├── .gitignore
├── HW1
    ├── part1
    │   ├── Makefile
    │   ├── PPintrin.cpp
    │   ├── PPintrin.h
    │   ├── def.h
    │   ├── logger.cpp
    │   ├── logger.h
    │   ├── main.cpp
    │   ├── serialOP.cpp
    │   └── vectorOP.cpp
    ├── part2
    │   ├── Makefile
    │   ├── assembly
    │   │   └── test2.vec.s
    │   ├── fasttime.h
    │   ├── main.cpp
    │   ├── test.h
    │   ├── test1.cpp
    │   ├── test2.cpp
    │   ├── test2.cpp.patch
    │   └── test3.cpp
    └── submit
    │   ├── url.txt
    │   └── vectorOP.cpp
├── HW2
    ├── part1
    │   ├── Makefile
    │   ├── pi.c
    │   ├── report.txt
    │   └── shishua-avx2.h
    ├── part2
    │   ├── Makefile
    │   ├── common
    │   │   ├── CycleTimer.h
    │   │   └── ppm.cpp
    │   ├── main.cpp
    │   ├── mandelbrotSerial.cpp
    │   ├── mandelbrotThread.cpp
    │   └── report.txt
    └── submit
    │   ├── part1
    │       ├── Makefile
    │       ├── pi.c
    │       └── shishua-avx2.h
    │   ├── part2
    │       └── mandelbrotThread.cpp
    │   └── url.txt
├── HW3
    ├── part1
    │   ├── Makefile
    │   ├── README
    │   ├── cg.c
    │   ├── cg_impl.c
    │   ├── cg_impl.h
    │   ├── common
    │   │   ├── c_timers.c
    │   │   ├── randdp.c
    │   │   ├── randdp.h
    │   │   ├── timers.h
    │   │   ├── type.h
    │   │   ├── wtime.c
    │   │   ├── wtime.h
    │   │   └── wtime_sgi64.c
    │   ├── def_cg.a
    │   ├── globals.h
    │   ├── grade.c
    │   ├── make.common
    │   ├── ref_cg.a
    │   └── report.txt
    ├── part2
    │   ├── breadth_first_search
    │   │   ├── Makefile
    │   │   ├── bfs.cpp
    │   │   ├── bfs.h
    │   │   ├── grade.cpp
    │   │   ├── main.cpp
    │   │   ├── ref_bfs.a
    │   │   └── report.txt
    │   ├── common
    │   │   ├── CycleTimer.h
    │   │   ├── contracts.h
    │   │   ├── grade.h
    │   │   ├── graph.cpp
    │   │   ├── graph.h
    │   │   └── graph_internal.h
    │   ├── doc
    │   │   └── bfs.pdf
    │   ├── graphs
    │   │   └── README.md
    │   ├── page_rank
    │   │   ├── Makefile
    │   │   ├── grade.cpp
    │   │   ├── main.cpp
    │   │   ├── page_rank.cpp
    │   │   ├── page_rank.h
    │   │   ├── ref_pr.a
    │   │   └── report.txt
    │   └── tools
    │   │   ├── Makefile
    │   │   ├── graphTools.cpp
    │   │   └── plaintext.graph
    └── submit
    │   ├── bfs.cpp
    │   ├── cg_impl.c
    │   └── page_rank.cpp
├── HW4
    ├── part1
    │   ├── Makefile
    │   ├── hello.cc
    │   ├── hosts_mpi.txt
    │   ├── hosts_part1.txt
    │   ├── pi_block_linear.cc
    │   ├── pi_block_tree.cc
    │   ├── pi_gather.cc
    │   ├── pi_nonblock_linear.cc
    │   ├── pi_reduce.cc
    │   ├── report.txt
    │   └── test.py
    ├── part2
    │   ├── Makefile
    │   ├── hosts_part2_4slots.txt
    │   ├── hosts_part2_7slots.txt
    │   ├── main.cc
    │   ├── matrix.cc
    │   ├── report.txt
    │   ├── test.py
    │   └── testdata
    │   │   ├── ans0_1
    │   │   ├── ans0_2
    │   │   ├── data0_1
    │   │   └── data0_2
    ├── setting
    │   ├── config
    │   └── hosts.txt
    └── submit
    │   ├── part1
    │       ├── hello.cc
    │       ├── pi_block_linear.cc
    │       ├── pi_block_tree.cc
    │       ├── pi_gather.cc
    │       ├── pi_nonblock_linear.cc
    │       └── pi_reduce.cc
    │   ├── part2
    │       ├── Makefile
    │       ├── main.cc
    │       └── matrix.cc
    │   └── url.txt
├── HW5
    ├── Makefile
    ├── common
    │   ├── CycleTimer.h
    │   └── ppm.cpp
    ├── kernel.h
    ├── kernel1.cu
    ├── kernel2.cu
    ├── kernel3.cu
    ├── kernel4.cu
    ├── main.cpp
    ├── mandelbrotSerial.cpp
    ├── mandelbrotThread.cpp
    ├── mandelbrotThreadRef.a
    ├── mandelbrotThreadRef50.a
    └── mandelbrotThreadRefAll.a
├── HW6
    ├── CycleTimer.h
    ├── Makefile
    ├── bmpfuncs.c
    ├── bmpfuncs.h
    ├── filter1.csv
    ├── filter2.csv
    ├── filter3.csv
    ├── helper.c
    ├── helper.h
    ├── hostFE.c
    ├── hostFE.h
    ├── input.bmp
    ├── kernel.cl
    ├── main.c
    ├── output.bmp
    ├── ref.bmp
    ├── serialConv.c
    └── serialConv.h
└── Readme.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | # HW6
 2 | HW6/conv
 3 | 
 4 | # HW5
 5 | HW5/mandelbrot
 6 | HW5/*.ppm
 7 | 
 8 | # HW4
 9 | HW4/part1/mpi_hello
10 | HW4/part1/pi_block_linear
11 | HW4/part1/pi_block_tree
12 | HW4/part1/pi_gather
13 | HW4/part1/pi_nonblock_linear
14 | HW4/part1/pi_reduce
15 | HW4/part2/matmul
16 | 
17 | # HW3
18 | HW3/part1/cg
19 | HW3/part1/cg_grader
20 | HW3/part2/breadth_first_search/bfs
21 | HW3/part2/breadth_first_search/bfs_grader
22 | HW3/part2/page_rank/pr
23 | HW3/part2/page_rank/pr_grader
24 | 
25 | # HW2
26 | HW2/part1/pi.out
27 | HW2/part2/mandelbrot
28 | HW2/part2/*.ppm
29 | 
30 | # HW1
31 | HW1/part1/myexp
32 | 
33 | # Profiling file
34 | gmon.out
35 | profiling_result
36 | perf.data
37 | perf.data.old
38 | 
39 | # Common Extensions
40 | *.o
41 | *.zip
42 | *.graph


--------------------------------------------------------------------------------
/HW1/part1/Makefile:
--------------------------------------------------------------------------------
 1 | CXX := g++
 2 | CXXFLAGS := -I./common -O3 -std=c++17 -Wall
 3 | 
 4 | ifeq (/usr/bin/g++-10,$(wildcard /usr/bin/g++-10*))
 5 |     CXX=g++-10
 6 | endif
 7 | 
 8 | all: myexp
 9 | 
10 | logger.o: logger.cpp logger.h PPintrin.h PPintrin.cpp def.h
11 | 	$(CXX) $(CXXFLAGS) -c logger.cpp
12 | 
13 | PPintrin.o: PPintrin.cpp PPintrin.h logger.cpp logger.h def.h
14 | 	$(CXX) $(CXXFLAGS) -c PPintrin.cpp
15 | 
16 | myexp: PPintrin.o logger.o main.cpp serialOP.cpp vectorOP.cpp 
17 | 	g++  -I./common logger.o PPintrin.o main.cpp serialOP.cpp vectorOP.cpp -o myexp
18 | 
19 | clean:
20 | 	rm -f *.o *.s myexp *~
21 | 


--------------------------------------------------------------------------------
/HW1/part1/PPintrin.h:
--------------------------------------------------------------------------------
  1 | #ifndef PPINTRIN_H_
  2 | #define PPINTRIN_H_
  3 | 
  4 | #include <cstdlib>
  5 | #include <cmath>
  6 | #include "logger.h"
  7 | #include "def.h"
  8 | //*******************
  9 | //* Type Definition *
 10 | //*******************
 11 | 
 12 | extern Logger PPLogger;
 13 | 
 14 | template <typename T>
 15 | struct __pp_vec {
 16 |   T value[VECTOR_WIDTH];
 17 | };
 18 | 
 19 | // Declare a mask with __pp_mask
 20 | struct __pp_mask : __pp_vec<bool> {};
 21 | 
 22 | // Declare a floating point vector register with __pp_vec_float
 23 | #define __pp_vec_float __pp_vec<float>
 24 | 
 25 | // Declare an integer vector register with __pp_vec_int
 26 | #define __pp_vec_int   __pp_vec<int>
 27 | 
 28 | //***********************
 29 | //* Function Definition *
 30 | //***********************
 31 | 
 32 | // Return a mask initialized to 1 in the first N lanes and 0 in the others
 33 | __pp_mask _pp_init_ones(int first = VECTOR_WIDTH);
 34 | 
 35 | // Return the inverse of maska
 36 | __pp_mask _pp_mask_not(__pp_mask &maska);
 37 | 
 38 | // Return (maska | maskb)
 39 | __pp_mask _pp_mask_or(__pp_mask &maska, __pp_mask &maskb);
 40 | 
 41 | // Return (maska & maskb)
 42 | __pp_mask _pp_mask_and(__pp_mask &maska, __pp_mask &maskb);
 43 | 
 44 | // Count the number of 1s in maska
 45 | int _pp_cntbits(__pp_mask &maska);
 46 | 
 47 | // Set register to value if vector lane is active
 48 | //  otherwise keep the old value
 49 | void _pp_vset_float(__pp_vec_float &vecResult, float value, __pp_mask &mask);
 50 | void _pp_vset_int(__pp_vec_int &vecResult, int value, __pp_mask &mask);
 51 | // For user's convenience, returns a vector register with all lanes initialized to value
 52 | __pp_vec_float _pp_vset_float(float value);
 53 | __pp_vec_int _pp_vset_int(int value);
 54 | 
 55 | // Copy values from vector register src to vector register dest if vector lane active
 56 | // otherwise keep the old value
 57 | void _pp_vmove_float(__pp_vec_float &dest, __pp_vec_float &src, __pp_mask &mask);
 58 | void _pp_vmove_int(__pp_vec_int &dest, __pp_vec_int &src, __pp_mask &mask);
 59 | 
 60 | // Load values from array src to vector register dest if vector lane active
 61 | //  otherwise keep the old value
 62 | void _pp_vload_float(__pp_vec_float &dest, float* src, __pp_mask &mask);
 63 | void _pp_vload_int(__pp_vec_int &dest, int* src, __pp_mask &mask);
 64 | 
 65 | // Store values from vector register src to array dest if vector lane active
 66 | //  otherwise keep the old value
 67 | void _pp_vstore_float(float* dest, __pp_vec_float &src, __pp_mask &mask);
 68 | void _pp_vstore_int(int* dest, __pp_vec_int &src, __pp_mask &mask);
 69 | 
 70 | // Return calculation of (veca + vecb) if vector lane active
 71 | //  otherwise keep the old value
 72 | void _pp_vadd_float(__pp_vec_float &vecResult, __pp_vec_float &veca, __pp_vec_float &vecb, __pp_mask &mask);
 73 | void _pp_vadd_int(__pp_vec_int &vecResult, __pp_vec_int &veca, __pp_vec_int &vecb, __pp_mask &mask);
 74 | 
 75 | // Return calculation of (veca - vecb) if vector lane active
 76 | //  otherwise keep the old value
 77 | void _pp_vsub_float(__pp_vec_float &vecResult, __pp_vec_float &veca, __pp_vec_float &vecb, __pp_mask &mask);
 78 | void _pp_vsub_int(__pp_vec_int &vecResult, __pp_vec_int &veca, __pp_vec_int &vecb, __pp_mask &mask);
 79 | 
 80 | // Return calculation of (veca * vecb) if vector lane active
 81 | //  otherwise keep the old value
 82 | void _pp_vmult_float(__pp_vec_float &vecResult, __pp_vec_float &veca, __pp_vec_float &vecb, __pp_mask &mask);
 83 | void _pp_vmult_int(__pp_vec_int &vecResult, __pp_vec_int &veca, __pp_vec_int &vecb, __pp_mask &mask);
 84 | 
 85 | // Return calculation of (veca / vecb) if vector lane active
 86 | //  otherwise keep the old value
 87 | void _pp_vdiv_float(__pp_vec_float &vecResult, __pp_vec_float &veca, __pp_vec_float &vecb, __pp_mask &mask);
 88 | void _pp_vdiv_int(__pp_vec_int &vecResult, __pp_vec_int &veca, __pp_vec_int &vecb, __pp_mask &mask);
 89 | 
 90 | 
 91 | // Return calculation of absolute value abs(veca) if vector lane active
 92 | //  otherwise keep the old value
 93 | void _pp_vabs_float(__pp_vec_float &vecResult, __pp_vec_float &veca, __pp_mask &mask);
 94 | void _pp_vabs_int(__pp_vec_int &vecResult, __pp_vec_int &veca, __pp_mask &mask);
 95 | 
 96 | // Return a mask of (veca > vecb) if vector lane active
 97 | //  otherwise keep the old value
 98 | void _pp_vgt_float(__pp_mask &vecResult, __pp_vec_float &veca, __pp_vec_float &vecb, __pp_mask &mask);
 99 | void _pp_vgt_int(__pp_mask &vecResult, __pp_vec_int &veca, __pp_vec_int &vecb, __pp_mask &mask);
100 | 
101 | // Return a mask of (veca < vecb) if vector lane active
102 | //  otherwise keep the old value
103 | void _pp_vlt_float(__pp_mask &vecResult, __pp_vec_float &veca, __pp_vec_float &vecb, __pp_mask &mask);
104 | void _pp_vlt_int(__pp_mask &vecResult, __pp_vec_int &veca, __pp_vec_int &vecb, __pp_mask &mask);
105 | 
106 | // Return a mask of (veca == vecb) if vector lane active
107 | //  otherwise keep the old value
108 | void _pp_veq_float(__pp_mask &vecResult, __pp_vec_float &veca, __pp_vec_float &vecb, __pp_mask &mask);
109 | void _pp_veq_int(__pp_mask &vecResult, __pp_vec_int &veca, __pp_vec_int &vecb, __pp_mask &mask);
110 | 
111 | // Adds up adjacent pairs of elements, so
112 | //  [0 1 2 3] -> [0+1 0+1 2+3 2+3]
113 | void _pp_hadd_float(__pp_vec_float &vecResult, __pp_vec_float &vec);
114 | 
115 | // Performs an even-odd interleaving where all even-indexed elements move to front half
116 | //  of the array and odd-indexed to the back half, so
117 | //  [0 1 2 3 4 5 6 7] -> [0 2 4 6 1 3 5 7]
118 | void _pp_interleave_float(__pp_vec_float &vecResult, __pp_vec_float &vec);
119 | 
120 | // Add a customized log to help debugging
121 | void addUserLog(const char * logStr);
122 | 
123 | #endif
124 | 


--------------------------------------------------------------------------------
/HW1/part1/def.h:
--------------------------------------------------------------------------------
1 | // Define vector unit width here
2 | #define VECTOR_WIDTH 16
3 | #define EXP_MAX 10
4 | 


--------------------------------------------------------------------------------
/HW1/part1/logger.cpp:
--------------------------------------------------------------------------------
 1 | #include "logger.h"
 2 | #include "PPintrin.h"
 3 | 
 4 | void Logger::addLog(const char *instruction, __pp_mask mask, int N)
 5 | {
 6 |   Log newLog;
 7 |   strcpy(newLog.instruction, instruction);
 8 |   newLog.mask = 0;
 9 |   for (int i = 0; i < N; i++)
10 |   {
11 |     if (mask.value[i])
12 |     {
13 |       newLog.mask |= (((unsigned long long)1) << i);
14 |       stats.utilized_lane++;
15 |     }
16 |   }
17 |   stats.total_lane += N;
18 |   stats.total_instructions += (N > 0);
19 |   log.push_back(newLog);
20 | }
21 | 
22 | void Logger::printStats()
23 | {
24 |   printf("****************** Printing Vector Unit Statistics *******************\n");
25 |   printf("Vector Width:              %d\n", VECTOR_WIDTH);
26 |   printf("Total Vector Instructions: %lld\n", stats.total_instructions);
27 |   printf("Vector Utilization:        %.1f%%\n", (double)stats.utilized_lane / stats.total_lane * 100);
28 |   printf("Utilized Vector Lanes:     %lld\n", stats.utilized_lane);
29 |   printf("Total Vector Lanes:        %lld\n", stats.total_lane);
30 | }
31 | 
32 | void Logger::printLog()
33 | {
34 |   printf("***************** Printing Vector Unit Execution Log *****************\n");
35 |   printf(" Instruction | Vector Lane Occupancy ('*' for active, '_' for inactive)\n");
36 |   printf("------------- --------------------------------------------------------\n");
37 |   for (int i = 0; i < log.size(); i++)
38 |   {
39 |     printf("%12s | ", log[i].instruction);
40 |     for (int j = 0; j < VECTOR_WIDTH; j++)
41 |     {
42 |       if (log[i].mask & (((unsigned long long)1) << j))
43 |       {
44 |         printf("*");
45 |       }
46 |       else
47 |       {
48 |         printf("_");
49 |       }
50 |     }
51 |     printf("\n");
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/HW1/part1/logger.h:
--------------------------------------------------------------------------------
 1 | #ifndef LOGGER_H_
 2 | #define LOGGER_H_
 3 | 
 4 | #include <stdio.h>
 5 | #include <vector>
 6 | #include <string.h>
 7 | using namespace std;
 8 | 
 9 | #define MAX_INST_LEN 32
10 | 
11 | struct __pp_mask;
12 | 
13 | struct Log {
14 |   char instruction[MAX_INST_LEN];
15 |   unsigned long long mask; // support vector width up to 64
16 | };
17 | 
18 | struct Statistics {
19 |   unsigned long long utilized_lane;
20 |   unsigned long long total_lane;
21 |   unsigned long long total_instructions;
22 | };
23 | 
24 | class Logger {
25 |   private:
26 |     vector<Log> log;
27 |     Statistics stats;
28 | 
29 |   public:
30 |     void addLog(const char * instruction, __pp_mask mask, int N = 0);
31 |     void printStats();
32 |     void printLog();
33 |     void refresh() {
34 |       stats.total_instructions = 0;
35 |       stats.total_lane = 0;
36 |       stats.utilized_lane = 0;
37 |     };
38 | };
39 | 
40 | #endif
41 | 


--------------------------------------------------------------------------------
/HW1/part1/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <algorithm>
  3 | #include <getopt.h>
  4 | #include <math.h>
  5 | #include "PPintrin.h"
  6 | #include "logger.h"
  7 | #include <sstream>
  8 | #include "def.h"
  9 | using namespace std;
 10 | 
 11 | Logger PPLogger;
 12 | 
 13 | void usage(const char *progname);
 14 | void initValue(float *values, int *exponents, float *output, float *gold, unsigned int N);
 15 | void absSerial(float *values, float *output, int N);
 16 | void absVector(float *values, float *output, int N);
 17 | void clampedExpSerial(float *values, int *exponents, float *output, int N);
 18 | void clampedExpVector(float *values, int *exponents, float *output, int N);
 19 | float arraySumSerial(float *values, int N);
 20 | float arraySumVector(float *values, int N);
 21 | bool verifyResult(float *values, int *exponents, float *output, float *gold, int N);
 22 | 
 23 | int main(int argc, char *argv[])
 24 | {
 25 |   int N = 16;
 26 |   bool printLog = false;
 27 | 
 28 |   // parse commandline options ////////////////////////////////////////////
 29 |   int opt;
 30 |   static struct option long_options[] = {
 31 |       {"size", 1, 0, 's'},
 32 |       {"log", 0, 0, 'l'},
 33 |       {"help", 0, 0, '?'},
 34 |       {0, 0, 0, 0}};
 35 | 
 36 |   while ((opt = getopt_long(argc, argv, "s:l?", long_options, NULL)) != EOF)
 37 |   {
 38 | 
 39 |     switch (opt)
 40 |     {
 41 |     case 's':
 42 |       N = atoi(optarg);
 43 |       if (N <= 0)
 44 |       {
 45 |         printf("Error: Workload size is set to %d (<0).\n", N);
 46 |         return -1;
 47 |       }
 48 |       break;
 49 |     case 'l':
 50 |       printLog = true;
 51 |       break;
 52 |     case '?':
 53 |     default:
 54 |       usage(argv[0]);
 55 |       return 1;
 56 |     }
 57 |   }
 58 | 
 59 |   float *values = new float[N + VECTOR_WIDTH];
 60 |   int *exponents = new int[N + VECTOR_WIDTH];
 61 |   float *output = new float[N + VECTOR_WIDTH];
 62 |   float *gold = new float[N + VECTOR_WIDTH];
 63 |   initValue(values, exponents, output, gold, N);
 64 | 
 65 |   clampedExpSerial(values, exponents, gold, N);
 66 |   clampedExpVector(values, exponents, output, N);
 67 | 
 68 |   //absSerial(values, gold, N);
 69 |   //absVector(values, output, N);
 70 | 
 71 |   printf("\e[1;31mCLAMPED EXPONENT\e[0m (required) \n");
 72 |   bool clampedCorrect = verifyResult(values, exponents, output, gold, N);
 73 |   if (printLog)
 74 |     PPLogger.printLog();
 75 |   PPLogger.printStats();
 76 | 
 77 |   printf("************************ Result Verification *************************\n");
 78 |   if (!clampedCorrect)
 79 |   {
 80 |     printf("@@@ ClampedExp Failed!!!\n");
 81 |   }
 82 |   else
 83 |   {
 84 |     printf("ClampedExp Passed!!!\n");
 85 |   }
 86 | 
 87 |   PPLogger.refresh();
 88 | 
 89 |   printf("\n\e[1;31mARRAY SUM\e[0m (bonus) \n");
 90 |   if (N % VECTOR_WIDTH == 0)
 91 |   {
 92 |     float sumGold = arraySumSerial(values, N);
 93 |     float sumOutput = arraySumVector(values, N);
 94 | 
 95 |     if (printLog)
 96 |       PPLogger.printLog();
 97 |     PPLogger.printStats();
 98 | 
 99 |     printf("************************ Result Verification *************************\n");
100 | 
101 |     float epsilon = 0.1;
102 |     bool sumCorrect = abs(sumGold - sumOutput) < epsilon * 2;
103 |     if (!sumCorrect)
104 |     {
105 |       printf("Expected %f, got %f\n.", sumGold, sumOutput);
106 |       printf("@@@ ArraySum Failed!!!\n");
107 |     }
108 |     else
109 |     {
110 |       printf("ArraySum Passed!!!\n");
111 |     }
112 |   }
113 |   else
114 |   {
115 |     printf("Must have N %% VECTOR_WIDTH == 0 for this problem (VECTOR_WIDTH is %d)\n", VECTOR_WIDTH);
116 |   }
117 | 
118 |   delete[] values;
119 |   delete[] exponents;
120 |   delete[] output;
121 |   delete[] gold;
122 | 
123 |   return 0;
124 | }
125 | 
126 | void usage(const char *progname)
127 | {
128 |   printf("Usage: %s [options]\n", progname);
129 |   printf("Program Options:\n");
130 |   printf("  -s  --size <N>     Use workload size N (Default = 16)\n");
131 |   printf("  -l  --log          Print vector unit execution log\n");
132 |   printf("  -?  --help         This message\n");
133 | }
134 | 
135 | void initValue(float *values, int *exponents, float *output, float *gold, unsigned int N)
136 | {
137 | 
138 |   for (unsigned int i = 0; i < N + VECTOR_WIDTH; i++)
139 |   {
140 |     // random input values
141 |     values[i] = -1.f + 4.f * static_cast<float>(rand()) / RAND_MAX;
142 |     exponents[i] = rand() % EXP_MAX;
143 |     output[i] = 0.f;
144 |     gold[i] = 0.f;
145 |   }
146 | }
147 | 
148 | bool verifyResult(float *values, int *exponents, float *output, float *gold, int N)
149 | {
150 |   int incorrect = -1;
151 |   float epsilon = 0.00001;
152 |   for (int i = 0; i < N + VECTOR_WIDTH; i++)
153 |   {
154 |     if (abs(output[i] - gold[i]) > epsilon)
155 |     {
156 |       incorrect = i;
157 |       break;
158 |     }
159 |   }
160 | 
161 |   if (incorrect != -1)
162 |   {
163 |     if (incorrect >= N)
164 |       printf("You have written to out of bound value!\n");
165 |     printf("Wrong calculation at value[%d]!\n", incorrect);
166 |     printf("value  = ");
167 |     for (int i = 0; i < N; i++)
168 |     {
169 |       printf("% f ", values[i]);
170 |     }
171 |     printf("\n");
172 | 
173 |     printf("exp    = ");
174 |     for (int i = 0; i < N; i++)
175 |     {
176 |       printf("% 9d ", exponents[i]);
177 |     }
178 |     printf("\n");
179 | 
180 |     printf("output = ");
181 |     for (int i = 0; i < N; i++)
182 |     {
183 |       printf("% f ", output[i]);
184 |     }
185 |     printf("\n");
186 | 
187 |     printf("gold   = ");
188 |     for (int i = 0; i < N; i++)
189 |     {
190 |       printf("% f ", gold[i]);
191 |     }
192 |     printf("\n");
193 |     return false;
194 |   }
195 |   printf("Results matched with answer!\n");
196 |   return true;
197 | }
198 | 


--------------------------------------------------------------------------------
/HW1/part1/serialOP.cpp:
--------------------------------------------------------------------------------
 1 | // computes the absolute value of all elements in the input array
 2 | // values, stores result in output
 3 | void absSerial(float *values, float *output, int N)
 4 | {
 5 |   for (int i = 0; i < N; i++)
 6 |   {
 7 |     float x = values[i];
 8 |     if (x < 0)
 9 |     {
10 |       output[i] = -x;
11 |     }
12 |     else
13 |     {
14 |       output[i] = x;
15 |     }
16 |   }
17 | }
18 | 
19 | // accepts an array of values and an array of exponents
20 | //
21 | // For each element, compute values[i]^exponents[i] and clamp value to
22 | // 9.999.  Store result in output.
23 | void clampedExpSerial(float *values, int *exponents, float *output, int N)
24 | {
25 |   for (int i = 0; i < N; i++)
26 |   {
27 |     float x = values[i];
28 |     int y = exponents[i];
29 |     if (y == 0)
30 |     {
31 |       output[i] = 1.f;
32 |     }
33 |     else
34 |     {
35 |       float result = x;
36 |       int count = y - 1;
37 |       while (count > 0)
38 |       {
39 |         result *= x;
40 |         count--;
41 |       }
42 |       if (result > 9.999999f)
43 |       {
44 |         result = 9.999999f;
45 |       }
46 |       output[i] = result;
47 |     }
48 |   }
49 | }
50 | 
51 | // returns the sum of all elements in values
52 | float arraySumSerial(float *values, int N)
53 | {
54 |   float sum = 0;
55 |   for (int i = 0; i < N; i++)
56 |   {
57 |     sum += values[i];
58 |   }
59 | 
60 |   return sum;
61 | }


--------------------------------------------------------------------------------
/HW1/part1/vectorOP.cpp:
--------------------------------------------------------------------------------
  1 | #include "PPintrin.h"
  2 | 
  3 | // implementation of absSerial(), but it is vectorized using PP intrinsics
  4 | void absVector(float *values, float *output, int N)
  5 | {
  6 |   __pp_vec_float x;
  7 |   __pp_vec_float result;
  8 |   __pp_vec_float zero = _pp_vset_float(0.f);
  9 |   __pp_mask maskAll, maskIsNegative, maskIsNotNegative;
 10 | 
 11 |   //  Note: Take a careful look at this loop indexing.  This example
 12 |   //  code is not guaranteed to work when (N % VECTOR_WIDTH) != 0.
 13 |   //  Why is that the case?
 14 |   for (int i = 0; i < N; i += VECTOR_WIDTH)
 15 |   {
 16 | 
 17 |     // All ones
 18 |     maskAll = _pp_init_ones();
 19 | 
 20 |     // All zeros
 21 |     maskIsNegative = _pp_init_ones(0);
 22 | 
 23 |     // Load vector of values from contiguous memory addresses
 24 |     _pp_vload_float(x, values + i, maskAll); // x = values[i];
 25 | 
 26 |     // Set mask according to predicate
 27 |     _pp_vlt_float(maskIsNegative, x, zero, maskAll); // if (x < 0) {
 28 | 
 29 |     // Execute instruction using mask ("if" clause)
 30 |     _pp_vsub_float(result, zero, x, maskIsNegative); //   output[i] = -x;
 31 | 
 32 |     // Inverse maskIsNegative to generate "else" mask
 33 |     maskIsNotNegative = _pp_mask_not(maskIsNegative); // } else {
 34 | 
 35 |     // Execute instruction ("else" clause)
 36 |     _pp_vload_float(result, values + i, maskIsNotNegative); //   output[i] = x; }
 37 | 
 38 |     // Write results back to memory
 39 |     _pp_vstore_float(output + i, result, maskAll);
 40 |   }
 41 | }
 42 | 
 43 | void clampedExpVector(float *values, int *exponents, float *output, int N)
 44 | {
 45 |   //
 46 |   // PP STUDENTS TODO: Implement your vectorized version of
 47 |   // clampedExpSerial() here.
 48 |   //
 49 |   // Your solution should work for any value of
 50 |   // N and VECTOR_WIDTH, not just when VECTOR_WIDTH divides N
 51 |   //
 52 |   __pp_vec_int zero, one;
 53 |   __pp_vec_float clampedValue;
 54 | 
 55 |   zero = _pp_vset_int(0);
 56 |   one = _pp_vset_int(1);
 57 |   clampedValue = _pp_vset_float(9.999999f);
 58 | 
 59 |   for (int i = 0; i < N; i += VECTOR_WIDTH)
 60 |   {
 61 |     __pp_vec_float x, result;
 62 |     __pp_vec_int y;
 63 |     int maskWidth;
 64 |     __pp_mask mask, maskEq0, maskNeq0, maskGtCV;
 65 | 
 66 |     maskWidth = i + VECTOR_WIDTH <= N ? VECTOR_WIDTH : N % VECTOR_WIDTH;
 67 |     mask = _pp_init_ones(maskWidth);
 68 | 
 69 |     maskEq0 = _pp_init_ones(0);
 70 |     maskNeq0 = _pp_init_ones(0);
 71 |     maskGtCV = _pp_init_ones(0);
 72 | 
 73 |     _pp_vload_float(x, values + i, mask);
 74 |     _pp_vload_int(y, exponents + i, mask);
 75 | 
 76 |     _pp_veq_int(maskEq0, y, zero, mask);
 77 |     _pp_vset_float(result, 1.f, maskEq0);
 78 | 
 79 |     maskNeq0 = _pp_mask_not(maskEq0);
 80 | 
 81 |     _pp_vmove_float(result, x, maskNeq0);
 82 |     
 83 |     _pp_vsub_int(y, y, one, maskNeq0);
 84 |     
 85 |     while (1)
 86 |     {
 87 |       int cnt;
 88 |       __pp_mask maskGt0;
 89 | 
 90 |       maskGt0 = _pp_init_ones(0);
 91 | 
 92 |       _pp_vgt_int(maskGt0, y, zero, mask);
 93 |       
 94 |       cnt = _pp_cntbits(maskGt0);
 95 | 
 96 |       if (!cnt) {
 97 |         break;
 98 |       }
 99 | 
100 |       _pp_vmult_float(result, result, x, maskGt0);
101 |       _pp_vsub_int(y, y, one, maskGt0);
102 |     }
103 | 
104 |     _pp_vgt_float(maskGtCV, result, clampedValue, mask);
105 |     _pp_vmove_float(result, clampedValue, maskGtCV);
106 | 
107 |     _pp_vstore_float(output + i, result, mask);
108 |   }
109 | }
110 | 
111 | // returns the sum of all elements in values
112 | // You can assume N is a multiple of VECTOR_WIDTH
113 | // You can assume VECTOR_WIDTH is a power of 2
114 | float arraySumVector(float *values, int N)
115 | {
116 | 
117 |   //
118 |   // PP STUDENTS TODO: Implement your vectorized version of arraySumSerial here
119 |   //
120 |   __pp_vec_float sum;
121 |   __pp_mask mask;
122 | 
123 |   sum = _pp_vset_float(0);
124 |   mask = _pp_init_ones();
125 | 
126 |   for (int i = 0; i < N; i += VECTOR_WIDTH)
127 |   {
128 |     __pp_vec_float vec;
129 | 
130 |     _pp_vload_float(vec, values + i, mask);
131 |     _pp_vadd_float(sum, sum, vec, mask);
132 |   }
133 | 
134 |   for (int i = VECTOR_WIDTH; i != 1; i /= 2) {
135 |     _pp_hadd_float(sum, sum);
136 |     _pp_interleave_float(sum, sum);
137 |   }
138 | 
139 |   return sum.value[0];
140 | }


--------------------------------------------------------------------------------
/HW1/part2/Makefile:
--------------------------------------------------------------------------------
 1 | TARGET := test_auto_vectorize
 2 | 
 3 | OBJS := main.o test1.o test2.o test3.o
 4 | 
 5 | CXX := clang++
 6 | 
 7 | ifeq (/usr/bin/clang++-11,$(wildcard /usr/bin/clang++-11*))
 8 |     CXX=clang++-11
 9 | endif
10 | 
11 | CXXFLAGS := -I./common -O3 -std=c++17 -Wall
12 | 
13 | ifeq ($(ASSEMBLE),1)
14 | 	CXXFLAGS += -S
15 | endif
16 | ifeq ($(VECTORIZE),1)
17 | 	CXXFLAGS += -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize
18 | 	SUFFIX := .vec
19 | else
20 | 	CXXFLAGS += -fno-vectorize
21 | 	SUFFIX := .novec
22 | endif
23 | ifeq ($(RESTRICT),1)
24 | 	SUFFIX := $(SUFFIX).restr
25 | endif
26 | ifeq ($(ALIGN),1)
27 | 	SUFFIX := $(SUFFIX).align
28 | endif
29 | ifeq ($(AVX2),1)
30 | 	CXXFLAGS += -mavx2
31 | 	SUFFIX := $(SUFFIX).avx2
32 | endif
33 | ifeq ($(FASTMATH),1)
34 |   	CXXFLAGS += -ffast-math
35 |   	SUFFIX := $(SUFFIX).fmath
36 | endif
37 | 
38 | all: $(TARGET)
39 | 
40 | %.o: %.cpp test.h
41 | ifeq ($(ASSEMBLE),1)
42 | 	if [ ! -d "./assembly" ]; then mkdir "./assembly"; fi
43 | 	$(CXX) $(CXXFLAGS) -c $< -o assembly/$(basename $<)$(SUFFIX).s
44 | else
45 | 	$(CXX) $(CXXFLAGS) -c $< 
46 | endif
47 | 
48 | $(TARGET): $(OBJS)
49 | ifneq ($(ASSEMBLE),1)
50 | 	$(CXX) $(CXXFLAGS) $(OBJS) -o $@
51 | endif
52 | 
53 | clean:
54 | 	rm -f *.o *.s $(TARGET) *~
55 | 
56 | cleanall:
57 | 	rm -rf *.o *.s $(TARGET) *~ assembly
58 | 
59 | 


--------------------------------------------------------------------------------
/HW1/part2/fasttime.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2014 MIT License by 6.172 Staff
 3 |  *
 4 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 5 |  * of this software and associated documentation files (the "Software"), to
 6 |  * deal in the Software without restriction, including without limitation the
 7 |  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 8 |  * sell copies of the Software, and to permit persons to whom the Software is
 9 |  * furnished to do so, subject to the following conditions:
10 |  *
11 |  * The above copyright notice and this permission notice shall be included in
12 |  * all copies or substantial portions of the Software.
13 |  *
14 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 |  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20 |  * IN THE SOFTWARE.
21 |  **/
22 | 
23 | #ifndef INCLUDED_FASTTIME_DOT_H
24 | #define INCLUDED_FASTTIME_DOT_H
25 | 
26 | #define _POSIX_C_SOURCE 200809L
27 | 
28 | #include <assert.h>
29 | 
30 | #ifdef __MACH__
31 | #include <mach/mach_time.h>  // mach_absolute_time
32 | 
33 | typedef uint64_t fasttime_t;
34 | 
35 | 
36 | // Return the current time.
37 | static inline fasttime_t gettime(void) {
38 |   return mach_absolute_time();
39 | }
40 | 
41 | // Return the time different between the start and the end, as a float
42 | // in units of seconds.  This function does not need to be fast.
43 | // Implementation notes: See
44 | // https://developer.apple.com/library/mac/qa/qa1398/_index.html
45 | static inline double tdiff(fasttime_t start, fasttime_t end) {
46 |   static mach_timebase_info_data_t timebase;
47 |   int r = mach_timebase_info(&timebase);
48 |   assert(r == 0);
49 |   fasttime_t elapsed = end-start;
50 |   double ns = (double)elapsed * timebase.numer / timebase.denom;
51 |   return ns*1e-9;
52 | }
53 | 
54 | static inline unsigned int random_seed_from_clock(void) {
55 |   fasttime_t now = gettime();
56 |   return (now & 0xFFFFFFFF) + (now>>32);
57 | }
58 | 
59 | #else  // LINUX
60 | 
61 | // We need _POSIX_C_SOURCE to pick up 'struct timespec' and clock_gettime.
62 | // #define _POSIX_C_SOURCE 200809L
63 | 
64 | #include <time.h>
65 | 
66 | typedef struct timespec fasttime_t;
67 | 
68 | // Return the current time.
69 | static inline fasttime_t gettime(void) {
70 |   struct timespec s;
71 | #ifdef NDEBUG
72 |   clock_gettime(CLOCK_MONOTONIC, &s);
73 | #else
74 |   int r = clock_gettime(CLOCK_MONOTONIC, &s);
75 |   assert(r == 0);
76 | #endif
77 |   return s;
78 | }
79 | 
80 | // Return the time different between the start and the end, as a float
81 | // in units of seconds.  This function does not need to be fast.
82 | static inline double tdiff(fasttime_t start, fasttime_t end) {
83 |   return end.tv_sec - start.tv_sec + 1e-9*(end.tv_nsec - start.tv_nsec);
84 | }
85 | 
86 | static inline unsigned int random_seed_from_clock(void) {
87 |   fasttime_t now = gettime();
88 |   return now.tv_sec + now.tv_nsec;
89 | }
90 | 
91 | // Poison these symbols to help find portability problems.
92 | int clock_gettime(clockid_t, struct timespec *) __attribute__((deprecated));
93 | time_t time(time_t *) __attribute__((deprecated));
94 | 
95 | #endif  // LINUX
96 | 
97 | #endif  // INCLUDED_FASTTIME_DOT_H
98 | 


--------------------------------------------------------------------------------
/HW1/part2/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <getopt.h>
 3 | 
 4 | using namespace std;
 5 | 
 6 | void usage(const char* progname);
 7 | void initValue(float* values1, float* values2, double* value3, float* output, unsigned int N);
 8 | 
 9 | extern void test1(float* a, float* b, float* c, int N);
10 | extern void test2(float *__restrict a, float *__restrict b, float *__restrict c, int N);
11 | extern double test3(double* __restrict a, int N) ;
12 | 
13 | int main(int argc, char * argv[]) {
14 |   int N = 1024;
15 |   int whichTestToRun = 1;
16 | 
17 |   // parse commandline options ////////////////////////////////////////////
18 |   int opt;
19 |   static struct option long_options[] = {
20 |     {"size", 1, 0, 's'},
21 |     {"test", 1, 0, 't'},
22 |     {"help", 0, 0, '?'},
23 |     {0 ,0, 0, 0}
24 |   };
25 | 
26 |   while ((opt = getopt_long(argc, argv, "st:?", long_options, NULL)) != EOF) {
27 | 
28 |     switch (opt) {
29 |       case 's':
30 |         N = atoi(optarg);
31 |         if (N <= 0) {
32 |           cout << "Error: Workload size is set to" << N << " (<0).\n";
33 |           return -1;
34 |         }
35 |         break;
36 |       case 't':
37 |         whichTestToRun = atoi(optarg);
38 |         if (whichTestToRun <= 0 || whichTestToRun >= 4) {
39 |           cout << "Error: test" << whichTestToRun << "() is not available.\n";
40 |           return -1;
41 |         }
42 |         break;
43 |       case 'h':
44 |       default:
45 |         usage(argv[0]);
46 |         return 1;
47 |     }
48 |   }
49 | 
50 |   float* values1 = new(std::align_val_t{ 32 }) float[N];
51 |   float* values2 = new(std::align_val_t{ 32 }) float[N];
52 |   double* values3 = new(std::align_val_t{ 32 }) double[N];
53 |   float* output = new(std::align_val_t{ 32 }) float[N];
54 |   initValue(values1, values2, values3, output, N);
55 | 
56 |   cout << "Running test" << whichTestToRun << "()...\n";
57 |   switch (whichTestToRun) {
58 |     case 1: test1(values1, values2, output, N); break;
59 |     case 2: test2(values1, values2, output, N); break;
60 |     case 3: test3(values3, N); break;
61 |   }
62 | 
63 |   delete [] values1;
64 |   delete [] values2;
65 |   delete [] values3;
66 |   delete [] output;
67 | 
68 |   return 0;
69 | }
70 | 
71 | void usage(const char* progname) {
72 |   printf("Usage: %s [options]\n", progname);
73 |   printf("Program Options:\n");
74 |   printf("  -s  --size <N>     Use workload size N (Default = 1024)\n");
75 |   printf("  -t  --test <N>     Just run the testN function (Default = 1)\n");
76 |   printf("  -h  --help         This message\n");
77 | }
78 | 
79 | void initValue(float* values1, float* values2, double* values3, float* output, unsigned int N) {
80 |   for (unsigned int i=0; i<N; i++)
81 |   {
82 |     // random input values
83 |     values1[i] = -1.f + 4.f * static_cast<float>(rand()) / RAND_MAX;
84 |     values2[i] = -1.f + 4.f * static_cast<float>(rand()) / RAND_MAX;
85 |     values3[i] = -1.f + 4.f * static_cast<double>(rand()) / RAND_MAX;
86 |     output[i] = 0.f;
87 |   }
88 | }
89 | 


--------------------------------------------------------------------------------
/HW1/part2/test.h:
--------------------------------------------------------------------------------
1 | // Run for multiple experiments to reduce measurement error on gettime().
2 | #define I 20000000


--------------------------------------------------------------------------------
/HW1/part2/test1.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "test.h"
 3 | #include "fasttime.h"
 4 | 
 5 | void test1(float* __restrict a, float* __restrict b, float* __restrict c, int N) {
 6 |   __builtin_assume(N == 1024);
 7 | 
 8 |   a = (float *)__builtin_assume_aligned(a, 32);
 9 |   b = (float *)__builtin_assume_aligned(b, 32);
10 |   c = (float *)__builtin_assume_aligned(c, 32);
11 | 
12 |   fasttime_t time1 = gettime();
13 |   for (int i=0; i<I; i++) {
14 |     for (int j=0; j<N; j++) {
15 |       c[j] = a[j] + b[j];
16 |     }
17 |   }
18 |   fasttime_t time2 = gettime();
19 | 
20 |   double elapsedf = tdiff(time1, time2);
21 |   std::cout << "Elapsed execution time of the loop in test1():\n" 
22 |     << elapsedf << "sec (N: " << N << ", I: " << I << ")\n";
23 | }


--------------------------------------------------------------------------------
/HW1/part2/test2.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "test.h"
 3 | #include "fasttime.h"
 4 | 
 5 | void test2(float *__restrict a, float *__restrict b, float *__restrict c, int N)
 6 | {
 7 |   __builtin_assume(N == 1024);
 8 |   a = (float *)__builtin_assume_aligned(a, 16);
 9 |   b = (float *)__builtin_assume_aligned(b, 16);
10 | 
11 |   fasttime_t time1 = gettime();
12 |   for (int i = 0; i < I; i++)
13 |   {
14 |     for (int j = 0; j < N; j++)
15 |     {
16 |       /* max() */
17 |       c[j] = a[j];
18 |       if (b[j] > a[j])
19 |         c[j] = b[j];
20 |       // if (b[j] > a[j]) c[j] = b[j];
21 |       // else c[j] = a[j];
22 |     }
23 |   }
24 | 
25 |   fasttime_t time2 = gettime();
26 | 
27 |   double elapsedf = tdiff(time1, time2);
28 |   std::cout << "Elapsed execution time of the loop in test2():\n"
29 |             << elapsedf << "sec (N: " << N << ", I: " << I << ")\n";
30 | }
31 | 


--------------------------------------------------------------------------------
/HW1/part2/test2.cpp.patch:
--------------------------------------------------------------------------------
 1 | --- test2.cpp
 2 | +++ test2.cpp
 3 | @@ -14,9 +14,8 @@
 4 |      for (int j = 0; j < N; j++)
 5 |      {
 6 |        /* max() */
 7 | -      c[j] = a[j];
 8 | -      if (b[j] > a[j])
 9 | -        c[j] = b[j];
10 | +      if (b[j] > a[j]) c[j] = b[j];
11 | +      else c[j] = a[j];
12 |      }
13 |    }
14 | 


--------------------------------------------------------------------------------
/HW1/part2/test3.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include "test.h"
 3 | #include "fasttime.h"
 4 | 
 5 | double test3(double* __restrict a, int N) {
 6 |   __builtin_assume(N == 1024);
 7 |   a = (double *)__builtin_assume_aligned(a, 16);
 8 | 
 9 |   double b = 0;
10 | 
11 |   fasttime_t time1 = gettime();
12 |   for (int i=0; i<I; i++) {
13 |     for (int j=0; j<N; j++) {
14 |       b += a[j];
15 |     }
16 |   }
17 |   fasttime_t time2 = gettime();
18 | 
19 |   double elapsedf = tdiff(time1, time2);
20 |   std::cout << "Elapsed execution time of the loop in test3():\n" 
21 |     << elapsedf << "sec (N: " << N << ", I: " << I << ")\n";
22 | 
23 |   return b;
24 | }


--------------------------------------------------------------------------------
/HW1/submit/url.txt:
--------------------------------------------------------------------------------
1 | https://hackmd.io/@LJP/S1rd69WQj


--------------------------------------------------------------------------------
/HW1/submit/vectorOP.cpp:
--------------------------------------------------------------------------------
  1 | #include "PPintrin.h"
  2 | 
  3 | // implementation of absSerial(), but it is vectorized using PP intrinsics
  4 | void absVector(float *values, float *output, int N)
  5 | {
  6 |   __pp_vec_float x;
  7 |   __pp_vec_float result;
  8 |   __pp_vec_float zero = _pp_vset_float(0.f);
  9 |   __pp_mask maskAll, maskIsNegative, maskIsNotNegative;
 10 | 
 11 |   //  Note: Take a careful look at this loop indexing.  This example
 12 |   //  code is not guaranteed to work when (N % VECTOR_WIDTH) != 0.
 13 |   //  Why is that the case?
 14 |   for (int i = 0; i < N; i += VECTOR_WIDTH)
 15 |   {
 16 | 
 17 |     // All ones
 18 |     maskAll = _pp_init_ones();
 19 | 
 20 |     // All zeros
 21 |     maskIsNegative = _pp_init_ones(0);
 22 | 
 23 |     // Load vector of values from contiguous memory addresses
 24 |     _pp_vload_float(x, values + i, maskAll); // x = values[i];
 25 | 
 26 |     // Set mask according to predicate
 27 |     _pp_vlt_float(maskIsNegative, x, zero, maskAll); // if (x < 0) {
 28 | 
 29 |     // Execute instruction using mask ("if" clause)
 30 |     _pp_vsub_float(result, zero, x, maskIsNegative); //   output[i] = -x;
 31 | 
 32 |     // Inverse maskIsNegative to generate "else" mask
 33 |     maskIsNotNegative = _pp_mask_not(maskIsNegative); // } else {
 34 | 
 35 |     // Execute instruction ("else" clause)
 36 |     _pp_vload_float(result, values + i, maskIsNotNegative); //   output[i] = x; }
 37 | 
 38 |     // Write results back to memory
 39 |     _pp_vstore_float(output + i, result, maskAll);
 40 |   }
 41 | }
 42 | 
 43 | void clampedExpVector(float *values, int *exponents, float *output, int N)
 44 | {
 45 |   //
 46 |   // PP STUDENTS TODO: Implement your vectorized version of
 47 |   // clampedExpSerial() here.
 48 |   //
 49 |   // Your solution should work for any value of
 50 |   // N and VECTOR_WIDTH, not just when VECTOR_WIDTH divides N
 51 |   //
 52 |   __pp_vec_int zero, one;
 53 |   __pp_vec_float clampedValue;
 54 | 
 55 |   zero = _pp_vset_int(0);
 56 |   one = _pp_vset_int(1);
 57 |   clampedValue = _pp_vset_float(9.999999f);
 58 | 
 59 |   for (int i = 0; i < N; i += VECTOR_WIDTH)
 60 |   {
 61 |     __pp_vec_float x, result;
 62 |     __pp_vec_int y;
 63 |     int maskWidth;
 64 |     __pp_mask mask, maskEq0, maskNeq0, maskGtCV;
 65 | 
 66 |     maskWidth = i + VECTOR_WIDTH <= N ? VECTOR_WIDTH : N % VECTOR_WIDTH;
 67 |     mask = _pp_init_ones(maskWidth);
 68 | 
 69 |     maskEq0 = _pp_init_ones(0);
 70 |     maskNeq0 = _pp_init_ones(0);
 71 |     maskGtCV = _pp_init_ones(0);
 72 | 
 73 |     _pp_vload_float(x, values + i, mask);
 74 |     _pp_vload_int(y, exponents + i, mask);
 75 | 
 76 |     _pp_veq_int(maskEq0, y, zero, mask);
 77 |     _pp_vset_float(result, 1.f, maskEq0);
 78 | 
 79 |     maskNeq0 = _pp_mask_not(maskEq0);
 80 | 
 81 |     _pp_vmove_float(result, x, maskNeq0);
 82 |     
 83 |     _pp_vsub_int(y, y, one, maskNeq0);
 84 |     
 85 |     while (1)
 86 |     {
 87 |       int cnt;
 88 |       __pp_mask maskGt0;
 89 | 
 90 |       maskGt0 = _pp_init_ones(0);
 91 | 
 92 |       _pp_vgt_int(maskGt0, y, zero, mask);
 93 |       
 94 |       cnt = _pp_cntbits(maskGt0);
 95 | 
 96 |       if (!cnt) {
 97 |         break;
 98 |       }
 99 | 
100 |       _pp_vmult_float(result, result, x, maskGt0);
101 |       _pp_vsub_int(y, y, one, maskGt0);
102 |     }
103 | 
104 |     _pp_vgt_float(maskGtCV, result, clampedValue, mask);
105 |     _pp_vmove_float(result, clampedValue, maskGtCV);
106 | 
107 |     _pp_vstore_float(output + i, result, mask);
108 |   }
109 | }
110 | 
111 | // returns the sum of all elements in values
112 | // You can assume N is a multiple of VECTOR_WIDTH
113 | // You can assume VECTOR_WIDTH is a power of 2
114 | float arraySumVector(float *values, int N)
115 | {
116 | 
117 |   //
118 |   // PP STUDENTS TODO: Implement your vectorized version of arraySumSerial here
119 |   //
120 |   __pp_vec_float sum;
121 |   __pp_mask mask;
122 | 
123 |   sum = _pp_vset_float(0);
124 |   mask = _pp_init_ones();
125 | 
126 |   for (int i = 0; i < N; i += VECTOR_WIDTH)
127 |   {
128 |     __pp_vec_float vec;
129 | 
130 |     _pp_vload_float(vec, values + i, mask);
131 |     _pp_vadd_float(sum, sum, vec, mask);
132 |   }
133 | 
134 |   for (int i = VECTOR_WIDTH; i != 1; i /= 2) {
135 |     _pp_hadd_float(sum, sum);
136 |     _pp_interleave_float(sum, sum);
137 |   }
138 | 
139 |   return sum.value[0];
140 | }


--------------------------------------------------------------------------------
/HW2/part1/Makefile:
--------------------------------------------------------------------------------
 1 | TARGET := pi.out
 2 | 
 3 | CC := clang
 4 | 
 5 | C_FILES = $(wildcard *.c)
 6 | O_FILES = $(C_FILES:%.c=%.o)
 7 | 
 8 | CXXFLAGS := -O3 -mavx2 -ffast-math -march=native -Wall
 9 | LDFLAGS  := -lpthread
10 | 
11 | all: $(TARGET)
12 | 
13 | $(TARGET): $(O_FILES)
14 | 	$(CC) $^ $(CXXFLAGS) -o $@ $(LDFLAGS)
15 | 
16 | %.o: %.c
17 | 	$(CC) -c $< $(CXXFLAGS) -o $@
18 | 
19 | .PHONY: clean
20 | clean:
21 | 	rm -f *.o $(TARGET)
22 | 	rm -f report.txt
23 | 
24 | .PHONY: report
25 | report: clean $(TARGET)
26 | 	lscpu | grep -E "name|Flags" > report.txt
27 | 	cat /proc/cpuinfo | grep MHz >> report.txt
28 | 	bash -c "{ time (./pi.out 3 100000000; ./pi.out 4 100000000) >>report.txt ; } 2>>report.txt"


--------------------------------------------------------------------------------
/HW2/part1/pi.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <unistd.h>
  4 | #include <pthread.h>
  5 | #include <sys/random.h>
  6 | #include "shishua-avx2.h"
  7 | 
  8 | #define U32_MAX 0xffffffff
  9 | 
 10 | typedef long int v4di __attribute__ ((vector_size (32)));
 11 | typedef union {
 12 |     v4di v;
 13 |     long int e[4];
 14 | } ve4di;
 15 | 
 16 | typedef double v4df __attribute__ ((vector_size (32)));
 17 | typedef union {
 18 |     v4df v;
 19 |     double e[4];
 20 | } ve4df;
 21 | 
 22 | typedef unsigned long u64;
 23 | typedef unsigned int u32;
 24 | typedef long long int s64;
 25 | 
 26 | s64 hit;
 27 | pthread_mutex_t hit_mutex = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
 28 | 
 29 | static inline void rand_init(prng_state *prng)
 30 | {
 31 |     u64 seed[4];
 32 |     ssize_t ret;
 33 |     
 34 |     ret = getrandom(seed, sizeof(seed), 0);
 35 | 
 36 |     if (ret < 0) {
 37 |         fprintf(stderr, "getrandom failed\n");
 38 |         exit(0);
 39 |     }
 40 | 
 41 |     prng_init(prng, seed);
 42 | }
 43 | 
 44 | double get_rand(prng_state *prng, double min, double max)
 45 | {
 46 |     uint8_t buf[0x80] __attribute__ ((aligned (64)));
 47 |     
 48 |     prng_gen(prng, buf, sizeof(buf));
 49 | 
 50 |     return min + ((double)(*((u32 *)buf)) / U32_MAX) * (max - min);
 51 | }
 52 | 
 53 | void add_hit(int value)
 54 | {
 55 |     pthread_mutex_lock(&hit_mutex);
 56 |     
 57 |     hit += value;
 58 | 
 59 |     pthread_mutex_unlock(&hit_mutex);
 60 | }
 61 | 
 62 | s64 _tf_estimate_pi_4(s64 toss_cnt, prng_state *prng)
 63 | {
 64 |     s64 _hit = 0;
 65 |     ve4df fone = {1, 1, 1, 1};
 66 |     ve4di ione = {1, 1, 1, 1};
 67 |     ve4di result = {0, 0, 0, 0};
 68 |     
 69 |     for (s64 i = 0; i < toss_cnt; i+=4) {
 70 |         ve4df distance;
 71 |         ve4di cmp;
 72 |         
 73 |         ve4df x = { get_rand(prng, -1, 1),
 74 |                     get_rand(prng, -1, 1),
 75 |                     get_rand(prng, -1, 1),
 76 |                     get_rand(prng, -1, 1) };
 77 |         ve4df y = { get_rand(prng, -1, 1),
 78 |                     get_rand(prng, -1, 1),
 79 |                     get_rand(prng, -1, 1),
 80 |                     get_rand(prng, -1, 1) };
 81 |         distance.v = x.v * x.v + y.v * y.v;
 82 | 
 83 |         cmp.v = distance.v <= fone.v;
 84 |         result.v = result.v + (ione.v & cmp.v);
 85 | 
 86 |         // printf("x: %f, %f, %f, %f\n", x.e[0], x.e[1], x.e[2], x.e[3]);
 87 |         // printf("y: %f, %f, %f, %f\n", y.e[0], y.e[1], y.e[2], y.e[3]);
 88 |         // printf("d: %f, %f, %f, %f\n", distance.e[0], distance.e[1], distance.e[2], distance.e[3]);
 89 |         // printf("c: %ld, %ld, %ld, %ld\n", cmp.e[0], cmp.e[1], cmp.e[2], cmp.e[3]);
 90 |         // printf("r: %ld, %ld, %ld, %ld\n", result.e[0], result.e[1], result.e[2], result.e[3]);
 91 |     }
 92 | 
 93 |     for (int i = 0; i < 4; ++i) {
 94 |         _hit += result.e[i];
 95 |     }
 96 | 
 97 |     // printf("_hit: %lld\n", _hit);
 98 | 
 99 |     return _hit;
100 | }
101 | 
102 | void *tf_estimate_pi(void *_toss_cnt)
103 | {
104 |     s64 toss_cnt = (s64)_toss_cnt;
105 |     s64 _hit = 0;
106 |     s64 remain;
107 |     prng_state prng;
108 | 
109 |     rand_init(&prng);
110 |     
111 |     remain = toss_cnt % 4;
112 | 
113 |     _hit += _tf_estimate_pi_4(toss_cnt - remain, &prng);
114 | 
115 |     for (s64 i = 0; i < remain; i++) {
116 |         double x = get_rand(&prng, -1, 1);
117 |         double y = get_rand(&prng, -1, 1);
118 |         double distance = x * x + y * y;
119 |         if (distance <= 1)
120 |             _hit++;
121 |     }
122 | 
123 |     add_hit(_hit);
124 | 
125 |     return NULL;
126 | }
127 | 
128 | /*
129 |  * pi.out takes two command-line arguments, which indicate the number of 
130 |  * threads and the number of tosses, respectively. The value of the first
131 |  * and second arguments will not exceed the range of int and long long int,
132 |  * respectively. 
133 |  */
134 | int main(int argc, char **argv)
135 | {
136 |     int thread_cnt;
137 |     s64 toss_cnt, remain, loading;
138 |     double pi;
139 |     pthread_t *tid;
140 |     int i;
141 | 
142 |     if (argc != 3) {
143 |         fprintf(stderr, "Usage: %s thread_num tosses_num\n", argv[0]);
144 |         return 0;
145 |     }
146 | 
147 |     thread_cnt = atoi(argv[1]);
148 |     toss_cnt = atoll(argv[2]);
149 | 
150 |     tid = (pthread_t *)malloc(sizeof(pthread_t) * thread_cnt);
151 |     remain = toss_cnt;
152 |     loading = toss_cnt / thread_cnt;
153 | 
154 |     for (i = 0; i < thread_cnt - 1; i++) {
155 |         pthread_create(&tid[i], NULL, tf_estimate_pi, (void *)loading);
156 |         remain -= loading;
157 |     }
158 |     pthread_create(&tid[i], NULL, tf_estimate_pi, (void *)remain);
159 | 
160 |     for (int i = 0; i < thread_cnt; i++) {
161 |         pthread_join(tid[i], NULL);
162 |     }
163 | 
164 |     pi = hit * (4 / ((double)toss_cnt));
165 | 
166 |     printf("%f\n", pi);
167 | 
168 |     return 0;
169 | }


--------------------------------------------------------------------------------
/HW2/part1/report.txt:
--------------------------------------------------------------------------------
 1 | Model name:                      AMD Ryzen 7 PRO 4750U with Radeon Graphics
 2 | Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero wbnoinvd arat umip rdpid overflow_recov succor
 3 | cpu MHz		: 1696.813
 4 | cpu MHz		: 1696.813
 5 | cpu MHz		: 1696.813
 6 | cpu MHz		: 1696.813
 7 | 3.141600
 8 | 3.141837
 9 | 
10 | real	0m0.660s
11 | user	0m2.107s
12 | sys	0m0.032s
13 | 


--------------------------------------------------------------------------------
/HW2/part1/shishua-avx2.h:
--------------------------------------------------------------------------------
  1 | // Reference: https://github.com/espadrine/shishua
  2 | #ifndef SHISHUA_AVX2_H
  3 | #define SHISHUA_AVX2_H
  4 | #include <stdint.h>
  5 | #include <stddef.h>
  6 | #include <string.h>
  7 | #include <immintrin.h>
  8 | #include <assert.h>
  9 | typedef struct prng_state {
 10 |   __m256i state[4];
 11 |   __m256i output[4];
 12 |   __m256i counter;
 13 | } prng_state;
 14 | 
 15 | // buf's size must be a multiple of 128 bytes.
 16 | static inline void prng_gen(prng_state *s, uint8_t buf[], size_t size) {
 17 |   __m256i o0 = s->output[0], o1 = s->output[1], o2 = s->output[2], o3 = s->output[3],
 18 |           s0 =  s->state[0], s1 =  s->state[1], s2 =  s->state[2], s3 =  s->state[3],
 19 |           t0, t1, t2, t3, u0, u1, u2, u3, counter = s->counter;
 20 |   // The following shuffles move weak (low-diffusion) 32-bit parts of 64-bit
 21 |   // additions to strong positions for enrichment. The low 32-bit part of a
 22 |   // 64-bit chunk never moves to the same 64-bit chunk as its high part.
 23 |   // They do not remain in the same chunk. Each part eventually reaches all
 24 |   // positions ringwise: A to B, B to C, …, H to A.
 25 |   // You may notice that they are simply 256-bit rotations (96 and 160).
 26 |   __m256i shu0 = _mm256_set_epi32(4, 3, 2, 1, 0, 7, 6, 5),
 27 |           shu1 = _mm256_set_epi32(2, 1, 0, 7, 6, 5, 4, 3);
 28 |   // The counter is not necessary to beat PractRand.
 29 |   // It sets a lower bound of 2^71 bytes = 2 ZiB to the period,
 30 |   // or about 7 millenia at 10 GiB/s.
 31 |   // The increments are picked as odd numbers,
 32 |   // since only coprimes of the base cover the full cycle,
 33 |   // and all odd numbers are coprime of 2.
 34 |   // I use different odd numbers for each 64-bit chunk
 35 |   // for a tiny amount of variation stirring.
 36 |   // I used the smallest odd numbers to avoid having a magic number.
 37 |   __m256i increment = _mm256_set_epi64x(1, 3, 5, 7);
 38 | 
 39 |   // TODO: consider adding proper uneven write handling
 40 |   assert((size % 128 == 0) && "buf's size must be a multiple of 128 bytes.");
 41 | 
 42 |   for (size_t i = 0; i < size; i += 128) {
 43 |     if (buf != NULL) {
 44 |       _mm256_storeu_si256((__m256i*)&buf[i +  0], o0);
 45 |       _mm256_storeu_si256((__m256i*)&buf[i + 32], o1);
 46 |       _mm256_storeu_si256((__m256i*)&buf[i + 64], o2);
 47 |       _mm256_storeu_si256((__m256i*)&buf[i + 96], o3);
 48 |     }
 49 | 
 50 |     // I apply the counter to s1,
 51 |     // since it is the one whose shift loses most entropy.
 52 |     s1 = _mm256_add_epi64(s1, counter);
 53 |     s3 = _mm256_add_epi64(s3, counter);
 54 |     counter = _mm256_add_epi64(counter, increment);
 55 | 
 56 |     // SIMD does not support rotations. Shift is the next best thing to entangle
 57 |     // bits with other 64-bit positions. We must shift by an odd number so that
 58 |     // each bit reaches all 64-bit positions, not just half. We must lose bits
 59 |     // of information, so we minimize it: 1 and 3. We use different shift values
 60 |     // to increase divergence between the two sides. We use rightward shift
 61 |     // because the rightmost bits have the least diffusion in addition (the low
 62 |     // bit is just a XOR of the low bits).
 63 |     u0 = _mm256_srli_epi64(s0, 1);              u1 = _mm256_srli_epi64(s1, 3);
 64 |     u2 = _mm256_srli_epi64(s2, 1);              u3 = _mm256_srli_epi64(s3, 3);
 65 |     t0 = _mm256_permutevar8x32_epi32(s0, shu0); t1 = _mm256_permutevar8x32_epi32(s1, shu1);
 66 |     t2 = _mm256_permutevar8x32_epi32(s2, shu0); t3 = _mm256_permutevar8x32_epi32(s3, shu1);
 67 |     // Addition is the main source of diffusion.
 68 |     // Storing the output in the state keeps that diffusion permanently.
 69 |     s0 = _mm256_add_epi64(t0, u0);              s1 = _mm256_add_epi64(t1, u1);
 70 |     s2 = _mm256_add_epi64(t2, u2);              s3 = _mm256_add_epi64(t3, u3);
 71 | 
 72 |     // Two orthogonally grown pieces evolving independently, XORed.
 73 |     o0 = _mm256_xor_si256(u0, t1);
 74 |     o1 = _mm256_xor_si256(u2, t3);
 75 |     o2 = _mm256_xor_si256(s0, s3);
 76 |     o3 = _mm256_xor_si256(s2, s1);
 77 |   }
 78 |   s->output[0] = o0; s->output[1] = o1; s->output[2] = o2; s->output[3] = o3;
 79 |   s->state [0] = s0; s->state [1] = s1; s->state [2] = s2; s->state [3] = s3;
 80 |   s->counter = counter;
 81 | }
 82 | 
 83 | // Nothing up my sleeve: those are the hex digits of Φ,
 84 | // the least approximable irrational number.
 85 | // $ echo 'scale=310;obase=16;(sqrt(5)-1)/2' | bc
 86 | static uint64_t phi[16] = {
 87 |   0x9E3779B97F4A7C15, 0xF39CC0605CEDC834, 0x1082276BF3A27251, 0xF86C6A11D0C18E95,
 88 |   0x2767F0B153D27B7F, 0x0347045B5BF1827F, 0x01886F0928403002, 0xC1D64BA40F335E36,
 89 |   0xF06AD7AE9717877E, 0x85839D6EFFBD7DC6, 0x64D325D1C5371682, 0xCADD0CCCFDFFBBE1,
 90 |   0x626E33B8D04B4331, 0xBBF73C790D94F79D, 0x471C4AB3ED3D82A5, 0xFEC507705E4AE6E5,
 91 | };
 92 | 
 93 | void prng_init(prng_state *s, uint64_t seed[4]) {
 94 |   memset(s, 0, sizeof(prng_state));
 95 | # define STEPS 1
 96 | # define ROUNDS 13
 97 |   uint8_t buf[128 * STEPS];
 98 |   // Diffuse first two seed elements in s0, then the last two. Same for s1.
 99 |   // We must keep half of the state unchanged so users cannot set a bad state.
100 |   s->state[0] = _mm256_set_epi64x(phi[ 3], phi[ 2] ^ seed[1], phi[ 1], phi[ 0] ^ seed[0]);
101 |   s->state[1] = _mm256_set_epi64x(phi[ 7], phi[ 6] ^ seed[3], phi[ 5], phi[ 4] ^ seed[2]);
102 |   s->state[2] = _mm256_set_epi64x(phi[11], phi[10] ^ seed[3], phi[ 9], phi[ 8] ^ seed[2]);
103 |   s->state[3] = _mm256_set_epi64x(phi[15], phi[14] ^ seed[1], phi[13], phi[12] ^ seed[0]);
104 |   for (size_t i = 0; i < ROUNDS; i++) {
105 |     prng_gen(s, buf, 128 * STEPS);
106 |     s->state[0] = s->output[3]; s->state[1] = s->output[2];
107 |     s->state[2] = s->output[1]; s->state[3] = s->output[0];
108 |   }
109 | # undef STEPS
110 | # undef ROUNDS
111 | }
112 | #endif


--------------------------------------------------------------------------------
/HW2/part2/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | CXX=g++ -m64
 3 | CXXFLAGS=-I./common -Iobjs/ -O3 -std=c++17 -Wall
 4 | 
 5 | APP_NAME=mandelbrot
 6 | OBJDIR=objs
 7 | COMMONDIR=./common
 8 | 
 9 | PPM_CXX=$(COMMONDIR)/ppm.cpp
10 | PPM_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(PPM_CXX:.cpp=.o)))
11 | 
12 | 
13 | default: $(APP_NAME)
14 | 
15 | .PHONY: dirs clean
16 | 
17 | dirs:
18 | 		/bin/mkdir -p $(OBJDIR)/
19 | 
20 | clean:
21 | 		/bin/rm -rf $(OBJDIR) *.ppm *~ $(APP_NAME)
22 | 
23 | OBJS=$(OBJDIR)/main.o $(OBJDIR)/mandelbrotSerial.o $(OBJDIR)/mandelbrotThread.o $(PPM_OBJ)
24 | 
25 | $(APP_NAME): dirs $(OBJS)
26 | 		$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm -lpthread
27 | 
28 | $(OBJDIR)/%.o: %.cpp
29 | 		$(CXX) $< $(CXXFLAGS) -c -o $@
30 | 
31 | $(OBJDIR)/%.o: $(COMMONDIR)/%.cpp
32 | 	$(CXX) $< $(CXXFLAGS) -c -o $@
33 | 
34 | $(OBJDIR)/main.o: $(COMMONDIR)/CycleTimer.h
35 | 
36 | .PHONY: report
37 | report: clean $(APP_NAME)
38 | 	lscpu | grep -E "name|Flags" > report.txt
39 | 	cat /proc/cpuinfo | grep MHz >> report.txt
40 | 	bash -c "./mandelbrot -t 3 >> report.txt ; ./mandelbrot -t 4 >> report.txt"


--------------------------------------------------------------------------------
/HW2/part2/common/CycleTimer.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SYRAH_CYCLE_TIMER_H_
  2 | #define _SYRAH_CYCLE_TIMER_H_
  3 | 
  4 | #if defined(__APPLE__)
  5 | #if defined(__x86_64__)
  6 | #include <sys/sysctl.h>
  7 | #else
  8 | #include <mach/mach.h>
  9 | #include <mach/mach_time.h>
 10 | #endif // __x86_64__ or not
 11 | 
 12 | #include <stdio.h>  // fprintf
 13 | #include <stdlib.h> // exit
 14 | 
 15 | #elif _WIN32
 16 | #include <windows.h>
 17 | #include <time.h>
 18 | #else
 19 | #include <stdio.h>
 20 | #include <stdlib.h>
 21 | #include <string.h>
 22 | #include <sys/time.h>
 23 | #endif
 24 | 
 25 | // This uses the cycle counter of the processor.  Different
 26 | // processors in the system will have different values for this.  If
 27 | // you process moves across processors, then the delta time you
 28 | // measure will likely be incorrect.  This is mostly for fine
 29 | // grained measurements where the process is likely to be on the
 30 | // same processor.  For more global things you should use the
 31 | // Time interface.
 32 | 
 33 | // Also note that if you processors' speeds change (i.e. processors
 34 | // scaling) or if you are in a heterogenous environment, you will
 35 | // likely get spurious results.
 36 | class CycleTimer
 37 | {
 38 | public:
 39 |   typedef unsigned long long SysClock;
 40 | 
 41 |   //////////
 42 |   // Return the current CPU time, in terms of clock ticks.
 43 |   // Time zero is at some arbitrary point in the past.
 44 |   static SysClock currentTicks()
 45 |   {
 46 | #if defined(__APPLE__) && !defined(__x86_64__)
 47 |     return mach_absolute_time();
 48 | #elif defined(_WIN32)
 49 |     LARGE_INTEGER qwTime;
 50 |     QueryPerformanceCounter(&qwTime);
 51 |     return qwTime.QuadPart;
 52 | #elif defined(__x86_64__)
 53 |     unsigned int a, d;
 54 |     asm volatile("rdtsc"
 55 |                  : "=a"(a), "=d"(d));
 56 |     return static_cast<unsigned long long>(a) |
 57 |            (static_cast<unsigned long long>(d) << 32);
 58 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser.
 59 |     unsigned int val;
 60 |     asm volatile("mrc p15, 0, %0, c9, c13, 0"
 61 |                  : "=r"(val));
 62 |     return val;
 63 | #else
 64 |     timespec spec;
 65 |     clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec);
 66 |     return CycleTimer::SysClock(static_cast<float>(spec.tv_sec) * 1e9 + static_cast<float>(spec.tv_nsec));
 67 | #endif
 68 |   }
 69 | 
 70 |   //////////
 71 |   // Return the current CPU time, in terms of seconds.
 72 |   // This is slower than currentTicks().  Time zero is at
 73 |   // some arbitrary point in the past.
 74 |   static double currentSeconds()
 75 |   {
 76 |     return currentTicks() * secondsPerTick();
 77 |   }
 78 | 
 79 |   //////////
 80 |   // Return the conversion from seconds to ticks.
 81 |   static double ticksPerSecond()
 82 |   {
 83 |     return 1.0 / secondsPerTick();
 84 |   }
 85 | 
 86 |   static const char *tickUnits()
 87 |   {
 88 | #if defined(__APPLE__) && !defined(__x86_64__)
 89 |     return "ns";
 90 | #elif defined(__WIN32__) || defined(__x86_64__)
 91 |     return "cycles";
 92 | #else
 93 |     return "ns"; // clock_gettime
 94 | #endif
 95 |   }
 96 | 
 97 |   //////////
 98 |   // Return the conversion from ticks to seconds.
 99 |   static double secondsPerTick()
100 |   {
101 |     static bool initialized = false;
102 |     static double secondsPerTick_val;
103 |     if (initialized)
104 |       return secondsPerTick_val;
105 | #if defined(__APPLE__)
106 | #ifdef __x86_64__
107 |     int args[] = {CTL_HW, HW_CPU_FREQ};
108 |     unsigned int Hz;
109 |     size_t len = sizeof(Hz);
110 |     if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0)
111 |     {
112 |       fprintf(stderr, "Failed to initialize secondsPerTick_val!\n");
113 |       exit(-1);
114 |     }
115 |     secondsPerTick_val = 1.0 / (double)Hz;
116 | #else
117 |     mach_timebase_info_data_t time_info;
118 |     mach_timebase_info(&time_info);
119 | 
120 |     // Scales to nanoseconds without 1e-9f
121 |     secondsPerTick_val = (1e-9 * static_cast<double>(time_info.numer)) /
122 |                          static_cast<double>(time_info.denom);
123 | #endif // x86_64 or not
124 | #elif defined(_WIN32)
125 |     LARGE_INTEGER qwTicksPerSec;
126 |     QueryPerformanceFrequency(&qwTicksPerSec);
127 |     secondsPerTick_val = 1.0 / static_cast<double>(qwTicksPerSec.QuadPart);
128 | #else
129 |     FILE *fp = fopen("/proc/cpuinfo", "r");
130 |     char input[1024];
131 |     if (!fp)
132 |     {
133 |       fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo.");
134 |       exit(-1);
135 |     }
136 |     // In case we don't find it, e.g. on the N900
137 |     secondsPerTick_val = 1e-9;
138 |     while (!feof(fp) && fgets(input, 1024, fp))
139 |     {
140 |       // NOTE(boulos): Because reading cpuinfo depends on dynamic
141 |       // frequency scaling it's better to read the @ sign first
142 |       float GHz, MHz;
143 |       if (strstr(input, "model name"))
144 |       {
145 |         char *at_sign = strstr(input, "@");
146 |         if (at_sign)
147 |         {
148 |           char *after_at = at_sign + 1;
149 |           char *GHz_str = strstr(after_at, "GHz");
150 |           char *MHz_str = strstr(after_at, "MHz");
151 |           if (GHz_str)
152 |           {
153 |             *GHz_str = '\0';
154 |             if (1 == sscanf(after_at, "%f", &GHz))
155 |             {
156 |               //printf("GHz = %f\n", GHz);
157 |               secondsPerTick_val = 1e-9f / GHz;
158 |               break;
159 |             }
160 |           }
161 |           else if (MHz_str)
162 |           {
163 |             *MHz_str = '\0';
164 |             if (1 == sscanf(after_at, "%f", &MHz))
165 |             {
166 |               //printf("MHz = %f\n", MHz);
167 |               secondsPerTick_val = 1e-6f / GHz;
168 |               break;
169 |             }
170 |           }
171 |         }
172 |       }
173 |       else if (1 == sscanf(input, "cpu MHz : %f", &MHz))
174 |       {
175 |         //printf("MHz = %f\n", MHz);
176 |         secondsPerTick_val = 1e-6f / MHz;
177 |         break;
178 |       }
179 |     }
180 |     fclose(fp);
181 | #endif
182 | 
183 |     initialized = true;
184 |     return secondsPerTick_val;
185 |   }
186 | 
187 |   //////////
188 |   // Return the conversion from ticks to milliseconds.
189 |   static double msPerTick()
190 |   {
191 |     return secondsPerTick() * 1000.0;
192 |   }
193 | 
194 | private:
195 |   CycleTimer();
196 | };
197 | 
198 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_
199 | 


--------------------------------------------------------------------------------
/HW2/part2/common/ppm.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | #include <math.h>
 4 | #include <algorithm>
 5 | 
 6 | void writePPMImage(int *data, int width, int height, const char *filename, int maxIterations)
 7 | {
 8 |     FILE *fp = fopen(filename, "wb");
 9 | 
10 |     // write ppm header
11 |     fprintf(fp, "P6\n");
12 |     fprintf(fp, "%d %d\n", width, height);
13 |     fprintf(fp, "255\n");
14 | 
15 |     for (int i = 0; i < width * height; ++i)
16 |     {
17 | 
18 |         // Clamp iteration count for this pixel, then scale the value
19 |         // to 0-1 range.  Raise resulting value to a power (<1) to
20 |         // increase brightness of low iteration count
21 |         // pixels. a.k.a. Make things look cooler.
22 | 
23 |         float mapped = pow(std::min(static_cast<float>(maxIterations),
24 |                                     static_cast<float>(data[i])) /
25 |                                256.f,
26 |                            .5f);
27 | 
28 |         // convert back into 0-255 range, 8-bit channels
29 |         unsigned char result = static_cast<unsigned char>(255.f * mapped);
30 |         for (int j = 0; j < 3; ++j)
31 |             fputc(result, fp);
32 |     }
33 |     fclose(fp);
34 |     printf("Wrote image file %s\n", filename);
35 | }
36 | 


--------------------------------------------------------------------------------
/HW2/part2/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <algorithm>
  3 | #include <getopt.h>
  4 | 
  5 | #include "CycleTimer.h"
  6 | 
  7 | extern void mandelbrotSerial(
  8 |     float x0, float y0, float x1, float y1,
  9 |     int width, int height,
 10 |     int startRow, int numRows,
 11 |     int maxIterations,
 12 |     int output[]);
 13 | 
 14 | extern void mandelbrotThread(
 15 |     int numThreads,
 16 |     float x0, float y0, float x1, float y1,
 17 |     int width, int height,
 18 |     int maxIterations,
 19 |     int output[]);
 20 | 
 21 | extern void writePPMImage(
 22 |     int* data,
 23 |     int width, int height,
 24 |     const char *filename,
 25 |     int maxIterations);
 26 | 
 27 | void
 28 | scaleAndShift(float& x0, float& x1, float& y0, float& y1,
 29 |               float scale,
 30 |               float shiftX, float shiftY)
 31 | {
 32 | 
 33 |     x0 *= scale;
 34 |     x1 *= scale;
 35 |     y0 *= scale;
 36 |     y1 *= scale;
 37 |     x0 += shiftX;
 38 |     x1 += shiftX;
 39 |     y0 += shiftY;
 40 |     y1 += shiftY;
 41 | 
 42 | }
 43 | 
 44 | void usage(const char* progname) {
 45 |     printf("Usage: %s [options]\n", progname);
 46 |     printf("Program Options:\n");
 47 |     printf("  -t  --threads <N>  Use N threads\n");
 48 |     printf("  -v  --view <INT>   Use specified view settings\n");
 49 |     printf("  -?  --help         This message\n");
 50 | }
 51 | 
 52 | bool verifyResult (int *gold, int *result, int width, int height) {
 53 | 
 54 |     int i, j;
 55 | 
 56 |     for (i = 0; i < height; i++) {
 57 |         for (j = 0; j < width; j++) {
 58 |             if (gold[i * width + j] != result[i * width + j]) {
 59 |                 printf ("Mismatch : [%d][%d], Expected : %d, Actual : %d\n",
 60 |                             i, j, gold[i * width + j], result[i * width + j]);
 61 |                 return 0;
 62 |             }
 63 |         }
 64 |     }
 65 | 
 66 |     return 1;
 67 | }
 68 | 
 69 | int main(int argc, char** argv) {
 70 | 
 71 |     const unsigned int width = 1600;
 72 |     const unsigned int height = 1200;
 73 |     const int maxIterations = 256;
 74 |     int numThreads = 2;
 75 | 
 76 |     float x0 = -2;
 77 |     float x1 = 1;
 78 |     float y0 = -1;
 79 |     float y1 = 1;
 80 | 
 81 |     // parse commandline options ////////////////////////////////////////////
 82 |     int opt;
 83 |     static struct option long_options[] = {
 84 |         {"threads", 1, 0, 't'},
 85 |         {"view", 1, 0, 'v'},
 86 |         {"help", 0, 0, '?'},
 87 |         {0 ,0, 0, 0}
 88 |     };
 89 | 
 90 |     while ((opt = getopt_long(argc, argv, "t:v:?", long_options, NULL)) != EOF) {
 91 | 
 92 |         switch (opt) {
 93 |         case 't':
 94 |         {
 95 |             numThreads = atoi(optarg);
 96 |             break;
 97 |         }
 98 |         case 'v':
 99 |         {
100 |             int viewIndex = atoi(optarg);
101 |             // change view settings
102 |             if (viewIndex == 2) {
103 |                 float scaleValue = .015f;
104 |                 float shiftX = -.986f;
105 |                 float shiftY = .30f;
106 |                 scaleAndShift(x0, x1, y0, y1, scaleValue, shiftX, shiftY);
107 |             } else if (viewIndex > 1) {
108 |                 fprintf(stderr, "Invalid view index\n");
109 |                 return 1;
110 |             }
111 |             break;
112 |         }
113 |         case '?':
114 |         default:
115 |             usage(argv[0]);
116 |             return 1;
117 |         }
118 |     }
119 |     // end parsing of commandline options
120 | 
121 | 
122 |     int* output_serial = new int[width*height];
123 |     int* output_thread = new int[width*height];
124 |     
125 |     //
126 |     // Run the serial implementation.  Run the code three times and
127 |     // take the minimum to get a good estimate.
128 |     //
129 | 
130 |     double minSerial = 1e30;
131 |     for (int i = 0; i < 5; ++i) {
132 |        memset(output_serial, 0, width * height * sizeof(int));
133 |         double startTime = CycleTimer::currentSeconds();
134 |         mandelbrotSerial(x0, y0, x1, y1, width, height, 0, height, maxIterations, output_serial);
135 |         double endTime = CycleTimer::currentSeconds();
136 |         minSerial = std::min(minSerial, endTime - startTime);
137 |     }
138 | 
139 |     printf("[mandelbrot serial]:\t\t[%.3f] ms\n", minSerial * 1000);
140 |     writePPMImage(output_serial, width, height, "mandelbrot-serial.ppm", maxIterations);
141 | 
142 |     //
143 |     // Run the threaded version
144 |     //
145 | 
146 |     double minThread = 1e30;
147 |     for (int i = 0; i < 5; ++i) {
148 |       memset(output_thread, 0, width * height * sizeof(int));
149 |         double startTime = CycleTimer::currentSeconds();
150 |         mandelbrotThread(numThreads, x0, y0, x1, y1, width, height, maxIterations, output_thread);
151 |         double endTime = CycleTimer::currentSeconds();
152 |         minThread = std::min(minThread, endTime - startTime);
153 |     }
154 | 
155 |     printf("[mandelbrot thread]:\t\t[%.3f] ms\n", minThread * 1000);
156 |     writePPMImage(output_thread, width, height, "mandelbrot-thread.ppm", maxIterations);
157 | 
158 |     if (! verifyResult (output_serial, output_thread, width, height)) {
159 |         printf ("Error : Output from threads does not match serial output\n");
160 | 
161 |         delete[] output_serial;
162 |         delete[] output_thread;
163 | 
164 |         return 1;
165 |     }
166 | 
167 |     // compute speedup
168 |     printf("\t\t\t\t(%.2fx speedup from %d threads)\n", minSerial/minThread, numThreads);
169 | 
170 |     delete[] output_serial;
171 |     delete[] output_thread;
172 | 
173 |     return 0;
174 | }
175 | 


--------------------------------------------------------------------------------
/HW2/part2/mandelbrotSerial.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 |   Note: This code was modified from example code
 4 |   originally provided by Intel.  To comply with Intel's open source
 5 |   licensing agreement, their copyright is retained below.
 6 | 
 7 |   -----------------------------------------------------------------
 8 | 
 9 |   Copyright (c) 2010-2011, Intel Corporation
10 |   All rights reserved.
11 | 
12 |   Redistribution and use in source and binary forms, with or without
13 |   modification, are permitted provided that the following conditions are
14 |   met:
15 | 
16 |     * Redistributions of source code must retain the above copyright
17 |       notice, this list of conditions and the following disclaimer.
18 | 
19 |     * Redistributions in binary form must reproduce the above copyright
20 |       notice, this list of conditions and the following disclaimer in the
21 |       documentation and/or other materials provided with the distribution.
22 | 
23 |     * Neither the name of Intel Corporation nor the names of its
24 |       contributors may be used to endorse or promote products derived from
25 |       this software without specific prior written permission.
26 | 
27 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
28 |    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29 |    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
30 |    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
31 |    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 |    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 |    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 |    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 |    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 |    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 |    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 | */
39 | 
40 | static inline int mandel(float c_re, float c_im, int count)
41 | {
42 |   float z_re = c_re, z_im = c_im;
43 |   int i;
44 |   for (i = 0; i < count; ++i)
45 |   {
46 | 
47 |     if (z_re * z_re + z_im * z_im > 4.f)
48 |       break;
49 | 
50 |     float new_re = z_re * z_re - z_im * z_im;
51 |     float new_im = 2.f * z_re * z_im;
52 |     z_re = c_re + new_re;
53 |     z_im = c_im + new_im;
54 |   }
55 | 
56 |   return i;
57 | }
58 | 
59 | //
60 | // MandelbrotSerial --
61 | //
62 | // Compute an image visualizing the mandelbrot set.  The resulting
63 | // array contains the number of iterations required before the complex
64 | // number corresponding to a pixel could be rejected from the set.
65 | //
66 | // * x0, y0, x1, y1 describe the complex coordinates mapping
67 | //   into the image viewport.
68 | // * width, height describe the size of the output image
69 | // * startRow, totalRows describe how much of the image to compute
70 | void mandelbrotSerial(
71 |     float x0, float y0, float x1, float y1,
72 |     int width, int height,
73 |     int startRow, int totalRows,
74 |     int maxIterations,
75 |     int output[])
76 | {
77 |   float dx = (x1 - x0) / width;
78 |   float dy = (y1 - y0) / height;
79 | 
80 |   int endRow = startRow + totalRows;
81 | 
82 |   for (int j = startRow; j < endRow; j++)
83 |   {
84 |     for (int i = 0; i < width; ++i)
85 |     {
86 |       float x = x0 + i * dx;
87 |       float y = y0 + j * dy;
88 | 
89 |       int index = (j * width + i);
90 |       output[index] = mandel(x, y, maxIterations);
91 |     }
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/HW2/part2/report.txt:
--------------------------------------------------------------------------------
 1 | Model name:                      AMD Ryzen 7 PRO 4750U with Radeon Graphics
 2 | Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero wbnoinvd arat umip rdpid overflow_recov succor
 3 | cpu MHz		: 1696.813
 4 | cpu MHz		: 1696.813
 5 | cpu MHz		: 1696.813
 6 | cpu MHz		: 1696.813
 7 | [mandelbrot serial]:		[543.537] ms
 8 | Wrote image file mandelbrot-serial.ppm
 9 | Finish Time Report:
10 | Thread 0: 0.054451
11 | Thread 1: 0.058942
12 | Thread 2: 0.059356
13 | Finish Time Report:
14 | Thread 0: 0.050994
15 | Thread 1: 0.056074
16 | Thread 2: 0.055579
17 | Finish Time Report:
18 | Thread 0: 0.055361
19 | Thread 1: 0.052566
20 | Thread 2: 0.054997
21 | Finish Time Report:
22 | Thread 0: 0.054042
23 | Thread 1: 0.052141
24 | Thread 2: 0.055408
25 | Finish Time Report:
26 | Thread 0: 0.053885
27 | Thread 1: 0.050878
28 | Thread 2: 0.055124
29 | [mandelbrot thread]:		[55.478] ms
30 | Wrote image file mandelbrot-thread.ppm
31 | 				(9.80x speedup from 3 threads)
32 | [mandelbrot serial]:		[556.734] ms
33 | Wrote image file mandelbrot-serial.ppm
34 | Finish Time Report:
35 | Thread 0: 0.044371
36 | Thread 1: 0.046961
37 | Thread 2: 0.043488
38 | Thread 3: 0.044690
39 | Finish Time Report:
40 | Thread 0: 0.042621
41 | Thread 1: 0.047911
42 | Thread 2: 0.043151
43 | Thread 3: 0.042625
44 | Finish Time Report:
45 | Thread 0: 0.040758
46 | Thread 1: 0.059524
47 | Thread 2: 0.046138
48 | Thread 3: 0.041580
49 | Finish Time Report:
50 | Thread 0: 0.042828
51 | Thread 1: 0.064692
52 | Thread 2: 0.043270
53 | Thread 3: 0.040743
54 | Finish Time Report:
55 | Thread 0: 0.038417
56 | Thread 1: 0.043711
57 | Thread 2: 0.046472
58 | Thread 3: 0.038204
59 | [mandelbrot thread]:		[47.160] ms
60 | Wrote image file mandelbrot-thread.ppm
61 | 				(11.81x speedup from 4 threads)
62 | 


--------------------------------------------------------------------------------
/HW2/submit/part1/Makefile:
--------------------------------------------------------------------------------
1 | ../../part1/Makefile


--------------------------------------------------------------------------------
/HW2/submit/part1/pi.c:
--------------------------------------------------------------------------------
1 | ../../part1/pi.c


--------------------------------------------------------------------------------
/HW2/submit/part1/shishua-avx2.h:
--------------------------------------------------------------------------------
1 | ../../part1/shishua-avx2.h


--------------------------------------------------------------------------------
/HW2/submit/part2/mandelbrotThread.cpp:
--------------------------------------------------------------------------------
1 | ../../part2/mandelbrotThread.cpp


--------------------------------------------------------------------------------
/HW2/submit/url.txt:
--------------------------------------------------------------------------------
1 | https://hackmd.io/@LJP/SyBbA0_rs


--------------------------------------------------------------------------------
/HW3/part1/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL=/bin/sh
 2 | BENCHMARK=cg
 3 | BENCHMARKU=CG
 4 | PROGRAMNAME=cg
 5 | DATASIZE=MEDIUMN
 6 | 
 7 | default: ${PROGRAMNAME} grade
 8 | 
 9 | include make.common
10 | 
11 | OBJS = cg_impl.o \
12 |        ${COMMON}/${RAND}.o \
13 |        ${COMMON}/c_timers.o \
14 |        ${COMMON}/wtime.o
15 | 
16 | ${PROGRAMNAME}: config ${PROGRAMNAME}.o ${OBJS}
17 | 	${CLINK} ${CLINKFLAGS} -Wl,--allow-multiple-definition -o ${PROGRAMNAME} ${PROGRAMNAME}.o ${OBJS} ${C_LIB}
18 | 
19 | grade: config grade.o ${OBJS}
20 | 	${CLINK} ${CLINKFLAGS} -Wl,--allow-multiple-definition -o cg_grader grade.o ${OBJS} ref_cg.a def_cg.a ${C_LIB}
21 | 
22 | .c.o:
23 | 	${CCOMPILE} $< -D${DATASIZE}
24 | 
25 | cg.o:	cg.c  globals.h
26 | cg_impl.o:	cg_impl.c  globals.h
27 | 
28 | clean:
29 | 	- rm -f *.o *~
30 | 	rm -f ${COMMON}/*.o
31 | 	rm -f ${PROGRAMNAME} cg_grader
32 | 	rm -f gmon.out
33 | 
34 | profiling: CFLAGS += -pg -Wall
35 | profiling: CLINKFLAGS += -pg -Wall
36 | profiling: clean ${PROGRAMNAME}
37 | 	./${PROGRAMNAME}
38 | 	gprof ${PROGRAMNAME} gmon.out -b > profiling_result
39 | 	sudo perf record -e cpu-cycles ./${PROGRAMNAME}
40 | # 	sudo perf report -F+period,srcline
41 | 
42 | report: clean ${PROGRAMNAME}
43 | 	lscpu | grep -E "name|Flags" > report.txt
44 | 	cat /proc/cpuinfo | grep MHz >> report.txt
45 | 	./cg_grader >> report.txt


--------------------------------------------------------------------------------
/HW3/part1/README:
--------------------------------------------------------------------------------
 1 | Files:
 2 |     cg.c : main function.
 3 |     cg_impl.c: the implementation of conjugate gradient method.
 4 |     globals.h : some data definitions.
 5 |     common : functions for verification and time calculation.
 6 |     bin : executable output directory.
 7 |     Makefile, make.common : make systems.
 8 | 
 9 | Build up:
10 |     make DATASIZE=[LARGE|MEDIUMN|SMALL]
11 |     (MEDIUMN by default)
12 |     Please make clean first if you want to change DATASIZE.
13 | 
14 | Check correctness:
15 |     Main function contains the verification procedure. It shows VERIFICATION SUCCESSFUL/FAILED on the screen to indicate the correctness of the program.
16 | 


--------------------------------------------------------------------------------
/HW3/part1/cg.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <math.h>
  4 | 
  5 | #include "globals.h"
  6 | #include "randdp.h"
  7 | #include "timers.h"
  8 | #include "cg_impl.h"
  9 | 
 10 | void init(double *zeta);
 11 | void iterate(double *zeta, int *it);
 12 | 
 13 | int main(int argc, char *argv[])
 14 | {
 15 |   int i, j, k, it;
 16 | 
 17 |   double zeta;
 18 | 
 19 |   double t, t_total;
 20 | 
 21 |   //char Class;
 22 |   logical verified;
 23 |   double zeta_verify_value, epsilon, err;
 24 | 
 25 |   char *t_names[T_last];
 26 | 
 27 |   for (i = 0; i < T_last; i++)
 28 |   {
 29 |     timer_clear(i);
 30 |   }
 31 | 
 32 |   timer_start(T_init);
 33 | 
 34 |   zeta_verify_value = VALID_RESULT;
 35 | 
 36 |   printf("\nCG start...\n\n");
 37 |   printf(" Size: %11d\n", NA);
 38 |   printf(" Iterations: %5d\n", NITER);
 39 |   printf("\n");
 40 | 
 41 |   init(&zeta);
 42 | 
 43 |   zeta = 0.0;
 44 | 
 45 |   //---------------------------------------------------------------------
 46 |   //---->
 47 |   // Do one iteration untimed to init all code and data page tables
 48 |   //---->                    (then reinit, start timing, to niter its)
 49 |   //---------------------------------------------------------------------
 50 |   for (it = 1; it <= 1; it++)
 51 |   {
 52 |     iterate(&zeta, &it);
 53 |   } // end of do one iteration untimed
 54 | 
 55 |   //---------------------------------------------------------------------
 56 |   // set starting vector to (1, 1, .... 1)
 57 |   //---------------------------------------------------------------------
 58 |   for (i = 0; i < NA + 1; i++)
 59 |   {
 60 |     x[i] = 1.0;
 61 |   }
 62 | 
 63 |   zeta = 0.0;
 64 | 
 65 |   timer_stop(T_init);
 66 | 
 67 |   printf(" Initialization time = %15.3f seconds\n", timer_read(T_init));
 68 |   t_total += timer_read(T_init);
 69 | 
 70 |   timer_start(T_bench);
 71 | 
 72 |   //---------------------------------------------------------------------
 73 |   //---->
 74 |   // Main Iteration for inverse power method
 75 |   //---->
 76 |   //---------------------------------------------------------------------
 77 |   for (it = 1; it <= NITER; it++)
 78 |   {
 79 |     iterate(&zeta, &it);
 80 |   } // end of main iter inv pow meth
 81 | 
 82 |   timer_stop(T_bench);
 83 | 
 84 |   //---------------------------------------------------------------------
 85 |   // End of timed section
 86 |   //---------------------------------------------------------------------
 87 | 
 88 |   t = timer_read(T_bench);
 89 |   t_total += t;
 90 | 
 91 |   printf("\nComplete...\n");
 92 | 
 93 |   epsilon = 1.0e-10;
 94 |   err = fabs(zeta - zeta_verify_value) / zeta_verify_value;
 95 |   if (err <= epsilon)
 96 |   {
 97 |     verified = true;
 98 |     printf(" VERIFICATION SUCCESSFUL\n");
 99 |     printf(" Zeta is    %20.13E\n", zeta);
100 |     printf(" Error is   %20.13E\n", err);
101 |   }
102 |   else
103 |   {
104 |     verified = false;
105 |     printf(" VERIFICATION FAILED\n");
106 |     printf(" Zeta                %20.13E\n", zeta);
107 |     printf(" The correct zeta is %20.13E\n", zeta_verify_value);
108 |   }
109 | 
110 |   printf("\n\nExecution time : %lf seconds\n\n", t);
111 | 
112 |   printf("Total Time: %lf seconds\n\n", t_total);
113 | 
114 |   return 0;
115 | }
116 | 


--------------------------------------------------------------------------------
/HW3/part1/cg_impl.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <math.h>
 5 | 
 6 | #include "globals.h"
 7 | #include "randdp.h"
 8 | #include "timers.h"
 9 | 
10 | //---------------------------------------------------------------------
11 | /* common / main_int_mem / */
12 | int colidx[NZ];
13 | int rowstr[NA + 1];
14 | int iv[NA];
15 | int arow[NA];
16 | int acol[NAZ];
17 | 
18 | /* common / main_flt_mem / */
19 | double aelt[NAZ];
20 | double a[NZ];
21 | double x[NA + 2];
22 | double z[NA + 2];
23 | double p[NA + 2];
24 | double q[NA + 2];
25 | double r[NA + 2];
26 | 
27 | /* common / partit_size / */
28 | int naa;
29 | int nzz;
30 | int firstrow;
31 | int lastrow;
32 | int firstcol;
33 | int lastcol;
34 | 
35 | /* common /urando/ */
36 | double amult;
37 | double tran;
38 | 
39 | /* common /timers/ */
40 | logical timeron;
41 | //---------------------------------------------------------------------
42 | 
43 | //---------------------------------------------------------------------
44 | void conj_grad(int colidx[],
45 |                int rowstr[],
46 |                double x[],
47 |                double z[],
48 |                double a[],
49 |                double p[],
50 |                double q[],
51 |                double r[],
52 |                double *rnorm);
53 | void makea(int n,
54 |            int nz,
55 |            double a[],
56 |            int colidx[],
57 |            int rowstr[],
58 |            int firstrow,
59 |            int lastrow,
60 |            int firstcol,
61 |            int lastcol,
62 |            int arow[],
63 |            int acol[][NONZER + 1],
64 |            double aelt[][NONZER + 1],
65 |            int iv[]);
66 | void sparse(double a[],
67 |             int colidx[],
68 |             int rowstr[],
69 |             int n,
70 |             int nz,
71 |             int nozer,
72 |             int arow[],
73 |             int acol[][NONZER + 1],
74 |             double aelt[][NONZER + 1],
75 |             int firstrow,
76 |             int lastrow,
77 |             int nzloc[],
78 |             double rcond,
79 |             double shift);
80 | void sprnvc(int n, int nz, int nn1, double v[], int iv[]);
81 | int icnvrt(double x, int ipwr2);
82 | void vecset(int n, double v[], int iv[], int *nzv, int i, double val);
83 | void init(double *zeta);
84 | void iterate(double *zeta, int *it);


--------------------------------------------------------------------------------
/HW3/part1/common/c_timers.c:
--------------------------------------------------------------------------------
 1 | #include "wtime.h"
 2 | #include <stdlib.h>
 3 | 
 4 | /*  Prototype  */
 5 | void wtime( double * );
 6 | 
 7 | 
 8 | /*****************************************************************/
 9 | /******         E  L  A  P  S  E  D  _  T  I  M  E          ******/
10 | /*****************************************************************/
11 | static double elapsed_time( void )
12 | {
13 |     double t;
14 | 
15 |     wtime( &t );
16 |     return( t );
17 | }
18 | 
19 | 
20 | static double start[64], elapsed[64];
21 | 
22 | /*****************************************************************/
23 | /******            T  I  M  E  R  _  C  L  E  A  R          ******/
24 | /*****************************************************************/
25 | void timer_clear( int n )
26 | {
27 |     elapsed[n] = 0.0;
28 | }
29 | 
30 | 
31 | /*****************************************************************/
32 | /******            T  I  M  E  R  _  S  T  A  R  T          ******/
33 | /*****************************************************************/
34 | void timer_start( int n )
35 | {
36 |     start[n] = elapsed_time();
37 | }
38 | 
39 | 
40 | /*****************************************************************/
41 | /******            T  I  M  E  R  _  S  T  O  P             ******/
42 | /*****************************************************************/
43 | void timer_stop( int n )
44 | {
45 |     double t, now;
46 | 
47 |     now = elapsed_time();
48 |     t = now - start[n];
49 |     elapsed[n] += t;
50 | 
51 | }
52 | 
53 | 
54 | /*****************************************************************/
55 | /******            T  I  M  E  R  _  R  E  A  D             ******/
56 | /*****************************************************************/
57 | double timer_read( int n )
58 | {
59 |     return( elapsed[n] );
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/HW3/part1/common/randdp.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <math.h>
  3 | 
  4 | double randlc( double *x, double a )
  5 | {
  6 |   //--------------------------------------------------------------------
  7 |   //
  8 |   //  This routine returns a uniform pseudorandom double precision number in the
  9 |   //  range (0, 1) by using the linear congruential generator
 10 |   //
 11 |   //  x_{k+1} = a x_k  (mod 2^46)
 12 |   //
 13 |   //  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
 14 |   //  before repeating.  The argument A is the same as 'a' in the above formula,
 15 |   //  and X is the same as x_0.  A and X must be odd double precision integers
 16 |   //  in the range (1, 2^46).  The returned value RANDLC is normalized to be
 17 |   //  between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
 18 |   //  the new seed x_1, so that subsequent calls to RANDLC using the same
 19 |   //  arguments will generate a continuous sequence.
 20 |   //
 21 |   //  This routine should produce the same results on any computer with at least
 22 |   //  48 mantissa bits in double precision floating point data.  On 64 bit
 23 |   //  systems, double precision should be disabled.
 24 |   //
 25 |   //  David H. Bailey     October 26, 1990
 26 |   //
 27 |   //--------------------------------------------------------------------
 28 | 
 29 |   // r23 = pow(0.5, 23.0);
 30 |   ////  pow(0.5, 23.0) = 1.1920928955078125e-07
 31 |   // r46 = r23 * r23;
 32 |   // t23 = pow(2.0, 23.0);
 33 |   ////  pow(2.0, 23.0) = 8.388608e+06
 34 |   // t46 = t23 * t23;
 35 | 
 36 |   const double r23 = 1.1920928955078125e-07;
 37 |   const double r46 = r23 * r23;
 38 |   const double t23 = 8.388608e+06;
 39 |   const double t46 = t23 * t23;
 40 | 
 41 |   double t1, t2, t3, t4, a1, a2, x1, x2, z;
 42 |   double r;
 43 | 
 44 |   //--------------------------------------------------------------------
 45 |   //  Break A into two parts such that A = 2^23 * A1 + A2.
 46 |   //--------------------------------------------------------------------
 47 |   t1 = r23 * a;
 48 |   a1 = (int) t1;
 49 |   a2 = a - t23 * a1;
 50 | 
 51 |   //--------------------------------------------------------------------
 52 |   //  Break X into two parts such that X = 2^23 * X1 + X2, compute
 53 |   //  Z = A1 * X2 + A2 * X1  (mod 2^23), and then
 54 |   //  X = 2^23 * Z + A2 * X2  (mod 2^46).
 55 |   //--------------------------------------------------------------------
 56 |   t1 = r23 * (*x);
 57 |   x1 = (int) t1;
 58 |   x2 = *x - t23 * x1;
 59 |   t1 = a1 * x2 + a2 * x1;
 60 |   t2 = (int) (r23 * t1);
 61 |   z = t1 - t23 * t2;
 62 |   t3 = t23 * z + a2 * x2;
 63 |   t4 = (int) (r46 * t3);
 64 |   *x = t3 - t46 * t4;
 65 |   r = r46 * (*x);
 66 | 
 67 |   return r;
 68 | }
 69 | 
 70 | 
 71 | void vranlc( int n, double *x, double a, double y[] )
 72 | {
 73 |   //--------------------------------------------------------------------
 74 |   //
 75 |   //  This routine generates N uniform pseudorandom double precision numbers in
 76 |   //  the range (0, 1) by using the linear congruential generator
 77 |   //
 78 |   //  x_{k+1} = a x_k  (mod 2^46)
 79 |   //
 80 |   //  where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
 81 |   //  before repeating.  The argument A is the same as 'a' in the above formula,
 82 |   //  and X is the same as x_0.  A and X must be odd double precision integers
 83 |   //  in the range (1, 2^46).  The N results are placed in Y and are normalized
 84 |   //  to be between 0 and 1.  X is updated to contain the new seed, so that
 85 |   //  subsequent calls to VRANLC using the same arguments will generate a
 86 |   //  continuous sequence.  If N is zero, only initialization is performed, and
 87 |   //  the variables X, A and Y are ignored.
 88 |   //
 89 |   //  This routine is the standard version designed for scalar or RISC systems.
 90 |   //  However, it should produce the same results on any single processor
 91 |   //  computer with at least 48 mantissa bits in double precision floating point
 92 |   //  data.  On 64 bit systems, double precision should be disabled.
 93 |   //
 94 |   //--------------------------------------------------------------------
 95 | 
 96 |   // r23 = pow(0.5, 23.0);
 97 |   ////  pow(0.5, 23.0) = 1.1920928955078125e-07
 98 |   // r46 = r23 * r23;
 99 |   // t23 = pow(2.0, 23.0);
100 |   ////  pow(2.0, 23.0) = 8.388608e+06
101 |   // t46 = t23 * t23;
102 | 
103 |   const double r23 = 1.1920928955078125e-07;
104 |   const double r46 = r23 * r23;
105 |   const double t23 = 8.388608e+06;
106 |   const double t46 = t23 * t23;
107 | 
108 |   double t1, t2, t3, t4, a1, a2, x1, x2, z;
109 | 
110 |   int i;
111 | 
112 |   //--------------------------------------------------------------------
113 |   //  Break A into two parts such that A = 2^23 * A1 + A2.
114 |   //--------------------------------------------------------------------
115 |   t1 = r23 * a;
116 |   a1 = (int) t1;
117 |   a2 = a - t23 * a1;
118 | 
119 |   //--------------------------------------------------------------------
120 |   //  Generate N results.   This loop is not vectorizable.
121 |   //--------------------------------------------------------------------
122 |   for ( i = 0; i < n; i++ ) {
123 |     //--------------------------------------------------------------------
124 |     //  Break X into two parts such that X = 2^23 * X1 + X2, compute
125 |     //  Z = A1 * X2 + A2 * X1  (mod 2^23), and then
126 |     //  X = 2^23 * Z + A2 * X2  (mod 2^46).
127 |     //--------------------------------------------------------------------
128 |     t1 = r23 * (*x);
129 |     x1 = (int) t1;
130 |     x2 = *x - t23 * x1;
131 |     t1 = a1 * x2 + a2 * x1;
132 |     t2 = (int) (r23 * t1);
133 |     z = t1 - t23 * t2;
134 |     t3 = t23 * z + a2 * x2;
135 |     t4 = (int) (r46 * t3) ;
136 |     *x = t3 - t46 * t4;
137 |     y[i] = r46 * (*x);
138 |   }
139 | 
140 |   return;
141 | }
142 | 
143 | 


--------------------------------------------------------------------------------
/HW3/part1/common/randdp.h:
--------------------------------------------------------------------------------
1 | #ifndef __RANDDP_H__
2 | #define __RANDDP_H__
3 | 
4 | double randlc( double *x, double a );
5 | void vranlc( int n, double *x, double a, double y[] );
6 | 
7 | #endif
8 | 
9 | 


--------------------------------------------------------------------------------
/HW3/part1/common/timers.h:
--------------------------------------------------------------------------------
 1 | #ifndef __TIMERS_H__
 2 | #define __TIMERS_H__
 3 | 
 4 | void timer_clear( int n );
 5 | void timer_start( int n );
 6 | void timer_stop( int n );
 7 | double timer_read( int n );
 8 | 
 9 | #endif
10 | 
11 | 


--------------------------------------------------------------------------------
/HW3/part1/common/type.h:
--------------------------------------------------------------------------------
 1 | #ifndef __TYPE_H__
 2 | #define __TYPE_H__
 3 | 
 4 | typedef enum { false, true } logical;
 5 | typedef struct { 
 6 |   double real;
 7 |   double imag;
 8 | } dcomplex;
 9 | 
10 | 
11 | #define min(x,y)    ((x) < (y) ? (x) : (y))
12 | #define max(x,y)    ((x) > (y) ? (x) : (y))
13 | 
14 | #endif //__TYPE_H__
15 | 


--------------------------------------------------------------------------------
/HW3/part1/common/wtime.c:
--------------------------------------------------------------------------------
 1 | #include "wtime.h"
 2 | #include <time.h>
 3 | #ifndef DOS
 4 | #include <sys/time.h>
 5 | #endif
 6 | 
 7 | void wtime(double *t)
 8 | {
 9 |   static int sec = -1;
10 |   struct timeval tv;
11 |   gettimeofday(&tv, (void *)0);
12 |   if (sec < 0) sec = tv.tv_sec;
13 |   *t = (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec;
14 | }
15 | 
16 |     
17 | 


--------------------------------------------------------------------------------
/HW3/part1/common/wtime.h:
--------------------------------------------------------------------------------
 1 | /* C/Fortran interface is different on different machines. 
 2 |  * You may need to tweak this.
 3 |  */
 4 | 
 5 | 
 6 | #if defined(IBM)
 7 | #define wtime wtime
 8 | #elif defined(CRAY)
 9 | #define wtime WTIME
10 | #else
11 | #define wtime wtime_
12 | #endif
13 | 


--------------------------------------------------------------------------------
/HW3/part1/common/wtime_sgi64.c:
--------------------------------------------------------------------------------
 1 | #include <sys/types.h>
 2 | #include <fcntl.h>
 3 | #include <sys/mman.h>
 4 | #include <sys/syssgi.h>
 5 | #include <sys/immu.h>
 6 | #include <errno.h>
 7 | #include <stdio.h>
 8 | 
 9 | /* The following works on SGI Power Challenge systems */
10 | 
11 | typedef unsigned long iotimer_t;
12 | 
13 | unsigned int cycleval;
14 | volatile iotimer_t *iotimer_addr, base_counter;
15 | double resolution;
16 | 
17 | /* address_t is an integer type big enough to hold an address */
18 | typedef unsigned long address_t;
19 | 
20 | 
21 | 
22 | void timer_init() 
23 | {
24 |   
25 |   int fd;
26 |   char *virt_addr;
27 |   address_t phys_addr, page_offset, pagemask, pagebase_addr;
28 |   
29 |   pagemask = getpagesize() - 1;
30 |   errno = 0;
31 |   phys_addr = syssgi(SGI_QUERY_CYCLECNTR, &cycleval);
32 |   if (errno != 0) {
33 |     perror("SGI_QUERY_CYCLECNTR");
34 |     exit(1);
35 |   }
36 |   /* rel_addr = page offset of physical address */
37 |   page_offset = phys_addr & pagemask;
38 |   pagebase_addr = phys_addr - page_offset;
39 |   fd = open("/dev/mmem", O_RDONLY);
40 | 
41 |   virt_addr = mmap(0, pagemask, PROT_READ, MAP_PRIVATE, fd, pagebase_addr);
42 |   virt_addr = virt_addr + page_offset;
43 |   iotimer_addr = (iotimer_t *)virt_addr;
44 |   /* cycleval in picoseconds to this gives resolution in seconds */
45 |   resolution = 1.0e-12*cycleval; 
46 |   base_counter = *iotimer_addr;
47 | }
48 | 
49 | void wtime_(double *time) 
50 | {
51 |   static int initialized = 0;
52 |   volatile iotimer_t counter_value;
53 |   if (!initialized) { 
54 |     timer_init();
55 |     initialized = 1;
56 |   }
57 |   counter_value = *iotimer_addr - base_counter;
58 |   *time = (double)counter_value * resolution;
59 | }
60 | 
61 | 
62 | void wtime(double *time) 
63 | {
64 |   static int initialized = 0;
65 |   volatile iotimer_t counter_value;
66 |   if (!initialized) { 
67 |     timer_init();
68 |     initialized = 1;
69 |   }
70 |   counter_value = *iotimer_addr - base_counter;
71 |   *time = (double)counter_value * resolution;
72 | }
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/HW3/part1/def_cg.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW3/part1/def_cg.a


--------------------------------------------------------------------------------
/HW3/part1/globals.h:
--------------------------------------------------------------------------------
 1 | #include "type.h"
 2 | 
 3 | //small datasize
 4 | #ifdef SMALL 
 5 | #define NA        7000
 6 | #define NONZER    8
 7 | #define SHIFT     12
 8 | #define NITER     15
 9 | #define RCOND     1.0e-1
10 | #define VALID_RESULT 10.362595087124
11 | #endif
12 | 
13 | //midiumn datasize
14 | #ifdef MEDIUMN
15 | #define NA        14000
16 | #define NONZER    11
17 | #define SHIFT     20
18 | #define NITER     15
19 | #define RCOND     1.0e-1
20 | #define VALID_RESULT 17.130235054029
21 | #endif
22 | 
23 | //large datasize
24 | #ifdef LARGE
25 | #define NA        75000
26 | #define NONZER    13
27 | #define SHIFT     60
28 | #define NITER     75
29 | #define RCOND     1.0e-1
30 | #define VALID_RESULT 22.712745482631
31 | #endif
32 | 
33 | #define NZ    (NA*(NONZER+1)*(NONZER+1))
34 | #define NAZ   (NA*(NONZER+1))
35 | 
36 | #define T_init        0
37 | #define T_bench       1
38 | #define T_conj_grad   2
39 | #define T_last        3
40 | 
41 | 


--------------------------------------------------------------------------------
/HW3/part1/grade.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <math.h>
  4 | #include <omp.h>
  5 | 
  6 | #include "globals.h"
  7 | #include "randdp.h"
  8 | #include "timers.h"
  9 | #include "cg_impl.h"
 10 | 
 11 | void init(double *zeta);
 12 | void iterate(double *zeta, int *it);
 13 | void reference_init(double *zeta);
 14 | void reference_iterate(double *zeta, int *it);
 15 | void default_init(double *zeta);
 16 | void default_iterate(double *zeta, int *it);
 17 | 
 18 | void print_scores(double stu_time, double ref_time, logical verified)
 19 | {
 20 |     double max_score = 30;
 21 |     double max_perf_score = 0.8 * max_score;
 22 |     double correctness_score = 0.2 * max_score;
 23 |     correctness_score = (verified == true) ? correctness_score : 0;
 24 | 
 25 |     double ratio = (ref_time/stu_time);
 26 | 
 27 |     double slope = max_perf_score/(0.7 - 0.3);
 28 |     double offset = 0.3 * slope;
 29 | 
 30 |     double perf_score = (verified == true) ? ratio*slope - offset : 0;
 31 | 
 32 |     if (perf_score < 0) perf_score = 0;
 33 |     if (perf_score > max_perf_score) perf_score = max_perf_score;
 34 | 
 35 |     printf("correctness : %lf\n", correctness_score);
 36 |     printf("performance : %lf\n", perf_score);
 37 |     printf("total       : %lf\n", correctness_score + perf_score);
 38 | 
 39 |     return;
 40 | }
 41 | 
 42 | int main(int argc, char *argv[])
 43 | {
 44 |     int num_threads = omp_get_max_threads();
 45 |     int i, j, k, it;
 46 |     double zeta;
 47 |     double t, t_total = 0, reference_total = 0, default_total = 0;
 48 |     logical verified;
 49 |     double zeta_verify_value, epsilon, err;
 50 |     char *t_names[T_last];
 51 | 
 52 |     omp_set_num_threads(num_threads);
 53 |     zeta_verify_value = VALID_RESULT;
 54 | 
 55 |     printf("\nCG start...\n\n");
 56 |     printf(" Size: %11d\n", NA);
 57 |     printf(" Iterations: %5d\n", NITER);
 58 |     printf(" Running with %d threads\n", num_threads);
 59 |     printf("\n");
 60 | 
 61 |     for (i = 0; i < T_last; i++)
 62 |     {
 63 |         timer_clear(i);
 64 |     }
 65 |     timer_start(T_init);
 66 |     init(&zeta);
 67 |     zeta = 0.0;
 68 |     for (it = 1; it <= 1; it++)
 69 |     {
 70 |         iterate(&zeta, &it);
 71 |     }
 72 |     for (i = 0; i < NA + 1; i++)
 73 |     {
 74 |         x[i] = 1.0;
 75 |     }
 76 |     zeta = 0.0;
 77 |     timer_stop(T_init);
 78 |     t_total += timer_read(T_init);
 79 | 
 80 |     timer_start(T_bench);
 81 |     for (it = 1; it <= NITER; it++)
 82 |     {
 83 |         iterate(&zeta, &it);
 84 |     }
 85 |     timer_stop(T_bench);
 86 |     t = timer_read(T_bench);
 87 |     t_total += t;
 88 | 
 89 |     epsilon = 1.0e-10;
 90 |     err = fabs(zeta - zeta_verify_value) / zeta_verify_value;
 91 |     if (err <= epsilon)
 92 |     {
 93 |         verified = true;
 94 |         printf(" VERIFICATION SUCCESSFUL\n");
 95 |         printf(" Zeta is    %20.13E\n", zeta);
 96 |         printf(" Error is   %20.13E\n", err);
 97 |     }
 98 |     else
 99 |     {
100 |         verified = false;
101 |         printf(" VERIFICATION FAILED\n");
102 |         printf(" Zeta                %20.13E\n", zeta);
103 |         printf(" The correct zeta is %20.13E\n", zeta_verify_value);
104 |     }
105 | 
106 |     for (i = 0; i < T_last; i++)
107 |     {
108 |         timer_clear(i);
109 |     }
110 |     timer_start(T_init);
111 |     reference_init(&zeta);
112 |     zeta = 0.0;
113 |     for (it = 1; it <= 1; it++)
114 |     {
115 |         reference_iterate(&zeta, &it);
116 |     }
117 |     for (i = 0; i < NA + 1; i++)
118 |     {
119 |         x[i] = 1.0;
120 |     }
121 |     zeta = 0.0;
122 |     timer_stop(T_init);
123 |     reference_total += timer_read(T_init);
124 | 
125 |     timer_start(T_bench);
126 |     for (it = 1; it <= NITER; it++)
127 |     {
128 |         reference_iterate(&zeta, &it);
129 |     }
130 |     timer_stop(T_bench);
131 |     t = timer_read(T_bench);
132 |     reference_total += t;
133 | 
134 |     for (i = 0; i < T_last; i++)
135 |     {
136 |         timer_clear(i);
137 |     }
138 |     timer_start(T_init);
139 |     default_init(&zeta);
140 |     zeta = 0.0;
141 |     for (it = 1; it <= 1; it++)
142 |     {
143 |         default_iterate(&zeta, &it);
144 |     }
145 |     for (i = 0; i < NA + 1; i++)
146 |     {
147 |         x[i] = 1.0;
148 |     }
149 |     zeta = 0.0;
150 |     timer_stop(T_init);
151 |     default_total += timer_read(T_init);
152 | 
153 |     timer_start(T_bench);
154 |     for (it = 1; it <= NITER; it++)
155 |     {
156 |         default_iterate(&zeta, &it);
157 |     }
158 |     timer_stop(T_bench);
159 |     t = timer_read(T_bench);
160 |     default_total += t;
161 | 
162 |     printf("\nreference time : %lfs\n", reference_total);
163 |     printf("default time   : %lfs\n", default_total);
164 |     printf("student time   : %lfs\n\n", t_total);
165 | 
166 |     if (default_total - 0.1 < t_total)
167 |     {
168 |         printf("Your implementation should be faster than default - 0.1s!\n\n");
169 |         verified = false;
170 |     }
171 | 
172 |     print_scores(t_total, reference_total, verified);
173 | 
174 |     return 0;
175 | }
176 | 


--------------------------------------------------------------------------------
/HW3/part1/make.common:
--------------------------------------------------------------------------------
 1 | #---------------------------------------------------------------------------
 2 | # Compiler configurations
 3 | #---------------------------------------------------------------------------
 4 | CC = gcc
 5 | CLINK	= $(CC)
 6 | C_LIB  = -lm
 7 | C_INC = -Icommon
 8 | CFLAGS	= -g -O3 -mcmodel=medium -fopenmp
 9 | CLINKFLAGS = -O3 -mcmodel=medium -fopenmp
10 | UCC	= gcc
11 | BINDIR	= bin
12 | RAND   = randdp
13 | WTIME  = wtime.c
14 | 
15 | CCOMPILE = $(CC)  -c $(C_INC) $(CFLAGS)
16 | CCOMPILE_pp = $(CC_pp)  -c $(C_INC_pp) $(CFLAGS_pp)
17 | 
18 | # Class "U" is used internally by the setparams program to mean
19 | # "unknown". This means that if you don't specify CLASS=
20 | # on the command line, you'll get an error. It would be nice
21 | # to be able to avoid this, but we'd have to get information
22 | # from the setparams back to the make program, which isn't easy. 
23 | CLASS=U
24 | 
25 | config:
26 | COMMON=common
27 | ${COMMON}/${RAND}.o: ${COMMON}/${RAND}.c
28 | 	cd ${COMMON}; ${CCOMPILE} ${RAND}.c
29 | 
30 | #${COMMON}/print_results.o: ${COMMON}/print_results.c
31 | #	cd ${COMMON}; ${CCOMPILE} print_results.c
32 | 
33 | #${COMMON}/c_print_results.o: ${COMMON}/c_print_results.c
34 | #	cd ${COMMON}; ${CCOMPILE} c_print_results.c
35 | 
36 | ${COMMON}/timers.o: ${COMMON}/timers.c
37 | 	cd ${COMMON}; ${CCOMPILE} timers.c
38 | 
39 | ${COMMON}/c_timers.o: ${COMMON}/c_timers.c
40 | 	cd ${COMMON}; ${CCOMPILE} c_timers.c
41 | 
42 | ${COMMON}/wtime.o: ${COMMON}/${WTIME}
43 | 	cd ${COMMON}; ${CCOMPILE} ${MACHINE} -o wtime.o ${WTIME}
44 | # For most machines or CRAY or IBM
45 | #	cd ${COMMON}; ${CCOMPILE} ${MACHINE} ${COMMON}/wtime.c
46 | # For a precise timer on an SGI Power Challenge, try:
47 | #	cd ${COMMON}; ${CCOMPILE} -o wtime.o ${COMMON}/wtime_sgi64.c
48 | 
49 | ${COMMON}/c_wtime.o: ${COMMON}/${WTIME}
50 | 	cd ${COMMON}; ${CCOMPILE} -o c_wtime.o ${WTIME}
51 | 
52 | # So that "make benchmark-name" works
53 | ${BENCHMARK}:  default
54 | ${BENCHMARKU}: default
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/HW3/part1/ref_cg.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW3/part1/ref_cg.a


--------------------------------------------------------------------------------
/HW3/part1/report.txt:
--------------------------------------------------------------------------------
 1 | Model name:                      AMD Ryzen 7 PRO 4750U with Radeon Graphics
 2 | Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero wbnoinvd arat umip rdpid overflow_recov succor
 3 | cpu MHz		: 1696.813
 4 | cpu MHz		: 1696.813
 5 | cpu MHz		: 1696.813
 6 | cpu MHz		: 1696.813
 7 | 
 8 | CG start...
 9 | 
10 |  Size:       14000
11 |  Iterations:    15
12 |  Running with 4 threads
13 | 
14 | 
15 |    iteration           ||r||                 zeta
16 |         1       2.60650812147631E-13    19.9997581277040
17 | 
18 |    iteration           ||r||                 zeta
19 |         1       2.60650812147631E-13    19.9997581277040
20 |         2       2.57531877367169E-15    17.1140495745506
21 |         3       2.59348789075185E-15    17.1296668946143
22 |         4       2.56262926848262E-15    17.1302113581193
23 |         5       2.51106135247005E-15    17.1302338856353
24 |         6       2.55819375820883E-15    17.1302349879482
25 |         7       2.54564770410681E-15    17.1302350498916
26 |         8       2.44940683285382E-15    17.1302350537510
27 |         9       2.48852359037289E-15    17.1302350540101
28 |        10       2.47715076108563E-15    17.1302350540284
29 |        11       2.49284410170029E-15    17.1302350540298
30 |        12       2.44437060612294E-15    17.1302350540299
31 |        13       2.47093619226119E-15    17.1302350540299
32 |        14       2.43816304501123E-15    17.1302350540299
33 |        15       2.42966732234484E-15    17.1302350540299
34 |  VERIFICATION SUCCESSFUL
35 |  Zeta is     1.7130235054030E+01
36 |  Error is    5.1226400332279E-14
37 | 
38 | reference time : 1.825870s
39 | default time   : 3.216315s
40 | student time   : 2.003389s
41 | 
42 | correctness : 6.000000
43 | performance : 24.000000
44 | total       : 30.000000
45 | 


--------------------------------------------------------------------------------
/HW3/part2/breadth_first_search/Makefile:
--------------------------------------------------------------------------------
 1 | all: default grade
 2 | 
 3 | default: main.cpp bfs.cpp
 4 | 	g++ -I../ -std=c++17 -fopenmp -O3 -g -o bfs main.cpp bfs.cpp ../common/graph.cpp ref_bfs.a
 5 | grade: grade.cpp bfs.cpp
 6 | 	g++ -I../ -std=c++17 -fopenmp -O3 -g -o bfs_grader grade.cpp bfs.cpp ../common/graph.cpp ref_bfs.a
 7 | clean:
 8 | 	rm -rf bfs_grader bfs  *~ *.*~
 9 | report: clean all
10 | 	./bfs_grader ../graphs > report.txt
11 | 


--------------------------------------------------------------------------------
/HW3/part2/breadth_first_search/bfs.h:
--------------------------------------------------------------------------------
 1 | #ifndef __BFS_H__
 2 | #define __BFS_H__
 3 | 
 4 | //#define DEBUG
 5 | 
 6 | #include "common/graph.h"
 7 | 
 8 | struct solution
 9 | {
10 |   int *distances;
11 | };
12 | 
13 | struct vertex_set {
14 |   // # of vertices in the set
15 |   int count;
16 |   // max size of buffer vertices 
17 |   int max_vertices;
18 |   // array of vertex ids in set
19 |   int *vertices;
20 | };
21 | 
22 | 
23 | void bfs_top_down(Graph graph, solution* sol);
24 | void bfs_bottom_up(Graph graph, solution* sol);
25 | void bfs_hybrid(Graph graph, solution* sol);
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/HW3/part2/breadth_first_search/ref_bfs.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW3/part2/breadth_first_search/ref_bfs.a


--------------------------------------------------------------------------------
/HW3/part2/breadth_first_search/report.txt:
--------------------------------------------------------------------------------
 1 | Max system threads = 4
 2 | Running with 4 threads
 3 | 
 4 | Graph: grid1000x1000.graph
 5 | 
 6 | Top down bfs
 7 | ref_time: 0.0258286s
 8 | stu_time: 0.0239949s
 9 | 
10 | Bottom up bfs
11 | ref_time: 2.97809s
12 | stu_time: 2.3487s
13 | 
14 | Hybrid bfs
15 | ref_time: 1.06599s
16 | stu_time: 0.555012s
17 | 
18 | Graph: soc-livejournal1_68m.graph
19 | 
20 | Top down bfs
21 | ref_time: 0.428485s
22 | stu_time: 0.364067s
23 | 
24 | Bottom up bfs
25 | ref_time: 0.312608s
26 | stu_time: 0.211181s
27 | 
28 | Hybrid bfs
29 | ref_time: 0.154956s
30 | stu_time: 0.136973s
31 | 
32 | Graph: com-orkut_117m.graph
33 | 
34 | Top down bfs
35 | ref_time: 0.621974s
36 | stu_time: 0.438463s
37 | 
38 | Bottom up bfs
39 | ref_time: 0.273366s
40 | stu_time: 0.230265s
41 | 
42 | Hybrid bfs
43 | ref_time: 0.0907601s
44 | stu_time: 0.0687539s
45 | 
46 | Graph: random_500m.graph
47 | 
48 | Top down bfs
49 | ref_time: 10.3302s
50 | stu_time: 10.1746s
51 | 
52 | Bottom up bfs
53 | ref_time: 19.7902s
54 | stu_time: 13.3703s
55 | 
56 | Hybrid bfs
57 | ref_time: 4.04886s
58 | stu_time: 3.4187s
59 | 
60 | Graph: rmat_200m.graph
61 | 
62 | Top down bfs
63 | ref_time: 3.91783s
64 | stu_time: 3.78097s
65 | 
66 | Bottom up bfs
67 | ref_time: 3.3444s
68 | stu_time: 2.08853s
69 | 
70 | Hybrid bfs
71 | ref_time: 1.70545s
72 | stu_time: 1.33205s
73 | 
74 | 
75 | --------------------------------------------------------------------------
76 | SCORES :                    |   Top-Down    |   Bott-Up    |    Hybrid    |
77 | --------------------------------------------------------------------------
78 | grid1000x1000.graph         |      2.00 / 2 |     3.00 / 3 |     3.00 / 3 |
79 | --------------------------------------------------------------------------
80 | soc-livejournal1_68m.graph  |      2.00 / 2 |     3.00 / 3 |     3.00 / 3 |
81 | --------------------------------------------------------------------------
82 | com-orkut_117m.graph        |      2.00 / 2 |     3.00 / 3 |     3.00 / 3 |
83 | --------------------------------------------------------------------------
84 | random_500m.graph           |      6.00 / 6 |     7.00 / 7 |     7.00 / 7 |
85 | --------------------------------------------------------------------------
86 | rmat_200m.graph             |      6.00 / 6 |     7.00 / 7 |     7.00 / 7 |
87 | --------------------------------------------------------------------------
88 | TOTAL                                                      |  64.00 / 64 |
89 | --------------------------------------------------------------------------
90 | 


--------------------------------------------------------------------------------
/HW3/part2/common/CycleTimer.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SYRAH_CYCLE_TIMER_H_
  2 | #define _SYRAH_CYCLE_TIMER_H_
  3 | 
  4 | #if defined(__APPLE__)
  5 |   #if defined(__x86_64__)
  6 |     #include <sys/sysctl.h>
  7 |   #else
  8 |     #include <mach/mach.h>
  9 |     #include <mach/mach_time.h>
 10 |   #endif // __x86_64__ or not
 11 | 
 12 |   #include <stdio.h>  // fprintf
 13 |   #include <stdlib.h> // exit
 14 | 
 15 | #elif _WIN32
 16 | #  include <windows.h>
 17 | #  include <time.h>
 18 | #else
 19 | #  include <stdio.h>
 20 | #  include <stdlib.h>
 21 | #  include <string.h>
 22 | #  include <sys/time.h>
 23 | #endif
 24 | 
 25 | 
 26 |   // This uses the cycle counter of the processor.  Different
 27 |   // processors in the system will have different values for this.  If
 28 |   // you process moves across processors, then the delta time you
 29 |   // measure will likely be incorrect.  This is mostly for fine
 30 |   // grained measurements where the process is likely to be on the
 31 |   // same processor.  For more global things you should use the
 32 |   // Time interface.
 33 | 
 34 |   // Also note that if you processors' speeds change (i.e. processors
 35 |   // scaling) or if you are in a heterogenous environment, you will
 36 |   // likely get spurious results.
 37 |   class CycleTimer {
 38 |   public:
 39 |     typedef unsigned long long SysClock;
 40 | 
 41 |     //////////
 42 |     // Return the current CPU time, in terms of clock ticks.
 43 |     // Time zero is at some arbitrary point in the past.
 44 |     static SysClock currentTicks() {
 45 | #if defined(__APPLE__) && !defined(__x86_64__)
 46 |       return mach_absolute_time();
 47 | #elif defined(_WIN32)
 48 |       LARGE_INTEGER qwTime;
 49 |       QueryPerformanceCounter(&qwTime);
 50 |       return qwTime.QuadPart;
 51 | #elif defined(__x86_64__)
 52 |       unsigned int a, d;
 53 |       asm volatile("rdtsc" : "=a" (a), "=d" (d));
 54 |       return static_cast<unsigned long long>(a) |
 55 |         (static_cast<unsigned long long>(d) << 32);
 56 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser.
 57 |       unsigned int val;
 58 |       asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val));
 59 |       return val;
 60 | #else
 61 |       timespec spec;
 62 |       clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec);
 63 |       return CycleTimer::SysClock(static_cast<float>(spec.tv_sec) * 1e9 + static_cast<float>(spec.tv_nsec));
 64 | #endif
 65 |     }
 66 | 
 67 |     //////////
 68 |     // Return the current CPU time, in terms of seconds.
 69 |     // This is slower than currentTicks().  Time zero is at
 70 |     // some arbitrary point in the past.
 71 |     static double currentSeconds() {
 72 |       return currentTicks() * secondsPerTick();
 73 |     }
 74 | 
 75 |     //////////
 76 |     // Return the conversion from seconds to ticks.
 77 |     static double ticksPerSecond() {
 78 |       return 1.0/secondsPerTick();
 79 |     }
 80 | 
 81 |     static const char* tickUnits() {
 82 | #if defined(__APPLE__) && !defined(__x86_64__)
 83 |       return "ns";
 84 | #elif defined(__WIN32__) || defined(__x86_64__)
 85 |       return "cycles";
 86 | #else
 87 |       return "ns"; // clock_gettime
 88 | #endif
 89 |     }
 90 | 
 91 |     //////////
 92 |     // Return the conversion from ticks to seconds.
 93 |     static double secondsPerTick() {
 94 |       static bool initialized = false;
 95 |       static double secondsPerTick_val;
 96 |       if (initialized) return secondsPerTick_val;
 97 | #if defined(__APPLE__)
 98 |   #ifdef __x86_64__
 99 |       int args[] = {CTL_HW, HW_CPU_FREQ};
100 |       unsigned int Hz;
101 |       size_t len = sizeof(Hz);
102 |       if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) {
103 |          fprintf(stderr, "Failed to initialize secondsPerTick_val!\n");
104 |          exit(-1);
105 |       }
106 |       secondsPerTick_val = 1.0 / (double) Hz;
107 |   #else
108 |       mach_timebase_info_data_t time_info;
109 |       mach_timebase_info(&time_info);
110 | 
111 |       // Scales to nanoseconds without 1e-9f
112 |       secondsPerTick_val = (1e-9*static_cast<double>(time_info.numer))/
113 |         static_cast<double>(time_info.denom);
114 |   #endif // x86_64 or not
115 | #elif defined(_WIN32)
116 |       LARGE_INTEGER qwTicksPerSec;
117 |       QueryPerformanceFrequency(&qwTicksPerSec);
118 |       secondsPerTick_val = 1.0/static_cast<double>(qwTicksPerSec.QuadPart);
119 | #else
120 |       FILE *fp = fopen("/proc/cpuinfo","r");
121 |       char input[1024];
122 |       if (!fp) {
123 |          fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo.");
124 |          exit(-1);
125 |       }
126 |       // In case we don't find it, e.g. on the N900
127 |       secondsPerTick_val = 1e-9;
128 |       while (!feof(fp) && fgets(input, 1024, fp)) {
129 |         // NOTE(boulos): Because reading cpuinfo depends on dynamic
130 |         // frequency scaling it's better to read the @ sign first
131 |         float GHz, MHz;
132 |         if (strstr(input, "model name")) {
133 |           char* at_sign = strstr(input, "@");
134 |           if (at_sign) {
135 |             char* after_at = at_sign + 1;
136 |             char* GHz_str = strstr(after_at, "GHz");
137 |             char* MHz_str = strstr(after_at, "MHz");
138 |             if (GHz_str) {
139 |               *GHz_str = '\0';
140 |               if (1 == sscanf(after_at, "%f", &GHz)) {
141 |                 //printf("GHz = %f\n", GHz);
142 |                 secondsPerTick_val = 1e-9f / GHz;
143 |                 break;
144 |               }
145 |             } else if (MHz_str) {
146 |               *MHz_str = '\0';
147 |               if (1 == sscanf(after_at, "%f", &MHz)) {
148 |                 //printf("MHz = %f\n", MHz);
149 |                 secondsPerTick_val = 1e-6f / GHz;
150 |                 break;
151 |               }
152 |             }
153 |           }
154 |         } else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) {
155 |           //printf("MHz = %f\n", MHz);
156 |           secondsPerTick_val = 1e-6f / MHz;
157 |           break;
158 |         }
159 |       }
160 |       fclose(fp);
161 | #endif
162 | 
163 |       initialized = true;
164 |       return secondsPerTick_val;
165 |     }
166 | 
167 |     //////////
168 |     // Return the conversion from ticks to milliseconds.
169 |     static double msPerTick() {
170 |       return secondsPerTick() * 1000.0;
171 |     }
172 | 
173 |   private:
174 |     CycleTimer();
175 |   };
176 | 
177 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_
178 | 


--------------------------------------------------------------------------------
/HW3/part2/common/contracts.h:
--------------------------------------------------------------------------------
 1 | /* Debugging with contracts; simulating cc0 -d
 2 |  * Enable with gcc -DDEBUG ...
 3 |  *
 4 |  * 15-122 Principles of Imperative Computation
 5 |  * Frank Pfenning
 6 |  */
 7 | 
 8 | #include <assert.h>
 9 | 
10 | /* Unlike typical header files, "contracts.h" may be
11 |  * included multiple times, with and without DEBUG defined.
12 |  * For this to succeed we first undefine the macros in
13 |  * question in order to avoid a redefinition warning.
14 |  */
15 | 
16 | #undef ASSERT
17 | #undef REQUIRES
18 | #undef ENSURES
19 | 
20 | #ifdef DEBUG
21 | 
22 | #define ASSERT(COND) assert(COND)
23 | #define REQUIRES(COND) assert(COND)
24 | #define ENSURES(COND) assert(COND)
25 | 
26 | #else
27 | 
28 | #define ASSERT(COND) ((void)0)
29 | #define REQUIRES(COND) ((void)0)
30 | #define ENSURES(COND) ((void)0)
31 | 
32 | #endif
33 | 


--------------------------------------------------------------------------------
/HW3/part2/common/grade.h:
--------------------------------------------------------------------------------
  1 | #ifndef __GRADE_H__
  2 | #define __GRADE_H__
  3 | 
  4 | #include <stdio.h>
  5 | #include <sstream>
  6 | #include <iomanip>
  7 | #include <chrono>
  8 | 
  9 | #include <type_traits>
 10 | #include <utility>
 11 | 
 12 | #include <float.h>
 13 | #include <cmath>
 14 | 
 15 | #include <omp.h>
 16 | 
 17 | #include "graph.h"
 18 | #include "graph_internal.h"
 19 | #include "contracts.h"
 20 | 
 21 | // Epsilon for approximate float comparisons
 22 | #define EPSILON 0.00000000001
 23 | 
 24 | // Output column size
 25 | #define COL_SIZE 15
 26 | 
 27 | // Point value for apps that are not run.
 28 | #define POINTS_NA -1
 29 | 
 30 | // Point value for apps that yeilded incorrect results.
 31 | #define POINTS_INCORRECT -2
 32 | 
 33 | /*
 34 |  * Printing functions
 35 |  */
 36 | 
 37 | static void sep(std::ostream& out, char separator = '-', int length = 78)
 38 | {
 39 |     for (int i = 0; i < length; i++)
 40 |       out << separator;
 41 |     out << std::endl;
 42 | }
 43 | 
 44 | static void printTimingApp(std::ostream& timing, const char* appName)
 45 | {
 46 |   std::cout << std::endl;
 47 |   std::cout << "Timing results for " << appName << ":" << std::endl;
 48 |   sep(std::cout, '=', 75);
 49 | 
 50 |   timing << std::endl;
 51 |   timing << "Timing results for " << appName << ":" << std::endl;
 52 |   sep(timing, '=', 75);
 53 | }
 54 | 
 55 | /*
 56 |  * Correctness checkers
 57 |  */
 58 | 
 59 | template <class T>
 60 | bool compareArrays(Graph graph, T* ref, T* stu)
 61 | {
 62 |   for (int i = 0; i < graph->num_nodes; i++) {
 63 |     if (ref[i] != stu[i]) {
 64 |       std::cerr << "*** Results disagree at " << i << " expected " 
 65 |         << ref[i] << " found " << stu[i] << std::endl;
 66 |       return false;
 67 |     }
 68 |   }
 69 |   return true;
 70 | }
 71 | 
 72 | template <class T>
 73 | bool compareApprox(Graph graph, T* ref, T* stu)
 74 | {
 75 |   for (int i = 0; i < graph->num_nodes; i++) {
 76 |     if (fabs(ref[i] - stu[i]) > EPSILON) {
 77 |       std::cerr << "*** Results disagree at " << i << " expected " 
 78 |         << ref[i] << " found " << stu[i] << std::endl;
 79 |       return false;
 80 |     }
 81 |   }
 82 |   return true;
 83 | }
 84 | 
 85 | template <class T>
 86 | bool compareArraysAndDisplay(Graph graph, T* ref, T*stu) 
 87 | {
 88 |   printf("\n----------------------------------\n");
 89 |   printf("Visualization of student results");
 90 |   printf("\n----------------------------------\n\n");
 91 | 
 92 |   int grid_dim = (int)sqrt(graph->num_nodes);
 93 |   for (int j=0; j<grid_dim; j++) {
 94 |     for (int i=0; i<grid_dim; i++) {
 95 |       printf("%02d ", stu[j*grid_dim + i]);
 96 |     }
 97 |     printf("\n");
 98 |   }
 99 |   printf("\n----------------------------------\n");
100 |   printf("Visualization of reference results");
101 |   printf("\n----------------------------------\n\n");
102 | 
103 |   grid_dim = (int)sqrt(graph->num_nodes);
104 |   for (int j=0; j<grid_dim; j++) {
105 |     for (int i=0; i<grid_dim; i++) {
106 |       printf("%02d ", ref[j*grid_dim + i]);
107 |     }
108 |     printf("\n");
109 |   }
110 |   
111 |   return compareArrays<T>(graph, ref, stu);
112 | }
113 | 
114 | template <class T>
115 | bool compareArraysAndRadiiEst(Graph graph, T* ref, T* stu) 
116 | {
117 |   bool isCorrect = true;
118 |   for (int i = 0; i < graph->num_nodes; i++) {
119 |     if (ref[i] != stu[i]) {
120 |       std::cerr << "*** Results disagree at " << i << " expected "
121 |         << ref[i] << " found " << stu[i] << std::endl;
122 | 	isCorrect = false;
123 |     }
124 |   }
125 |   int stuMaxVal = -1;
126 |   int refMaxVal = -1;
127 |   #pragma omp parallel for schedule(dynamic, 512) reduction(max: stuMaxVal)
128 |   for (int i = 0; i < graph->num_nodes; i++) {
129 | 	if (stu[i] > stuMaxVal)
130 | 		stuMaxVal = stu[i];
131 |   }
132 |   #pragma omp parallel for schedule(dynamic, 512) reduction(max: refMaxVal)
133 |   for (int i = 0; i < graph->num_nodes; i++) {
134 |         if (ref[i] > refMaxVal)
135 |                 refMaxVal = ref[i];
136 |   }
137 |  
138 |   if (refMaxVal != stuMaxVal) {
139 | 	std::cerr << "*** Radius estimates differ. Expected: " << refMaxVal << " Got: " << stuMaxVal << std::endl;
140 | 	isCorrect = false;
141 |   }   
142 |   return isCorrect;
143 | }
144 | 
145 | #endif /* __GRADE_H__ */
146 | 


--------------------------------------------------------------------------------
/HW3/part2/common/graph.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GRAPH_H__
 2 | #define __GRAPH_H__
 3 | 
 4 | using Vertex = int;
 5 | 
 6 | struct graph
 7 | {
 8 |     // Number of edges in the graph
 9 |     int num_edges;
10 |     // Number of vertices in the graph
11 |     int num_nodes;
12 | 
13 |     // The node reached by vertex i's first outgoing edge is given by
14 |     // outgoing_edges[outgoing_starts[i]].  To iterate over all
15 |     // outgoing edges, please see the top-down bfs implementation.
16 |     int* outgoing_starts;
17 |     Vertex* outgoing_edges;
18 | 
19 |     int* incoming_starts;
20 |     Vertex* incoming_edges;
21 | };
22 | 
23 | using Graph = graph*;
24 | 
25 | /* Getters */
26 | static inline int num_nodes(const Graph);
27 | static inline int num_edges(const Graph);
28 | 
29 | static inline const Vertex* outgoing_begin(const Graph, Vertex);
30 | static inline const Vertex* outgoing_end(const Graph, Vertex);
31 | static inline int outgoing_size(const Graph, Vertex);
32 | 
33 | static inline const Vertex* incoming_begin(const Graph, Vertex);
34 | static inline const Vertex* incoming_end(const Graph, Vertex);
35 | static inline int incoming_size(const Graph, Vertex);
36 | 
37 | 
38 | /* IO */
39 | Graph load_graph(const char* filename);
40 | Graph load_graph_binary(const char* filename);
41 | void store_graph_binary(const char* filename, Graph);
42 | 
43 | void print_graph(const graph*);
44 | 
45 | 
46 | /* Deallocation */
47 | void free_graph(Graph);
48 | 
49 | 
50 | /* Included here to enable inlining. Don't look. */
51 | #include "graph_internal.h"
52 | 
53 | #endif
54 | 


--------------------------------------------------------------------------------
/HW3/part2/common/graph_internal.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GRAPH_INTERNAL_H__
 2 | #define __GRAPH_INTERNAL_H__
 3 | 
 4 | #include <stdlib.h>
 5 | #include "contracts.h"
 6 | 
 7 | static inline int num_nodes(const Graph graph)
 8 | {
 9 |   REQUIRES(graph != NULL);
10 |   return graph->num_nodes;
11 | }
12 | 
13 | static inline int num_edges(const Graph graph)
14 | {
15 |   REQUIRES(graph != NULL);
16 |   return graph->num_edges;
17 | }
18 | 
19 | static inline const Vertex* outgoing_begin(const Graph g, Vertex v)
20 | {
21 |   REQUIRES(g != NULL);
22 |   REQUIRES(0 <= v && v < num_nodes(g));
23 |   return g->outgoing_edges + g->outgoing_starts[v];
24 | }
25 | 
26 | static inline const Vertex* outgoing_end(const Graph g, Vertex v)
27 | {
28 |   REQUIRES(g != NULL);
29 |   REQUIRES(0 <= v && v < num_nodes(g));
30 |   int offset = (v == g->num_nodes - 1) ? g->num_edges : g->outgoing_starts[v + 1];
31 |   return g->outgoing_edges + offset;
32 | }
33 | 
34 | static inline int outgoing_size(const Graph g, Vertex v)
35 | {
36 |   REQUIRES(g != NULL);
37 |   REQUIRES(0 <= v && v < num_nodes(g));
38 |   if (v == g->num_nodes - 1) {
39 |     return g->num_edges - g->outgoing_starts[v];
40 |   } else {
41 |     return g->outgoing_starts[v + 1] - g->outgoing_starts[v];
42 |   }
43 | }
44 | 
45 | static inline const Vertex* incoming_begin(const Graph g, Vertex v)
46 | {
47 |   REQUIRES(g != NULL);
48 |   REQUIRES(0 <= v && v < num_nodes(g));
49 |   return g->incoming_edges + g->incoming_starts[v];
50 | }
51 | 
52 | static inline const Vertex* incoming_end(const Graph g, Vertex v)
53 | {
54 |   REQUIRES(g != NULL);
55 |   REQUIRES(0 <= v && v < num_nodes(g));
56 |   int offset = (v == g->num_nodes - 1) ? g->num_edges : g->incoming_starts[v + 1];
57 |   return g->incoming_edges + offset;
58 | }
59 | 
60 | static inline int incoming_size(const Graph g, Vertex v)
61 | {
62 |   REQUIRES(g != NULL);
63 |   REQUIRES(0 <= v && v < num_nodes(g));
64 |   if (v == g->num_nodes - 1) {
65 |     return g->num_edges - g->incoming_starts[v];
66 |   } else {
67 |     return g->incoming_starts[v + 1] - g->incoming_starts[v];
68 |   }
69 | }
70 | 
71 | #endif // __GRAPH_INTERNAL_H__
72 | 


--------------------------------------------------------------------------------
/HW3/part2/doc/bfs.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW3/part2/doc/bfs.pdf


--------------------------------------------------------------------------------
/HW3/part2/graphs/README.md:
--------------------------------------------------------------------------------
1 | http://sslab.cs.nctu.edu.tw/~acliu/all_graphs.tgz
2 | 
3 | * Be careful, this is a 3GB download
4 | 


--------------------------------------------------------------------------------
/HW3/part2/page_rank/Makefile:
--------------------------------------------------------------------------------
 1 | all: default grade
 2 | 
 3 | default: page_rank.cpp main.cpp
 4 | 	g++ -I../ -std=c++17 -fopenmp -O3 -o pr main.cpp page_rank.cpp ../common/graph.cpp ref_pr.a
 5 | grade: page_rank.cpp grade.cpp
 6 | 	g++ -I../ -std=c++17 -fopenmp -O3 -o pr_grader grade.cpp page_rank.cpp ../common/graph.cpp ref_pr.a
 7 | clean:
 8 | 	rm -rf pr pr_grader *~ *.*~
 9 | report: clean all
10 | 	./pr_grader ../graphs > report.txt


--------------------------------------------------------------------------------
/HW3/part2/page_rank/grade.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <omp.h>
  4 | #include <string>
  5 | #include <unistd.h>
  6 | #include <limits>
  7 | 
  8 | #include <iostream>
  9 | #include <sstream>
 10 | #include <vector>
 11 | 
 12 | #include "../common/CycleTimer.h"
 13 | #include "../common/graph.h"
 14 | #include "../common/grade.h"
 15 | #include "page_rank.h"
 16 | 
 17 | #define USE_BINARY_GRAPH 1
 18 | 
 19 | #define PageRankDampening 0.3f
 20 | #define PageRankConvergence 1e-7d
 21 | 
 22 | void reference_pageRank(Graph g, double* solution, double damping,
 23 |                         double convergence);
 24 | 
 25 | void usage(const char* binary_name) {
 26 |     std::cout << "Usage: " << binary_name << " [options] graphdir" << std::endl;
 27 |     std::cout << std::endl;
 28 |     std::cout << "Options:" << std::endl;
 29 |     std::cout << "  -n  INT number of threads" << std::endl;
 30 |     std::cout << "  -r  INT number of runs" << std::endl;
 31 |     std::cout << "  -h      this commandline help message" << std::endl;
 32 | }
 33 | 
 34 | graph* load_graph(std::string graph_filename) {
 35 |     graph* g;
 36 |     if (USE_BINARY_GRAPH) {
 37 |       g = load_graph_binary(graph_filename.c_str());
 38 |     } else {
 39 |         g = load_graph(graph_filename);
 40 |         printf("storing binary form of graph!\n");
 41 |         store_graph_binary(graph_filename.append(".bin").c_str(), g);
 42 |         free_graph(g);
 43 |         exit(1);
 44 |     }
 45 |     return g;
 46 | }
 47 | 
 48 | double run_on_graph(graph* g, int num_threads, int num_runs, std::string graph_name) {
 49 | 
 50 |     double* sol_stu = new double[g->num_nodes];
 51 |     double* sol_ref = new double[g->num_nodes];
 52 |         
 53 |     omp_set_num_threads(num_threads);
 54 | 
 55 |     double start, time;
 56 |     
 57 |     //Run implementation
 58 |     double stu_time = std::numeric_limits<int>::max();
 59 |     for (int r = 0; r < num_runs; r++) {
 60 |         start = CycleTimer::currentSeconds();
 61 |         pageRank(g, sol_stu, PageRankDampening, PageRankConvergence);
 62 |         //reference_pageRank(g, sol_stu, PageRankDampening, PageRankConvergence);
 63 |         time = CycleTimer::currentSeconds() - start;
 64 |         stu_time = std::min(stu_time, time);
 65 |     }
 66 | 
 67 |     //Run reference implementation
 68 |     double ref_time = std::numeric_limits<int>::max();
 69 |     for (int r = 0; r < num_runs; r++) {
 70 |         start = CycleTimer::currentSeconds();
 71 |         reference_pageRank(g, sol_ref, PageRankDampening, PageRankConvergence);
 72 |         time = CycleTimer::currentSeconds() - start;
 73 |         ref_time = std::min(ref_time, time);
 74 |     }
 75 | 
 76 |     bool correct = compareApprox(g, sol_ref, sol_stu);
 77 |     
 78 |     delete(sol_stu);
 79 |     delete(sol_ref);
 80 | 
 81 |     if (!correct) {
 82 |         std::cout << "Page rank incorrect" << std::endl; 
 83 |     } else {
 84 |         std::cout << "ref_time: " <<  ref_time << "s" << std::endl;
 85 |         std::cout << "stu_time: " <<  stu_time << "s" << std::endl;
 86 |     }
 87 | 
 88 |     double max_score = 4;
 89 |     double max_perf_score = 0.8 * max_score;
 90 |     double correctness_score = 0.2 * max_score;
 91 |     correctness_score = (correct) ? correctness_score : 0;
 92 | 
 93 |     double ratio = (ref_time/stu_time);
 94 | 
 95 |     double slope = max_perf_score/(0.7 - 0.3);
 96 |     double offset = 0.3 * slope;
 97 | 
 98 |     double perf_score = (correct) ? ratio*slope - offset : 0;
 99 | 
100 |     if (perf_score < 0) perf_score = 0;
101 |     if (perf_score > max_perf_score) perf_score = max_perf_score;
102 | 
103 |     return (correctness_score + perf_score);
104 | }
105 | 
106 | void print_separator_line() {
107 |     for (int i = 0; i < 43; i++) {
108 |         std::cout<<"-";
109 |     }
110 |     std::cout<<std::endl;
111 | }
112 | 
113 | void print_scores(std::vector<std::string> grade_graphs, std::vector<double> scores) {
114 |     
115 |     std::cout.precision(5);
116 |     std::cout.setf(std::ios::fixed, std:: ios::floatfield);
117 |     std::cout<<std::endl<<std::endl;
118 | 
119 |     print_separator_line();
120 | 
121 |     std::cout<<"SCORES :"<<std::endl;
122 | 
123 |     print_separator_line();
124 | 
125 |     double total_score = 0.0;
126 | 
127 |     for (int g = 0; g < grade_graphs.size(); g++) {
128 |         auto& graph_name = grade_graphs[g];
129 | 
130 |         total_score += scores[g];
131 | 
132 |         std::string max_score = "4";
133 | 
134 |         std::cout<<graph_name;
135 |         for (int i = 0; i < (28 - graph_name.length()); i++) {
136 |             std::cout<<" ";
137 |         }
138 |         std::cout<<"| ";
139 |         std::cout<<"  "<<scores[g]<<" / "<<max_score<<" |"<<std::endl;
140 | 
141 |         print_separator_line();
142 |     }
143 | 
144 |     std::cout<<"TOTAL";
145 |     for (int i = 0; i < (28 - 5); i++) {
146 |             std::cout<<" ";
147 |     }
148 |     std::cout<<"| ";
149 |     std::cout<<"  "<<total_score<<" / "<<"16"<<" |"<<std::endl;
150 | 
151 |     print_separator_line();
152 | 
153 | }
154 | 
155 | int main(int argc, char** argv) {
156 | 
157 |     int num_threads = omp_get_max_threads();
158 |     int num_runs = 1;
159 |     std::string graph_name, graph_dir;
160 |     bool grade = false;
161 | 
162 |     int opt;
163 |     while ((opt = getopt(argc,argv,"n:r:h")) != EOF) {
164 |         switch(opt) {
165 |             case 'n':
166 |                 num_threads = atoi(optarg);
167 |                 break;
168 |             case 'r':
169 |                 num_runs = atoi(optarg);
170 |                 break;
171 |             case 'h':
172 |             case '?':
173 |             default:
174 |                 usage(argv[0]);
175 |                 exit(1);
176 |         }
177 |     }
178 | 
179 |     if (argc <= optind) {
180 |         usage(argv[0]);
181 |         exit(1);
182 |     }
183 | 
184 |     graph_dir = argv[optind];
185 |   
186 |     printf("Max system threads = %d\n", omp_get_max_threads());
187 |     printf("Running with %d threads\n", num_threads);
188 | 
189 | 
190 |     std::vector<std::string> grade_graphs = { "soc-livejournal1_68m.graph",
191 |                                               "com-orkut_117m.graph",
192 |                                               "rmat_200m.graph",
193 |                                               "random_500m.graph"};
194 | 
195 |     std::vector<double> scores(grade_graphs.size());
196 | 
197 |     int i = 0;
198 |     for (auto& graph_name: grade_graphs) {
199 |         graph* g = load_graph(graph_dir + '/' + graph_name);
200 |         std::cout << "\nGraph: " << graph_name << std::endl;
201 |         scores[i] = run_on_graph(g, num_threads, num_runs, graph_name);
202 |         free_graph(g);
203 |         i++;
204 |     }
205 | 
206 |     print_scores(grade_graphs, scores);
207 | 
208 |     return 0;
209 | }
210 | 


--------------------------------------------------------------------------------
/HW3/part2/page_rank/page_rank.cpp:
--------------------------------------------------------------------------------
  1 | #include "page_rank.h"
  2 | 
  3 | #include <stdlib.h>
  4 | #include <cmath>
  5 | #include <omp.h>
  6 | #include <utility>
  7 | 
  8 | #include <vector>
  9 | 
 10 | #include "../common/CycleTimer.h"
 11 | #include "../common/graph.h"
 12 | 
 13 | // pageRank --
 14 | //
 15 | // g:           graph to process (see common/graph.h)
 16 | // solution:    array of per-vertex vertex scores (length of array is num_nodes(g))
 17 | // damping:     page-rank algorithm's damping parameter
 18 | // convergence: page-rank algorithm's convergence threshold
 19 | //
 20 | void pageRank(Graph g, double *solution, double damping, double convergence)
 21 | {
 22 | 
 23 |   // initialize vertex weights to uniform probability. Double
 24 |   // precision scores are used to avoid underflow for large graphs
 25 | 
 26 |   /*
 27 |     For PP students: Implement the page rank algorithm here.  You
 28 |     are expected to parallelize the algorithm using openMP.  Your
 29 |     solution may need to allocate (and free) temporary arrays.
 30 | 
 31 |     Basic page rank pseudocode is provided below to get you started:
 32 | 
 33 |     // initialization: see example code above
 34 |     score_old[vi] = 1/numNodes;
 35 | 
 36 |     while (!converged) {
 37 | 
 38 |       // compute score_new[vi] for all nodes vi:
 39 |       score_new[vi] = sum over all nodes vj reachable from incoming edges
 40 |                          { score_old[vj] / number of edges leaving vj  }
 41 |       score_new[vi] = (damping * score_new[vi]) + (1.0-damping) / numNodes;
 42 | 
 43 |       score_new[vi] += sum over all nodes v in graph with no outgoing edges
 44 |                          { damping * score_old[v] / numNodes }
 45 | 
 46 |       // compute how much per-node scores have changed
 47 |       // quit once algorithm has converged
 48 | 
 49 |       global_diff = sum over all nodes vi { abs(score_new[vi] - score_old[vi]) };
 50 |       converged = (global_diff < convergence)
 51 |     }
 52 | 
 53 |   */
 54 |   int num_nodes = g->num_nodes;
 55 |   int num_edges = g->num_edges;
 56 |   int *outgoing_starts = g->outgoing_starts;
 57 |   Vertex *outgoing_edges = g->outgoing_edges;
 58 |   int *incoming_starts = g->incoming_starts;
 59 |   Vertex *incoming_edges = g->incoming_edges;
 60 | 
 61 |   double global_diff;
 62 |   double *solution_old = new double[num_nodes];
 63 |   double equal_prob = 1.0 / num_nodes;
 64 |   std::vector<Vertex> partial_vs[omp_get_max_threads()];
 65 |   std::vector<Vertex> other_vs; // all nodes v in graph with no outgoing edges
 66 |   double partial_scores[omp_get_max_threads()];
 67 |   double no_out_score;
 68 | 
 69 |   #pragma omp parallel for
 70 |   for (Vertex v = 0; v < num_nodes; ++v)
 71 |   {
 72 |     int start_edge = outgoing_starts[v];
 73 |     int end_edge = (v == num_nodes - 1)
 74 |                    ? num_edges
 75 |                    : outgoing_starts[v + 1];
 76 |     
 77 |     if (start_edge == end_edge)
 78 |     {
 79 |       partial_vs[omp_get_thread_num()].push_back(v);
 80 |     }
 81 |   }
 82 | 
 83 |   for (int i = 0; i < omp_get_max_threads(); ++i)
 84 |   {
 85 |     for (Vertex v : partial_vs[i])
 86 |     {
 87 |       other_vs.push_back(v);
 88 |     }
 89 |   }
 90 | 
 91 |   #pragma omp parallel for
 92 |   for (Vertex i = 0; i < num_nodes; ++i)
 93 |   {
 94 |     solution_old[i] = equal_prob;
 95 |   }
 96 | 
 97 |   while (1)
 98 |   {
 99 |     // sum over all nodes incoming_v reachable from incoming edges
100 |     // score[v] = sum(score_old[incoming_v] / 
101 |     //   number of edges leaving incoming_v) for each incoming_v
102 | 
103 |     #pragma omp parallel for
104 |     for (Vertex v = 0; v < num_nodes; ++v)
105 |     {
106 |       solution[v] = 0;
107 |       int start_edge = incoming_starts[v];
108 |       int end_edge = (v == num_nodes - 1)
109 |                      ? num_edges
110 |                      : incoming_starts[v + 1];
111 | 
112 |       for (int edgeidx = start_edge; edgeidx < end_edge; ++edgeidx)
113 |       {
114 |         Vertex incoming_v = incoming_edges[edgeidx];
115 | 
116 |         int out_start_edge = outgoing_starts[incoming_v];
117 |         int out_end_edge = (incoming_v == num_nodes - 1)
118 |                            ? num_edges
119 |                            : outgoing_starts[incoming_v + 1];
120 |         solution[v] += (solution_old[incoming_v] / (out_end_edge - out_start_edge));
121 |       }
122 |     }
123 | 
124 |     // damping & sum over all nodes other_v in graph with no outgoing edges
125 |     // score[v] = (damping * score[v]) + (1.0-damping) / num_nodes;
126 |     // score[v] += sum(damping * score_old[other_v] /
127 |     //   num_nodes) for each other_v
128 |     // --->
129 |     // score[v] = (damping * score[v]) + 
130 |     //   ((damping * sum(score_old[other_v]) for each other_v) + (1.0-damping)) / num_nodes
131 | 
132 |     for (int i = 0; i < omp_get_max_threads(); ++i)
133 |     {
134 |       partial_scores[i] = 0;
135 |     }
136 | 
137 |     #pragma omp parallel for
138 |     for (int i = 0; i < other_vs.size(); ++i)
139 |     {
140 |       Vertex other_v = other_vs[i];
141 |       partial_scores[omp_get_thread_num()] += solution_old[other_v];
142 |     }
143 | 
144 |     no_out_score = 0;
145 |     for (int i = 0; i < omp_get_max_threads(); ++i)
146 |     {
147 |       no_out_score += partial_scores[i];
148 |     }
149 | 
150 |     no_out_score = ((damping * no_out_score) + (1.0 - damping)) / num_nodes;
151 | 
152 |     #pragma omp parallel for
153 |     for (Vertex v = 0; v < num_nodes; ++v)
154 |     {
155 |       solution[v] = damping * solution[v] + no_out_score;
156 |     }
157 | 
158 |     // compute how much per-node scores have changed
159 |     // quit once algorithm has converged
160 | 
161 |     for (int i = 0; i < omp_get_max_threads(); ++i)
162 |     {
163 |       partial_scores[i] = 0;
164 |     }
165 | 
166 |     #pragma omp parallel for
167 |     for (Vertex v = 0; v < num_nodes; ++v)
168 |     {
169 |       double diff = solution[v] - solution_old[v];
170 |       partial_scores[omp_get_thread_num()] += diff >= 0 ? diff : -diff;
171 |     }
172 | 
173 |     global_diff = 0;
174 |     for (int i = 0; i < omp_get_max_threads(); ++i)
175 |     {
176 |       global_diff += partial_scores[i];
177 |     }
178 | 
179 |     if (global_diff < convergence)
180 |       break;
181 | 
182 |     #pragma omp parallel for
183 |     for (int i = 0; i < num_nodes; ++i)
184 |     {
185 |       solution_old[i] = solution[i];
186 |     }
187 |   }
188 | 
189 |   delete [] solution_old;
190 | }
191 | 


--------------------------------------------------------------------------------
/HW3/part2/page_rank/page_rank.h:
--------------------------------------------------------------------------------
1 | #ifndef __PAGE_RANK_H__
2 | #define __PAGE_RANK_H__
3 | 
4 | #include "common/graph.h"
5 | 
6 | void pageRank(Graph g, double* solution, double damping, double convergence);
7 | 
8 | #endif /* __PAGE_RANK_H__ */
9 | 


--------------------------------------------------------------------------------
/HW3/part2/page_rank/ref_pr.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW3/part2/page_rank/ref_pr.a


--------------------------------------------------------------------------------
/HW3/part2/page_rank/report.txt:
--------------------------------------------------------------------------------
 1 | Max system threads = 4
 2 | Running with 4 threads
 3 | 
 4 | Graph: soc-livejournal1_68m.graph
 5 | ref_time: 6.61132s
 6 | stu_time: 7.6062s
 7 | 
 8 | Graph: com-orkut_117m.graph
 9 | ref_time: 5.02033s
10 | stu_time: 5.51803s
11 | 
12 | Graph: rmat_200m.graph
13 | ref_time: 33.6868s
14 | stu_time: 34.6728s
15 | 
16 | Graph: random_500m.graph
17 | ref_time: 180.839s
18 | stu_time: 156.458s
19 | 
20 | 
21 | -------------------------------------------
22 | SCORES :
23 | -------------------------------------------
24 | soc-livejournal1_68m.graph  |   4.00000 / 4 |
25 | -------------------------------------------
26 | com-orkut_117m.graph        |   4.00000 / 4 |
27 | -------------------------------------------
28 | rmat_200m.graph             |   4.00000 / 4 |
29 | -------------------------------------------
30 | random_500m.graph           |   4.00000 / 4 |
31 | -------------------------------------------
32 | TOTAL                       |   16.00000 / 16 |
33 | -------------------------------------------
34 | 


--------------------------------------------------------------------------------
/HW3/part2/tools/Makefile:
--------------------------------------------------------------------------------
1 | BINARYNAME=graphTools
2 | 
3 | main:
4 | 	g++ -std=c++11 -g -O3 -o ${BINARYNAME} graphTools.cpp ../common/graph.cpp
5 | clean:
6 | 	rm -rf pr *~ *.*~ ${BINARYNAME}
7 | 


--------------------------------------------------------------------------------
/HW3/part2/tools/plaintext.graph:
--------------------------------------------------------------------------------
 1 | AdjacencyGraph
 2 | # num vertices
 3 | 5
 4 | # num edges
 5 | 8
 6 | # edge starts
 7 | 0 4 6 7 8
 8 | # all the outgoing edges (target vertex)
 9 | 1 2 3 4
10 | 2 3
11 | 0
12 | 0
13 | 


--------------------------------------------------------------------------------
/HW3/submit/bfs.cpp:
--------------------------------------------------------------------------------
1 | ../part2/breadth_first_search/bfs.cpp


--------------------------------------------------------------------------------
/HW3/submit/cg_impl.c:
--------------------------------------------------------------------------------
1 | ../part1/cg_impl.c


--------------------------------------------------------------------------------
/HW3/submit/page_rank.cpp:
--------------------------------------------------------------------------------
1 | ../part2/page_rank/page_rank.cpp


--------------------------------------------------------------------------------
/HW4/part1/Makefile:
--------------------------------------------------------------------------------
 1 | TARGET := mpi_hello pi_block_linear pi_block_tree pi_nonblock_linear pi_gather pi_reduce
 2 | 
 3 | MPI_HELLO_C_FILES = hello.cc
 4 | PI_BLOCK_LINEAR_SRC_FILES = pi_block_linear.cc
 5 | PI_BLOCK_TREE_SRC_FILES = pi_block_tree.cc
 6 | PI_NONBLOCK_LINEAR_SRC_FILES = pi_nonblock_linear.cc
 7 | PI_GATHER_SRC_FILES = pi_gather.cc
 8 | PI_REDUCE_SRC_FILES = pi_reduce.cc
 9 | 
10 | all: $(TARGET)
11 | #	Copy to all hosts
12 | 	parallel-scp -A -r -h ../setting/hosts.txt ~/HW4 ~
13 | 
14 | mpi_hello: $(MPI_HELLO_C_FILES)
15 | # 	Compile
16 | 	mpicxx $< -o $@
17 | 
18 | pi_block_linear: $(PI_BLOCK_LINEAR_SRC_FILES)
19 | 	mpicxx $< -o $@
20 | 
21 | pi_block_tree: $(PI_BLOCK_TREE_SRC_FILES)
22 | 	mpicxx $< -o $@
23 | 
24 | pi_nonblock_linear: $(PI_NONBLOCK_LINEAR_SRC_FILES)
25 | 	mpicxx $< -o $@
26 | 
27 | pi_gather: $(PI_GATHER_SRC_FILES)
28 | 	mpicxx $< -o $@
29 | 
30 | pi_reduce: $(PI_REDUCE_SRC_FILES)
31 | 	mpicxx $< -o $@
32 | 
33 | .PHONY: clean
34 | clean:
35 | 	rm -f *.o $(TARGET)
36 | 
37 | .PHONY: report
38 | report: $(TARGET)
39 | 	python3 ./test.py
40 | 


--------------------------------------------------------------------------------
/HW4/part1/hello.cc:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <stdio.h>
 3 | 
 4 | int main(int argc, char **argv)
 5 | {
 6 |   // Initialize the MPI environment. The two arguments to MPI Init are not
 7 |   // currently used by MPI implementations, but are there in case future
 8 |   // implementations might need the arguments.
 9 |   MPI_Init(NULL, NULL);
10 | 
11 |   // TODO: Get the number of processes
12 |   int world_size;
13 |   MPI_Comm_size(MPI_COMM_WORLD, &world_size);
14 | 
15 |   // TODO: Get the rank of the process
16 |   int world_rank;
17 |   MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
18 | 
19 |   // Get the name of the processor
20 |   char processor_name[MPI_MAX_PROCESSOR_NAME];
21 |   int name_len;
22 |   MPI_Get_processor_name(processor_name, &name_len);
23 | 
24 |   // Print off a hello world message
25 |   printf("Hello world from processor %s, rank %d out of %d processors\n",
26 |          processor_name, world_rank, world_size);
27 | 
28 |   // Finalize the MPI environment. No more MPI calls can be made after this
29 |   MPI_Finalize();
30 |   return 0;
31 | }


--------------------------------------------------------------------------------
/HW4/part1/hosts_mpi.txt:
--------------------------------------------------------------------------------
1 | pp2 slots=8
2 | pp3 slots=4


--------------------------------------------------------------------------------
/HW4/part1/hosts_part1.txt:
--------------------------------------------------------------------------------
1 | pp7
2 | pp2
3 | pp3
4 | pp4
5 | pp5


--------------------------------------------------------------------------------
/HW4/part1/pi_block_linear.cc:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <time.h>
 5 | #include <sys/types.h>
 6 | #include <unistd.h>
 7 | 
 8 | #pragma GCC optimize ("O3")
 9 | 
10 | long long int hit;
11 | 
12 | double rand_get(double min, double max, unsigned int *seed)
13 | {
14 |     int r = rand_r(seed);
15 |     return min + ((double)(r) / RAND_MAX) * (max - min);
16 | }
17 | 
18 | void tf_estimate_pi(long long int tosses_cnt, int rank)
19 | {
20 |     unsigned int seed = getpid() ^ time(NULL) ^ (0xaed01498 << rank);
21 | 
22 |     for (long long int toss = 0; toss < tosses_cnt; toss ++) {
23 |         double x, y, distance_squared;
24 |         x = rand_get(-1, 1, &seed);
25 |         y = rand_get(-1, 1, &seed);
26 |         distance_squared = x * x + y * y;
27 |         if (distance_squared <= 1)
28 |             hit++;
29 |     }
30 | }
31 | 
32 | int main(int argc, char **argv)
33 | {
34 |     // --- DON'T TOUCH ---
35 |     MPI_Init(&argc, &argv);
36 |     double start_time = MPI_Wtime();
37 |     double pi_result;
38 |     long long int tosses = atoi(argv[1]);
39 |     int world_rank, world_size;
40 |     // ---
41 | 
42 |     // TODO: init MPI
43 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
44 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
45 | 
46 |     if (world_rank > 0)
47 |     {
48 |         // TODO: handle workers
49 |         tosses /= world_size;
50 |         tf_estimate_pi(tosses, world_rank);
51 | 
52 |         MPI_Send(&hit,
53 |                  1,
54 |                  MPI_LONG_LONG,
55 |                  0,
56 |                  0,
57 |                  MPI_COMM_WORLD);
58 |     }
59 |     else if (world_rank == 0)
60 |     {
61 |         // TODO: master
62 |         long long int part_tosses;
63 | 
64 |         part_tosses = (tosses / world_size) + (tosses % world_size);
65 |         tf_estimate_pi(part_tosses, world_rank);
66 | 
67 |         for (int i = 1; i < world_size; ++i) {
68 |             long long int holder;
69 | 
70 |             MPI_Recv(&holder,
71 |                      1,
72 |                      MPI_LONG_LONG,
73 |                      i,
74 |                      0,
75 |                      MPI_COMM_WORLD,
76 |                      MPI_STATUS_IGNORE);
77 | 
78 |             hit += holder;
79 |         }
80 |     }
81 | 
82 |     if (world_rank == 0)
83 |     {
84 |         // TODO: process PI result
85 |         pi_result = 4 * hit / ((double)tosses);
86 | 
87 |         // --- DON'T TOUCH ---
88 |         double end_time = MPI_Wtime();
89 |         printf("%lf\n", pi_result);
90 |         printf("MPI running time: %lf Seconds\n", end_time - start_time);
91 |         // ---
92 |     }
93 | 
94 |     MPI_Finalize();
95 |     return 0;
96 | }
97 | 


--------------------------------------------------------------------------------
/HW4/part1/pi_block_tree.cc:
--------------------------------------------------------------------------------
  1 | #include <mpi.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <time.h>
  5 | #include <sys/types.h>
  6 | #include <unistd.h>
  7 | 
  8 | #pragma GCC optimize ("O3")
  9 | 
 10 | long long int hit;
 11 | 
 12 | double rand_get(double min, double max, unsigned int *seed)
 13 | {
 14 |     int r = rand_r(seed);
 15 |     return min + ((double)(r) / RAND_MAX) * (max - min);
 16 | }
 17 | 
 18 | void tf_estimate_pi(long long int tosses_cnt, int rank)
 19 | {
 20 |     unsigned int seed = getpid() ^ time(NULL) ^ (0xaed01498 << rank);
 21 | 
 22 |     for (long long int toss = 0; toss < tosses_cnt; toss ++) {
 23 |         double x, y, distance_squared;
 24 |         x = rand_get(-1, 1, &seed);
 25 |         y = rand_get(-1, 1, &seed);
 26 |         distance_squared = x * x + y * y;
 27 |         if (distance_squared <= 1)
 28 |             hit++;
 29 |     }
 30 | }
 31 | 
 32 | int main(int argc, char **argv)
 33 | {
 34 |     // --- DON'T TOUCH ---
 35 |     MPI_Init(&argc, &argv);
 36 |     double start_time = MPI_Wtime();
 37 |     double pi_result;
 38 |     long long int tosses = atoi(argv[1]);
 39 |     int world_rank, world_size;
 40 |     // ---
 41 | 
 42 |     long long int part_tosses;
 43 |     int is_buddy_master;
 44 |     int buddy_rank;
 45 |     int buddy_layer = 0;
 46 | 
 47 |     // TODO: MPI init
 48 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 49 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 50 | 
 51 |     // TODO: binary tree redunction
 52 |     part_tosses = tosses / world_size;
 53 |     if (world_rank == 0) {
 54 |         part_tosses += (tosses % world_size);
 55 |     }
 56 | 
 57 |     tf_estimate_pi(part_tosses, world_rank);
 58 | 
 59 |     world_size >>= 1;
 60 | 
 61 |     while (world_size) {
 62 |         buddy_rank = world_rank ^ (1 << buddy_layer);
 63 |         is_buddy_master = !(world_rank & (1 << buddy_layer));
 64 | 
 65 |         if (is_buddy_master) {
 66 |             // Recv
 67 |             long long int holder;
 68 | 
 69 |             MPI_Recv(&holder,
 70 |                      1,
 71 |                      MPI_LONG_LONG,
 72 |                      buddy_rank,
 73 |                      0,
 74 |                      MPI_COMM_WORLD,
 75 |                      MPI_STATUS_IGNORE);
 76 | 
 77 |             hit += holder;
 78 |         } else {
 79 |             // Send
 80 |             MPI_Send(&hit,
 81 |                      1,
 82 |                      MPI_LONG_LONG,
 83 |                      buddy_rank,
 84 |                      0,
 85 |                      MPI_COMM_WORLD);
 86 |             break;
 87 |         }
 88 | 
 89 |         ++buddy_layer;
 90 |         world_size >>= 1;
 91 |     }
 92 |     
 93 |     if (world_rank == 0)
 94 |     {
 95 |         // TODO: PI result
 96 |         pi_result = 4 * hit / ((double)tosses);
 97 | 
 98 |         // --- DON'T TOUCH ---
 99 |         double end_time = MPI_Wtime();
100 |         printf("%lf\n", pi_result);
101 |         printf("MPI running time: %lf Seconds\n", end_time - start_time);
102 |         // ---
103 |     }
104 | 
105 |     MPI_Finalize();
106 |     return 0;
107 | }
108 | 


--------------------------------------------------------------------------------
/HW4/part1/pi_gather.cc:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <time.h>
 5 | #include <sys/types.h>
 6 | #include <unistd.h>
 7 | 
 8 | #pragma GCC optimize ("O3")
 9 | 
10 | long long int hit;
11 | 
12 | double rand_get(double min, double max, unsigned int *seed)
13 | {
14 |     int r = rand_r(seed);
15 |     return min + ((double)(r) / RAND_MAX) * (max - min);
16 | }
17 | 
18 | void tf_estimate_pi(long long int tosses_cnt, int rank)
19 | {
20 |     unsigned int seed = getpid() ^ time(NULL) ^ (0xaed01498 << rank);
21 | 
22 |     for (long long int toss = 0; toss < tosses_cnt; toss ++) {
23 |         double x, y, distance_squared;
24 |         x = rand_get(-1, 1, &seed);
25 |         y = rand_get(-1, 1, &seed);
26 |         distance_squared = x * x + y * y;
27 |         if (distance_squared <= 1)
28 |             hit++;
29 |     }
30 | }
31 | 
32 | int main(int argc, char **argv)
33 | {
34 |     // --- DON'T TOUCH ---
35 |     MPI_Init(&argc, &argv);
36 |     double start_time = MPI_Wtime();
37 |     double pi_result;
38 |     long long int tosses = atoi(argv[1]);
39 |     int world_rank, world_size;
40 |     // ---
41 | 
42 |     // TODO: MPI init
43 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
44 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
45 | 
46 |     // TODO: use MPI_Gather
47 |     long long int part_tosses;
48 |     long long int hits[world_size];
49 | 
50 |     part_tosses = tosses / world_size;
51 |     if (world_rank == 0) {
52 |         part_tosses += (tosses % world_size);
53 |     }
54 | 
55 |     tf_estimate_pi(part_tosses, world_rank);
56 | 
57 |     MPI_Gather(&hit, 1, MPI_LONG_LONG, hits, 1, MPI_LONG_LONG, 0, MPI_COMM_WORLD);
58 | 
59 |     if (world_rank == 0) {
60 |         for (int i = 1; i < world_size; ++i) {
61 |             hit += hits[i];
62 |         }
63 | 
64 |         // TODO: PI result
65 |         pi_result = 4 * hit / ((double)tosses);
66 | 
67 |         // --- DON'T TOUCH ---
68 |         double end_time = MPI_Wtime();
69 |         printf("%lf\n", pi_result);
70 |         printf("MPI running time: %lf Seconds\n", end_time - start_time);
71 |         // ---
72 |     }
73 |     
74 |     MPI_Finalize();
75 |     return 0;
76 | }
77 | 


--------------------------------------------------------------------------------
/HW4/part1/pi_nonblock_linear.cc:
--------------------------------------------------------------------------------
  1 | #include <mpi.h>
  2 | #include <stdio.h>
  3 | #include <stdlib.h>
  4 | #include <time.h>
  5 | #include <sys/types.h>
  6 | #include <unistd.h>
  7 | 
  8 | #pragma GCC optimize ("O3")
  9 | 
 10 | long long int hit;
 11 | 
 12 | double rand_get(double min, double max, unsigned int *seed)
 13 | {
 14 |     int r = rand_r(seed);
 15 |     return min + ((double)(r) / RAND_MAX) * (max - min);
 16 | }
 17 | 
 18 | void tf_estimate_pi(long long int tosses_cnt, int rank)
 19 | {
 20 |     unsigned int seed = getpid() ^ time(NULL) ^ (0xaed01498 << rank);
 21 | 
 22 |     for (long long int toss = 0; toss < tosses_cnt; toss ++) {
 23 |         double x, y, distance_squared;
 24 |         x = rand_get(-1, 1, &seed);
 25 |         y = rand_get(-1, 1, &seed);
 26 |         distance_squared = x * x + y * y;
 27 |         if (distance_squared <= 1)
 28 |             hit++;
 29 |     }
 30 | }
 31 | 
 32 | int main(int argc, char **argv)
 33 | {
 34 |     // --- DON'T TOUCH ---
 35 |     MPI_Init(&argc, &argv);
 36 |     double start_time = MPI_Wtime();
 37 |     double pi_result;
 38 |     long long int tosses = atoi(argv[1]);
 39 |     int world_rank, world_size;
 40 |     // ---
 41 | 
 42 |     // TODO: MPI init
 43 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
 44 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
 45 | 
 46 |     if (world_rank > 0)
 47 |     {
 48 |         // TODO: MPI workers
 49 |         tosses /= world_size;
 50 |         tf_estimate_pi(tosses, world_rank);
 51 | 
 52 |         MPI_Send(&hit,
 53 |                  1,
 54 |                  MPI_LONG_LONG,
 55 |                  0,
 56 |                  0,
 57 |                  MPI_COMM_WORLD);
 58 |     }
 59 |     else if (world_rank == 0)
 60 |     {
 61 |         // TODO: non-blocking MPI communication.
 62 |         // Use MPI_Irecv, MPI_Wait or MPI_Waitall.
 63 |         MPI_Request requests[world_size];
 64 |         long long int holders[world_size];
 65 |         long long int part_tosses;
 66 | 
 67 |         part_tosses = (tosses / world_size) + (tosses % world_size);
 68 |         tf_estimate_pi(part_tosses, world_rank);
 69 | 
 70 |         for (int i = 1; i < world_size; ++i) {
 71 |             MPI_Irecv(&holders[i],
 72 |                       1,
 73 |                       MPI_LONG_LONG,
 74 |                       i,
 75 |                       0,
 76 |                       MPI_COMM_WORLD,
 77 |                       &requests[i]);
 78 |         }
 79 | 
 80 |         MPI_Waitall(world_size - 1, &requests[1], MPI_STATUSES_IGNORE);
 81 | 
 82 |         for (int i = 1; i < world_size; ++i) {
 83 |             hit += holders[i];
 84 |         }
 85 |     }
 86 | 
 87 |     if (world_rank == 0)
 88 |     {
 89 |         // TODO: PI result
 90 |         pi_result = 4 * hit / ((double)tosses);
 91 | 
 92 |         // --- DON'T TOUCH ---
 93 |         double end_time = MPI_Wtime();
 94 |         printf("%lf\n", pi_result);
 95 |         printf("MPI running time: %lf Seconds\n", end_time - start_time);
 96 |         // ---
 97 |     }
 98 | 
 99 |     MPI_Finalize();
100 |     return 0;
101 | }
102 | 


--------------------------------------------------------------------------------
/HW4/part1/pi_reduce.cc:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <time.h>
 5 | #include <sys/types.h>
 6 | #include <unistd.h>
 7 | 
 8 | #pragma GCC optimize ("O3")
 9 | 
10 | long long int hit;
11 | 
12 | double rand_get(double min, double max, unsigned int *seed)
13 | {
14 |     int r = rand_r(seed);
15 |     return min + ((double)(r) / RAND_MAX) * (max - min);
16 | }
17 | 
18 | void tf_estimate_pi(long long int tosses_cnt, int rank)
19 | {
20 |     unsigned int seed = getpid() ^ time(NULL) ^ (0xaed01498 << rank);
21 | 
22 |     for (long long int toss = 0; toss < tosses_cnt; toss ++) {
23 |         double x, y, distance_squared;
24 |         x = rand_get(-1, 1, &seed);
25 |         y = rand_get(-1, 1, &seed);
26 |         distance_squared = x * x + y * y;
27 |         if (distance_squared <= 1)
28 |             hit++;
29 |     }
30 | }
31 | 
32 | int main(int argc, char **argv)
33 | {
34 |     // --- DON'T TOUCH ---
35 |     MPI_Init(&argc, &argv);
36 |     double start_time = MPI_Wtime();
37 |     double pi_result;
38 |     long long int tosses = atoi(argv[1]);
39 |     int world_rank, world_size;
40 |     // ---
41 | 
42 |     // TODO: MPI init
43 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
44 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
45 | 
46 |     // TODO: use MPI_Reduce
47 |     long long int part_tosses;
48 |     long long int total_hit;
49 | 
50 |     part_tosses = tosses / world_size;
51 |     if (world_rank == 0) {
52 |         part_tosses += (tosses % world_size);
53 |     }
54 | 
55 |     tf_estimate_pi(part_tosses, world_rank);
56 | 
57 |     MPI_Reduce(&hit, &total_hit, 1, MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
58 | 
59 |     if (world_rank == 0)
60 |     {
61 |         // TODO: PI result
62 |         pi_result = 4 * total_hit / ((double)tosses);
63 | 
64 |         // --- DON'T TOUCH ---
65 |         double end_time = MPI_Wtime();
66 |         printf("%lf\n", pi_result);
67 |         printf("MPI running time: %lf Seconds\n", end_time - start_time);
68 |         // ---
69 |     }
70 | 
71 |     MPI_Finalize();
72 |     return 0;
73 | }
74 | 


--------------------------------------------------------------------------------
/HW4/part1/test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import subprocess
 3 | 
 4 | def test_mpi_hello(np=8):
 5 |     cmd = ['mpirun', '-np', str(np), '--hostfile', 'hosts_mpi.txt', 'mpi_hello']
 6 | 
 7 |     with open('report.txt', "a") as report:
 8 |         report.write('mpirun -np {} --hostfile hosts_mpi.txt mpi_hello\n'.format(np))
 9 |         report.flush()
10 |         subprocess.run(cmd, stdout=report)
11 |         report.write('\n')
12 |         report.flush()
13 | 
14 | def test_pi(np=8, pi_block_linear='pi_block_linear'):
15 |     cmd = ['mpirun', '-np', str(np), '--hostfile', 'hosts_part1.txt', pi_block_linear, '1000000000']
16 | 
17 |     with open('report.txt', "a") as report:
18 |         report.write('mpirun -np {} --hostfile hosts_part1.txt {} 1000000000\n'.format(np, pi_block_linear))
19 |         report.flush()
20 |         subprocess.run(cmd, stdout=report)
21 |         report.write('\n')
22 |         report.flush()
23 | 
24 | with open('report.txt', "w") as report:
25 |     report.write('')
26 | 
27 | test_mpi_hello(8)
28 | test_mpi_hello(10)
29 | 
30 | test_pi(2, 'pi_block_linear')
31 | test_pi(2, '/HW4/ref/pi_block_linear')
32 | test_pi(4, 'pi_block_linear')
33 | test_pi(4, '/HW4/ref/pi_block_linear')
34 | test_pi(8, 'pi_block_linear')
35 | test_pi(8, '/HW4/ref/pi_block_linear')
36 | test_pi(12, 'pi_block_linear')
37 | test_pi(12, '/HW4/ref/pi_block_linear')
38 | test_pi(16, 'pi_block_linear')
39 | test_pi(16, '/HW4/ref/pi_block_linear')
40 | 
41 | test_pi(2, 'pi_block_tree')
42 | test_pi(2, '/HW4/ref/pi_block_tree')
43 | test_pi(4, 'pi_block_tree')
44 | test_pi(4, '/HW4/ref/pi_block_tree')
45 | test_pi(8, 'pi_block_tree')
46 | test_pi(8, '/HW4/ref/pi_block_tree')
47 | test_pi(16, 'pi_block_tree')
48 | test_pi(16, '/HW4/ref/pi_block_tree')
49 | 
50 | test_pi(2, 'pi_nonblock_linear')
51 | test_pi(2, '/HW4/ref/pi_nonblock_linear')
52 | test_pi(4, 'pi_nonblock_linear')
53 | test_pi(4, '/HW4/ref/pi_nonblock_linear')
54 | test_pi(8, 'pi_nonblock_linear')
55 | test_pi(8, '/HW4/ref/pi_nonblock_linear')
56 | test_pi(12, 'pi_nonblock_linear')
57 | test_pi(12, '/HW4/ref/pi_nonblock_linear')
58 | test_pi(16, 'pi_nonblock_linear')
59 | test_pi(16, '/HW4/ref/pi_nonblock_linear')
60 | 
61 | test_pi(2, 'pi_gather')
62 | test_pi(2, '/HW4/ref/pi_gather')
63 | test_pi(4, 'pi_gather')
64 | test_pi(4, '/HW4/ref/pi_gather')
65 | test_pi(8, 'pi_gather')
66 | test_pi(8, '/HW4/ref/pi_gather')
67 | test_pi(12, 'pi_gather')
68 | test_pi(12, '/HW4/ref/pi_gather')
69 | test_pi(16, 'pi_gather')
70 | test_pi(16, '/HW4/ref/pi_gather')
71 | 
72 | test_pi(2, 'pi_reduce')
73 | test_pi(2, '/HW4/ref/pi_reduce')
74 | test_pi(4, 'pi_reduce')
75 | test_pi(4, '/HW4/ref/pi_reduce')
76 | test_pi(8, 'pi_reduce')
77 | test_pi(8, '/HW4/ref/pi_reduce')
78 | test_pi(12, 'pi_reduce')
79 | test_pi(12, '/HW4/ref/pi_reduce')
80 | test_pi(16, 'pi_reduce')
81 | test_pi(16, '/HW4/ref/pi_reduce')
82 | 


--------------------------------------------------------------------------------
/HW4/part2/Makefile:
--------------------------------------------------------------------------------
 1 | TARGET := matmul
 2 | 
 3 | CC_FILES = $(wildcard *.cc)
 4 | O_FILES = $(CC_FILES:%.cc=%.o)
 5 | 
 6 | all: $(TARGET)
 7 | 
 8 | $(TARGET): $(O_FILES)
 9 | 	mpicxx $^ -o $@
10 | 
11 | %.o: %.cc
12 | 	mpicxx -O3 -c $< -o $@ -Wall
13 | 
14 | .PHONY: clean
15 | clean:
16 | 	rm -f *.o $(TARGET)
17 | 
18 | .PHONY: sync
19 | sync: all
20 | #	Copy to all hosts
21 | 	parallel-scp -A -r -h ../setting/hosts.txt ~/HW4 ~
22 | 
23 | .PHONY: report
24 | report: sync
25 | 	python3 ./test.py


--------------------------------------------------------------------------------
/HW4/part2/hosts_part2_4slots.txt:
--------------------------------------------------------------------------------
1 | pp2 slots=1
2 | pp3 slots=1
3 | pp5 slots=1
4 | pp7 slots=1


--------------------------------------------------------------------------------
/HW4/part2/hosts_part2_7slots.txt:
--------------------------------------------------------------------------------
1 | pp2 slots=1
2 | pp3 slots=1
3 | pp4 slots=1
4 | pp5 slots=1
5 | pp6 slots=1
6 | pp7 slots=1
7 | pp8 slots=1


--------------------------------------------------------------------------------
/HW4/part2/main.cc:
--------------------------------------------------------------------------------
 1 | #include <mpi.h>
 2 | #include <cstdio>
 3 | 
 4 | // #define DEBUG
 5 | 
 6 | // *********************************************
 7 | // ** ATTENTION: YOU CANNOT MODIFY THIS FILE. **
 8 | // *********************************************
 9 | 
10 | // Read size of matrix_a and matrix_b (n, m, l) and whole data of matrixes from stdin
11 | //
12 | // n_ptr:     pointer to n
13 | // m_ptr:     pointer to m
14 | // l_ptr:     pointer to l
15 | // a_mat_ptr: pointer to matrix a (a should be a continuous memory space for placing n * m elements of int)
16 | // b_mat_ptr: pointer to matrix b (b should be a continuous memory space for placing m * l elements of int)
17 | void construct_matrices(int *n_ptr, int *m_ptr, int *l_ptr,
18 |                         int **a_mat_ptr, int **b_mat_ptr);
19 | 
20 | // Just matrix multiplication (your should output the result in this function)
21 | // 
22 | // n:     row number of matrix a
23 | // m:     col number of matrix a / row number of matrix b
24 | // l:     col number of matrix b
25 | // a_mat: a continuous memory placing n * m elements of int
26 | // b_mat: a continuous memory placing m * l elements of int
27 | void matrix_multiply(const int n, const int m, const int l,
28 |                      const int *a_mat, const int *b_mat);
29 | 
30 | // Remember to release your allocated memory
31 | void destruct_matrices(int *a_mat, int *b_mat);
32 | 
33 | int main () {
34 |     int n, m, l;
35 |     int *a_mat, *b_mat;
36 | 
37 | #ifdef DEBUG
38 |     int world_rank, world_size;
39 | #endif
40 | 
41 |     MPI_Init(NULL, NULL);
42 | 
43 | #ifdef DEBUG
44 |     MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
45 |     MPI_Comm_size(MPI_COMM_WORLD, &world_size);
46 | #endif
47 | 
48 |     double start_time = MPI_Wtime();
49 | 
50 |     construct_matrices(&n, &m, &l, &a_mat, &b_mat);
51 |     matrix_multiply(n, m, l, a_mat, b_mat);
52 |     destruct_matrices(a_mat, b_mat);
53 | 
54 |     double end_time = MPI_Wtime();
55 |     MPI_Finalize();
56 | 
57 | #ifndef DEBUG
58 |     printf("MPI running time: %lf Seconds\n", end_time - start_time);
59 | #else
60 |     printf("[%d] MPI running time: %lf Seconds\n", world_rank, end_time - start_time);
61 | #endif
62 | 
63 |     return 0;
64 | }
65 | 


--------------------------------------------------------------------------------
/HW4/part2/report.txt:
--------------------------------------------------------------------------------
  1 | mpirun -np 1 --hostfile hosts_part2_4slots.txt matmul < ./testdata/data0_1
  2 | MPI running time: 0.000088 Seconds
  3 | 
  4 | mpirun -np 1 --hostfile hosts_part2_4slots.txt matmul < ./testdata/data0_2
  5 | MPI running time: 0.000116 Seconds
  6 | 
  7 | mpirun -np 1 --hostfile hosts_part2_4slots.txt matmul < /home/.grade/HW4/data-set/data1_1
  8 | MPI running time: 0.150457 Seconds
  9 | 
 10 | mpirun -np 1 --hostfile hosts_part2_4slots.txt matmul < /home/.grade/HW4/data-set/data2_1
 11 | MPI running time: 3.448961 Seconds
 12 | 
 13 | mpirun -np 1 --hostfile hosts_part2_4slots.txt matmul < /home/.grade/HW4/data-set/data2_8
 14 | MPI running time: 3.539633 Seconds
 15 | 
 16 | mpirun -np 1 --hostfile hosts_part2_4slots.txt matmul < /home/.grade/HW4/data-set/data2_10
 17 | MPI running time: 3.038098 Seconds
 18 | 
 19 | mpirun -np 4 --hostfile hosts_part2_4slots.txt matmul < ./testdata/data0_1
 20 | MPI running time: 0.032685 Seconds
 21 | MPI running time: 0.032888 Seconds
 22 | MPI running time: 0.032859 Seconds
 23 | MPI running time: 0.032206 Seconds
 24 | 
 25 | mpirun -np 4 --hostfile hosts_part2_4slots.txt matmul < ./testdata/data0_2
 26 | MPI running time: 0.032295 Seconds
 27 | MPI running time: 0.032178 Seconds
 28 | MPI running time: 0.032376 Seconds
 29 | MPI running time: 0.032043 Seconds
 30 | 
 31 | mpirun -np 4 --hostfile hosts_part2_4slots.txt matmul < /home/.grade/HW4/data-set/data1_1
 32 | MPI running time: 0.175286 Seconds
 33 | MPI running time: 0.175577 Seconds
 34 | MPI running time: 0.175579 Seconds
 35 | MPI running time: 0.175084 Seconds
 36 | 
 37 | mpirun -np 4 --hostfile hosts_part2_4slots.txt matmul < /home/.grade/HW4/data-set/data2_1
 38 | MPI running time: 2.925186 Seconds
 39 | MPI running time: 2.927498 Seconds
 40 | MPI running time: 2.925823 Seconds
 41 | MPI running time: 2.925206 Seconds
 42 | 
 43 | mpirun -np 4 --hostfile hosts_part2_4slots.txt matmul < /home/.grade/HW4/data-set/data2_8
 44 | MPI running time: 2.916394 Seconds
 45 | MPI running time: 2.916054 Seconds
 46 | MPI running time: 2.916057 Seconds
 47 | MPI running time: 2.917237 Seconds
 48 | 
 49 | mpirun -np 4 --hostfile hosts_part2_4slots.txt matmul < /home/.grade/HW4/data-set/data2_10
 50 | MPI running time: 2.470314 Seconds
 51 | MPI running time: 2.469457 Seconds
 52 | MPI running time: 2.469103 Seconds
 53 | MPI running time: 2.469243 Seconds
 54 | 
 55 | mpirun -np 7 --hostfile hosts_part2_7slots.txt matmul < ./testdata/data0_1
 56 | MPI running time: 0.037167 Seconds
 57 | MPI running time: 0.037349 Seconds
 58 | MPI running time: 0.037437 Seconds
 59 | MPI running time: 0.036933 Seconds
 60 | MPI running time: 0.038714 Seconds
 61 | MPI running time: 0.038760 Seconds
 62 | MPI running time: 0.036833 Seconds
 63 | 
 64 | mpirun -np 7 --hostfile hosts_part2_7slots.txt matmul < ./testdata/data0_2
 65 | MPI running time: 0.035229 Seconds
 66 | MPI running time: 0.035061 Seconds
 67 | MPI running time: 0.034967 Seconds
 68 | MPI running time: 0.034978 Seconds
 69 | MPI running time: 0.034990 Seconds
 70 | MPI running time: 0.034789 Seconds
 71 | MPI running time: 0.034828 Seconds
 72 | 
 73 | mpirun -np 7 --hostfile hosts_part2_7slots.txt matmul < /home/.grade/HW4/data-set/data1_1
 74 | MPI running time: 0.278344 Seconds
 75 | MPI running time: 0.278426 Seconds
 76 | MPI running time: 0.277971 Seconds
 77 | MPI running time: 0.279104 Seconds
 78 | MPI running time: 0.278362 Seconds
 79 | MPI running time: 0.278067 Seconds
 80 | MPI running time: 0.278558 Seconds
 81 | 
 82 | mpirun -np 7 --hostfile hosts_part2_7slots.txt matmul < /home/.grade/HW4/data-set/data2_1
 83 | MPI running time: 3.072552 Seconds
 84 | MPI running time: 3.072336 Seconds
 85 | MPI running time: 3.072476 Seconds
 86 | MPI running time: 3.072225 Seconds
 87 | MPI running time: 3.072801 Seconds
 88 | MPI running time: 3.072261 Seconds
 89 | MPI running time: 3.073843 Seconds
 90 | 
 91 | mpirun -np 7 --hostfile hosts_part2_7slots.txt matmul < /home/.grade/HW4/data-set/data2_8
 92 | MPI running time: 3.054006 Seconds
 93 | MPI running time: 3.054290 Seconds
 94 | MPI running time: 3.054202 Seconds
 95 | MPI running time: 3.054022 Seconds
 96 | MPI running time: 3.053785 Seconds
 97 | MPI running time: 3.053801 Seconds
 98 | MPI running time: 3.055214 Seconds
 99 | 
100 | mpirun -np 7 --hostfile hosts_part2_7slots.txt matmul < /home/.grade/HW4/data-set/data2_10
101 | MPI running time: 2.695241 Seconds
102 | MPI running time: 2.695270 Seconds
103 | MPI running time: 2.695297 Seconds
104 | MPI running time: 2.694974 Seconds
105 | MPI running time: 2.695059 Seconds
106 | MPI running time: 2.695294 Seconds
107 | MPI running time: 2.697507 Seconds
108 | 
109 | 


--------------------------------------------------------------------------------
/HW4/part2/test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import subprocess
 3 | 
 4 | def test(np=8, hostfile='hosts_part2_4slots.txt', infile='./testdata/data1_1', ansfile='./testdata/ans1_1'):
 5 |     cmd = ['mpirun', '-np', str(np), '--hostfile', hostfile, 'matmul']
 6 | 
 7 |     with open(infile, "r") as f:
 8 |         with subprocess.Popen(cmd, stdin=f, stdout=subprocess.PIPE) as proc:
 9 |             output = proc.stdout.read()
10 | 
11 |     with open(ansfile, "rb") as f:
12 |         ans = f.read()
13 | 
14 |     output = output.split(b'MPI running time', 1)
15 |     myans = output[0]
16 |     runtime = b'MPI running time' + output[1]
17 | 
18 |     with open('report.txt', "a") as report:
19 |         report.write('mpirun -np {} --hostfile {} matmul < {}\n'.format(np, hostfile, infile))
20 | 
21 |         if ans != myans:
22 |             report.write('[WA]\n')
23 |         else:
24 |             report.write(runtime.decode())
25 | 
26 |         report.write('\n')
27 | 
28 | with open('report.txt', "w") as report:
29 |     report.write('')
30 | 
31 | test(1, 'hosts_part2_4slots.txt', './testdata/data0_1', './testdata/ans0_1')
32 | test(1, 'hosts_part2_4slots.txt', './testdata/data0_2', './testdata/ans0_2')
33 | test(1, 'hosts_part2_4slots.txt', '/home/.grade/HW4/data-set/data1_1', '/home/.grade/HW4/data-set/ans1_1')
34 | test(1, 'hosts_part2_4slots.txt', '/home/.grade/HW4/data-set/data2_1', '/home/.grade/HW4/data-set/ans2_1')
35 | test(1, 'hosts_part2_4slots.txt', '/home/.grade/HW4/data-set/data2_8', '/home/.grade/HW4/data-set/ans2_8')
36 | test(1, 'hosts_part2_4slots.txt', '/home/.grade/HW4/data-set/data2_10', '/home/.grade/HW4/data-set/ans2_10')
37 | 
38 | test(4, 'hosts_part2_4slots.txt', './testdata/data0_1', './testdata/ans0_1')
39 | test(4, 'hosts_part2_4slots.txt', './testdata/data0_2', './testdata/ans0_2')
40 | test(4, 'hosts_part2_4slots.txt', '/home/.grade/HW4/data-set/data1_1', '/home/.grade/HW4/data-set/ans1_1')
41 | test(4, 'hosts_part2_4slots.txt', '/home/.grade/HW4/data-set/data2_1', '/home/.grade/HW4/data-set/ans2_1')
42 | test(4, 'hosts_part2_4slots.txt', '/home/.grade/HW4/data-set/data2_8', '/home/.grade/HW4/data-set/ans2_8')
43 | test(4, 'hosts_part2_4slots.txt', '/home/.grade/HW4/data-set/data2_10', '/home/.grade/HW4/data-set/ans2_10')
44 | 
45 | test(7, 'hosts_part2_7slots.txt', './testdata/data0_1', './testdata/ans0_1')
46 | test(7, 'hosts_part2_7slots.txt', './testdata/data0_2', './testdata/ans0_2')
47 | test(7, 'hosts_part2_7slots.txt', '/home/.grade/HW4/data-set/data1_1', '/home/.grade/HW4/data-set/ans1_1')
48 | test(7, 'hosts_part2_7slots.txt', '/home/.grade/HW4/data-set/data2_1', '/home/.grade/HW4/data-set/ans2_1')
49 | test(7, 'hosts_part2_7slots.txt', '/home/.grade/HW4/data-set/data2_8', '/home/.grade/HW4/data-set/ans2_8')
50 | test(7, 'hosts_part2_7slots.txt', '/home/.grade/HW4/data-set/data2_10', '/home/.grade/HW4/data-set/ans2_10')
51 | 


--------------------------------------------------------------------------------
/HW4/part2/testdata/ans0_1:
--------------------------------------------------------------------------------
1 | 1 8 5 
2 | 0 9 6 
3 | 4 23 14 
4 | 


--------------------------------------------------------------------------------
/HW4/part2/testdata/ans0_2:
--------------------------------------------------------------------------------
 1 | 32 40 76 52 32 48 32 68 
 2 | 36 49 101 70 48 41 27 64 
 3 | 33 42 93 60 39 33 21 42 
 4 | 37 54 114 81 59 35 23 63 
 5 | 10 14 28 20 14 12 8 20 
 6 | 44 43 108 55 24 45 27 21 
 7 | 59 64 150 85 45 63 39 51 
 8 | 58 58 130 70 30 76 48 62 
 9 | 57 72 150 99 63 69 45 93 
10 | 16 17 42 23 12 15 9 9 
11 | 


--------------------------------------------------------------------------------
/HW4/part2/testdata/data0_1:
--------------------------------------------------------------------------------
1 | 3 2 3
2 | 1 2
3 | 0 3
4 | 4 5
5 | 1 2 1
6 | 0 3 2


--------------------------------------------------------------------------------
/HW4/part2/testdata/data0_2:
--------------------------------------------------------------------------------
 1 | 10 3 8
 2 | 8 4 0
 3 | 6 7 1
 4 | 3 6 3
 5 | 5 9 1
 6 | 2 2 0
 7 | 0 4 9
 8 | 3 7 9
 9 | 6 4 8
10 | 9 9 3
11 | 0 2 3
12 | 3 3 5 3 1 6 4 7
13 | 2 4 9 7 6 0 0 3
14 | 4 3 8 3 0 5 3 1
15 | 


--------------------------------------------------------------------------------
/HW4/setting/config:
--------------------------------------------------------------------------------
 1 | Host pp2
 2 |   HostName 192.168.202.2
 3 |   User 310555003
 4 | 
 5 | Host pp3
 6 |   HostName 192.168.202.3
 7 |   User 310555003
 8 | 
 9 | Host pp4
10 |   HostName 192.168.202.4
11 |   User 310555003
12 | 
13 | Host pp5
14 |   HostName 192.168.202.5
15 |   User 310555003
16 | 
17 | Host pp6
18 |   HostName 192.168.202.6
19 |   User 310555003
20 | 
21 | Host pp7
22 |   HostName 192.168.202.7
23 |   User 310555003
24 | 
25 | Host pp8
26 |   HostName 192.168.202.8
27 |   User 310555003
28 | 
29 | Host pp10
30 |   HostName 192.168.202.10
31 |   User 310555003


--------------------------------------------------------------------------------
/HW4/setting/hosts.txt:
--------------------------------------------------------------------------------
1 | 192.168.202.2
2 | 192.168.202.3
3 | 192.168.202.4
4 | 192.168.202.5
5 | 192.168.202.6
6 | 192.168.202.7
7 | 192.168.202.8
8 | 192.168.202.10


--------------------------------------------------------------------------------
/HW4/submit/part1/hello.cc:
--------------------------------------------------------------------------------
1 | ../../part1/hello.cc


--------------------------------------------------------------------------------
/HW4/submit/part1/pi_block_linear.cc:
--------------------------------------------------------------------------------
1 | ../../part1/pi_block_linear.cc


--------------------------------------------------------------------------------
/HW4/submit/part1/pi_block_tree.cc:
--------------------------------------------------------------------------------
1 | ../../part1/pi_block_tree.cc


--------------------------------------------------------------------------------
/HW4/submit/part1/pi_gather.cc:
--------------------------------------------------------------------------------
1 | ../../part1/pi_gather.cc


--------------------------------------------------------------------------------
/HW4/submit/part1/pi_nonblock_linear.cc:
--------------------------------------------------------------------------------
1 | ../../part1/pi_nonblock_linear.cc


--------------------------------------------------------------------------------
/HW4/submit/part1/pi_reduce.cc:
--------------------------------------------------------------------------------
1 | ../../part1/pi_reduce.cc


--------------------------------------------------------------------------------
/HW4/submit/part2/Makefile:
--------------------------------------------------------------------------------
1 | ../../part2/Makefile


--------------------------------------------------------------------------------
/HW4/submit/part2/main.cc:
--------------------------------------------------------------------------------
1 | ../../part2/main.cc


--------------------------------------------------------------------------------
/HW4/submit/part2/matrix.cc:
--------------------------------------------------------------------------------
1 | ../../part2/matrix.cc


--------------------------------------------------------------------------------
/HW4/submit/url.txt:
--------------------------------------------------------------------------------
1 | https://hackmd.io/@LJP/S1kpIg5Io


--------------------------------------------------------------------------------
/HW5/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC = nvcc
 2 | CXX=g++
 3 | CXXFLAGS=-I./common -Iobjs/ -O3 -std=c++17 -Wall -g -fPIC -lm
 4 | 
 5 | APP_NAME=mandelbrot
 6 | OBJDIR=objs
 7 | COMMONDIR=./common
 8 | 
 9 | CUDA_LINK_FLAGS =  -rdc=true -gencode=arch=compute_61,code=sm_61 -Xcompiler '-fPIC' 
10 | CUDA_COMPILE_FLAGS = --device-c -gencode=arch=compute_61,code=sm_61 -Xcompiler '-fPIC' -g -O3
11 | 
12 | PPM_CXX=$(COMMONDIR)/ppm.cpp
13 | PPM_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(PPM_CXX:.cpp=.o)))
14 | 
15 | 
16 | default: $(APP_NAME)
17 | 
18 | .PHONY: dirs clean
19 | 
20 | dirs:
21 | 		/bin/mkdir -p $(OBJDIR)/
22 | 
23 | clean:
24 | 		/bin/rm -rf $(OBJDIR) *.ppm *~ $(APP_NAME)
25 | 
26 | OBJS=$(OBJDIR)/main.o $(OBJDIR)/kernel1.o $(OBJDIR)/kernel2.o $(OBJDIR)/kernel3.o $(OBJDIR)/kernel4.o \
27 |      $(OBJDIR)/mandelbrotSerial.o $(OBJDIR)/mandelbrotThread.o $(PPM_OBJ)
28 | 
29 | $(APP_NAME): dirs $(OBJS)
30 | 		$(NVCC) ${CUDA_LINK_FLAGS} -o $@ $(OBJS) mandelbrotThreadRef.a
31 | 
32 | $(OBJDIR)/%.o: %.cpp
33 | 		$(CXX) $< $(CXXFLAGS) -c -o $@
34 | 
35 | $(OBJDIR)/%.o: $(COMMONDIR)/%.cpp
36 | 	$(CXX) $< $(CXXFLAGS) -c -o $@
37 | 
38 | $(OBJDIR)/main.o: $(COMMONDIR)/CycleTimer.h kernel.h
39 | 
40 | $(OBJDIR)/kernel1.o : kernel1.cu kernel.h
41 | 	${NVCC} ${CUDA_COMPILE_FLAGS} -c $< -o $@
42 | 
43 | $(OBJDIR)/kernel2.o : kernel2.cu kernel.h
44 | 	${NVCC} ${CUDA_COMPILE_FLAGS} -c $< -o $@
45 | 
46 | $(OBJDIR)/kernel3.o : kernel3.cu kernel.h
47 | 	${NVCC} ${CUDA_COMPILE_FLAGS} -c $< -o $@
48 | 
49 | $(OBJDIR)/kernel4.o : kernel4.cu kernel.h
50 | 	${NVCC} ${CUDA_COMPILE_FLAGS} -c $< -o $@
51 | 


--------------------------------------------------------------------------------
/HW5/common/CycleTimer.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SYRAH_CYCLE_TIMER_H_
  2 | #define _SYRAH_CYCLE_TIMER_H_
  3 | 
  4 | #if defined(__APPLE__)
  5 |   #if defined(__x86_64__)
  6 |     #include <sys/sysctl.h>
  7 |   #else
  8 |     #include <mach/mach.h>
  9 |     #include <mach/mach_time.h>
 10 |   #endif // __x86_64__ or not
 11 | 
 12 |   #include <stdio.h>  // fprintf
 13 |   #include <stdlib.h> // exit
 14 | 
 15 | #elif _WIN32
 16 | #  include <windows.h>
 17 | #  include <time.h>
 18 | #else
 19 | #  include <stdio.h>
 20 | #  include <stdlib.h>
 21 | #  include <string.h>
 22 | #  include <sys/time.h>
 23 | #endif
 24 | 
 25 | 
 26 |   // This uses the cycle counter of the processor.  Different
 27 |   // processors in the system will have different values for this.  If
 28 |   // you process moves across processors, then the delta time you
 29 |   // measure will likely be incorrect.  This is mostly for fine
 30 |   // grained measurements where the process is likely to be on the
 31 |   // same processor.  For more global things you should use the
 32 |   // Time interface.
 33 | 
 34 |   // Also note that if you processors' speeds change (i.e. processors
 35 |   // scaling) or if you are in a heterogenous environment, you will
 36 |   // likely get spurious results.
 37 |   class CycleTimer {
 38 |   public:
 39 |     typedef unsigned long long SysClock;
 40 | 
 41 |     //////////
 42 |     // Return the current CPU time, in terms of clock ticks.
 43 |     // Time zero is at some arbitrary point in the past.
 44 |     static SysClock currentTicks() {
 45 | #if defined(__APPLE__) && !defined(__x86_64__)
 46 |       return mach_absolute_time();
 47 | #elif defined(_WIN32)
 48 |       LARGE_INTEGER qwTime;
 49 |       QueryPerformanceCounter(&qwTime);
 50 |       return qwTime.QuadPart;
 51 | #elif defined(__x86_64__)
 52 |       unsigned int a, d;
 53 |       asm volatile("rdtsc" : "=a" (a), "=d" (d));
 54 |       return static_cast<unsigned long long>(a) |
 55 |         (static_cast<unsigned long long>(d) << 32);
 56 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser.
 57 |       unsigned int val;
 58 |       asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val));
 59 |       return val;
 60 | #else
 61 |       timespec spec;
 62 |       clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec);
 63 |       return CycleTimer::SysClock(static_cast<float>(spec.tv_sec) * 1e9 + static_cast<float>(spec.tv_nsec));
 64 | #endif
 65 |     }
 66 | 
 67 |     //////////
 68 |     // Return the current CPU time, in terms of seconds.
 69 |     // This is slower than currentTicks().  Time zero is at
 70 |     // some arbitrary point in the past.
 71 |     static double currentSeconds() {
 72 |       return currentTicks() * secondsPerTick();
 73 |     }
 74 | 
 75 |     //////////
 76 |     // Return the conversion from seconds to ticks.
 77 |     static double ticksPerSecond() {
 78 |       return 1.0/secondsPerTick();
 79 |     }
 80 | 
 81 |     static const char* tickUnits() {
 82 | #if defined(__APPLE__) && !defined(__x86_64__)
 83 |       return "ns";
 84 | #elif defined(__WIN32__) || defined(__x86_64__)
 85 |       return "cycles";
 86 | #else
 87 |       return "ns"; // clock_gettime
 88 | #endif
 89 |     }
 90 | 
 91 |     //////////
 92 |     // Return the conversion from ticks to seconds.
 93 |     static double secondsPerTick() {
 94 |       static bool initialized = false;
 95 |       static double secondsPerTick_val;
 96 |       if (initialized) return secondsPerTick_val;
 97 | #if defined(__APPLE__)
 98 |   #ifdef __x86_64__
 99 |       int args[] = {CTL_HW, HW_CPU_FREQ};
100 |       unsigned int Hz;
101 |       size_t len = sizeof(Hz);
102 |       if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) {
103 |          fprintf(stderr, "Failed to initialize secondsPerTick_val!\n");
104 |          exit(-1);
105 |       }
106 |       secondsPerTick_val = 1.0 / (double) Hz;
107 |   #else
108 |       mach_timebase_info_data_t time_info;
109 |       mach_timebase_info(&time_info);
110 | 
111 |       // Scales to nanoseconds without 1e-9f
112 |       secondsPerTick_val = (1e-9*static_cast<double>(time_info.numer))/
113 |         static_cast<double>(time_info.denom);
114 |   #endif // x86_64 or not
115 | #elif defined(_WIN32)
116 |       LARGE_INTEGER qwTicksPerSec;
117 |       QueryPerformanceFrequency(&qwTicksPerSec);
118 |       secondsPerTick_val = 1.0/static_cast<double>(qwTicksPerSec.QuadPart);
119 | #else
120 |       FILE *fp = fopen("/proc/cpuinfo","r");
121 |       char input[1024];
122 |       if (!fp) {
123 |          fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo.");
124 |          exit(-1);
125 |       }
126 |       // In case we don't find it, e.g. on the N900
127 |       secondsPerTick_val = 1e-9;
128 |       while (!feof(fp) && fgets(input, 1024, fp)) {
129 |         // NOTE(boulos): Because reading cpuinfo depends on dynamic
130 |         // frequency scaling it's better to read the @ sign first
131 |         float GHz, MHz;
132 |         if (strstr(input, "model name")) {
133 |           char* at_sign = strstr(input, "@");
134 |           if (at_sign) {
135 |             char* after_at = at_sign + 1;
136 |             char* GHz_str = strstr(after_at, "GHz");
137 |             char* MHz_str = strstr(after_at, "MHz");
138 |             if (GHz_str) {
139 |               *GHz_str = '\0';
140 |               if (1 == sscanf(after_at, "%f", &GHz)) {
141 |                 //printf("GHz = %f\n", GHz);
142 |                 secondsPerTick_val = 1e-9f / GHz;
143 |                 break;
144 |               }
145 |             } else if (MHz_str) {
146 |               *MHz_str = '\0';
147 |               if (1 == sscanf(after_at, "%f", &MHz)) {
148 |                 //printf("MHz = %f\n", MHz);
149 |                 secondsPerTick_val = 1e-6f / GHz;
150 |                 break;
151 |               }
152 |             }
153 |           }
154 |         } else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) {
155 |           //printf("MHz = %f\n", MHz);
156 |           secondsPerTick_val = 1e-6f / MHz;
157 |           break;
158 |         }
159 |       }
160 |       fclose(fp);
161 | #endif
162 | 
163 |       initialized = true;
164 |       return secondsPerTick_val;
165 |     }
166 | 
167 |     //////////
168 |     // Return the conversion from ticks to milliseconds.
169 |     static double msPerTick() {
170 |       return secondsPerTick() * 1000.0;
171 |     }
172 | 
173 |   private:
174 |     CycleTimer();
175 |   };
176 | 
177 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_
178 | 


--------------------------------------------------------------------------------
/HW5/common/ppm.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | #include <math.h>
 4 | #include <algorithm>
 5 | 
 6 | 
 7 | 
 8 | void
 9 | writePPMImage(int* data, int width, int height, const char *filename, int maxIterations)
10 | {
11 |     FILE *fp = fopen(filename, "wb");
12 | 
13 |     // write ppm header
14 |     fprintf(fp, "P6\n");
15 |     fprintf(fp, "%d %d\n", width, height);
16 |     fprintf(fp, "255\n");
17 | 
18 |     for (int i = 0; i < width*height; ++i) {
19 | 
20 |         // Clamp iteration count for this pixel, then scale the value
21 |         // to 0-1 range.  Raise resulting value to a power (<1) to
22 |         // increase brightness of low iteration count
23 |         // pixels. a.k.a. Make things look cooler.
24 | 
25 |         float mapped = pow( std::min(static_cast<float>(maxIterations),
26 |                                      static_cast<float>(data[i])) / 256.f, .5f);
27 | 
28 |         // convert back into 0-255 range, 8-bit channels
29 |         unsigned char result = static_cast<unsigned char>(255.f * mapped);
30 |         for (int j = 0; j < 3; ++j)
31 |             fputc(result, fp);
32 |     }
33 |     fclose(fp);
34 |     printf("Wrote image file %s\n", filename);
35 | }
36 | 


--------------------------------------------------------------------------------
/HW5/kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef KERNEL_H_
 2 | #define KERNEL_H_
 3 | 
 4 | #define USE_KERNEL 4
 5 | 
 6 | //extern "C"
 7 | void hostFE(float uX, float uY, float lX, float lY, int *image, int resX, int resY, int maxIterations);
 8 | 
 9 | #endif /* KERNEL_H_ */
10 | 


--------------------------------------------------------------------------------
/HW5/kernel1.cu:
--------------------------------------------------------------------------------
  1 | #ifdef _WIN32
  2 | #include <cuda_runtime.h>
  3 | #include <device_launch_parameters.h>
  4 | #else
  5 | #include <cuda.h>
  6 | #endif
  7 | 
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | 
 11 | #include <iostream>
 12 | 
 13 | #include "kernel.h"
 14 | 
 15 | using namespace std;
 16 | 
 17 | #if USE_KERNEL == 1
 18 | 
 19 | // 1600 * 1200
 20 | #define GRID_X 100
 21 | #define GRID_Y 75
 22 | #define BLOCK_X 16
 23 | #define BLOCK_Y 16
 24 | 
 25 | static int cudaInited;
 26 | 
 27 | void cudaInit()
 28 | {
 29 |     cudaError_t cudaStatus;
 30 | 
 31 |     if (cudaInited)
 32 |         return;
 33 | 
 34 |     cudaStatus = cudaSetDevice(0);
 35 |     if (cudaStatus != cudaSuccess) {
 36 |         cerr << "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?" << endl;
 37 |         exit(EXIT_FAILURE);
 38 |     }
 39 | 
 40 |     cudaInited = 1;
 41 | }
 42 | 
 43 | __device__ int mandel(float c_re, float c_im, int count)
 44 | {
 45 |     float z_re = c_re, z_im = c_im;
 46 |     int i;
 47 | 
 48 |     for (i = 0; i < count; ++i) {
 49 |         if (z_re * z_re + z_im * z_im > 4.f)
 50 |             break;
 51 | 
 52 |         float new_re = z_re * z_re - z_im * z_im;
 53 |         float new_im = 2.f * z_re * z_im;
 54 |         z_re = c_re + new_re;
 55 |         z_im = c_im + new_im;
 56 |     }
 57 | 
 58 |     return i;
 59 | }
 60 | 
 61 | __global__ void mandelKernel(int *output, float x0, float y0, float dx, float dy, int maxIterations)
 62 | {
 63 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 64 |     int j = blockIdx.y * blockDim.y + threadIdx.y;
 65 |     float x = x0 + i * dx;
 66 |     float y = y0 + j * dy;
 67 |     int index = j * gridDim.x * blockDim.x + i;
 68 |     output[index] = mandel(x, y, maxIterations);
 69 | }
 70 | 
 71 | // Host front-end function that allocates the memory and launches the GPU kernel
 72 | void hostFE(float upperX, float upperY, float lowerX, float lowerY, int *img, int resX, int resY, int maxIterations)
 73 | {
 74 |     cudaError_t cudaStatus;
 75 |     int *cudaResult, *result;
 76 |     float dx, dy;
 77 | 
 78 |     cudaInit();
 79 | 
 80 |     cudaStatus = cudaMalloc((void **)&cudaResult, sizeof(int) * resX * resY);
 81 |     if (cudaStatus != cudaSuccess) {
 82 |         cerr << "cudaMalloc failed!" << endl;
 83 |         exit(EXIT_FAILURE);
 84 |     }
 85 | 
 86 |     // HW required
 87 |     result = (int *)malloc(sizeof(int) * resX * resY);
 88 |     
 89 |     dx = (upperX - lowerX) / resX;
 90 |     dy = (upperY - lowerY) / resY;
 91 | 
 92 |     dim3 dimGrid(GRID_X, GRID_Y);
 93 |     dim3 dimBlock(BLOCK_X, BLOCK_Y);
 94 | 
 95 |     mandelKernel<<<dimGrid, dimBlock>>>(cudaResult, lowerX, lowerY, dx, dy, maxIterations);
 96 | 
 97 |     cudaStatus = cudaDeviceSynchronize();
 98 |     if (cudaStatus != cudaSuccess) {
 99 |         cerr << "cudaDeviceSynchronize returned error code " << cudaStatus << " after launching addKernel!" << endl;
100 |         exit(EXIT_FAILURE);
101 |     }
102 | 
103 |     cudaStatus = cudaMemcpy(result, cudaResult, sizeof(int) * resX * resY, cudaMemcpyDeviceToHost);
104 |     if (cudaStatus != cudaSuccess) {
105 |         cerr << "cudaMemcpy failed!" << endl;
106 |         exit(EXIT_FAILURE);
107 |     }
108 | 
109 |     cudaFree(cudaResult);
110 | 
111 |     // Copy result to output
112 |     memcpy(img, result, sizeof(int) * resX * resY);
113 | 
114 |     free(result);
115 | }
116 | 
117 | #endif


--------------------------------------------------------------------------------
/HW5/kernel2.cu:
--------------------------------------------------------------------------------
  1 | #ifdef _WIN32
  2 | #include <cuda_runtime.h>
  3 | #include <device_launch_parameters.h>
  4 | #else
  5 | #include <cuda.h>
  6 | #endif
  7 | 
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | 
 11 | #include <iostream>
 12 | 
 13 | #include "kernel.h"
 14 | 
 15 | using namespace std;
 16 | 
 17 | #if USE_KERNEL == 2
 18 | 
 19 | // 1600 * 1200
 20 | #define GRID_X 100
 21 | #define GRID_Y 75
 22 | #define BLOCK_X 16
 23 | #define BLOCK_Y 16
 24 | 
 25 | static int cudaInited;
 26 | 
 27 | void cudaInit()
 28 | {
 29 |     cudaError_t cudaStatus;
 30 | 
 31 |     if (cudaInited)
 32 |         return;
 33 | 
 34 |     cudaStatus = cudaSetDevice(0);
 35 |     if (cudaStatus != cudaSuccess) {
 36 |         cerr << "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?" << endl;
 37 |         exit(EXIT_FAILURE);
 38 |     }
 39 | 
 40 |     cudaInited = 1;
 41 | }
 42 | 
 43 | __device__ int mandel(float c_re, float c_im, int count)
 44 | {
 45 |     float z_re = c_re, z_im = c_im;
 46 |     int i;
 47 | 
 48 |     for (i = 0; i < count; ++i) {
 49 |         if (z_re * z_re + z_im * z_im > 4.f)
 50 |             break;
 51 | 
 52 |         float new_re = z_re * z_re - z_im * z_im;
 53 |         float new_im = 2.f * z_re * z_im;
 54 |         z_re = c_re + new_re;
 55 |         z_im = c_im + new_im;
 56 |     }
 57 | 
 58 |     return i;
 59 | }
 60 | 
 61 | __global__ void mandelKernel(int *output, float x0, float y0, float dx, float dy, int maxIterations)
 62 | {
 63 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 64 |     int j = blockIdx.y * blockDim.y + threadIdx.y;
 65 |     float x = x0 + i * dx;
 66 |     float y = y0 + j * dy;
 67 |     int index = j * gridDim.x * blockDim.x + i;
 68 |     output[index] = mandel(x, y, maxIterations);
 69 | }
 70 | 
 71 | // Host front-end function that allocates the memory and launches the GPU kernel
 72 | void hostFE(float upperX, float upperY, float lowerX, float lowerY, int *img, int resX, int resY, int maxIterations)
 73 | {
 74 |     cudaError_t cudaStatus;
 75 |     int *cudaResult, *result;
 76 |     size_t pitch;
 77 |     float dx, dy;
 78 | 
 79 |     cudaInit();
 80 | 
 81 |     cudaStatus = cudaMallocPitch((void **)&cudaResult, &pitch, sizeof(int) * resX, resY);
 82 |     if (cudaStatus != cudaSuccess) {
 83 |         cerr << "cudaMallocPitch failed!" << endl;
 84 |         exit(EXIT_FAILURE);
 85 |     }
 86 | 
 87 |     // HW required
 88 |     cudaStatus = cudaHostAlloc((void **)&result, sizeof(int) * resX * resY, cudaHostAllocDefault);
 89 |     if (cudaStatus != cudaSuccess) {
 90 |         cerr << "cudaHostAlloc failed!" << endl;
 91 |         exit(EXIT_FAILURE);
 92 |     }
 93 | 
 94 |     dx = (upperX - lowerX) / resX;
 95 |     dy = (upperY - lowerY) / resY;
 96 | 
 97 |     dim3 dimGrid(GRID_X, GRID_Y);
 98 |     dim3 dimBlock(BLOCK_X, BLOCK_Y);
 99 | 
100 |     mandelKernel<<<dimGrid, dimBlock>>>(cudaResult, lowerX, lowerY, dx, dy, maxIterations);
101 | 
102 |     cudaStatus = cudaDeviceSynchronize();
103 |     if (cudaStatus != cudaSuccess) {
104 |         cerr << "cudaDeviceSynchronize returned error code " << cudaStatus << " after launching addKernel!" << endl;
105 |         exit(EXIT_FAILURE);
106 |     }
107 | 
108 |     cudaStatus = cudaMemcpy(result, cudaResult, sizeof(int) * resX * resY, cudaMemcpyDeviceToHost);
109 |     if (cudaStatus != cudaSuccess) {
110 |         cerr << "cudaMemcpy failed!" << endl;
111 |         exit(EXIT_FAILURE);
112 |     }
113 | 
114 |     cudaFree(cudaResult);
115 | 
116 |     // Copy result to output
117 |     memcpy(img, result, sizeof(int) * resX * resY);
118 | 
119 |     cudaFreeHost(result);
120 | }
121 | 
122 | #endif


--------------------------------------------------------------------------------
/HW5/kernel3.cu:
--------------------------------------------------------------------------------
  1 | #ifdef _WIN32
  2 | #include <cuda_runtime.h>
  3 | #include <device_launch_parameters.h>
  4 | #else
  5 | #include <cuda.h>
  6 | #endif
  7 | 
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | 
 11 | #include <iostream>
 12 | 
 13 | #include "kernel.h"
 14 | 
 15 | using namespace std;
 16 | 
 17 | #if USE_KERNEL == 3
 18 | 
 19 | // Group size: 4
 20 | // 1600 * 1200 -> 1600 * 300
 21 | #define GRID_X 100
 22 | #define GRID_Y 5
 23 | #define BLOCK_X 16
 24 | #define BLOCK_Y 60
 25 | 
 26 | static int cudaInited;
 27 | 
 28 | void cudaInit()
 29 | {
 30 |     cudaError_t cudaStatus;
 31 | 
 32 |     if (cudaInited)
 33 |         return;
 34 | 
 35 |     cudaStatus = cudaSetDevice(0);
 36 |     if (cudaStatus != cudaSuccess) {
 37 |         cerr << "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?" << endl;
 38 |         exit(EXIT_FAILURE);
 39 |     }
 40 | 
 41 |     cudaInited = 1;
 42 | }
 43 | 
 44 | __device__ int mandel(float c_re, float c_im, int count)
 45 | {
 46 |     float z_re = c_re, z_im = c_im;
 47 |     int i;
 48 | 
 49 |     for (i = 0; i < count; ++i) {
 50 |         if (z_re * z_re + z_im * z_im > 4.f)
 51 |             break;
 52 | 
 53 |         float new_re = z_re * z_re - z_im * z_im;
 54 |         float new_im = 2.f * z_re * z_im;
 55 |         z_re = c_re + new_re;
 56 |         z_im = c_im + new_im;
 57 |     }
 58 | 
 59 |     return i;
 60 | }
 61 | 
 62 | __global__ void mandelKernel(int *output, float x0, float y0, float dx, float dy, int maxIterations)
 63 | {
 64 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 65 |     int j = (blockIdx.y * blockDim.y + threadIdx.y) * 4;
 66 |     float x = x0 + i * dx;
 67 | 
 68 |     #pragma unroll 4
 69 |     for (int loop = 0; loop < 4; ++loop) {
 70 |         float y = y0 + (j + loop) * dy;
 71 |         int index = (j + loop) * gridDim.x * blockDim.x + i;
 72 |         output[index] = mandel(x, y, maxIterations);
 73 |     }
 74 | }
 75 | 
 76 | // Host front-end function that allocates the memory and launches the GPU kernel
 77 | void hostFE(float upperX, float upperY, float lowerX, float lowerY, int *img, int resX, int resY, int maxIterations)
 78 | {
 79 |     cudaError_t cudaStatus;
 80 |     int *cudaResult, *result;
 81 |     size_t pitch;
 82 |     float dx, dy;
 83 | 
 84 |     cudaInit();
 85 | 
 86 |     cudaStatus = cudaMallocPitch((void **)&cudaResult, &pitch, sizeof(int) * resX, resY);
 87 |     if (cudaStatus != cudaSuccess) {
 88 |         cerr << "cudaMallocPitch failed!" << endl;
 89 |         exit(EXIT_FAILURE);
 90 |     }
 91 | 
 92 |     // HW required
 93 |     cudaStatus = cudaHostAlloc((void **)&result, sizeof(int) * resX * resY, cudaHostAllocDefault);
 94 |     if (cudaStatus != cudaSuccess) {
 95 |         cerr << "cudaHostAlloc failed!" << endl;
 96 |         exit(EXIT_FAILURE);
 97 |     }
 98 | 
 99 |     dx = (upperX - lowerX) / resX;
100 |     dy = (upperY - lowerY) / resY;
101 | 
102 |     dim3 dimGrid(GRID_X, GRID_Y);
103 |     dim3 dimBlock(BLOCK_X, BLOCK_Y);
104 | 
105 |     mandelKernel<<<dimGrid, dimBlock>>>(cudaResult, lowerX, lowerY, dx, dy, maxIterations);
106 | 
107 |     cudaStatus = cudaDeviceSynchronize();
108 |     if (cudaStatus != cudaSuccess) {
109 |         cerr << "cudaDeviceSynchronize returned error code " << cudaStatus << " after launching addKernel!" << endl;
110 |         exit(EXIT_FAILURE);
111 |     }
112 | 
113 |     cudaStatus = cudaMemcpy(result, cudaResult, sizeof(int) * resX * resY, cudaMemcpyDeviceToHost);
114 |     if (cudaStatus != cudaSuccess) {
115 |         cerr << "cudaMemcpy failed!" << endl;
116 |         exit(EXIT_FAILURE);
117 |     }
118 | 
119 |     cudaFree(cudaResult);
120 | 
121 |     // Copy result to output
122 |     memcpy(img, result, sizeof(int) * resX * resY);
123 | 
124 |     cudaFreeHost(result);
125 | }
126 | 
127 | #endif


--------------------------------------------------------------------------------
/HW5/kernel4.cu:
--------------------------------------------------------------------------------
  1 | #ifdef _WIN32
  2 | #include <cuda_runtime.h>
  3 | #include <device_launch_parameters.h>
  4 | #else
  5 | #include <cuda.h>
  6 | #endif
  7 | 
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | 
 11 | #include <iostream>
 12 | 
 13 | #include "kernel.h"
 14 | 
 15 | using namespace std;
 16 | 
 17 | #if USE_KERNEL == 4
 18 | 
 19 | // 1600 * 1200
 20 | #define GRID_X 100
 21 | #define GRID_Y 75
 22 | #define BLOCK_X 16
 23 | #define BLOCK_Y 16
 24 | 
 25 | static int cudaInited;
 26 | 
 27 | void cudaInit()
 28 | {
 29 |     cudaError_t cudaStatus;
 30 | 
 31 |     if (cudaInited)
 32 |         return;
 33 | 
 34 |     cudaStatus = cudaSetDevice(0);
 35 |     if (cudaStatus != cudaSuccess) {
 36 |         cerr << "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?" << endl;
 37 |         exit(EXIT_FAILURE);
 38 |     }
 39 | 
 40 |     cudaInited = 1;
 41 | }
 42 | 
 43 | __device__ int mandel(float c_re, float c_im, int count)
 44 | {
 45 |     float z_re = c_re, z_im = c_im;
 46 |     int i;
 47 | 
 48 |     for (i = 0; i < count; ++i) {
 49 |         if (z_re * z_re + z_im * z_im > 4.f)
 50 |             break;
 51 | 
 52 |         float new_re = z_re * z_re - z_im * z_im;
 53 |         float new_im = 2.f * z_re * z_im;
 54 |         z_re = c_re + new_re;
 55 |         z_im = c_im + new_im;
 56 |     }
 57 | 
 58 |     return i;
 59 | }
 60 | 
 61 | __global__ void mandelKernel(int *output, float x0, float y0, float dx, float dy, int maxIterations)
 62 | {
 63 |     int i = blockIdx.x * blockDim.x + threadIdx.x;
 64 |     int j = blockIdx.y * blockDim.y + threadIdx.y;
 65 |     float x = x0 + i * dx;
 66 |     float y = y0 + j * dy;
 67 |     int index = j * gridDim.x * blockDim.x + i;
 68 |     output[index] = mandel(x, y, maxIterations);
 69 | }
 70 | 
 71 | // Host front-end function that allocates the memory and launches the GPU kernel
 72 | void hostFE(float upperX, float upperY, float lowerX, float lowerY, int *img, int resX, int resY, int maxIterations)
 73 | {
 74 |     cudaError_t cudaStatus;
 75 |     int *cudaResult;
 76 |     float dx, dy;
 77 | 
 78 |     cudaInit();
 79 | 
 80 |     cudaStatus = cudaMalloc((void **)&cudaResult, sizeof(int) * resX * resY);
 81 |     if (cudaStatus != cudaSuccess) {
 82 |         cerr << "cudaMalloc failed!" << endl;
 83 |         exit(EXIT_FAILURE);
 84 |     }
 85 | 
 86 |     dx = (upperX - lowerX) / resX;
 87 |     dy = (upperY - lowerY) / resY;
 88 | 
 89 |     dim3 dimGrid(GRID_X, GRID_Y);
 90 |     dim3 dimBlock(BLOCK_X, BLOCK_Y);
 91 | 
 92 |     mandelKernel<<<dimGrid, dimBlock>>>(cudaResult, lowerX, lowerY, dx, dy, maxIterations);
 93 | 
 94 |     cudaStatus = cudaDeviceSynchronize();
 95 |     if (cudaStatus != cudaSuccess) {
 96 |         cerr << "cudaDeviceSynchronize returned error code " << cudaStatus << " after launching addKernel!" << endl;
 97 |         exit(EXIT_FAILURE);
 98 |     }
 99 | 
100 |     cudaStatus = cudaMemcpy(img, cudaResult, sizeof(int) * resX * resY, cudaMemcpyDeviceToHost);
101 |     if (cudaStatus != cudaSuccess) {
102 |         cerr << "cudaMemcpy failed!" << endl;
103 |         exit(EXIT_FAILURE);
104 |     }
105 | 
106 |     cudaFree(cudaResult);
107 | }
108 | 
109 | #endif


--------------------------------------------------------------------------------
/HW5/mandelbrotSerial.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | static inline int mandel(float c_re, float c_im, int count)
 4 | {
 5 |   float z_re = c_re, z_im = c_im;
 6 |   int i;
 7 |   for (i = 0; i < count; ++i)
 8 |   {
 9 | 
10 |     if (z_re * z_re + z_im * z_im > 4.f)
11 |       break;
12 | 
13 |     float new_re = z_re * z_re - z_im * z_im;
14 |     float new_im = 2.f * z_re * z_im;
15 |     z_re = c_re + new_re;
16 |     z_im = c_im + new_im;
17 |   }
18 | 
19 |   return i;
20 | }
21 | 
22 | //
23 | // MandelbrotSerial --
24 | //
25 | // Compute an image visualizing the mandelbrot set.  The resulting
26 | // array contains the number of iterations required before the complex
27 | // number corresponding to a pixel could be rejected from the set.
28 | //
29 | // * x0, y0, x1, y1 describe the complex coordinates mapping
30 | //   into the image viewport.
31 | // * width, height describe the size of the output image
32 | // * startRow, totalRows describe how much of the image to compute
33 | void mandelbrotSerial(
34 |     float x0, float y0, float x1, float y1,
35 |     int width, int height,
36 |     int startRow, int totalRows,
37 |     int maxIterations,
38 |     int output[])
39 | {
40 |   float dx = (x1 - x0) / width;
41 |   float dy = (y1 - y0) / height;
42 | 
43 |   int endRow = startRow + totalRows;
44 | 
45 |   for (int j = startRow; j < endRow; j++)
46 |   {
47 |     for (int i = 0; i < width; ++i)
48 |     {
49 |       float x = x0 + i * dx;
50 |       float y = y0 + j * dy;
51 | 
52 |       int index = (j * width + i);
53 |       output[index] = mandel(x, y, maxIterations);
54 |     }
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/HW5/mandelbrotThread.cpp:
--------------------------------------------------------------------------------
 1 | #include "kernel.h"
 2 | 
 3 | //
 4 | // MandelbrotThread --
 5 | //
 6 | // Multi-threaded implementation of mandelbrot set image generation.
 7 | // Threads of execution are created by using CUDA
 8 | void mandelbrotThread(
 9 |     float x0, float y0, float x1, float y1,
10 |     int width, int height,
11 |     int maxIterations, int output[])
12 | {
13 |     hostFE(x1, y1, x0, y0, output, width, height, maxIterations);
14 | }
15 | 


--------------------------------------------------------------------------------
/HW5/mandelbrotThreadRef.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW5/mandelbrotThreadRef.a


--------------------------------------------------------------------------------
/HW5/mandelbrotThreadRef50.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW5/mandelbrotThreadRef50.a


--------------------------------------------------------------------------------
/HW5/mandelbrotThreadRefAll.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW5/mandelbrotThreadRefAll.a


--------------------------------------------------------------------------------
/HW6/CycleTimer.h:
--------------------------------------------------------------------------------
 1 | #ifndef _SYRAH_CYCLE_TIMER_H_
 2 | #define _SYRAH_CYCLE_TIMER_H_
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <string.h>
 7 | #include <sys/time.h>
 8 | 
 9 | long long currentTicks();
10 | static double secondsPerTick();
11 | static double currentSeconds();
12 | 
13 | long long currentTicks()
14 | {
15 |   unsigned int a, d;
16 |   asm volatile("rdtsc"
17 |                : "=a"(a), "=d"(d));
18 |   return (unsigned long long)a | (long long)d << 32;
19 | }
20 | 
21 | // Return the conversion from ticks to seconds.
22 | static double secondsPerTick()
23 | {
24 |   static int initialized = 0;
25 |   static double secondsPerTick_val;
26 |   if (initialized)
27 |     return secondsPerTick_val;
28 |   FILE *fp = fopen("/proc/cpuinfo", "r");
29 |   char input[1024];
30 |   if (!fp)
31 |   {
32 |     fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo.");
33 |     exit(-1);
34 |   }
35 |   // In case we don't find it, e.g. on the N900
36 |   secondsPerTick_val = 1e-9;
37 |   while (!feof(fp) && fgets(input, 1024, fp))
38 |   {
39 |     // NOTE(boulos): Because reading cpuinfo depends on dynamic
40 |     // frequency scaling it's better to read the @ sign first
41 |     float GHz, MHz;
42 |     if (strstr(input, "model name"))
43 |     {
44 |       char *at_sign = strstr(input, "@");
45 |       if (at_sign)
46 |       {
47 |         char *after_at = at_sign + 1;
48 |         char *GHz_str = strstr(after_at, "GHz");
49 |         char *MHz_str = strstr(after_at, "MHz");
50 |         if (GHz_str)
51 |         {
52 |           *GHz_str = '\0';
53 |           if (1 == sscanf(after_at, "%f", &GHz))
54 |           {
55 |             //printf("GHz = %f\n", GHz);
56 |             secondsPerTick_val = 1e-9f / GHz;
57 |             break;
58 |           }
59 |         }
60 |         else if (MHz_str)
61 |         {
62 |           *MHz_str = '\0';
63 |           if (1 == sscanf(after_at, "%f", &MHz))
64 |           {
65 |             //printf("MHz = %f\n", MHz);
66 |             secondsPerTick_val = 1e-6f / GHz;
67 |             break;
68 |           }
69 |         }
70 |       }
71 |     }
72 |     else if (1 == sscanf(input, "cpu MHz : %f", &MHz))
73 |     {
74 |       //printf("MHz = %f\n", MHz);
75 |       secondsPerTick_val = 1e-6f / MHz;
76 |       break;
77 |     }
78 |   }
79 |   fclose(fp);
80 | 
81 |   initialized = 1;
82 |   return secondsPerTick_val;
83 | }
84 | 
85 | static double currentSeconds()
86 | {
87 |   return currentTicks() * secondsPerTick();
88 | }
89 | 
90 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_
91 | 


--------------------------------------------------------------------------------
/HW6/Makefile:
--------------------------------------------------------------------------------
 1 | default: conv
 2 | 
 3 | CC = gcc-10
 4 | FLAGS = -O3 -lOpenCL -m64 -ffloat-store -w -g
 5 | 
 6 | OBJS = main.o bmpfuncs.o hostFE.o serialConv.o helper.o
 7 | 
 8 | conv: $(OBJS)
 9 | 	$(CC) -o $@ $(OBJS) $(FLAGS)
10 | 
11 | %.o: %.c
12 | 	$(CC) -c $(FLAGS) $< -o $@
13 | 
14 | clean:
15 | 	rm -f conv *.o output.bmp ref.bmp


--------------------------------------------------------------------------------
/HW6/bmpfuncs.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include "bmpfuncs.h"
  4 | 
  5 | //#include "bmpfuncs.h"
  6 | typedef unsigned char uchar;
  7 | 
  8 | void storeImage(float *imageOut, const char *filename, int rows, int cols,
  9 |                 const char *refFilename)
 10 | {
 11 | 
 12 |    FILE *ifp, *ofp;
 13 |    unsigned char tmp;
 14 |    int offset;
 15 |    unsigned char *buffer;
 16 |    int i, j;
 17 | 
 18 |    int bytes;
 19 | 
 20 |    int height, width;
 21 | 
 22 |    ifp = fopen(refFilename, "rb");
 23 |    if (ifp == NULL)
 24 |    {
 25 |       perror(filename);
 26 |       exit(-1);
 27 |    }
 28 | 
 29 |    fseek(ifp, 10, SEEK_SET);
 30 |    fread(&offset, 4, 1, ifp);
 31 | 
 32 |    fseek(ifp, 18, SEEK_SET);
 33 |    fread(&width, 4, 1, ifp);
 34 |    fread(&height, 4, 1, ifp);
 35 | 
 36 |    fseek(ifp, 0, SEEK_SET);
 37 | 
 38 |    buffer = (unsigned char *)malloc(offset);
 39 |    if (buffer == NULL)
 40 |    {
 41 |       perror("malloc");
 42 |       exit(-1);
 43 |    }
 44 | 
 45 |    fread(buffer, 1, offset, ifp);
 46 | 
 47 |    printf("Writing output image to %s\n", filename);
 48 |    ofp = fopen(filename, "wb");
 49 |    if (ofp == NULL)
 50 |    {
 51 |       perror("opening output file");
 52 |       exit(-1);
 53 |    }
 54 |    bytes = fwrite(buffer, 1, offset, ofp);
 55 |    if (bytes != offset)
 56 |    {
 57 |       printf("error writing header!\n");
 58 |       exit(-1);
 59 |    }
 60 | 
 61 |    // NOTE bmp formats store data in reverse raster order (see comment in
 62 |    // readImage function), so we need to flip it upside down here.
 63 |    int mod = width % 4;
 64 |    if (mod != 0)
 65 |    {
 66 |       mod = 4 - mod;
 67 |    }
 68 |    //   printf("mod = %d\n", mod);
 69 |    for (i = height - 1; i >= 0; i--)
 70 |    {
 71 |       for (j = 0; j < width; j++)
 72 |       {
 73 |          tmp = (unsigned char)imageOut[i * cols + j];
 74 |          fwrite(&tmp, sizeof(char), 1, ofp);
 75 |       }
 76 |       // In bmp format, rows must be a multiple of 4-bytes.
 77 |       // So if we're not at a multiple of 4, add junk padding.
 78 |       for (j = 0; j < mod; j++)
 79 |       {
 80 |          fwrite(&tmp, sizeof(char), 1, ofp);
 81 |       }
 82 |    }
 83 | 
 84 |    fclose(ofp);
 85 |    fclose(ifp);
 86 | 
 87 |    free(buffer);
 88 | }
 89 | 
 90 | /*
 91 |  * Read bmp image and convert to byte array. Also output the width and height
 92 |  */
 93 | float *readImage(const char *filename, int *widthOut, int *heightOut)
 94 | {
 95 | 
 96 |    uchar *imageData;
 97 | 
 98 |    int height, width;
 99 |    uchar tmp;
100 |    int offset;
101 |    int i, j;
102 | 
103 |    printf("Reading input image from %s\n", filename);
104 |    FILE *fp = fopen(filename, "rb");
105 |    if (fp == NULL)
106 |    {
107 |       perror(filename);
108 |       exit(-1);
109 |    }
110 | 
111 |    fseek(fp, 10, SEEK_SET);
112 |    fread(&offset, 4, 1, fp);
113 | 
114 |    fseek(fp, 18, SEEK_SET);
115 |    fread(&width, 4, 1, fp);
116 |    fread(&height, 4, 1, fp);
117 | 
118 |    printf("width = %d\n", width);
119 |    printf("height = %d\n", height);
120 | 
121 |    *widthOut = width;
122 |    *heightOut = height;
123 | 
124 |    imageData = (uchar *)malloc(width * height);
125 |    if (imageData == NULL)
126 |    {
127 |       perror("malloc");
128 |       exit(-1);
129 |    }
130 | 
131 |    fseek(fp, offset, SEEK_SET);
132 |    fflush(NULL);
133 | 
134 |    int mod = width % 4;
135 |    if (mod != 0)
136 |    {
137 |       mod = 4 - mod;
138 |    }
139 | 
140 |    // NOTE bitmaps are stored in upside-down raster order.  So we begin
141 |    // reading from the bottom left pixel, then going from left-to-right,
142 |    // read from the bottom to the top of the image.  For image analysis,
143 |    // we want the image to be right-side up, so we'll modify it here.
144 | 
145 |    // First we read the image in upside-down
146 | 
147 |    // Read in the actual image
148 |    for (i = 0; i < height; i++)
149 |    {
150 | 
151 |       // add actual data to the image
152 |       for (j = 0; j < width; j++)
153 |       {
154 |          fread(&tmp, sizeof(char), 1, fp);
155 |          imageData[i * width + j] = tmp;
156 |       }
157 |       // For the bmp format, each row has to be a multiple of 4,
158 |       // so I need to read in the junk data and throw it away
159 |       for (j = 0; j < mod; j++)
160 |       {
161 |          fread(&tmp, sizeof(char), 1, fp);
162 |       }
163 |    }
164 | 
165 |    // Then we flip it over
166 |    int flipRow;
167 |    for (i = 0; i < height / 2; i++)
168 |    {
169 |       flipRow = height - (i + 1);
170 |       for (j = 0; j < width; j++)
171 |       {
172 |          tmp = imageData[i * width + j];
173 |          imageData[i * width + j] = imageData[flipRow * width + j];
174 |          imageData[flipRow * width + j] = tmp;
175 |       }
176 |    }
177 | 
178 |    fclose(fp);
179 | 
180 |    // Input image on the host
181 |    float *floatImage = NULL;
182 |    floatImage = (float *)malloc(sizeof(float) * width * height);
183 |    if (floatImage == NULL)
184 |    {
185 |       perror("malloc");
186 |       exit(-1);
187 |    }
188 | 
189 |    // Convert the BMP image to float (not required)
190 |    for (i = 0; i < height; i++)
191 |    {
192 |       for (j = 0; j < width; j++)
193 |       {
194 |          floatImage[i * width + j] = (float)imageData[i * width + j];
195 |       }
196 |    }
197 | 
198 |    free(imageData);
199 |    return floatImage;
200 | }
201 | 


--------------------------------------------------------------------------------
/HW6/bmpfuncs.h:
--------------------------------------------------------------------------------
 1 | #ifndef __BMPFUNCS__
 2 | #define __BMPFUNCS__
 3 | 
 4 | typedef unsigned char uchar;
 5 | 
 6 | float* readImage(const char *filename, int* widthOut, int* heightOut);
 7 | void storeImage(float *imageOut, const char *filename, int rows, int cols, 
 8 |                 const char* refFilename);
 9 | 
10 | #endif
11 | 


--------------------------------------------------------------------------------
/HW6/filter1.csv:
--------------------------------------------------------------------------------
1 | 7
2 | 0 0 0 0 0 0 0 
3 | 0 0 0 0 0 0 0 
4 | 0 0 1 0 1 0 0 
5 | 0 0 2 0 2 0 0 
6 | 0 0 1 0 1 0 0 
7 | 0 0 0 0 0 0 0 
8 | 0 0 0 0 0 0 0
9 | 


--------------------------------------------------------------------------------
/HW6/filter2.csv:
--------------------------------------------------------------------------------
1 | 3
2 | 0 0 1
3 | 0 1 0
4 | 0 0 1
5 | 


--------------------------------------------------------------------------------
/HW6/filter3.csv:
--------------------------------------------------------------------------------
1 | 5
2 | 0 0 0 0 0
3 | 0 1 0 1 0
4 | 0 1 1 1 0
5 | 0 1 1 1 0
6 | 0 0 0 0 0
7 | 


--------------------------------------------------------------------------------
/HW6/helper.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <CL/cl.h>
  5 | #include "helper.h"
  6 | 
  7 | // This function reads in a text file and stores it as a char pointer
  8 | char *readSource(char *kernelPath)
  9 | {
 10 |     cl_int status;
 11 |     FILE *fp;
 12 |     char *source;
 13 |     long int size;
 14 | 
 15 |     printf("Program file is: %s\n", kernelPath);
 16 | 
 17 |     fp = fopen(kernelPath, "rb");
 18 |     if (!fp)
 19 |     {
 20 |         printf("Could not open kernel file\n");
 21 |         exit(-1);
 22 |     }
 23 |     status = fseek(fp, 0, SEEK_END);
 24 |     if (status != 0)
 25 |     {
 26 |         printf("Error seeking to end of file\n");
 27 |         exit(-1);
 28 |     }
 29 |     size = ftell(fp);
 30 |     if (size < 0)
 31 |     {
 32 |         printf("Error getting file position\n");
 33 |         exit(-1);
 34 |     }
 35 | 
 36 |     rewind(fp);
 37 | 
 38 |     source = (char *)malloc(size + 1);
 39 | 
 40 |     int i;
 41 |     for (i = 0; i < size + 1; i++)
 42 |     {
 43 |         source[i] = '\0';
 44 |     }
 45 | 
 46 |     if (source == NULL)
 47 |     {
 48 |         printf("Error allocating space for the kernel source\n");
 49 |         exit(-1);
 50 |     }
 51 | 
 52 |     fread(source, 1, size, fp);
 53 |     source[size] = '\0';
 54 | 
 55 |     return source;
 56 | }
 57 | 
 58 | void initCL(cl_device_id *device, cl_context *context, cl_program *program)
 59 | {
 60 |     // Set up the OpenCL environment
 61 |     cl_int status;
 62 | 
 63 |     // Discovery platform
 64 |     cl_platform_id platform;
 65 |     status = clGetPlatformIDs(1, &platform, NULL);
 66 |     CHECK(status, "clGetPlatformIDs");
 67 | 
 68 |     // Discover device
 69 |     clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, device, NULL);
 70 |     CHECK(status, "clGetDeviceIDs");
 71 | 
 72 |     // CL_DEVICE_MAX_WORK_ITEM_SIZES
 73 |     size_t workitem_size[3];
 74 |     clGetDeviceInfo(*device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
 75 |     printf("CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]);
 76 | 
 77 |     // CL_DEVICE_MAX_WORK_GROUP_SIZE
 78 |     size_t workgroup_size;
 79 |     clGetDeviceInfo(*device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(workgroup_size), &workgroup_size, NULL);
 80 |     printf("CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", workgroup_size);
 81 | 
 82 |     // Create context
 83 |     cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
 84 |                                       (cl_context_properties)(platform), 0};
 85 |     *context = clCreateContext(props, 1, device, NULL, NULL, &status);
 86 |     CHECK(status, "clCreateContext");
 87 | 
 88 |     const char *source = readSource("kernel.cl");
 89 | 
 90 |     // Create a program object with source and build it
 91 |     *program = clCreateProgramWithSource(*context, 1, &source, NULL, NULL);
 92 |     CHECK(status, "clCreateProgramWithSource");
 93 |     status = clBuildProgram(*program, 1, device, NULL, NULL, NULL);
 94 |     CHECK(status, "clBuildProgram");
 95 | 
 96 |     return;
 97 | }
 98 | 
 99 | float *readFilter(const char *filename, int *filterWidth)
100 | {
101 |     printf("Reading filter data from %s\n", filename);
102 | 
103 |     FILE *fp = fopen(filename, "r");
104 |     if (!fp)
105 |     {
106 |         printf("Could not open filter file\n");
107 |         exit(-1);
108 |     }
109 | 
110 |     fscanf(fp, "%d", filterWidth);
111 | 
112 |     float *filter = (float *)malloc(*filterWidth * *filterWidth * sizeof(int));
113 | 
114 |     float tmp;
115 |     for (int i = 0; i < *filterWidth * *filterWidth; i++)
116 |     {
117 |         fscanf(fp, "%f", &tmp);
118 |         filter[i] = tmp;
119 |     }
120 | 
121 |     printf("Filter width: %d\n", *filterWidth);
122 | 
123 |     fclose(fp);
124 |     return filter;
125 | }
126 | 


--------------------------------------------------------------------------------
/HW6/helper.h:
--------------------------------------------------------------------------------
 1 | #ifndef __HELPER__
 2 | #define __HELPER__
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <CL/cl.h>
 7 | 
 8 | #define CHECK(status, cmd)                           \
 9 |     {                                                \
10 |         if (status != CL_SUCCESS)                    \
11 |         {                                            \
12 |             printf("%s failed (%d)\n", cmd, status); \
13 |             exit(-1);                                \
14 |         }                                            \
15 |     }
16 | 
17 | // This function reads in a text file and stores it as a char pointer
18 | char *readSource(char *kernelPath);
19 | 
20 | void initCL(cl_device_id *device, cl_context *context, cl_program *program);
21 | 
22 | float *readFilter(const char *filename, int *filterWidth);
23 | #endif


--------------------------------------------------------------------------------
/HW6/hostFE.c:
--------------------------------------------------------------------------------
 1 | // Ref: https://www.eriksmistad.no/getting-started-with-opencl-and-gpu-computing/
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include "hostFE.h"
 5 | #include "helper.h"
 6 | 
 7 | void hostFE(int filter_width, float *filter, int image_height, int image_width,
 8 |             float *inputImage, float *outputImage, cl_device_id *device,
 9 |             cl_context *context, cl_program *program)
10 | {
11 |     cl_int status;
12 |     int image_size = image_height * image_width;
13 |     int filter_size = filter_width * filter_width;
14 | 
15 |     // Create a command queue
16 |     cl_command_queue command_queue = clCreateCommandQueue(*context, *device, 0, &status);
17 |     CHECK(status, "clCreateCommandQueue");
18 |  
19 |     // Create memory buffers on the device
20 |     cl_mem input_img_mem_obj = clCreateBuffer(*context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
21 |             sizeof(float) * image_size, inputImage, &status);
22 |     CHECK(status, "clCreateBuffer");
23 | 
24 |     cl_mem filter_mem_obj = clCreateBuffer(*context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
25 |             sizeof(float) * filter_size, filter, &status);
26 |     CHECK(status, "clCreateBuffer");
27 | 
28 |     cl_mem output_img_mem_obj = clCreateBuffer(*context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, 
29 |             sizeof(float) * image_size, outputImage, &status);
30 |     CHECK(status, "clCreateBuffer");
31 | 
32 |     // Create the OpenCL kernel
33 |     cl_kernel kernel = clCreateKernel(*program, "convolution", &status);
34 |     CHECK(status, "clCreateKernel");
35 | 
36 |     // Set the arguments of the kernel
37 |     status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&input_img_mem_obj);
38 |     CHECK(status, "clSetKernelArg");
39 | 
40 |     status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&filter_mem_obj);
41 |     CHECK(status, "clSetKernelArg");
42 | 
43 |     status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&output_img_mem_obj);
44 |     CHECK(status, "clSetKernelArg");
45 | 
46 |     status = clSetKernelArg(kernel, 3, sizeof(int), (void *)&image_height);
47 |     CHECK(status, "clSetKernelArg");
48 | 
49 |     status = clSetKernelArg(kernel, 4, sizeof(int), (void *)&image_width);
50 |     CHECK(status, "clSetKernelArg");
51 | 
52 |     status = clSetKernelArg(kernel, 5, sizeof(int), (void *)&filter_width);
53 |     CHECK(status, "clSetKernelArg");
54 |  
55 |     // Execute the OpenCL kernel on the list
56 |     size_t global_item_size[2] = { image_width, image_height };
57 |     size_t local_item_size[2] = { 40, 25 };
58 |     status = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, 
59 |             global_item_size, local_item_size, 0, NULL, NULL);
60 |     CHECK(status, "clEnqueueNDRangeKernel");
61 | 
62 |     // After map call, host-memory area for outputImage is
63 |     // automatically updated with the latest bits from the device
64 |     clEnqueueMapBuffer(
65 |         command_queue,
66 |         output_img_mem_obj,
67 |         CL_TRUE,
68 |         CL_MAP_READ,
69 |         0,
70 |         sizeof(float) * image_size,
71 |         0, 0, 0,
72 |         &status
73 |     );
74 |     CHECK(status, "clEnqueueMapBuffer");
75 | 
76 |     // All resources are deallocated automatically.
77 | }


--------------------------------------------------------------------------------
/HW6/hostFE.h:
--------------------------------------------------------------------------------
1 | #ifndef __HOSTFE__
2 | #define __HOSTFE__
3 | #include <CL/cl.h>
4 | 
5 | void hostFE(int filterWidth, float *filter, int imageHeight, int imageWidth,
6 |             float *inputImage, float *outputImage, cl_device_id *device,
7 |             cl_context *context, cl_program *program);
8 | 
9 | #endif


--------------------------------------------------------------------------------
/HW6/input.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW6/input.bmp


--------------------------------------------------------------------------------
/HW6/kernel.cl:
--------------------------------------------------------------------------------
 1 | __kernel void convolution(
 2 |     __global const float *input_image,
 3 |     __global const float *filter,
 4 |     __global float *output_image,
 5 |     const int image_height,
 6 |     const int image_width,
 7 |     const int filter_width)
 8 | {
 9 |     int gx = get_global_id(0);
10 |     int gy = get_global_id(1);
11 |     int halffilter_width = filter_width / 2;
12 |     float sum;
13 |     int k, l;
14 | 
15 |     sum = 0;
16 |     for (k = -halffilter_width; k <= halffilter_width; k++)
17 |     {
18 |         for (l = -halffilter_width; l <= halffilter_width; l++)
19 |         {
20 |             if (gy + k >= 0 && gy + k < image_height &&
21 |                 gx + l >= 0 && gx + l < image_width)
22 |             {
23 |                 sum += input_image[(gy + k) * image_width + gx + l] *
24 |                        filter[(k + halffilter_width) * filter_width +
25 |                               l + halffilter_width];
26 |             }
27 |         }
28 |     }
29 |     output_image[gy * image_width + gx] = sum;
30 | }
31 | 


--------------------------------------------------------------------------------
/HW6/main.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <getopt.h>
  3 | #include <stdlib.h>
  4 | #include "CycleTimer.h"
  5 | #include "helper.h"
  6 | #include "hostFE.h"
  7 | #include "bmpfuncs.h"
  8 | #include "serialConv.h"
  9 | 
 10 | void usage(const char *progname)
 11 | {
 12 |    printf("Usage: %s [options]\n", progname);
 13 |    printf("Program Options:\n");
 14 |    printf("  -i  --input   <String> Input image\n");
 15 |    printf("  -f  --filter  <INT>    Use which filter (0, 1, 2)\n");
 16 |    printf("  -?  --help             This message\n");
 17 | }
 18 | 
 19 | int compare(const void *a, const void *b)
 20 | {
 21 |    double *x = (double *)a;
 22 |    double *y = (double *)b;
 23 |    if (*x < *y)
 24 |       return -1;
 25 |    else if (*x > *y)
 26 |       return 1;
 27 |    return 0;
 28 | }
 29 | 
 30 | int main(int argc, char **argv)
 31 | {
 32 |    int i, j;
 33 | 
 34 |    // Rows and columns in the input image
 35 |    int imageHeight;
 36 |    int imageWidth;
 37 | 
 38 |    double start_time, end_time;
 39 | 
 40 |    char *inputFile = "input.bmp";
 41 |    const char *outputFile = "output.bmp";
 42 |    const char *refFile = "ref.bmp";
 43 |    char *filterFile = "filter1.csv";
 44 | 
 45 |    // parse commandline options ////////////////////////////////////////////
 46 |    int opt;
 47 |    static struct option long_options[] = {
 48 |        {"filter", 1, 0, 'f'},
 49 |        {"input", 1, 0, 'i'},
 50 |        {"help", 0, 0, '?'},
 51 |        {0, 0, 0, 0}};
 52 | 
 53 |    while ((opt = getopt_long(argc, argv, "i:f:?", long_options, NULL)) != EOF)
 54 |    {
 55 | 
 56 |       switch (opt)
 57 |       {
 58 |       case 'i':
 59 |       {
 60 |          inputFile = optarg;
 61 | 
 62 |          break;
 63 |       }
 64 |       case 'f':
 65 |       {
 66 |          int idx = atoi(optarg);
 67 |          if (idx == 2)
 68 |             filterFile = "filter2.csv";
 69 |          else if (idx == 3)
 70 |             filterFile = "filter3.csv";
 71 | 
 72 |          break;
 73 |       }
 74 |       case '?':
 75 |       default:
 76 |          usage(argv[0]);
 77 |          return 1;
 78 |       }
 79 |    }
 80 |    // end parsing of commandline options
 81 | 
 82 |    // read filter data
 83 |    int filterWidth;
 84 |    float *filter = readFilter(filterFile, &filterWidth);
 85 | 
 86 |    // Homegrown function to read a BMP from file
 87 |    float *inputImage = readImage(inputFile, &imageWidth, &imageHeight);
 88 |    // Size of the input and output images on the host
 89 |    int dataSize = imageHeight * imageWidth * sizeof(float);
 90 |    // Output image on the host
 91 |    float *outputImage = (float *)malloc(dataSize);
 92 | 
 93 |    // helper init CL
 94 |    cl_program program;
 95 |    cl_device_id device;
 96 |    cl_context context;
 97 |    initCL(&device, &context, &program);
 98 | 
 99 |    double minThread = 0;
100 |    double recordThread[10] = {0};
101 |    for (int i = 0; i < 10; ++i)
102 |    {
103 |       memset(outputImage, 0, dataSize);
104 |       start_time = currentSeconds();
105 |       // Run the host to execute the kernel
106 |       hostFE(filterWidth, filter, imageHeight, imageWidth, inputImage, outputImage,
107 |              &device, &context, &program);
108 |       end_time = currentSeconds();
109 |       recordThread[i] = end_time - start_time;
110 |    }
111 |    qsort(recordThread, 10, sizeof(double), compare);
112 |    for (int i = 3; i < 7; ++i)
113 |    {
114 |       minThread += recordThread[i];
115 |    }
116 |    minThread /= 4;
117 | 
118 |    printf("\n[conv opencl]:\t\t[%.3f] ms\n\n", minThread * 1000);
119 | 
120 |    // Write the output image to file
121 |    storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile);
122 | 
123 |    // Output image of reference on the host
124 |    float *refImage = NULL;
125 |    refImage = (float *)malloc(dataSize);
126 |    memset(refImage, 0, dataSize);
127 | 
128 |    double minSerial = 0;
129 |    double recordSerial[10] = {0};
130 |    for (int i = 0; i < 10; ++i)
131 |    {
132 |       memset(refImage, 0, dataSize);
133 |       start_time = currentSeconds();
134 |       serialConv(filterWidth, filter, imageHeight, imageWidth, inputImage, refImage);
135 |       end_time = currentSeconds();
136 |       recordSerial[i] = end_time - start_time;
137 |    }
138 |    qsort(recordSerial, 10, sizeof(double), compare);
139 |    for (int i = 3; i < 7; ++i)
140 |    {
141 |       minSerial += recordSerial[i];
142 |    }
143 |    minSerial /= 4;
144 | 
145 |    printf("\n[conv serial]:\t\t[%.3f] ms\n\n", minSerial * 1000);
146 | 
147 |    storeImage(refImage, refFile, imageHeight, imageWidth, inputFile);
148 | 
149 |    int diff_counter = 0;
150 |    for (i = 0; i < imageHeight; i++)
151 |    {
152 |       for (j = 0; j < imageWidth; j++)
153 |       {
154 |          if (abs(outputImage[i * imageWidth + j] - refImage[i * imageWidth + j]) > 10)
155 |          {
156 |             diff_counter += 1;
157 |          }
158 |       }
159 |    }
160 | 
161 |    float diff_ratio = (float)diff_counter / (imageHeight * imageWidth);
162 |    printf("Diff ratio: %f\n", diff_ratio);
163 | 
164 |    if (diff_ratio > 0.1)
165 |    {
166 |       printf("\n\033[31mFAILED:\tResults are incorrect!\033[0m\n");
167 |       return -1;
168 |    }
169 |    else
170 |    {
171 |       printf("\n\033[32mPASS:\t(%.2fx speedup over the serial version)\033[0m\n", minSerial / minThread);
172 |    }
173 | 
174 |    return 0;
175 | }
176 | 


--------------------------------------------------------------------------------
/HW6/output.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW6/output.bmp


--------------------------------------------------------------------------------
/HW6/ref.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LJP-TW/NYCU_Parallel_Programming/2d643e4befe80c827fc50d55cff5c613cd3bf9ae/HW6/ref.bmp


--------------------------------------------------------------------------------
/HW6/serialConv.c:
--------------------------------------------------------------------------------
 1 | #include "serialConv.h"
 2 | 
 3 | void serialConv(int filterWidth, float *filter, int imageHeight, int imageWidth, float *inputImage, float *outputImage)
 4 | {
 5 |     // Iterate over the rows of the source image
 6 |     int halffilterSize = filterWidth / 2;
 7 |     float sum;
 8 |     int i, j, k, l;
 9 | 
10 |     for (i = 0; i < imageHeight; i++)
11 |     {
12 |         // Iterate over the columns of the source image
13 |         for (j = 0; j < imageWidth; j++)
14 |         {
15 |             sum = 0; // Reset sum for new source pixel
16 |             // Apply the filter to the neighborhood
17 |             for (k = -halffilterSize; k <= halffilterSize; k++)
18 |             {
19 |                 for (l = -halffilterSize; l <= halffilterSize; l++)
20 |                 {
21 |                     if (i + k >= 0 && i + k < imageHeight &&
22 |                         j + l >= 0 && j + l < imageWidth)
23 |                     {
24 |                         sum += inputImage[(i + k) * imageWidth + j + l] *
25 |                                filter[(k + halffilterSize) * filterWidth +
26 |                                       l + halffilterSize];
27 |                     }
28 |                 }
29 |             }
30 |             outputImage[i * imageWidth + j] = sum;
31 |         }
32 |     }
33 | }


--------------------------------------------------------------------------------
/HW6/serialConv.h:
--------------------------------------------------------------------------------
1 | #ifndef __serialConv__
2 | #define __serialConv__
3 | 
4 | void serialConv(int filterWidth, float *filter, int imageHeight, int imageWidth, float *inputImage, float *outputImage);
5 | 
6 | #endif


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | # NYCU_Parallel_Programming
 2 | 
 3 | # Performance
 4 | 
 5 | ## HW1
 6 | | My Score | Best Score |
 7 | | -------- | ---------- |
 8 | | 107%     | 110%       |
 9 | 
10 | ## HW2
11 | | My PI_time_3 | Best PI_time_3 | 
12 | | ------------ | -------------- |
13 | | 0.13         | 0.01           |
14 | 
15 | | My PI_time_4 | Best PI_time_4 | 
16 | | ------------ | -------------- |
17 | | 0.1          | 0.01           | 
18 | 
19 | | My Mandelbrot_time_3 | Best Mandelbrot_time_3 |
20 | | -------------------- | ---------------------- |
21 | | 42.123               | 42.123                 |
22 | 
23 | | My Mandelbrot_time_4 | Best Mandelbrot_time_4 |
24 | | -------------------- | ---------------------- |
25 | | 32.224               | 32.224                 |
26 | 
27 | ## HW3
28 | | My CG Score | Best CG Score |
29 | | ----------- | ------------- |
30 | | 30          | 30            |
31 | 
32 | | My Page Rank Score | Best Page Rank Score |
33 | | ------------------ | -------------------- |
34 | | 16                 | 16                   |
35 | 
36 | | My BFS_Top_Down Score | Best BFS_Top_Down Score |
37 | | ----------- | ------------- |
38 | | 18          | 18            |
39 | 
40 | | My BFS_Bottom_Up Score | Best BFS_Bottom_Up Score |
41 | | ----------- | ------------- |
42 | | 23          | 23            |
43 | 
44 | | My BFS_Hybrid Score | Best BFS_Hybrid Score |
45 | | ----------- | ------------- |
46 | | 23          | 23            |
47 | 
48 | ## HW4
49 | | My MM_time_1 | Best MM_time_1 |
50 | | ------------ | -------------- |
51 | | 0.0874       | 0.0275         |    
52 | 
53 | | My MM_time_2 | Best MM_time_2 |
54 | | ------------ | -------------- |
55 | | 1.5608       | 0.6174         |
56 | 
57 | ## HW5
58 | | My kernel4_view1_time | Best kernel4_view1_time |
59 | | --------------------- | ----------------------- |
60 | | 287.4263              | 206.571                 |
61 | 
62 | | My kernel4_view2_time | Best kernel4_view2_time |
63 | | --------------------- | ----------------------- |
64 | | 26.8847               | 18.329                  |
65 | 
66 | ## HW6
67 | | My Filter_time | Best Filter_time |
68 | | -------------- | ---------------- |
69 | | 1.7767         | 1.1127           |
70 | 
71 | # Final Project
72 | [NBodySim](https://github.com/LJP-TW/NBodySim)
73 | 


--------------------------------------------------------------------------------