├── README.md
├── asst1
    ├── README.md
    ├── common
    │   ├── CycleTimer.h
    │   ├── ppm.cpp
    │   └── tasksys.cpp
    ├── imgs
    │   ├── 1.png
    │   ├── 2.png
    │   ├── 3.png
    │   ├── 4.png
    │   ├── Snipaste_2022-02-28_23-37-11.png
    │   ├── Snipaste_2022-02-28_23-49-32.png
    │   └── Snipaste_2022-02-28_23-49-52.png
    ├── prog1_mandelbrot_threads
    │   ├── Makefile
    │   ├── main.cpp
    │   ├── mandelbrot
    │   ├── mandelbrot-serial.ppm
    │   ├── mandelbrot-thread.ppm
    │   ├── mandelbrotSerial.cpp
    │   ├── mandelbrotThread.cpp
    │   └── objs
    │   │   ├── main.o
    │   │   ├── mandelbrotSerial.o
    │   │   ├── mandelbrotThread.o
    │   │   └── ppm.o
    ├── prog2_vecintrin
    │   ├── CS149intrin.cpp
    │   ├── CS149intrin.h
    │   ├── Makefile
    │   ├── logger.cpp
    │   ├── logger.h
    │   └── main.cpp
    ├── prog3_mandelbrot_ispc
    │   ├── Makefile
    │   ├── main.cpp
    │   ├── mandelbrot.ispc
    │   └── mandelbrotSerial.cpp
    ├── prog4_sqrt
    │   ├── Makefile
    │   ├── main.cpp
    │   ├── sqrt.ispc
    │   └── sqrtSerial.cpp
    └── prog5_saxpy
    │   ├── Makefile
    │   ├── main.cpp
    │   ├── saxpy.ispc
    │   └── saxpySerial.cpp
├── asst2
    ├── README.md
    ├── common
    │   ├── CycleTimer.h
    │   └── ppm.cpp
    ├── figs
    │   └── task_graph.png
    ├── part_a
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── itasksys.h
    │   ├── runtasks_ref_linux
    │   ├── runtasks_ref_osx_arm
    │   ├── runtasks_ref_osx_x86
    │   ├── tasksys.cpp
    │   └── tasksys.h
    ├── part_b
    │   ├── .gitignore
    │   ├── Makefile
    │   ├── itasksys.h
    │   ├── runtasks_ref_linux
    │   ├── runtasks_ref_osx_arm
    │   ├── runtasks_ref_osx_x86
    │   ├── tasksys.cpp
    │   └── tasksys.h
    ├── tests
    │   ├── main.cpp
    │   ├── main_ref.cpp
    │   ├── run_test_harness.py
    │   └── tests.h
    └── tutorial
    │   ├── Makefile
    │   ├── README.md
    │   └── tutorial.cpp
├── asst3
    ├── README.md
    ├── cloud_readme.md
    ├── handout
    │   ├── bug_example.jpg
    │   ├── choose_ami.png
    │   ├── choose_instance.png
    │   ├── choose_storage.png
    │   ├── dependencies.jpg
    │   ├── gpu_instance.png
    │   ├── gpu_instance.png_original
    │   ├── ip_address.png
    │   ├── location_limit.png
    │   ├── navigation_quota.png
    │   ├── new_key_pair.png
    │   ├── order.jpg
    │   ├── point_in_circle.jpg
    │   ├── public_dns.png
    │   ├── quota_request.png
    │   ├── teaser.jpg
    │   ├── vCPU_dashboard.png
    │   ├── vCPU_dashboard_2.png
    │   └── vCPU_trouble.png
    ├── install.sh
    ├── render
    │   ├── Makefile
    │   ├── benchmark.cpp
    │   ├── checker.pl
    │   ├── checker.py
    │   ├── circleBoxTest.cu_inl
    │   ├── circleRenderer.h
    │   ├── cudaRenderer.cu
    │   ├── cudaRenderer.h
    │   ├── cycleTimer.h
    │   ├── display.cpp
    │   ├── exclusiveScan.cu_inl
    │   ├── image.h
    │   ├── index.html
    │   ├── lookupColor.cu_inl
    │   ├── main.cpp
    │   ├── noise.cpp
    │   ├── noise.h
    │   ├── noiseCuda.cu_inl
    │   ├── platformgl.h
    │   ├── ppm.cpp
    │   ├── ppm.h
    │   ├── refRenderer.cpp
    │   ├── refRenderer.h
    │   ├── refTimings.txt
    │   ├── render_ref
    │   ├── sceneLoader.cpp
    │   ├── sceneLoader.h
    │   ├── snow.par
    │   └── util.h
    ├── saxpy
    │   ├── CycleTimer.h
    │   ├── Makefile
    │   ├── main.cpp
    │   ├── saxpy.cu
    │   └── tt.asm
    └── scan
    │   ├── CycleTimer.h
    │   ├── Makefile
    │   ├── checker.pl
    │   ├── cudaScan_ref
    │   ├── log.txt
    │   ├── main.cpp
    │   └── scan.cu
└── asst4
    ├── README.md
    ├── bfs
        ├── Makefile
        ├── bfs.cpp
        ├── bfs.h
        ├── grade.cpp
        ├── main.cpp
        └── ref_bfs.o
    ├── cloud_readme.md
    ├── common
        ├── CycleTimer.h
        ├── contracts.h
        ├── grade.h
        ├── graph.cpp
        ├── graph.h
        └── graph_internal.h
    ├── handout
        ├── AMI.png
        ├── instance_type.png
        ├── instance_type_big.png
        ├── ip_address.png
        ├── new_key_pair.png
        ├── storage.png
        └── storage_big.png
    ├── imgs
        ├── 1.png
        └── 2.png
    ├── pagerank
        ├── Makefile
        ├── grade.cpp
        ├── main.cpp
        ├── page_rank.cpp
        ├── page_rank.h
        └── ref_pr.a
    └── tools
        ├── Makefile
        ├── graphTools.cpp
        └── plaintext.graph


/README.md:
--------------------------------------------------------------------------------
1 | # cs149
2 | 
3 | 本项目是自学CS149 PARALLEL COMPUTING 完成的所有的课程lab。
4 | 
5 | 该门课程对应的CMU15-418课程，斯坦福的内容较CMU少，老师是同一个人。
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/asst1/common/CycleTimer.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SYRAH_CYCLE_TIMER_H_
  2 | #define _SYRAH_CYCLE_TIMER_H_
  3 | 
  4 | #if defined(__APPLE__)
  5 |   #if defined(__x86_64__)
  6 |     #include <sys/sysctl.h>
  7 |   #else
  8 |     #include <mach/mach.h>
  9 |     #include <mach/mach_time.h>
 10 |   #endif // __x86_64__ or not
 11 | 
 12 |   #include <stdio.h>  // fprintf
 13 |   #include <stdlib.h> // exit
 14 | 
 15 | #elif _WIN32
 16 | #  include <windows.h>
 17 | #  include <time.h>
 18 | #else
 19 | #  include <stdio.h>
 20 | #  include <stdlib.h>
 21 | #  include <string.h>
 22 | #  include <sys/time.h>
 23 | #endif
 24 | 
 25 | 
 26 |   // This uses the cycle counter of the processor.  Different
 27 |   // processors in the system will have different values for this.  If
 28 |   // you process moves across processors, then the delta time you
 29 |   // measure will likely be incorrect.  This is mostly for fine
 30 |   // grained measurements where the process is likely to be on the
 31 |   // same processor.  For more global things you should use the
 32 |   // Time interface.
 33 | 
 34 |   // Also note that if you processors' speeds change (i.e. processors
 35 |   // scaling) or if you are in a heterogenous environment, you will
 36 |   // likely get spurious results.
 37 |   class CycleTimer {
 38 |   public:
 39 |     typedef unsigned long long SysClock;
 40 | 
 41 |     //////////
 42 |     // Return the current CPU time, in terms of clock ticks.
 43 |     // Time zero is at some arbitrary point in the past.
 44 |     static SysClock currentTicks() {
 45 | #if defined(__APPLE__) && !defined(__x86_64__)
 46 |       return mach_absolute_time();
 47 | #elif defined(_WIN32)
 48 |       LARGE_INTEGER qwTime;
 49 |       QueryPerformanceCounter(&qwTime);
 50 |       return qwTime.QuadPart;
 51 | #elif defined(__x86_64__)
 52 |       unsigned int a, d;
 53 |       asm volatile("rdtsc" : "=a" (a), "=d" (d));
 54 |       return static_cast<unsigned long long>(a) |
 55 |         (static_cast<unsigned long long>(d) << 32);
 56 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser.
 57 |       unsigned int val;
 58 |       asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val));
 59 |       return val;
 60 | #else
 61 |       timespec spec;
 62 |       clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec);
 63 |       return CycleTimer::SysClock(static_cast<float>(spec.tv_sec) * 1e9 + static_cast<float>(spec.tv_nsec));
 64 | #endif
 65 |     }
 66 | 
 67 |     //////////
 68 |     // Return the current CPU time, in terms of seconds.
 69 |     // This is slower than currentTicks().  Time zero is at
 70 |     // some arbitrary point in the past.
 71 |     static double currentSeconds() {
 72 |       return currentTicks() * secondsPerTick();
 73 |     }
 74 | 
 75 |     //////////
 76 |     // Return the conversion from seconds to ticks.
 77 |     static double ticksPerSecond() {
 78 |       return 1.0/secondsPerTick();
 79 |     }
 80 | 
 81 |     static const char* tickUnits() {
 82 | #if defined(__APPLE__) && !defined(__x86_64__)
 83 |       return "ns";
 84 | #elif defined(__WIN32__) || defined(__x86_64__)
 85 |       return "cycles";
 86 | #else
 87 |       return "ns"; // clock_gettime
 88 | #endif
 89 |     }
 90 | 
 91 |     //////////
 92 |     // Return the conversion from ticks to seconds.
 93 |     static double secondsPerTick() {
 94 |       static bool initialized = false;
 95 |       static double secondsPerTick_val;
 96 |       if (initialized) return secondsPerTick_val;
 97 | #if defined(__APPLE__)
 98 |   #ifdef __x86_64__
 99 |       int args[] = {CTL_HW, HW_CPU_FREQ};
100 |       unsigned int Hz;
101 |       size_t len = sizeof(Hz);
102 |       if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) {
103 |          fprintf(stderr, "Failed to initialize secondsPerTick_val!\n");
104 |          exit(-1);
105 |       }
106 |       secondsPerTick_val = 1.0 / (double) Hz;
107 |   #else
108 |       mach_timebase_info_data_t time_info;
109 |       mach_timebase_info(&time_info);
110 | 
111 |       // Scales to nanoseconds without 1e-9f
112 |       secondsPerTick_val = (1e-9*static_cast<double>(time_info.numer))/
113 |         static_cast<double>(time_info.denom);
114 |   #endif // x86_64 or not
115 | #elif defined(_WIN32)
116 |       LARGE_INTEGER qwTicksPerSec;
117 |       QueryPerformanceFrequency(&qwTicksPerSec);
118 |       secondsPerTick_val = 1.0/static_cast<double>(qwTicksPerSec.QuadPart);
119 | #else
120 |       FILE *fp = fopen("/proc/cpuinfo","r");
121 |       char input[1024];
122 |       if (!fp) {
123 |          fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo.");
124 |          exit(-1);
125 |       }
126 |       // In case we don't find it, e.g. on the N900
127 |       secondsPerTick_val = 1e-9;
128 |       while (!feof(fp) && fgets(input, 1024, fp)) {
129 |         // NOTE(boulos): Because reading cpuinfo depends on dynamic
130 |         // frequency scaling it's better to read the @ sign first
131 |         float GHz, MHz;
132 |         if (strstr(input, "model name")) {
133 |           char* at_sign = strstr(input, "@");
134 |           if (at_sign) {
135 |             char* after_at = at_sign + 1;
136 |             char* GHz_str = strstr(after_at, "GHz");
137 |             char* MHz_str = strstr(after_at, "MHz");
138 |             if (GHz_str) {
139 |               *GHz_str = '\0';
140 |               if (1 == sscanf(after_at, "%f", &GHz)) {
141 |                 //printf("GHz = %f\n", GHz);
142 |                 secondsPerTick_val = 1e-9f / GHz;
143 |                 break;
144 |               }
145 |             } else if (MHz_str) {
146 |               *MHz_str = '\0';
147 |               if (1 == sscanf(after_at, "%f", &MHz)) {
148 |                 //printf("MHz = %f\n", MHz);
149 |                 secondsPerTick_val = 1e-6f / GHz;
150 |                 break;
151 |               }
152 |             }
153 |           }
154 |         } else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) {
155 |           //printf("MHz = %f\n", MHz);
156 |           secondsPerTick_val = 1e-6f / MHz;
157 |           break;
158 |         }
159 |       }
160 |       fclose(fp);
161 | #endif
162 | 
163 |       initialized = true;
164 |       return secondsPerTick_val;
165 |     }
166 | 
167 |     //////////
168 |     // Return the conversion from ticks to milliseconds.
169 |     static double msPerTick() {
170 |       return secondsPerTick() * 1000.0;
171 |     }
172 | 
173 |   private:
174 |     CycleTimer();
175 |   };
176 | 
177 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_
178 | 


--------------------------------------------------------------------------------
/asst1/common/ppm.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | #include <math.h>
 4 | #include <algorithm>
 5 | 
 6 | 
 7 | 
 8 | void
 9 | writePPMImage(int* data, int width, int height, const char *filename, int maxIterations)
10 | {
11 |     FILE *fp = fopen(filename, "wb");
12 | 
13 |     // write ppm header
14 |     fprintf(fp, "P6\n");
15 |     fprintf(fp, "%d %d\n", width, height);
16 |     fprintf(fp, "255\n");
17 | 
18 |     for (int i = 0; i < width*height; ++i) {
19 | 
20 |         // Clamp iteration count for this pixel, then scale the value
21 |         // to 0-1 range.  Raise resulting value to a power (<1) to
22 |         // increase brightness of low iteration count
23 |         // pixels. a.k.a. Make things look cooler.
24 | 
25 |         float mapped = pow( std::min(static_cast<float>(maxIterations),
26 |                                      static_cast<float>(data[i])) / 256.f, .5f);
27 | 
28 |         // convert back into 0-255 range, 8-bit channels
29 |         unsigned char result = static_cast<unsigned char>(255.f * mapped);
30 |         for (int j = 0; j < 3; ++j)
31 |             fputc(result, fp);
32 |     }
33 |     fclose(fp);
34 |     printf("Wrote image file %s\n", filename);
35 | }
36 | 


--------------------------------------------------------------------------------
/asst1/imgs/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/imgs/1.png


--------------------------------------------------------------------------------
/asst1/imgs/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/imgs/2.png


--------------------------------------------------------------------------------
/asst1/imgs/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/imgs/3.png


--------------------------------------------------------------------------------
/asst1/imgs/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/imgs/4.png


--------------------------------------------------------------------------------
/asst1/imgs/Snipaste_2022-02-28_23-37-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/imgs/Snipaste_2022-02-28_23-37-11.png


--------------------------------------------------------------------------------
/asst1/imgs/Snipaste_2022-02-28_23-49-32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/imgs/Snipaste_2022-02-28_23-49-32.png


--------------------------------------------------------------------------------
/asst1/imgs/Snipaste_2022-02-28_23-49-52.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/imgs/Snipaste_2022-02-28_23-49-52.png


--------------------------------------------------------------------------------
/asst1/prog1_mandelbrot_threads/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | CXX=g++ -m64
 3 | CXXFLAGS=-I../common -Iobjs/ -O3 -std=c++11 -Wall -fPIC
 4 | 
 5 | APP_NAME=mandelbrot
 6 | OBJDIR=objs
 7 | COMMONDIR=../common
 8 | 
 9 | PPM_CXX=$(COMMONDIR)/ppm.cpp
10 | PPM_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(PPM_CXX:.cpp=.o)))
11 | 
12 | 
13 | default: $(APP_NAME)
14 | 
15 | .PHONY: dirs clean
16 | 
17 | dirs:
18 | 		/bin/mkdir -p $(OBJDIR)/
19 | 
20 | clean:
21 | 		/bin/rm -rf $(OBJDIR) *.ppm *~ $(APP_NAME)
22 | 
23 | OBJS=$(OBJDIR)/main.o $(OBJDIR)/mandelbrotSerial.o $(OBJDIR)/mandelbrotThread.o $(PPM_OBJ)
24 | 
25 | $(APP_NAME): dirs $(OBJS)
26 | 		$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm -lpthread
27 | 
28 | $(OBJDIR)/%.o: %.cpp
29 | 		$(CXX) $< $(CXXFLAGS) -c -o $@
30 | 
31 | $(OBJDIR)/%.o: $(COMMONDIR)/%.cpp
32 | 	$(CXX) $< $(CXXFLAGS) -c -o $@
33 | 
34 | $(OBJDIR)/main.o: $(COMMONDIR)/CycleTimer.h
35 | 
36 | 


--------------------------------------------------------------------------------
/asst1/prog1_mandelbrot_threads/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <algorithm>
  3 | #include <getopt.h>
  4 | 
  5 | #include "CycleTimer.h"
  6 | 
  7 | extern void mandelbrotSerial(
  8 |     float x0, float y0, float x1, float y1,
  9 |     int width, int height,
 10 |     int startRow, int numRows,
 11 |     int maxIterations,
 12 |     int output[]);
 13 | 
 14 | extern void mandelbrotThread(
 15 |     int numThreads,
 16 |     float x0, float y0, float x1, float y1,
 17 |     int width, int height,
 18 |     int maxIterations,
 19 |     int output[]);
 20 | 
 21 | extern void writePPMImage(
 22 |     int* data,
 23 |     int width, int height,
 24 |     const char *filename,
 25 |     int maxIterations);
 26 | 
 27 | void
 28 | scaleAndShift(float& x0, float& x1, float& y0, float& y1,
 29 |               float scale,
 30 |               float shiftX, float shiftY)
 31 | {
 32 | 
 33 |     x0 *= scale;
 34 |     x1 *= scale;
 35 |     y0 *= scale;
 36 |     y1 *= scale;
 37 |     x0 += shiftX;
 38 |     x1 += shiftX;
 39 |     y0 += shiftY;
 40 |     y1 += shiftY;
 41 | 
 42 | }
 43 | 
 44 | void usage(const char* progname) {
 45 |     printf("Usage: %s [options]\n", progname);
 46 |     printf("Program Options:\n");
 47 |     printf("  -t  --threads <N>  Use N threads\n");
 48 |     printf("  -v  --view <INT>   Use specified view settings\n");
 49 |     printf("  -?  --help         This message\n");
 50 | }
 51 | 
 52 | bool verifyResult (int *gold, int *result, int width, int height) {
 53 | 
 54 |     int i, j;
 55 | 
 56 |     for (i = 0; i < height; i++) {
 57 |         for (j = 0; j < width; j++) {
 58 |             if (gold[i * width + j] != result[i * width + j]) {
 59 |                 printf ("Mismatch : [%d][%d], Expected : %d, Actual : %d\n",
 60 |                             i, j, gold[i * width + j], result[i * width + j]);
 61 |                 return 0;
 62 |             }
 63 |         }
 64 |     }
 65 | 
 66 |     return 1;
 67 | }
 68 | 
 69 | int main(int argc, char** argv) {
 70 | 
 71 |     const unsigned int width = 1600;
 72 |     const unsigned int height = 1200;
 73 |     const int maxIterations = 256;
 74 |     int numThreads = 2;
 75 | 
 76 |     float x0 = -2;
 77 |     float x1 = 1;
 78 |     float y0 = -1;
 79 |     float y1 = 1;
 80 | 
 81 |     // parse commandline options ////////////////////////////////////////////
 82 |     int opt;
 83 |     static struct option long_options[] = {
 84 |         {"threads", 1, 0, 't'},
 85 |         {"view", 1, 0, 'v'},
 86 |         {"help", 0, 0, '?'},
 87 |         {0 ,0, 0, 0}
 88 |     };
 89 | 
 90 |     while ((opt = getopt_long(argc, argv, "t:v:?", long_options, NULL)) != EOF) {
 91 | 
 92 |         switch (opt) {
 93 |         case 't':
 94 |         {
 95 |             numThreads = atoi(optarg);
 96 |             break;
 97 |         }
 98 |         case 'v':
 99 |         {
100 |             int viewIndex = atoi(optarg);
101 |             // change view settings
102 |             if (viewIndex == 2) {
103 |                 float scaleValue = .015f;
104 |                 float shiftX = -.986f;
105 |                 float shiftY = .30f;
106 |                 scaleAndShift(x0, x1, y0, y1, scaleValue, shiftX, shiftY);
107 |             } else if (viewIndex > 1) {
108 |                 fprintf(stderr, "Invalid view index\n");
109 |                 return 1;
110 |             }
111 |             break;
112 |         }
113 |         case '?':
114 |         default:
115 |             usage(argv[0]);
116 |             return 1;
117 |         }
118 |     }
119 |     // end parsing of commandline options
120 | 
121 | 
122 |     int* output_serial = new int[width*height];
123 |     int* output_thread = new int[width*height];
124 | 
125 |     //
126 |     // Run the serial implementation.  Run the code three times and
127 |     // take the minimum to get a good estimate.
128 |     //
129 | 
130 |     double minSerial = 1e30;
131 |     for (int i = 0; i < 5; ++i) {
132 |        memset(output_serial, 0, width * height * sizeof(int));
133 |         double startTime = CycleTimer::currentSeconds();
134 |         mandelbrotSerial(x0, y0, x1, y1, width, height, 0, height, maxIterations, output_serial);
135 |         double endTime = CycleTimer::currentSeconds();
136 |         minSerial = std::min(minSerial, endTime - startTime);
137 |     }
138 | 
139 |     printf("[mandelbrot serial]:\t\t[%.3f] ms\n", minSerial * 1000);
140 |     writePPMImage(output_serial, width, height, "mandelbrot-serial.ppm", maxIterations);
141 | 
142 |     //
143 |     // Run the threaded version
144 |     //
145 | 
146 |     double minThread = 1e30;
147 |     for (int i = 0; i < 5; ++i) {
148 |       memset(output_thread, 0, width * height * sizeof(int));
149 |         double startTime = CycleTimer::currentSeconds();
150 |         mandelbrotThread(numThreads, x0, y0, x1, y1, width, height, maxIterations, output_thread);
151 |         double endTime = CycleTimer::currentSeconds();
152 |         minThread = std::min(minThread, endTime - startTime);
153 |     }
154 | 
155 |     printf("[mandelbrot thread]:\t\t[%.3f] ms\n", minThread * 1000);
156 |     writePPMImage(output_thread, width, height, "mandelbrot-thread.ppm", maxIterations);
157 | 
158 |     if (! verifyResult (output_serial, output_thread, width, height)) {
159 |         printf ("Error : Output from threads does not match serial output\n");
160 | 
161 |         delete[] output_serial;
162 |         delete[] output_thread;
163 | 
164 |         return 1;
165 |     }
166 | 
167 |     // compute speedup
168 |     printf("\t\t\t\t(%.2fx speedup from %d threads)\n", minSerial/minThread, numThreads);
169 | 
170 |     delete[] output_serial;
171 |     delete[] output_thread;
172 | 
173 |     return 0;
174 | }
175 | 


--------------------------------------------------------------------------------
/asst1/prog1_mandelbrot_threads/mandelbrot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/prog1_mandelbrot_threads/mandelbrot


--------------------------------------------------------------------------------
/asst1/prog1_mandelbrot_threads/mandelbrot-serial.ppm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/prog1_mandelbrot_threads/mandelbrot-serial.ppm


--------------------------------------------------------------------------------
/asst1/prog1_mandelbrot_threads/mandelbrot-thread.ppm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/prog1_mandelbrot_threads/mandelbrot-thread.ppm


--------------------------------------------------------------------------------
/asst1/prog1_mandelbrot_threads/mandelbrotSerial.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 |   Note: This code was modified from example code
 4 |   originally provided by Intel.  To comply with Intel's open source
 5 |   licensing agreement, their copyright is retained below.
 6 | 
 7 |   -----------------------------------------------------------------
 8 | 
 9 |   Copyright (c) 2010-2011, Intel Corporation
10 |   All rights reserved.
11 | 
12 |   Redistribution and use in source and binary forms, with or without
13 |   modification, are permitted provided that the following conditions are
14 |   met:
15 | 
16 |     * Redistributions of source code must retain the above copyright
17 |       notice, this list of conditions and the following disclaimer.
18 | 
19 |     * Redistributions in binary form must reproduce the above copyright
20 |       notice, this list of conditions and the following disclaimer in the
21 |       documentation and/or other materials provided with the distribution.
22 | 
23 |     * Neither the name of Intel Corporation nor the names of its
24 |       contributors may be used to endorse or promote products derived from
25 |       this software without specific prior written permission.
26 | 
27 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
28 |    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29 |    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
30 |    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
31 |    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 |    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 |    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 |    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 |    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 |    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 |    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 | */
39 | 
40 | 
41 | static inline int mandel(float c_re, float c_im, int count)
42 | {
43 |     float z_re = c_re, z_im = c_im;
44 |     int i;
45 |     for (i = 0; i < count; ++i) {
46 | 
47 |         if (z_re * z_re + z_im * z_im > 4.f)
48 |             break;
49 | 
50 |         float new_re = z_re*z_re - z_im*z_im;
51 |         float new_im = 2.f * z_re * z_im;
52 |         z_re = c_re + new_re;
53 |         z_im = c_im + new_im;
54 |     }
55 | 
56 |     return i;
57 | }
58 | 
59 | //
60 | // MandelbrotSerial --
61 | //
62 | // Compute an image visualizing the mandelbrot set.  The resulting
63 | // array contains the number of iterations required before the complex
64 | // number corresponding to a pixel could be rejected from the set.
65 | //
66 | // * x0, y0, x1, y1 describe the complex coordinates mapping
67 | //   into the image viewport.
68 | // * width, height describe the size of the output image
69 | // * startRow, totalRows describe how much of the image to compute
70 | void mandelbrotSerial(
71 |     float x0, float y0, float x1, float y1,
72 |     int width, int height,
73 |     int startRow, int totalRows,
74 |     int maxIterations,
75 |     int output[])
76 | {
77 |     float dx = (x1 - x0) / width;
78 |     float dy = (y1 - y0) / height;
79 | 
80 |     int endRow = startRow + totalRows;
81 | 
82 |     for (int j = startRow; j < endRow; j++) {
83 |         for (int i = 0; i < width; ++i) {
84 |             float x = x0 + i * dx;
85 |             float y = y0 + j * dy;
86 | 
87 |             int index = (j * width + i);
88 |             output[index] = mandel(x, y, maxIterations);
89 |         }
90 |     }
91 | }
92 | 
93 | 


--------------------------------------------------------------------------------
/asst1/prog1_mandelbrot_threads/mandelbrotThread.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <thread>
  3 | 
  4 | #include "CycleTimer.h"
  5 | 
  6 | typedef struct {
  7 |     float x0, x1;
  8 |     float y0, y1;
  9 |     unsigned int width;
 10 |     unsigned int height;
 11 |     int maxIterations;
 12 |     int* output;
 13 |     int threadId;
 14 |     int numThreads;
 15 | } WorkerArgs;
 16 | 
 17 | 
 18 | static inline int mandel(float c_re, float c_im, int count)
 19 | {
 20 |     float z_re = c_re, z_im = c_im;
 21 |     int i;
 22 |     for (i = 0; i < count; ++i) {
 23 | 
 24 |         if (z_re * z_re + z_im * z_im > 4.f)
 25 |             break;
 26 | 
 27 |         float new_re = z_re*z_re - z_im*z_im;
 28 |         float new_im = 2.f * z_re * z_im;
 29 |         z_re = c_re + new_re;
 30 |         z_im = c_im + new_im;
 31 |     }
 32 | 
 33 |     return i;
 34 | }
 35 | 
 36 | // 每个线程按照step进行跳跃处理
 37 | static void mandelbrotSerial(
 38 |     float x0, float y0, float x1, float y1,
 39 |     int width, int height,
 40 |     int startRow, int step,
 41 |     int maxIterations,
 42 |     int output[]) {
 43 |     float dx = (x1 - x0) / width;
 44 |     float dy = (y1 - y0) / height;
 45 | 
 46 |     for (int j = startRow; j < height; j += step) {
 47 |         for (int i = 0; i < width; ++i) {
 48 |             float x = x0 + i * dx;
 49 |             float y = y0 + j * dy;
 50 |             int index = (j * width + i);
 51 |             output[index] = mandel(x, y, maxIterations);
 52 |         }
 53 |     }
 54 | }
 55 | 
 56 | 
 57 | //
 58 | // workerThreadStart --
 59 | //
 60 | // Thread entrypoint.
 61 | void workerThreadStart(WorkerArgs * const args) {
 62 | 
 63 |     // TODO FOR CS149 STUDENTS: Implement the body of the worker
 64 |     // thread here. Each thread should make a call to mandelbrotSerial()
 65 |     // to compute a part of the output image.  For example, in a
 66 |     // program that uses two threads, thread 0 could compute the top
 67 |     // half of the image and thread 1 could compute the bottom half.
 68 | 
 69 |     printf("Hello world from thread %d\n", args->threadId);
 70 |     double startTime = CycleTimer::currentSeconds();
 71 |     // method1.
 72 |     // unsigned int dh = (args->height + args->numThreads - 1) / args->numThreads;
 73 |     // mandelbrotSerial(args->x0, args->y0, args->x1, args->y1, args->width, args->height, args->threadId * dh,
 74 |     // std::min(args->height, (args->threadId + 1) * dh) - args->threadId * dh, args->maxIterations, args->output);
 75 |     // method2
 76 |     mandelbrotSerial(args->x0, args->y0, args->x1, args->y1, args->width, args->height, args->threadId,
 77 |     args->numThreads, args->maxIterations, args->output);
 78 |     double endTime = CycleTimer::currentSeconds();
 79 | 
 80 |     printf("[mandelbrot threadid %d]: [%.3lf] ms\n", args->threadId, (endTime - startTime) * 1000);
 81 | }
 82 | 
 83 | //
 84 | // MandelbrotThread --
 85 | //
 86 | // Multi-threaded implementation of mandelbrot set image generation.
 87 | // Threads of execution are created by spawning std::threads.
 88 | void mandelbrotThread(
 89 |     int numThreads,
 90 |     float x0, float y0, float x1, float y1,
 91 |     int width, int height,
 92 |     int maxIterations, int output[])
 93 | {
 94 |     static constexpr int MAX_THREADS = 32;
 95 | 
 96 |     if (numThreads > MAX_THREADS)
 97 |     {
 98 |         fprintf(stderr, "Error: Max allowed threads is %d\n", MAX_THREADS);
 99 |         exit(1);
100 |     }
101 | 
102 |     // Creates thread objects that do not yet represent a thread.
103 |     std::thread workers[MAX_THREADS];
104 |     WorkerArgs args[MAX_THREADS];
105 | 
106 |     for (int i=0; i<numThreads; i++) {
107 | 
108 |         // TODO FOR CS149 STUDENTS: You may or may not wish to modify
109 |         // the per-thread arguments here.  The code below copies the
110 |         // same arguments for each thread
111 |         args[i].x0 = x0;
112 |         args[i].y0 = y0;
113 |         args[i].x1 = x1;
114 |         args[i].y1 = y1;
115 |         args[i].width = width;
116 |         args[i].height = height;
117 |         args[i].maxIterations = maxIterations;
118 |         args[i].numThreads = numThreads;
119 |         args[i].output = output;
120 |         args[i].threadId = i;
121 |     }
122 | 
123 |     // Spawn the worker threads.  Note that only numThreads-1 std::threads
124 |     // are created and the main application thread is used as a worker
125 |     // as well.
126 |     for (int i=1; i<numThreads; i++) {
127 |         workers[i] = std::thread(workerThreadStart, &args[i]);
128 |     }
129 | 
130 |     workerThreadStart(&args[0]);
131 | 
132 |     // join worker threads
133 |     for (int i=1; i<numThreads; i++) {
134 |         workers[i].join();
135 |     }
136 | }
137 | 
138 | 


--------------------------------------------------------------------------------
/asst1/prog1_mandelbrot_threads/objs/main.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/prog1_mandelbrot_threads/objs/main.o


--------------------------------------------------------------------------------
/asst1/prog1_mandelbrot_threads/objs/mandelbrotSerial.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/prog1_mandelbrot_threads/objs/mandelbrotSerial.o


--------------------------------------------------------------------------------
/asst1/prog1_mandelbrot_threads/objs/mandelbrotThread.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/prog1_mandelbrot_threads/objs/mandelbrotThread.o


--------------------------------------------------------------------------------
/asst1/prog1_mandelbrot_threads/objs/ppm.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/prog1_mandelbrot_threads/objs/ppm.o


--------------------------------------------------------------------------------
/asst1/prog2_vecintrin/CS149intrin.h:
--------------------------------------------------------------------------------
  1 | // Define vector unit width here
  2 | #define VECTOR_WIDTH 16
  3 | 
  4 | #ifndef CS149INTRIN_H_
  5 | #define CS149INTRIN_H_
  6 | 
  7 | #include <cstdlib>
  8 | #include <cmath>
  9 | #include "logger.h"
 10 | 
 11 | //*******************
 12 | //* Type Definition *
 13 | //*******************
 14 | 
 15 | extern Logger CS149Logger;
 16 | 
 17 | template <typename T>
 18 | struct __cs149_vec {
 19 |   T value[VECTOR_WIDTH];
 20 | };
 21 | 
 22 | // Declare a mask with __cs149_mask
 23 | struct __cs149_mask : __cs149_vec<bool> {};
 24 | 
 25 | // Declare a floating point vector register with __cs149_vec_float
 26 | #define __cs149_vec_float __cs149_vec<float>
 27 | 
 28 | // Declare an integer vector register with __cs149_vec_int
 29 | #define __cs149_vec_int   __cs149_vec<int>
 30 | 
 31 | //***********************
 32 | //* Function Definition *
 33 | //***********************
 34 | 
 35 | // Return a mask initialized to 1 in the first N lanes and 0 in the others
 36 | __cs149_mask _cs149_init_ones(int first = VECTOR_WIDTH);
 37 | 
 38 | // Return the inverse of maska
 39 | __cs149_mask _cs149_mask_not(__cs149_mask &maska);
 40 | 
 41 | // Return (maska | maskb)
 42 | __cs149_mask _cs149_mask_or(__cs149_mask &maska, __cs149_mask &maskb);
 43 | 
 44 | // Return (maska & maskb)
 45 | __cs149_mask _cs149_mask_and(__cs149_mask &maska, __cs149_mask &maskb);
 46 | 
 47 | // Count the number of 1s in maska
 48 | int _cs149_cntbits(__cs149_mask &maska);
 49 | 
 50 | // Set register to value if vector lane is active
 51 | //  otherwise keep the old value
 52 | void _cs149_vset_float(__cs149_vec_float &vecResult, float value, __cs149_mask &mask);
 53 | void _cs149_vset_int(__cs149_vec_int &vecResult, int value, __cs149_mask &mask);
 54 | // For user's convenience, returns a vector register with all lanes initialized to value
 55 | __cs149_vec_float _cs149_vset_float(float value);
 56 | __cs149_vec_int _cs149_vset_int(int value);
 57 | 
 58 | // Copy values from vector register src to vector register dest if vector lane active
 59 | // otherwise keep the old value
 60 | void _cs149_vmove_float(__cs149_vec_float &dest, __cs149_vec_float &src, __cs149_mask &mask);
 61 | void _cs149_vmove_int(__cs149_vec_int &dest, __cs149_vec_int &src, __cs149_mask &mask);
 62 | 
 63 | // Load values from array src to vector register dest if vector lane active
 64 | //  otherwise keep the old value
 65 | void _cs149_vload_float(__cs149_vec_float &dest, float* src, __cs149_mask &mask);
 66 | void _cs149_vload_int(__cs149_vec_int &dest, int* src, __cs149_mask &mask);
 67 | 
 68 | // Store values from vector register src to array dest if vector lane active
 69 | //  otherwise keep the old value
 70 | void _cs149_vstore_float(float* dest, __cs149_vec_float &src, __cs149_mask &mask);
 71 | void _cs149_vstore_int(int* dest, __cs149_vec_int &src, __cs149_mask &mask);
 72 | 
 73 | // Return calculation of (veca + vecb) if vector lane active
 74 | //  otherwise keep the old value
 75 | void _cs149_vadd_float(__cs149_vec_float &vecResult, __cs149_vec_float &veca, __cs149_vec_float &vecb, __cs149_mask &mask);
 76 | void _cs149_vadd_int(__cs149_vec_int &vecResult, __cs149_vec_int &veca, __cs149_vec_int &vecb, __cs149_mask &mask);
 77 | 
 78 | // Return calculation of (veca - vecb) if vector lane active
 79 | //  otherwise keep the old value
 80 | void _cs149_vsub_float(__cs149_vec_float &vecResult, __cs149_vec_float &veca, __cs149_vec_float &vecb, __cs149_mask &mask);
 81 | void _cs149_vsub_int(__cs149_vec_int &vecResult, __cs149_vec_int &veca, __cs149_vec_int &vecb, __cs149_mask &mask);
 82 | 
 83 | // Return calculation of (veca * vecb) if vector lane active
 84 | //  otherwise keep the old value
 85 | void _cs149_vmult_float(__cs149_vec_float &vecResult, __cs149_vec_float &veca, __cs149_vec_float &vecb, __cs149_mask &mask);
 86 | void _cs149_vmult_int(__cs149_vec_int &vecResult, __cs149_vec_int &veca, __cs149_vec_int &vecb, __cs149_mask &mask);
 87 | 
 88 | // Return calculation of (veca / vecb) if vector lane active
 89 | //  otherwise keep the old value
 90 | void _cs149_vdiv_float(__cs149_vec_float &vecResult, __cs149_vec_float &veca, __cs149_vec_float &vecb, __cs149_mask &mask);
 91 | void _cs149_vdiv_int(__cs149_vec_int &vecResult, __cs149_vec_int &veca, __cs149_vec_int &vecb, __cs149_mask &mask);
 92 | 
 93 | 
 94 | // Return calculation of absolute value abs(veca) if vector lane active
 95 | //  otherwise keep the old value
 96 | void _cs149_vabs_float(__cs149_vec_float &vecResult, __cs149_vec_float &veca, __cs149_mask &mask);
 97 | void _cs149_vabs_int(__cs149_vec_int &vecResult, __cs149_vec_int &veca, __cs149_mask &mask);
 98 | 
 99 | // Return a mask of (veca > vecb) if vector lane active
100 | //  otherwise keep the old value
101 | void _cs149_vgt_float(__cs149_mask &vecResult, __cs149_vec_float &veca, __cs149_vec_float &vecb, __cs149_mask &mask);
102 | void _cs149_vgt_int(__cs149_mask &vecResult, __cs149_vec_int &veca, __cs149_vec_int &vecb, __cs149_mask &mask);
103 | 
104 | // Return a mask of (veca < vecb) if vector lane active
105 | //  otherwise keep the old value
106 | void _cs149_vlt_float(__cs149_mask &vecResult, __cs149_vec_float &veca, __cs149_vec_float &vecb, __cs149_mask &mask);
107 | void _cs149_vlt_int(__cs149_mask &vecResult, __cs149_vec_int &veca, __cs149_vec_int &vecb, __cs149_mask &mask);
108 | 
109 | // Return a mask of (veca == vecb) if vector lane active
110 | //  otherwise keep the old value
111 | void _cs149_veq_float(__cs149_mask &vecResult, __cs149_vec_float &veca, __cs149_vec_float &vecb, __cs149_mask &mask);
112 | void _cs149_veq_int(__cs149_mask &vecResult, __cs149_vec_int &veca, __cs149_vec_int &vecb, __cs149_mask &mask);
113 | 
114 | // Adds up adjacent pairs of elements, so
115 | //  [0 1 2 3] -> [0+1 0+1 2+3 2+3]
116 | void _cs149_hadd_float(__cs149_vec_float &vecResult, __cs149_vec_float &vec);
117 | 
118 | // Performs an even-odd interleaving where all even-indexed elements move to front half
119 | //  of the array and odd-indexed to the back half, so
120 | //  [0 1 2 3 4 5 6 7] -> [0 2 4 6 1 3 5 7]
121 | void _cs149_interleave_float(__cs149_vec_float &vecResult, __cs149_vec_float &vec);
122 | 
123 | // Add a customized log to help debugging
124 | void addUserLog(const char * logStr);
125 | 
126 | #endif
127 | 


--------------------------------------------------------------------------------
/asst1/prog2_vecintrin/Makefile:
--------------------------------------------------------------------------------
 1 | all: myexp
 2 | 
 3 | logger.o: logger.cpp logger.h CS149intrin.h CS149intrin.cpp
 4 | 	g++ -c logger.cpp
 5 | 
 6 | CS149intrin.o: CS149intrin.cpp CS149intrin.h logger.cpp logger.h
 7 | 	g++ -c CS149intrin.cpp
 8 | 
 9 | myexp: CS149intrin.o logger.o main.cpp
10 | 	g++ -I../common logger.o CS149intrin.o main.cpp -o myexp
11 | 
12 | clean:
13 | 	rm -f *.o myexp *~
14 | 


--------------------------------------------------------------------------------
/asst1/prog2_vecintrin/logger.cpp:
--------------------------------------------------------------------------------
 1 | #include "logger.h"
 2 | #include "CS149intrin.h"
 3 | 
 4 | void Logger::addLog(const char * instruction, __cs149_mask mask, int N) {
 5 |   Log newLog;
 6 |   strcpy(newLog.instruction, instruction);
 7 |   newLog.mask = 0;
 8 |   for (int i=0; i<N; i++) {
 9 |     if (mask.value[i]) {
10 |       newLog.mask |= (((unsigned long long)1)<<i);
11 |       stats.utilized_lane++;
12 |     }
13 |   }
14 |   stats.total_lane += N;
15 |   stats.total_instructions += (N>0);
16 |   log.push_back(newLog);
17 | }
18 | 
19 | void Logger::printStats() {
20 |   printf("****************** Printing Vector Unit Statistics *******************\n");
21 |   printf("Vector Width:              %d\n", VECTOR_WIDTH);
22 |   printf("Total Vector Instructions: %lld\n", stats.total_instructions);
23 |   printf("Vector Utilization:        %.1f%%\n", (double)stats.utilized_lane/stats.total_lane*100);
24 |   printf("Utilized Vector Lanes:     %lld\n", stats.utilized_lane);
25 |   printf("Total Vector Lanes:        %lld\n", stats.total_lane);
26 | }
27 | 
28 | 
29 | 
30 | void Logger::printLog() {
31 |   printf("***************** Printing Vector Unit Execution Log *****************\n");
32 |   printf(" Instruction | Vector Lane Occupancy ('*' for active, '_' for inactive)\n");
33 |   printf("------------- --------------------------------------------------------\n");
34 |   for (int i=0; i<log.size(); i++) {
35 |     printf("%12s | ", log[i].instruction);
36 |     for (int j=0; j<VECTOR_WIDTH; j++) {
37 |       if (log[i].mask & (((unsigned long long)1)<<j)) {
38 |         printf("*");
39 |       } else {
40 |         printf("_");
41 |       }
42 |     }
43 |     printf("\n");
44 |   }
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/asst1/prog2_vecintrin/logger.h:
--------------------------------------------------------------------------------
 1 | #ifndef LOGGER_H_
 2 | #define LOGGER_H_
 3 | 
 4 | #include <stdio.h>
 5 | #include <vector>
 6 | #include <string.h>
 7 | using namespace std;
 8 | 
 9 | #define MAX_INST_LEN 32
10 | 
11 | struct __cs149_mask;
12 | 
13 | struct Log {
14 |   char instruction[MAX_INST_LEN];
15 |   unsigned long long mask; // support vector width up to 64
16 | };
17 | 
18 | struct Statistics {
19 |   unsigned long long utilized_lane;
20 |   unsigned long long total_lane;
21 |   unsigned long long total_instructions;
22 | };
23 | 
24 | class Logger {
25 |   private:
26 |     vector<Log> log;
27 |     Statistics stats;
28 | 
29 |   public:
30 |     void addLog(const char * instruction, __cs149_mask mask, int N = 0);
31 |     void printStats();
32 |     void printLog();
33 | };
34 | 
35 | #endif
36 | 


--------------------------------------------------------------------------------
/asst1/prog3_mandelbrot_ispc/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++ -m64
 2 | CXXFLAGS=-I../common -Iobjs/ -O3 -Wall -fPIC
 3 | ISPC=ispc
 4 | # note: requires AVX2
 5 | # disabling AVX2 FMA since it causes a difference in output compared to reference on Mandelbrot 
 6 | ISPCFLAGS=-O3 --target=avx2-i32x8 --arch=x86-64 --opt=disable-fma --pic
 7 | 
 8 | APP_NAME=mandelbrot_ispc
 9 | OBJDIR=objs
10 | COMMONDIR=../common
11 | 
12 | PPM_CXX=$(COMMONDIR)/ppm.cpp
13 | PPM_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(PPM_CXX:.cpp=.o)))
14 | 
15 | TASKSYS_CXX=$(COMMONDIR)/tasksys.cpp
16 | TASKSYS_LIB=-lpthread
17 | TASKSYS_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(TASKSYS_CXX:.cpp=.o)))
18 | 
19 | default: $(APP_NAME)
20 | 
21 | .PHONY: dirs clean
22 | 
23 | dirs:
24 | 		/bin/mkdir -p $(OBJDIR)/
25 | 
26 | clean:
27 | 		/bin/rm -rf $(OBJDIR) *.ppm *~ $(APP_NAME)
28 | 
29 | OBJS=$(OBJDIR)/main.o $(OBJDIR)/mandelbrotSerial.o $(OBJDIR)/mandelbrot_ispc.o $(PPM_OBJ) $(TASKSYS_OBJ)
30 | 
31 | $(APP_NAME): dirs $(OBJS)
32 | 		$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASKSYS_LIB)
33 | 
34 | $(OBJDIR)/%.o: %.cpp
35 | 		$(CXX) $< $(CXXFLAGS) -c -o $@
36 | 
37 | $(OBJDIR)/%.o: $(COMMONDIR)/%.cpp
38 | 	$(CXX) $< $(CXXFLAGS) -c -o $@
39 | 
40 | $(OBJDIR)/main.o: $(OBJDIR)/mandelbrot_ispc.h $(COMMONDIR)/CycleTimer.h
41 | 
42 | $(OBJDIR)/%_ispc.h $(OBJDIR)//%_ispc.o: %.ispc
43 | 		$(ISPC) $(ISPCFLAGS) $< -o $(OBJDIR)/$*_ispc.o -h $(OBJDIR)/$*_ispc.h
44 | 
45 | 


--------------------------------------------------------------------------------
/asst1/prog3_mandelbrot_ispc/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <algorithm>
  3 | #include <getopt.h>
  4 | 
  5 | #include "CycleTimer.h"
  6 | #include "mandelbrot_ispc.h"
  7 | 
  8 | extern void mandelbrotSerial(
  9 |     float x0, float y0, float x1, float y1,
 10 |     int width, int height,
 11 |     int startRow, int numRows,
 12 |     int maxIterations,
 13 |     int output[]);
 14 | 
 15 | extern void mandelbrotThread(
 16 |     int numThreads,
 17 |     float x0, float y0, float x1, float y1,
 18 |     int width, int height,
 19 |     int maxIterations,
 20 |     int output[]);
 21 | 
 22 | extern void writePPMImage(
 23 |     int* data,
 24 |     int width, int height,
 25 |     const char *filename,
 26 |     int maxIterations);
 27 | 
 28 | bool verifyResult (int *gold, int *result, int width, int height) {
 29 |     int i, j;
 30 | 
 31 |     for (i = 0; i < height; i++) {
 32 |         for (j = 0; j < width; j++) {
 33 |             if (gold[i * width + j] != result[i * width + j]) {
 34 |                 printf ("Mismatch : [%d][%d], Expected : %d, Actual : %d\n",
 35 |                             i, j, gold[i * width + j], result[i * width + j]);
 36 |                 return 0;
 37 |             }
 38 |         }
 39 |     }
 40 | 
 41 |     return 1;
 42 | }
 43 | 
 44 | void
 45 | scaleAndShift(float& x0, float& x1, float& y0, float& y1,
 46 |               float scale,
 47 |               float shiftX, float shiftY)
 48 | {
 49 | 
 50 |     x0 *= scale;
 51 |     x1 *= scale;
 52 |     y0 *= scale;
 53 |     y1 *= scale;
 54 |     x0 += shiftX;
 55 |     x1 += shiftX;
 56 |     y0 += shiftY;
 57 |     y1 += shiftY;
 58 | 
 59 | }
 60 | 
 61 | using namespace ispc;
 62 | 
 63 | void usage(const char* progname) {
 64 |     printf("Usage: %s [options]\n", progname);
 65 |     printf("Program Options:\n");
 66 |     printf("  -t  --tasks        Run ISPC code implementation with tasks\n");
 67 |     printf("  -v  --view <INT>   Use specified view settings\n");
 68 |     printf("  -?  --help         This message\n");
 69 | }
 70 | 
 71 | 
 72 | int main(int argc, char** argv) {
 73 | 
 74 |     const unsigned int width = 1200;
 75 |     const unsigned int height = 800;
 76 |     const int maxIterations = 256;
 77 | 
 78 |     float x0 = -2;
 79 |     float x1 = 1;
 80 |     float y0 = -1;
 81 |     float y1 = 1;
 82 | 
 83 |     bool useTasks = false;
 84 | 
 85 |     // parse commandline options ////////////////////////////////////////////
 86 |     int opt;
 87 |     static struct option long_options[] = {
 88 |         {"tasks", 0, 0, 't'},
 89 |         {"view",  1, 0, 'v'},
 90 |         {"help",  0, 0, '?'},
 91 |         {0 ,0, 0, 0}
 92 |     };
 93 | 
 94 |     while ((opt = getopt_long(argc, argv, "tv:?", long_options, NULL)) != EOF) {
 95 | 
 96 |         switch (opt) {
 97 |         case 't':
 98 |             useTasks = true;
 99 |             break;
100 |         case 'v':
101 |         {
102 |             int viewIndex = atoi(optarg);
103 |             // change view settings
104 |             if (viewIndex == 2) {
105 |                 float scaleValue = .015f;
106 |                 float shiftX = -.986f;
107 |                 float shiftY = .30f;
108 |                 scaleAndShift(x0, x1, y0, y1, scaleValue, shiftX, shiftY);
109 |             } else if (viewIndex > 1) {
110 |                 fprintf(stderr, "Invalid view index\n");
111 |                 return 1;
112 |             }
113 |             break;
114 |         }
115 |         case '?':
116 |         default:
117 |             usage(argv[0]);
118 |             return 1;
119 |         }
120 |     }
121 |     // end parsing of commandline options
122 | 
123 |     int *output_serial = new int[width*height];
124 |     int *output_ispc = new int[width*height];
125 |     int *output_ispc_tasks = new int[width*height];
126 | 
127 |     for (unsigned int i = 0; i < width * height; ++i)
128 |         output_serial[i] = 0;
129 | 
130 |     //
131 |     // Run the serial implementation. Teport the minimum time of three
132 |     // runs for robust timing.
133 |     //
134 |     double minSerial = 1e30;
135 |     for (int i = 0; i < 3; ++i) {
136 |         double startTime = CycleTimer::currentSeconds();
137 |         mandelbrotSerial(x0, y0, x1, y1, width, height, 0, height, maxIterations, output_serial);
138 |         double endTime = CycleTimer::currentSeconds();
139 |         minSerial = std::min(minSerial, endTime - startTime);
140 |     }
141 | 
142 |     printf("[mandelbrot serial]:\t\t[%.3f] ms\n", minSerial * 1000);
143 |     writePPMImage(output_serial, width, height, "mandelbrot-serial.ppm", maxIterations);
144 | 
145 |     // Clear out the buffer
146 |     for (unsigned int i = 0; i < width * height; ++i)
147 |         output_ispc[i] = 0;
148 | 
149 |     //
150 |     // Compute the image using the ispc implementation
151 |     //
152 |     double minISPC = 1e30;
153 |     for (int i = 0; i < 3; ++i) {
154 |         double startTime = CycleTimer::currentSeconds();
155 |         mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, output_ispc);
156 |         double endTime = CycleTimer::currentSeconds();
157 |         minISPC = std::min(minISPC, endTime - startTime);
158 |     }
159 | 
160 |     printf("[mandelbrot ispc]:\t\t[%.3f] ms\n", minISPC * 1000);
161 |     writePPMImage(output_ispc, width, height, "mandelbrot-ispc.ppm", maxIterations);
162 | 
163 | 
164 |     if (! verifyResult (output_serial, output_ispc, width, height)) {
165 |         printf ("Error : ISPC output differs from sequential output\n");
166 | 
167 |         delete[] output_serial;
168 |         delete[] output_ispc;
169 |         delete[] output_ispc_tasks;
170 | 
171 |         return 1;
172 |     }
173 | 
174 |     // Clear out the buffer
175 |     for (unsigned int i = 0; i < width * height; ++i) {
176 |         output_ispc_tasks[i] = 0;
177 |     }
178 | 
179 |     double minTaskISPC = 1e30;
180 |     if (useTasks) {
181 |         //
182 |         // Tasking version of the ISPC code
183 |         //
184 |         for (int i = 0; i < 3; ++i) {
185 |             double startTime = CycleTimer::currentSeconds();
186 |             mandelbrot_ispc_withtasks(x0, y0, x1, y1, width, height, maxIterations, output_ispc_tasks);
187 |             double endTime = CycleTimer::currentSeconds();
188 |             minTaskISPC = std::min(minTaskISPC, endTime - startTime);
189 |         }
190 | 
191 |         printf("[mandelbrot multicore ispc]:\t[%.3f] ms\n", minTaskISPC * 1000);
192 |         writePPMImage(output_ispc_tasks, width, height, "mandelbrot-task-ispc.ppm", maxIterations);
193 | 
194 |         if (! verifyResult (output_serial, output_ispc_tasks, width, height)) {
195 |             printf ("Error : ISPC output differs from sequential output\n");
196 |             return 1;
197 |         }
198 |     }
199 | 
200 |     printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
201 |     if (useTasks) {
202 |         printf("\t\t\t\t(%.2fx speedup from task ISPC)\n", minSerial/minTaskISPC);
203 |     }
204 | 
205 |     delete[] output_serial;
206 |     delete[] output_ispc;
207 |     delete[] output_ispc_tasks;
208 | 
209 | 
210 |     return 0;
211 | }
212 | 


--------------------------------------------------------------------------------
/asst1/prog3_mandelbrot_ispc/mandelbrot.ispc:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | static inline int mandel(float c_re, float c_im, int count) {
 4 |     float z_re = c_re, z_im = c_im;
 5 |     int i;
 6 |     for (i = 0; i < count; ++i) {
 7 | 
 8 |         if (z_re * z_re + z_im * z_im > 4.f)
 9 |            break;
10 | 
11 |         float new_re = z_re*z_re - z_im*z_im;
12 |         float new_im = 2.f * z_re * z_im;
13 |         z_re = c_re + new_re;
14 |         z_im = c_im + new_im;
15 |     }
16 | 
17 |     return i;
18 | }
19 | 
20 | export void mandelbrot_ispc(uniform float x0, uniform float y0, 
21 |                             uniform float x1, uniform float y1,
22 |                             uniform int width, uniform int height, 
23 |                             uniform int maxIterations,
24 |                             uniform int output[])
25 | {
26 |     float dx = (x1 - x0) / width;
27 |     float dy = (y1 - y0) / height;
28 | 
29 |     foreach (j = 0 ... height, i = 0 ... width) {
30 |             float x = x0 + i * dx;
31 |             float y = y0 + j * dy;
32 | 
33 |             int index = j * width + i;
34 |             output[index] = mandel(x, y, maxIterations);
35 |     }
36 | }
37 | 
38 | // slightly different kernel to support tasking
39 | task void mandelbrot_ispc_task(uniform float x0, uniform float y0, 
40 |                                uniform float x1, uniform float y1,
41 |                                uniform int width, uniform int height,
42 |                                uniform int rowsPerTask,
43 |                                uniform int maxIterations,
44 |                                uniform int output[])
45 | {
46 | 
47 |     // taskIndex is an ISPC built-in
48 |     
49 |     uniform int ystart = taskIndex * rowsPerTask;
50 |     uniform int yend = ystart + rowsPerTask;
51 |     
52 |     uniform float dx = (x1 - x0) / width;
53 |     uniform float dy = (y1 - y0) / height;
54 |     
55 |     foreach (j = ystart ... yend, i = 0 ... width) {
56 |             float x = x0 + i * dx;
57 |             float y = y0 + j * dy;
58 |             
59 |             int index = j * width + i;
60 |             output[index] = mandel(x, y, maxIterations);
61 |     }
62 | }
63 | 
64 | export void mandelbrot_ispc_withtasks(uniform float x0, uniform float y0,
65 |                                       uniform float x1, uniform float y1,
66 |                                       uniform int width, uniform int height,
67 |                                       uniform int maxIterations,
68 |                                       uniform int output[])
69 | {
70 | 
71 |     uniform int rowsPerTask = height / 16;
72 | 
73 |     // create 2 tasks
74 |     launch[16] mandelbrot_ispc_task(x0, y0, x1, y1,
75 |                                      width, height,
76 |                                      rowsPerTask,
77 |                                      maxIterations,
78 |                                      output); 
79 | }
80 | 


--------------------------------------------------------------------------------
/asst1/prog3_mandelbrot_ispc/mandelbrotSerial.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | 
 3 |   15418 Spring 2012 note: This code was modified from example code
 4 |   originally provided by Intel.  To comply with Intel's open source
 5 |   licensing agreement, their copyright is retained below.
 6 | 
 7 |   -----------------------------------------------------------------
 8 | 
 9 |   Copyright (c) 2010-2011, Intel Corporation
10 |   All rights reserved.
11 | 
12 |   Redistribution and use in source and binary forms, with or without
13 |   modification, are permitted provided that the following conditions are
14 |   met:
15 | 
16 |     * Redistributions of source code must retain the above copyright
17 |       notice, this list of conditions and the following disclaimer.
18 | 
19 |     * Redistributions in binary form must reproduce the above copyright
20 |       notice, this list of conditions and the following disclaimer in the
21 |       documentation and/or other materials provided with the distribution.
22 | 
23 |     * Neither the name of Intel Corporation nor the names of its
24 |       contributors may be used to endorse or promote products derived from
25 |       this software without specific prior written permission.
26 | 
27 |    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
28 |    IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29 |    TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
30 |    PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
31 |    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 |    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 |    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 |    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 |    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 |    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 |    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 | */
39 | 
40 | 
41 | static inline int mandel(float c_re, float c_im, int count)
42 | {
43 |     float z_re = c_re, z_im = c_im;
44 |     int i;
45 |     for (i = 0; i < count; ++i) {
46 | 
47 |         if (z_re * z_re + z_im * z_im > 4.f)
48 |             break;
49 | 
50 |         float new_re = z_re*z_re - z_im*z_im;
51 |         float new_im = 2.f * z_re * z_im;
52 |         z_re = c_re + new_re;
53 |         z_im = c_im + new_im;
54 |     }
55 | 
56 |     return i;
57 | }
58 | 
59 | //
60 | // MandelbrotSerial --
61 | //
62 | // Compute an image visualizing the mandelbrot set.  The resulting
63 | // array contains the number of iterations required before the complex
64 | // number corresponding to a pixel could be rejected from the set.
65 | //
66 | // * x0, y0, x1, y1 describe the complex coordinates mapping
67 | //   into the image viewport.
68 | // * width, height describe the size of the output image
69 | // * startRow, totalRows describe how much of the image to compute
70 | void mandelbrotSerial(
71 |     float x0, float y0, float x1, float y1,
72 |     int width, int height,
73 |     int startRow, int totalRows,
74 |     int maxIterations,
75 |     int output[])
76 | {
77 |     float dx = (x1 - x0) / width;
78 |     float dy = (y1 - y0) / height;
79 | 
80 |     int endRow = startRow + totalRows;
81 | 
82 |     for (int j = startRow; j < endRow; j++) {
83 |         for (int i = 0; i < width; ++i) {
84 |             float x = x0 + i * dx;
85 |             float y = y0 + j * dy;
86 | 
87 |             int index = (j * width + i);
88 |             output[index] = mandel(x, y, maxIterations);
89 |         }
90 |     }
91 | }
92 | 
93 | 


--------------------------------------------------------------------------------
/asst1/prog4_sqrt/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++ -m64 -march=native
 2 | CXXFLAGS=-I../common -Iobjs/ -O3 -Wall
 3 | ISPC=ispc
 4 | # note: requires AVX2 capable machine
 5 | ISPCFLAGS=-O3 --target=avx2-i32x8 --arch=x86-64 --pic
 6 | 
 7 | 
 8 | APP_NAME=sqrt
 9 | OBJDIR=objs
10 | COMMONDIR=../common
11 | 
12 | PPM_CXX=$(COMMONDIR)/ppm.cpp
13 | PPM_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(PPM_CXX:.cpp=.o)))
14 | 
15 | TASKSYS_CXX=$(COMMONDIR)/tasksys.cpp
16 | TASKSYS_LIB=-lpthread
17 | TASKSYS_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(TASKSYS_CXX:.cpp=.o)))
18 | 
19 | default: $(APP_NAME)
20 | 
21 | .PHONY: dirs clean
22 | 
23 | dirs:
24 | 		/bin/mkdir -p $(OBJDIR)/
25 | 
26 | clean:
27 | 		/bin/rm -rf $(OBJDIR) *.ppm *~ $(APP_NAME)
28 | 
29 | OBJS=$(OBJDIR)/main.o $(OBJDIR)/sqrtSerial.o $(OBJDIR)/sqrt_ispc.o $(PPM_OBJ) $(TASKSYS_OBJ)
30 | 
31 | $(APP_NAME): dirs $(OBJS)
32 | 		$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASKSYS_LIB)
33 | 
34 | $(OBJDIR)/%.o: %.cpp
35 | 		$(CXX) $< $(CXXFLAGS) -c -o $@
36 | 
37 | $(OBJDIR)/%.o: $(COMMONDIR)/%.cpp
38 | 	$(CXX) $< $(CXXFLAGS) -c -o $@
39 | 
40 | $(OBJDIR)/main.o: $(OBJDIR)/$(APP_NAME)_ispc.h $(COMMONDIR)/CycleTimer.h
41 | 
42 | $(OBJDIR)/%_ispc.h $(OBJDIR)//%_ispc.o: %.ispc
43 | 		$(ISPC) $(ISPCFLAGS) $< -o $(OBJDIR)/$*_ispc.o -h $(OBJDIR)/$*_ispc.h
44 | 
45 | 


--------------------------------------------------------------------------------
/asst1/prog4_sqrt/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <algorithm>
  3 | #include <pthread.h>
  4 | #include <math.h>
  5 | 
  6 | #include "CycleTimer.h"
  7 | #include "sqrt_ispc.h"
  8 | 
  9 | using namespace ispc;
 10 | 
 11 | extern void sqrtSerial(int N, float startGuess, float* values, float* output);
 12 | 
 13 | static void verifyResult(int N, float* result, float* gold) {
 14 |     for (int i=0; i<N; i++) {
 15 |         if (fabs(result[i] - gold[i]) > 1e-4) {
 16 |             printf("Error: [%d] Got %f expected %f\n", i, result[i], gold[i]);
 17 |         }
 18 |     }
 19 | }
 20 | 
 21 | int main() {
 22 | 
 23 |     const unsigned int N = 20 * 1000 * 1000;
 24 |     const float initialGuess = 1.0f;
 25 | 
 26 |     float* values = new float[N];
 27 |     float* output = new float[N];
 28 |     float* gold = new float[N];
 29 | 
 30 |     for (unsigned int i=0; i<N; i++)
 31 |     {
 32 |         // TODO: CS149 students.  Attempt to change the values in the
 33 |         // array here to meet the instructions in the handout: we want
 34 |         // to you generate best and worse-case speedups
 35 |         
 36 |         // starter code populates array with random input values
 37 |         // origin
 38 |         // values[i] = .001f + 2.998f * static_cast<float>(rand()) / RAND_MAX;
 39 |         // Q2
 40 |         // values[i] = 2.998f;
 41 |         // Q3
 42 |         if (i % 8) {
 43 |             values[i] = 1.0f;
 44 |         } else {
 45 |             values[i] = 2.998f;
 46 |         }
 47 |     }
 48 | 
 49 |     // generate a gold version to check results
 50 |     for (unsigned int i=0; i<N; i++)
 51 |         gold[i] = sqrt(values[i]);
 52 | 
 53 |     //
 54 |     // And run the serial implementation 3 times, again reporting the
 55 |     // minimum time.
 56 |     //
 57 |     double minSerial = 1e30;
 58 |     for (int i = 0; i < 3; ++i) {
 59 |         double startTime = CycleTimer::currentSeconds();
 60 |         sqrtSerial(N, initialGuess, values, output);
 61 |         double endTime = CycleTimer::currentSeconds();
 62 |         minSerial = std::min(minSerial, endTime - startTime);
 63 |     }
 64 | 
 65 |     printf("[sqrt serial]:\t\t[%.3f] ms\n", minSerial * 1000);
 66 | 
 67 |     verifyResult(N, output, gold);
 68 | 
 69 |     //
 70 |     // Compute the image using the ispc implementation; report the minimum
 71 |     // time of three runs.
 72 |     //
 73 |     double minISPC = 1e30;
 74 |     for (int i = 0; i < 3; ++i) {
 75 |         double startTime = CycleTimer::currentSeconds();
 76 |         sqrt_ispc(N, initialGuess, values, output);
 77 |         double endTime = CycleTimer::currentSeconds();
 78 |         minISPC = std::min(minISPC, endTime - startTime);
 79 |     }
 80 | 
 81 |     printf("[sqrt ispc]:\t\t[%.3f] ms\n", minISPC * 1000);
 82 | 
 83 |     verifyResult(N, output, gold);
 84 | 
 85 |     // Clear out the buffer
 86 |     for (unsigned int i = 0; i < N; ++i)
 87 |         output[i] = 0;
 88 | 
 89 |     //
 90 |     // Tasking version of the ISPC code
 91 |     //
 92 |     double minTaskISPC = 1e30;
 93 |     for (int i = 0; i < 3; ++i) {
 94 |         double startTime = CycleTimer::currentSeconds();
 95 |         sqrt_ispc_withtasks(N, initialGuess, values, output);
 96 |         double endTime = CycleTimer::currentSeconds();
 97 |         minTaskISPC = std::min(minTaskISPC, endTime - startTime);
 98 |     }
 99 | 
100 |     printf("[sqrt task ispc]:\t[%.3f] ms\n", minTaskISPC * 1000);
101 | 
102 |     verifyResult(N, output, gold);
103 | 
104 |     printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
105 |     printf("\t\t\t\t(%.2fx speedup from task ISPC)\n", minSerial/minTaskISPC);
106 | 
107 |     delete [] values;
108 |     delete [] output;
109 |     delete [] gold;
110 | 
111 |     return 0;
112 | }
113 | 


--------------------------------------------------------------------------------
/asst1/prog4_sqrt/sqrt.ispc:
--------------------------------------------------------------------------------
 1 | 
 2 | static const float kThreshold = 0.00001f; 
 3 | 
 4 | export void sqrt_ispc(uniform int N,
 5 |                       uniform float initialGuess,
 6 |                       uniform float values[],
 7 |                       uniform float output[])
 8 | {
 9 |     foreach (i = 0 ... N) {
10 | 
11 |         float x = values[i];
12 |         float guess = initialGuess;
13 | 
14 |         float pred = abs(guess * guess * x - 1.f);
15 | 
16 |         while (pred > kThreshold) {
17 |             guess = (3.f * guess - x * guess * guess * guess) * 0.5f;
18 |             pred = abs(guess * guess * x - 1.f);
19 |         }
20 | 
21 |         output[i] = x * guess;
22 |         
23 |     }
24 | }
25 | 
26 | task void sqrt_ispc_task(uniform int N,
27 |                          uniform int span,
28 |                          uniform float initialGuess,
29 |                          uniform float values[],
30 |                          uniform float output[])
31 | {
32 | 
33 |     uniform int indexStart = taskIndex * span;
34 |     uniform int indexEnd = min(N, indexStart + span);
35 |     
36 |     foreach (i = indexStart ... indexEnd) {
37 | 
38 |         float x = values[i];
39 |         float guess = initialGuess;
40 | 
41 |         float pred = abs(guess * guess * x - 1.f);
42 | 
43 |         while (pred > kThreshold) {
44 |             guess = (3.f * guess - x * guess * guess * guess) * 0.5f;
45 |             pred = abs(guess * guess * x - 1.f);
46 |         }
47 | 
48 |         output[i] = x * guess;
49 |     
50 |     }
51 | }
52 | 
53 | export void sqrt_ispc_withtasks(uniform int N,
54 |                                 uniform float initialGuess,
55 |                                 uniform float values[],
56 |                                 uniform float output[])
57 | {
58 | 
59 |     uniform int span = N / 64;  // 64 tasks
60 | 
61 |     launch[N/span] sqrt_ispc_task(N, span, initialGuess, values, output);
62 | }
63 | 


--------------------------------------------------------------------------------
/asst1/prog4_sqrt/sqrtSerial.cpp:
--------------------------------------------------------------------------------
 1 | #include <math.h>
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | 
 5 | 
 6 | void sqrtSerial(int N,
 7 |                 float initialGuess,
 8 |                 float values[],
 9 |                 float output[])
10 | {
11 | 
12 |     static const float kThreshold = 0.00001f;
13 | 
14 |     for (int i=0; i<N; i++) {
15 | 
16 |         float x = values[i];
17 |         float guess = initialGuess;
18 | 
19 |         float error = fabs(guess * guess * x - 1.f);
20 | 
21 |         while (error > kThreshold) {
22 |             guess = (3.f * guess - x * guess * guess * guess) * 0.5f;
23 |             error = fabs(guess * guess * x - 1.f);
24 |         }
25 | 
26 |         output[i] = x * guess;
27 |     }
28 | }
29 | 
30 | 


--------------------------------------------------------------------------------
/asst1/prog5_saxpy/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++ -m64
 2 | CXXFLAGS=-I../common -Iobjs/ -O2 -Wall 
 3 | ISPC=ispc
 4 | # note: requires AVX2
 5 | ISPCFLAGS=-O3 --target=avx2-i32x8 --arch=x86-64 --pic
 6 | 
 7 | APP_NAME=saxpy
 8 | OBJDIR=objs
 9 | COMMONDIR=../common
10 | 
11 | TASKSYS_CXX=$(COMMONDIR)/tasksys.cpp
12 | TASKSYS_LIB=-lpthread
13 | TASKSYS_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(TASKSYS_CXX:.cpp=.o)))
14 | 
15 | default: $(APP_NAME)
16 | 
17 | .PHONY: dirs clean
18 | 
19 | dirs:
20 | 		/bin/mkdir -p $(OBJDIR)/
21 | 
22 | clean:
23 | 		/bin/rm -rf $(OBJDIR) *.ppm *~ $(APP_NAME)
24 | 
25 | OBJS=$(OBJDIR)/main.o $(OBJDIR)/saxpySerial.o $(OBJDIR)/saxpy_ispc.o $(TASKSYS_OBJ)
26 | 
27 | $(APP_NAME): dirs $(OBJS)
28 | 		$(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASKSYS_LIB)
29 | 
30 | $(OBJDIR)/%.o: %.cpp
31 | 		$(CXX) $< $(CXXFLAGS) -c -o $@
32 | 
33 | $(OBJDIR)/%.o: $(COMMONDIR)/%.cpp
34 | 	$(CXX) $< $(CXXFLAGS) -c -o $@
35 | 
36 | $(OBJDIR)/main.o: $(OBJDIR)/$(APP_NAME)_ispc.h $(COMMONDIR)/CycleTimer.h
37 | 
38 | $(OBJDIR)/%_ispc.h $(OBJDIR)//%_ispc.o: %.ispc
39 | 		$(ISPC) $(ISPCFLAGS) $< -o $(OBJDIR)/$*_ispc.o -h $(OBJDIR)/$*_ispc.h
40 | 
41 | 


--------------------------------------------------------------------------------
/asst1/prog5_saxpy/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <algorithm>
  3 | 
  4 | #include "CycleTimer.h"
  5 | #include "saxpy_ispc.h"
  6 | 
  7 | extern void saxpySerial(int N, float a, float* X, float* Y, float* result);
  8 | 
  9 | 
 10 | // return GB/s
 11 | static float
 12 | toBW(int bytes, float sec) {
 13 |     return static_cast<float>(bytes) / (1024. * 1024. * 1024.) / sec;
 14 | }
 15 | 
 16 | static float
 17 | toGFLOPS(int ops, float sec) {
 18 |     return static_cast<float>(ops) / 1e9 / sec;
 19 | }
 20 | 
 21 | static void verifyResult(int N, float* result, float* gold) {
 22 |     for (int i=0; i<N; i++) {
 23 |         if (result[i] != gold[i]) {
 24 |             printf("Error: [%d] Got %f expected %f\n", i, result[i], gold[i]);
 25 |         }
 26 |     }
 27 | }
 28 | 
 29 | using namespace ispc;
 30 | 
 31 | 
 32 | int main() {
 33 | 
 34 |     const unsigned int N = 20 * 1000 * 1000; // 20 M element vectors (~80 MB)
 35 |     const unsigned int TOTAL_BYTES = 4 * N * sizeof(float);
 36 |     const unsigned int TOTAL_FLOPS = 2 * N;
 37 | 
 38 |     float scale = 2.f;
 39 | 
 40 |     float* arrayX = new float[N];
 41 |     float* arrayY = new float[N];
 42 |     float* resultSerial = new float[N];
 43 |     float* resultISPC = new float[N];
 44 |     float* resultTasks = new float[N];
 45 | 
 46 |     // initialize array values
 47 |     for (unsigned int i=0; i<N; i++)
 48 |     {
 49 |         arrayX[i] = i;
 50 |         arrayY[i] = i;
 51 |         resultSerial[i] = 0.f;
 52 |         resultISPC[i] = 0.f;
 53 |         resultTasks[i] = 0.f;
 54 |     }
 55 | 
 56 |     //
 57 |     // Run the serial implementation. Repeat three times for robust
 58 |     // timing.
 59 |     //
 60 |     double minSerial = 1e30;
 61 |     for (int i = 0; i < 3; ++i) {
 62 |         double startTime =CycleTimer::currentSeconds();
 63 |         saxpySerial(N, scale, arrayX, arrayY, resultSerial);
 64 |         double endTime = CycleTimer::currentSeconds();
 65 |         minSerial = std::min(minSerial, endTime - startTime);
 66 |     }
 67 | 
 68 | // printf("[saxpy serial]:\t\t[%.3f] ms\t[%.3f] GB/s\t[%.3f] GFLOPS\n",
 69 |     //       minSerial * 1000,
 70 |     //       toBW(TOTAL_BYTES, minSerial),
 71 |     //       toGFLOPS(TOTAL_FLOPS, minSerial));
 72 | 
 73 |     //
 74 |     // Run the ISPC (single core) implementation
 75 |     //
 76 |     double minISPC = 1e30;
 77 |     for (int i = 0; i < 3; ++i) {
 78 |         double startTime = CycleTimer::currentSeconds();
 79 |         saxpy_ispc(N, scale, arrayX, arrayY, resultISPC);
 80 |         double endTime = CycleTimer::currentSeconds();
 81 |         minISPC = std::min(minISPC, endTime - startTime);
 82 |     }
 83 | 
 84 |     verifyResult(N, resultISPC, resultSerial);
 85 | 
 86 |     printf("[saxpy ispc]:\t\t[%.3f] ms\t[%.3f] GB/s\t[%.3f] GFLOPS\n",
 87 |            minISPC * 1000,
 88 |            toBW(TOTAL_BYTES, minISPC),
 89 |            toGFLOPS(TOTAL_FLOPS, minISPC));
 90 | 
 91 |     //
 92 |     // Run the ISPC (multi-core) implementation
 93 |     //
 94 |     double minTaskISPC = 1e30;
 95 |     for (int i = 0; i < 3; ++i) {
 96 |         double startTime = CycleTimer::currentSeconds();
 97 |         saxpy_ispc_withtasks(N, scale, arrayX, arrayY, resultTasks);
 98 |         double endTime = CycleTimer::currentSeconds();
 99 |         minTaskISPC = std::min(minTaskISPC, endTime - startTime);
100 |     }
101 | 
102 |     verifyResult(N, resultTasks, resultSerial);
103 | 
104 |     printf("[saxpy task ispc]:\t[%.3f] ms\t[%.3f] GB/s\t[%.3f] GFLOPS\n",
105 |            minTaskISPC * 1000,
106 |            toBW(TOTAL_BYTES, minTaskISPC),
107 |            toGFLOPS(TOTAL_FLOPS, minTaskISPC));
108 | 
109 |     printf("\t\t\t\t(%.2fx speedup from use of tasks)\n", minISPC/minTaskISPC);
110 |     //printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC);
111 |     //printf("\t\t\t\t(%.2fx speedup from task ISPC)\n", minSerial/minTaskISPC);
112 | 
113 |     delete[] arrayX;
114 |     delete[] arrayY;
115 |     delete[] resultSerial;
116 |     delete[] resultISPC;
117 |     delete[] resultTasks;
118 | 
119 |     return 0;
120 | }
121 | 


--------------------------------------------------------------------------------
/asst1/prog5_saxpy/saxpy.ispc:
--------------------------------------------------------------------------------
 1 | 
 2 | export void saxpy_ispc(uniform int N,
 3 |                        uniform float scale,
 4 |                             uniform float X[],
 5 |                             uniform float Y[],
 6 |                             uniform float result[])
 7 | {
 8 |     foreach (i = 0 ... N) {           
 9 |         result[i] = scale * X[i] + Y[i];
10 |     }
11 | }
12 | 
13 | task void saxpy_ispc_task(uniform int N,
14 |                                uniform int span,
15 |                                uniform float scale,
16 |                                uniform float X[], 
17 |                                uniform float Y[],
18 |                                uniform float result[])
19 | {
20 | 
21 |     uniform int indexStart = taskIndex * span;
22 |     uniform int indexEnd = min(N, indexStart + span);
23 | 
24 |     foreach (i = indexStart ... indexEnd) {
25 |         result[i] = scale * X[i] + Y[i];
26 |     }
27 | }
28 | 
29 | export void saxpy_ispc_withtasks(uniform int N,
30 |                                uniform float scale,
31 |                                uniform float X[],
32 |                                uniform float Y[],
33 |                                uniform float result[])
34 | {
35 | 
36 |     uniform int span = N / 64;  // 64 tasks
37 | 
38 |     launch[N/span] saxpy_ispc_task(N, span, scale, X, Y, result);
39 | }
40 | 


--------------------------------------------------------------------------------
/asst1/prog5_saxpy/saxpySerial.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | void saxpySerial(int N,
 3 |                        float scale,
 4 |                        float X[],
 5 |                        float Y[],
 6 |                        float result[])
 7 | {
 8 | 
 9 |     for (int i=0; i<N; i++) {
10 |         result[i] = scale * X[i] + Y[i];
11 |     }
12 | }
13 | 
14 | 


--------------------------------------------------------------------------------
/asst2/common/CycleTimer.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SYRAH_CYCLE_TIMER_H_
  2 | #define _SYRAH_CYCLE_TIMER_H_
  3 | 
  4 | #if defined(__APPLE__)
  5 |   #if defined(__x86_64__)
  6 |     #include <sys/sysctl.h>
  7 |   #else
  8 |     #include <mach/mach.h>
  9 |     #include <mach/mach_time.h>
 10 |   #endif // __x86_64__ or not
 11 | 
 12 |   #include <stdio.h>  // fprintf
 13 |   #include <stdlib.h> // exit
 14 | 
 15 | #elif _WIN32
 16 | #  include <windows.h>
 17 | #  include <time.h>
 18 | #else
 19 | #  include <stdio.h>
 20 | #  include <stdlib.h>
 21 | #  include <string.h>
 22 | #  include <sys/time.h>
 23 | #endif
 24 | 
 25 | 
 26 |   // This uses the cycle counter of the processor.  Different
 27 |   // processors in the system will have different values for this.  If
 28 |   // you process moves across processors, then the delta time you
 29 |   // measure will likely be incorrect.  This is mostly for fine
 30 |   // grained measurements where the process is likely to be on the
 31 |   // same processor.  For more global things you should use the
 32 |   // Time interface.
 33 | 
 34 |   // Also note that if you processors' speeds change (i.e. processors
 35 |   // scaling) or if you are in a heterogenous environment, you will
 36 |   // likely get spurious results.
 37 |   class CycleTimer {
 38 |   public:
 39 |     typedef unsigned long long SysClock;
 40 | 
 41 |     //////////
 42 |     // Return the current CPU time, in terms of clock ticks.
 43 |     // Time zero is at some arbitrary point in the past.
 44 |     static SysClock currentTicks() {
 45 | #if defined(__APPLE__) && !defined(__x86_64__)
 46 |       return mach_absolute_time();
 47 | #elif defined(_WIN32)
 48 |       LARGE_INTEGER qwTime;
 49 |       QueryPerformanceCounter(&qwTime);
 50 |       return qwTime.QuadPart;
 51 | #elif defined(__x86_64__)
 52 |       unsigned int a, d;
 53 |       asm volatile("rdtsc" : "=a" (a), "=d" (d));
 54 |       return static_cast<unsigned long long>(a) |
 55 |         (static_cast<unsigned long long>(d) << 32);
 56 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser.
 57 |       unsigned int val;
 58 |       asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val));
 59 |       return val;
 60 | #else
 61 |       timespec spec;
 62 |       clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec);
 63 |       return CycleTimer::SysClock(static_cast<float>(spec.tv_sec) * 1e9 + static_cast<float>(spec.tv_nsec));
 64 | #endif
 65 |     }
 66 | 
 67 |     //////////
 68 |     // Return the current CPU time, in terms of seconds.
 69 |     // This is slower than currentTicks().  Time zero is at
 70 |     // some arbitrary point in the past.
 71 |     static double currentSeconds() {
 72 |       return currentTicks() * secondsPerTick();
 73 |     }
 74 | 
 75 |     //////////
 76 |     // Return the conversion from seconds to ticks.
 77 |     static double ticksPerSecond() {
 78 |       return 1.0/secondsPerTick();
 79 |     }
 80 | 
 81 |     static const char* tickUnits() {
 82 | #if defined(__APPLE__) && !defined(__x86_64__)
 83 |       return "ns";
 84 | #elif defined(__WIN32__) || defined(__x86_64__)
 85 |       return "cycles";
 86 | #else
 87 |       return "ns"; // clock_gettime
 88 | #endif
 89 |     }
 90 | 
 91 |     //////////
 92 |     // Return the conversion from ticks to seconds.
 93 |     static double secondsPerTick() {
 94 |       static bool initialized = false;
 95 |       static double secondsPerTick_val;
 96 |       if (initialized) return secondsPerTick_val;
 97 | #if defined(__APPLE__)
 98 |   #ifdef __x86_64__
 99 |       int args[] = {CTL_HW, HW_CPU_FREQ};
100 |       unsigned int Hz;
101 |       size_t len = sizeof(Hz);
102 |       if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) {
103 |          fprintf(stderr, "Failed to initialize secondsPerTick_val!\n");
104 |          exit(-1);
105 |       }
106 |       secondsPerTick_val = 1.0 / (double) Hz;
107 |   #else
108 |       mach_timebase_info_data_t time_info;
109 |       mach_timebase_info(&time_info);
110 | 
111 |       // Scales to nanoseconds without 1e-9f
112 |       secondsPerTick_val = (1e-9*static_cast<double>(time_info.numer))/
113 |         static_cast<double>(time_info.denom);
114 |   #endif // x86_64 or not
115 | #elif defined(_WIN32)
116 |       LARGE_INTEGER qwTicksPerSec;
117 |       QueryPerformanceFrequency(&qwTicksPerSec);
118 |       secondsPerTick_val = 1.0/static_cast<double>(qwTicksPerSec.QuadPart);
119 | #else
120 |       FILE *fp = fopen("/proc/cpuinfo","r");
121 |       char input[1024];
122 |       if (!fp) {
123 |          fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo.");
124 |          exit(-1);
125 |       }
126 |       // In case we don't find it, e.g. on the N900
127 |       secondsPerTick_val = 1e-9;
128 |       while (!feof(fp) && fgets(input, 1024, fp)) {
129 |         // NOTE(boulos): Because reading cpuinfo depends on dynamic
130 |         // frequency scaling it's better to read the @ sign first
131 |         float GHz, MHz;
132 |         if (strstr(input, "model name")) {
133 |           char* at_sign = strstr(input, "@");
134 |           if (at_sign) {
135 |             char* after_at = at_sign + 1;
136 |             char* GHz_str = strstr(after_at, "GHz");
137 |             char* MHz_str = strstr(after_at, "MHz");
138 |             if (GHz_str) {
139 |               *GHz_str = '\0';
140 |               if (1 == sscanf(after_at, "%f", &GHz)) {
141 |                 //printf("GHz = %f\n", GHz);
142 |                 secondsPerTick_val = 1e-9f / GHz;
143 |                 break;
144 |               }
145 |             } else if (MHz_str) {
146 |               *MHz_str = '\0';
147 |               if (1 == sscanf(after_at, "%f", &MHz)) {
148 |                 //printf("MHz = %f\n", MHz);
149 |                 secondsPerTick_val = 1e-6f / GHz;
150 |                 break;
151 |               }
152 |             }
153 |           }
154 |         } else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) {
155 |           //printf("MHz = %f\n", MHz);
156 |           secondsPerTick_val = 1e-6f / MHz;
157 |           break;
158 |         }
159 |       }
160 |       fclose(fp);
161 | #endif
162 | 
163 |       initialized = true;
164 |       return secondsPerTick_val;
165 |     }
166 | 
167 |     //////////
168 |     // Return the conversion from ticks to milliseconds.
169 |     static double msPerTick() {
170 |       return secondsPerTick() * 1000.0;
171 |     }
172 | 
173 |   private:
174 |     CycleTimer();
175 |   };
176 | 
177 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_
178 | 


--------------------------------------------------------------------------------
/asst2/common/ppm.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | #include <math.h>
 4 | #include <algorithm>
 5 | 
 6 | 
 7 | 
 8 | void
 9 | writePPMImage(int* data, int width, int height, const char *filename, int maxIterations)
10 | {
11 |     FILE *fp = fopen(filename, "wb");
12 | 
13 |     // write ppm header
14 |     fprintf(fp, "P6\n");
15 |     fprintf(fp, "%d %d\n", width, height);
16 |     fprintf(fp, "255\n");
17 | 
18 |     for (int i = 0; i < width*height; ++i) {
19 | 
20 |         // Clamp iteration count for this pixel, then scale the value
21 |         // to 0-1 range.  Raise resulting value to a power (<1) to
22 |         // increase brightness of low iteration count
23 |         // pixels. a.k.a. Make things look cooler.
24 | 
25 |         float mapped = pow( std::min(static_cast<float>(maxIterations),
26 |                                      static_cast<float>(data[i])) / 256.f, .5f);
27 | 
28 |         // convert back into 0-255 range, 8-bit channels
29 |         unsigned char result = static_cast<unsigned char>(255.f * mapped);
30 |         for (int j = 0; j < 3; ++j)
31 |             fputc(result, fp);
32 |     }
33 |     fclose(fp);
34 |     printf("Wrote image file %s\n", filename);
35 | }
36 | 


--------------------------------------------------------------------------------
/asst2/figs/task_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst2/figs/task_graph.png


--------------------------------------------------------------------------------
/asst2/part_a/.gitignore:
--------------------------------------------------------------------------------
1 | objs/
2 | runtasks
3 | 


--------------------------------------------------------------------------------
/asst2/part_a/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++ -m64
 2 | CXXFLAGS=-I. -I../common -I../tests -Iobjs/ -O3 -std=c++11 -Wall
 3 | 
 4 | APP_NAME=runtasks
 5 | OBJDIR=objs
 6 | COMMONDIR=../common
 7 | 
 8 | PPM_CXX=$(COMMONDIR)/ppm.cpp
 9 | PPM_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(PPM_CXX:.cpp=.o)))
10 | 
11 | default: $(APP_NAME)
12 | 
13 | .PHONY: dirs clean
14 | 
15 | dirs:
16 | 	/bin/mkdir -p $(OBJDIR)/
17 | 
18 | clean:
19 | 	/bin/rm -rf $(OBJDIR) *.ppm *~ $(APP_NAME)
20 | 
21 | OBJS=$(PPM_OBJ) $(OBJDIR)/tasksys.o
22 | 
23 | $(APP_NAME): clean dirs $(OBJS)
24 | 	$(CXX) ../tests/main.cpp $(CXXFLAGS) -o $@ $(OBJDIR)/tasksys.o -lm -lpthread
25 | 
26 | $(OBJDIR)/%.o: $(COMMONDIR)/%.cpp
27 | 	$(CXX) $< $(CXXFLAGS) -c -o $@
28 | 
29 | $(OBJDIR)/%.o: %.cpp
30 | 	$(CXX) $< $(CXXFLAGS) -c -o $@
31 | 


--------------------------------------------------------------------------------
/asst2/part_a/itasksys.h:
--------------------------------------------------------------------------------
 1 | #ifndef _ITASKSYS_H
 2 | #define _ITASKSYS_H
 3 | #include <vector>
 4 | 
 5 | typedef int TaskID;
 6 | 
 7 | class IRunnable {
 8 |     public:
 9 |         virtual ~IRunnable();
10 | 
11 |         /*
12 |           Executes an instance of the task as part of a bulk task launch.
13 |           
14 |            - task_id: the current task identifier. This value will be
15 |               between 0 and num_total_tasks-1.
16 |               
17 |            - num_total_tasks: the total number of tasks in the bulk
18 |              task launch.
19 |          */
20 |         virtual void runTask(int task_id, int num_total_tasks) = 0;
21 | };
22 | 
23 | class ITaskSystem {
24 |     public:
25 |         /*
26 |           Instantiates a task system.
27 | 
28 |            - num_threads: the maximum number of threads that the task system
29 |              can use.
30 |          */
31 |         ITaskSystem(int num_threads);
32 |         virtual ~ITaskSystem();
33 |         virtual const char* name() = 0;
34 | 
35 |         /*
36 |           Executes a bulk task launch of num_total_tasks.  Task
37 |           execution is synchronous with the calling thread, so run()
38 |           will return only when the execution of all tasks is
39 |           complete.
40 |         */
41 |         virtual void run(IRunnable* runnable, int num_total_tasks) = 0;
42 | 
43 |         /*
44 |           Executes an asynchronous bulk task launch of
45 |           num_total_tasks, but with a dependency on prior launched
46 |           tasks.
47 | 
48 | 
49 |           The task runtime must complete execution of the tasks
50 |           associated with all bulk task launches referenced in the
51 |           array `deps` before beginning execution of *any* task in
52 |           this bulk task launch.
53 | 
54 |           The caller must invoke sync() to guarantee completion of the
55 |           tasks in this bulk task launch.
56 |  
57 |           Returns an identifer that can be used in subsequent calls to
58 |           runAsnycWithDeps() to specify a dependency of some future
59 |           bulk task launch on this bulk task launch.
60 |          */
61 |         virtual TaskID runAsyncWithDeps(IRunnable* runnable, int num_total_tasks,
62 |                                         const std::vector<TaskID>& deps) = 0;
63 | 
64 |         /*
65 |           Blocks until all tasks created as a result of **any prior**
66 |           runXXX calls are done.
67 |          */
68 |         virtual void sync() = 0;
69 | };
70 | #endif
71 | 


--------------------------------------------------------------------------------
/asst2/part_a/runtasks_ref_linux:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst2/part_a/runtasks_ref_linux


--------------------------------------------------------------------------------
/asst2/part_a/runtasks_ref_osx_arm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst2/part_a/runtasks_ref_osx_arm


--------------------------------------------------------------------------------
/asst2/part_a/runtasks_ref_osx_x86:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst2/part_a/runtasks_ref_osx_x86


--------------------------------------------------------------------------------
/asst2/part_a/tasksys.h:
--------------------------------------------------------------------------------
  1 | #ifndef _TASKSYS_H
  2 | #define _TASKSYS_H
  3 | 
  4 | #include "itasksys.h"
  5 | #include <atomic>
  6 | #include <queue>
  7 | #include <vector>
  8 | #include <thread>
  9 | #include <mutex>
 10 | #include <condition_variable>
 11 | #include <unordered_map>
 12 | 
 13 | /*
 14 |  * TaskSystemSerial: This class is the student's implementation of a
 15 |  * serial task execution engine.  See definition of ITaskSystem in
 16 |  * itasksys.h for documentation of the ITaskSystem interface.
 17 |  */
 18 | class TaskSystemSerial: public ITaskSystem {
 19 |     public:
 20 |         TaskSystemSerial(int num_threads);
 21 |         ~TaskSystemSerial();
 22 |         const char* name();
 23 |         void run(IRunnable* runnable, int num_total_tasks);
 24 |         TaskID runAsyncWithDeps(IRunnable* runnable, int num_total_tasks,
 25 |                                 const std::vector<TaskID>& deps);
 26 |         void sync();
 27 | };
 28 | 
 29 | /*
 30 |  * TaskSystemParallelSpawn: This class is the student's implementation of a
 31 |  * parallel task execution engine that spawns threads in every run()
 32 |  * call.  See definition of ITaskSystem in itasksys.h for documentation
 33 |  * of the ITaskSystem interface.
 34 |  */
 35 | class TaskSystemParallelSpawn: public ITaskSystem {
 36 |     public:
 37 |         TaskSystemParallelSpawn(int num_threads);
 38 |         ~TaskSystemParallelSpawn();
 39 |         const char* name();
 40 |         void run(IRunnable* runnable, int num_total_tasks);
 41 |         TaskID runAsyncWithDeps(IRunnable* runnable, int num_total_tasks,
 42 |                                 const std::vector<TaskID>& deps);
 43 |         void sync();
 44 | private:
 45 |     int num_thread_;
 46 |     std::atomic<int> task_idx_;
 47 | };
 48 | 
 49 | /*
 50 |  * TaskSystemParallelThreadPoolSpinning: This class is the student's
 51 |  * implementation of a parallel task execution engine that uses a
 52 |  * thread pool. See definition of ITaskSystem in itasksys.h for
 53 |  * documentation of the ITaskSystem interface.
 54 |  */
 55 | class TaskSystemParallelThreadPoolSpinning: public ITaskSystem {
 56 |     public:
 57 |         TaskSystemParallelThreadPoolSpinning(int num_threads);
 58 |         ~TaskSystemParallelThreadPoolSpinning();
 59 |         const char* name();
 60 |         void run(IRunnable* runnable, int num_total_tasks);
 61 |         TaskID runAsyncWithDeps(IRunnable* runnable, int num_total_tasks,
 62 |                                 const std::vector<TaskID>& deps);
 63 |         void sync();
 64 | private:
 65 |     std::vector<std::thread> threads_;
 66 |     int num_total_tasks_;
 67 |     IRunnable *runnable_;
 68 |     std::queue<int> task_index_;
 69 |     std::mutex lk_;
 70 |     bool stop_;
 71 |     std::atomic<int> task_done_;
 72 | };
 73 | 
 74 | /*
 75 |  * TaskSystemParallelThreadPoolSleeping: This class is the student's
 76 |  * optimized implementation of a parallel task execution engine that uses
 77 |  * a thread pool. See definition of ITaskSystem in
 78 |  * itasksys.h for documentation of the ITaskSystem interface.
 79 |  */
 80 | class TaskSystemParallelThreadPoolSleeping: public ITaskSystem {
 81 |     public:
 82 |         TaskSystemParallelThreadPoolSleeping(int num_threads);
 83 |         ~TaskSystemParallelThreadPoolSleeping();
 84 |         const char* name();
 85 |         void run(IRunnable* runnable, int num_total_tasks);
 86 |         TaskID runAsyncWithDeps(IRunnable* runnable, int num_total_tasks,
 87 |                                 const std::vector<TaskID>& deps);
 88 |         void sync();
 89 | private:
 90 | 
 91 |     struct TaskInfo {
 92 |         TaskID id; // task的ID
 93 |         IRunnable* runnable;
 94 |         int num_total_task; // 所有的任务数
 95 |         int num_done_task; // 已经完成的任务数
 96 |         bool done() const {
 97 |             return num_total_task == num_done_task;
 98 |         }
 99 |     };
100 | 
101 | 
102 |     struct WorkInfo {
103 |         TaskID id; // work所属的task
104 |         IRunnable* runnable;
105 |         int cur_index;
106 |         int num_total_task;
107 |     };
108 | 
109 |     bool stop_;
110 | 
111 |     TaskID global_task_id_;
112 |     std::mutex lk_;
113 |     std::condition_variable cv_worker_; // worker线程等待的队列
114 |     std::condition_variable cv_main_; // sync线程等待的队列
115 | 
116 |     // 所有加入，但是还没有完成的task的总数，包括不满足条件的
117 |     int num_all_undone_task;
118 | 
119 |     std::vector<std::thread> threads_; // 所有的worker线程
120 |     std::unordered_map<TaskID, std::vector<int>> graph_; // 维护当前图
121 |     std::unordered_map<TaskID, int> in_degree_; // 每个task的入度
122 |     std::queue<WorkInfo> tasks_; //  所有需要被执行的任务
123 |     std::unordered_map<TaskID, TaskInfo> task_info_; // 每个任务的信息
124 | 
125 | };
126 | 
127 | #endif
128 | 


--------------------------------------------------------------------------------
/asst2/part_b/.gitignore:
--------------------------------------------------------------------------------
1 | objs/
2 | runtasks
3 | 


--------------------------------------------------------------------------------
/asst2/part_b/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++ -m64
 2 | CXXFLAGS=-I. -I../common -I../tests -Iobjs/ -O3 -std=c++11 -Wall
 3 | #CXXFLAGS=-I. -I../common -I../tests -Iobjs/ -g -std=c++11 -Wall
 4 | 
 5 | APP_NAME=runtasks
 6 | OBJDIR=objs
 7 | COMMONDIR=../common
 8 | 
 9 | PPM_CXX=$(COMMONDIR)/ppm.cpp
10 | PPM_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(PPM_CXX:.cpp=.o)))
11 | 
12 | default: $(APP_NAME)
13 | 
14 | .PHONY: dirs clean
15 | 
16 | dirs:
17 | 	/bin/mkdir -p $(OBJDIR)/
18 | 
19 | clean:
20 | 	/bin/rm -rf $(OBJDIR) *.ppm *~ $(APP_NAME)
21 | 
22 | OBJS=$(PPM_OBJ) $(OBJDIR)/tasksys.o
23 | 
24 | $(APP_NAME): clean dirs $(OBJS)
25 | 	$(CXX) ../tests/main.cpp $(CXXFLAGS) -o $@ $(OBJDIR)/tasksys.o -lm -lpthread
26 | 
27 | $(OBJDIR)/%.o: $(COMMONDIR)/%.cpp
28 | 	$(CXX) $< $(CXXFLAGS) -c -o $@
29 | 
30 | $(OBJDIR)/%.o: %.cpp
31 | 	$(CXX) $< $(CXXFLAGS) -c -o $@
32 | 


--------------------------------------------------------------------------------
/asst2/part_b/itasksys.h:
--------------------------------------------------------------------------------
 1 | #ifndef _ITASKSYS_H
 2 | #define _ITASKSYS_H
 3 | #include <vector>
 4 | 
 5 | typedef int TaskID;
 6 | 
 7 | class IRunnable {
 8 | public:
 9 |   virtual ~IRunnable();
10 | 
11 |   /*
12 |     Executes an instance of the task as part of a bulk task launch.
13 | 
14 |      - task_id: the current task identifier. This value will be
15 |         between 0 and num_total_tasks-1.
16 | 
17 |      - num_total_tasks: the total number of tasks in the bulk
18 |        task launch.
19 |    */
20 |   virtual void runTask(int task_id, int num_total_tasks) = 0;
21 | };
22 | 
23 | class ITaskSystem {
24 | public:
25 |   /*
26 |     Instantiates a task system.
27 | 
28 |      - num_threads: the maximum number of threads that the task system
29 |        can use.
30 |    */
31 |   ITaskSystem(int num_threads);
32 |   virtual ~ITaskSystem();
33 |   virtual const char *name() = 0;
34 | 
35 |   /*
36 |     Executes a bulk task launch of num_total_tasks.  Task
37 |     execution is synchronous with the calling thread, so run()
38 |     will return only when the execution of all tasks is
39 |     complete.
40 |   */
41 |   virtual void run(IRunnable *runnable, int num_total_tasks) = 0;
42 | 
43 |   /*
44 |     Executes an asynchronous bulk task launch of
45 |     num_total_tasks, but with a dependency on prior launched
46 |     tasks.
47 | 
48 | 
49 |     The task runtime must complete execution of the tasks
50 |     associated with all bulk task launches referenced in the
51 |     array `deps` before beginning execution of *any* task in
52 |     this bulk task launch.
53 | 
54 |     The caller must invoke sync() to guarantee completion of the
55 |     tasks in this bulk task launch.
56 | 
57 |     Returns an identifer that can be used in subsequent calls to
58 |     runAsnycWithDeps() to specify a dependency of some future
59 |     bulk task launch on this bulk task launch.
60 |    */
61 |   virtual TaskID runAsyncWithDeps(IRunnable *runnable, int num_total_tasks,
62 |                                   const std::vector<TaskID> &deps) = 0;
63 | 
64 |   /*
65 |     Blocks until all tasks created as a result of **any prior**
66 |     runXXX calls are done.
67 |    */
68 |   virtual void sync() = 0;
69 | };
70 | #endif
71 | 


--------------------------------------------------------------------------------
/asst2/part_b/runtasks_ref_linux:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst2/part_b/runtasks_ref_linux


--------------------------------------------------------------------------------
/asst2/part_b/runtasks_ref_osx_arm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst2/part_b/runtasks_ref_osx_arm


--------------------------------------------------------------------------------
/asst2/part_b/runtasks_ref_osx_x86:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst2/part_b/runtasks_ref_osx_x86


--------------------------------------------------------------------------------
/asst2/part_b/tasksys.h:
--------------------------------------------------------------------------------
  1 | #ifndef _TASKSYS_H
  2 | #define _TASKSYS_H
  3 | 
  4 | #include "itasksys.h"
  5 | #include <atomic>
  6 | #include <queue>
  7 | #include <vector>
  8 | #include <thread>
  9 | #include <mutex>
 10 | #include <condition_variable>
 11 | #include <unordered_map>
 12 | 
 13 | /*
 14 |  * TaskSystemSerial: This class is the student's implementation of a
 15 |  * serial task execution engine.  See definition of ITaskSystem in
 16 |  * itasksys.h for documentation of the ITaskSystem interface.
 17 |  */
 18 | class TaskSystemSerial: public ITaskSystem {
 19 |     public:
 20 |         TaskSystemSerial(int num_threads);
 21 |         ~TaskSystemSerial();
 22 |         const char* name();
 23 |         void run(IRunnable* runnable, int num_total_tasks);
 24 |         TaskID runAsyncWithDeps(IRunnable* runnable, int num_total_tasks,
 25 |                                 const std::vector<TaskID>& deps);
 26 |         void sync();
 27 | };
 28 | 
 29 | /*
 30 |  * TaskSystemParallelSpawn: This class is the student's implementation of a
 31 |  * parallel task execution engine that spawns threads in every run()
 32 |  * call.  See definition of ITaskSystem in itasksys.h for documentation
 33 |  * of the ITaskSystem interface.
 34 |  */
 35 | class TaskSystemParallelSpawn: public ITaskSystem {
 36 |     public:
 37 |         TaskSystemParallelSpawn(int num_threads);
 38 |         ~TaskSystemParallelSpawn();
 39 |         const char* name();
 40 |         void run(IRunnable* runnable, int num_total_tasks);
 41 |         TaskID runAsyncWithDeps(IRunnable* runnable, int num_total_tasks,
 42 |                                 const std::vector<TaskID>& deps);
 43 |         void sync();
 44 | private:
 45 |     int num_thread_;
 46 |     std::atomic<int> task_idx_;
 47 | };
 48 | 
 49 | /*
 50 |  * TaskSystemParallelThreadPoolSpinning: This class is the student's
 51 |  * implementation of a parallel task execution engine that uses a
 52 |  * thread pool. See definition of ITaskSystem in itasksys.h for
 53 |  * documentation of the ITaskSystem interface.
 54 |  */
 55 | class TaskSystemParallelThreadPoolSpinning: public ITaskSystem {
 56 |     public:
 57 |         TaskSystemParallelThreadPoolSpinning(int num_threads);
 58 |         ~TaskSystemParallelThreadPoolSpinning();
 59 |         const char* name();
 60 |         void run(IRunnable* runnable, int num_total_tasks);
 61 |         TaskID runAsyncWithDeps(IRunnable* runnable, int num_total_tasks,
 62 |                                 const std::vector<TaskID>& deps);
 63 |         void sync();
 64 | private:
 65 |     std::vector<std::thread> threads_;
 66 |     int num_total_tasks_;
 67 |     IRunnable *runnable_;
 68 |     std::queue<int> task_index_;
 69 |     std::mutex lk_;
 70 |     bool stop_;
 71 |     std::atomic<int> task_done_;
 72 | };
 73 | 
 74 | /*
 75 |  * TaskSystemParallelThreadPoolSleeping: This class is the student's
 76 |  * optimized implementation of a parallel task execution engine that uses
 77 |  * a thread pool. See definition of ITaskSystem in
 78 |  * itasksys.h for documentation of the ITaskSystem interface.
 79 |  */
 80 | class TaskSystemParallelThreadPoolSleeping: public ITaskSystem {
 81 |     public:
 82 |         TaskSystemParallelThreadPoolSleeping(int num_threads);
 83 |         ~TaskSystemParallelThreadPoolSleeping();
 84 |         const char* name();
 85 |         void run(IRunnable* runnable, int num_total_tasks);
 86 |         TaskID runAsyncWithDeps(IRunnable* runnable, int num_total_tasks,
 87 |                                 const std::vector<TaskID>& deps);
 88 |         void sync();
 89 | private:
 90 | 
 91 |     struct TaskInfo {
 92 |         TaskID id; // task的ID
 93 |         IRunnable* runnable;
 94 |         int num_total_task; // 所有的任务数
 95 |         int num_done_work; // 当前task已经完成的任务
 96 |         TaskInfo(TaskID _id, IRunnable* _runnable, int _num_total_task, int _num_donw_work):
 97 |             id(_id), runnable(_runnable), num_total_task(_num_total_task), num_done_work(_num_donw_work) {}
 98 |         // std::atomic<int> num_done_work; // 当前task已经完成的任务
 99 |         // TaskInfo(TaskID _id, IRunnable* _runnable, int _num_total_task):
100 |         //     id(_id), runnable(_runnable), num_total_task(_num_total_task){}
101 |     };
102 | 
103 | 
104 |     struct WorkInfo {
105 |         TaskID id; // work所属的task
106 |         int cur_index;
107 |     };
108 | 
109 |     bool stop_;
110 | 
111 |     TaskID global_task_id_;
112 |     std::mutex lk_;
113 |     std::condition_variable cv_worker_; // worker线程等待的队列
114 |     std::condition_variable cv_main_; // sync线程等待的队列
115 | 
116 |     // 所有加入，但是还没有完成的task的总数，包括不满足条件的
117 |     int num_all_undone_task;
118 | 
119 |     std::vector<std::thread> threads_; // 所有的worker线程
120 |     std::vector<std::vector<int>> graph_;
121 |     // std::unordered_map<TaskID, std::vector<int>> graph_; // 维护当前图
122 |     std::vector<int> in_degree_;
123 |     // std::unordered_map<TaskID, int> in_degree_; // 每个task的入度
124 |     std::queue<WorkInfo> tasks_; //  所有需要被执行的任务
125 |     std::vector<TaskInfo> task_info_;
126 |     // std::unordered_map<TaskID, TaskInfo> task_info_; // 每个任务的信息
127 |     static constexpr int N = 1024;
128 | };
129 | 
130 | #endif
131 | 


--------------------------------------------------------------------------------
/asst2/tutorial/Makefile:
--------------------------------------------------------------------------------
 1 | CXX=g++ -m64
 2 | CXXFLAGS=-O3 -std=c++11 -Wall
 3 | 
 4 | APP_NAME=tutorial
 5 | 
 6 | default: $(APP_NAME)
 7 | 
 8 | .PHONY: dirs clean
 9 | 
10 | clean:
11 | 	/bin/rm -rf $(APP_NAME)
12 | 
13 | $(APP_NAME): clean
14 | 	$(CXX) $(CXXFLAGS) -o $@ $@.cpp -lpthread
15 | 


--------------------------------------------------------------------------------
/asst2/tutorial/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # A Primer on C++ Synchronization #
 3 | 
 4 | Your programming assignment 2 solutions will certainly need to create threads, and may need to make use of two types of synchronization primitives: mutexes and condition variables.  The following notes explain these two types of synchronization.
 5 | 
 6 | We provide you basic examples of creating C++ threads, locking/unlocking mutexes, and using condition variables in the file `tutorial/tutorial.cpp` provided in the starter code.
 7 | 
 8 | ## Creating C++ Threads ##
 9 | 
10 | Creating new threads in C++ is simple.  To create threads, an application constructs new instances of the `std::thread` object.  For example, in the code below, the main thread creates two threads that run the function `my_func`. (Observe that the function `my_func` is used as an argument to the `std::thread` constructor.)  The main thread invokes `join()` to determine when the execution of a spawned thread has completed.
11 | 
12 |     #include <thread>
13 |     #include <stdio.h>
14 | 
15 |     void my_func(int thread_id, int num_threads) {
16 |     	printf("Hello from spawned thread %d of %d\n", thread_id, num_threads);
17 |     }
18 | 
19 |     int main(int argc, char** argv) {
20 |     
21 |       std::thread t0 = std::thread(my_func, 0, 2);
22 |       std::thread t1 = std::thread(my_func, 1, 2);
23 |     
24 |       printf("The main thread is running concurrently with spawned threads.\n");
25 |     
26 |       t0.join();
27 |       t1.join();
28 |     
29 |       printf("Spawned threads have terminated at this point.\n");
30 |     
31 |       return 0;
32 |     }
33 | 
34 | Full documentation of `std::thread` can be found here: <https://en.cppreference.com/w/cpp/thread/thread>.
35 | 
36 | Useful tutorials on creating threads in C++ 11:
37 | 
38 |  * <https://www.geeksforgeeks.org/multithreading-in-cpp/>
39 |  * <https://thispointer.com/c-11-multithreading-part-1-three-different-ways-to-create-threads/>
40 | 
41 | ## Mutexes ##
42 | 
43 | C++ standard library provides a mutex synchronization primitive, `std::mutex`, for protecting shared data from simultaneous access by multiple application threads. (Note: mutex is short for "mutual exclusion").  
44 | 
45 | <https://en.cppreference.com/w/cpp/thread/mutex>
46 | 
47 | You have already encountered mutexes in prior courses like CS110.  A thread locks the mutex using `mutex::lock()`. The calling thread will block until the mutex lock can be acquired.  When `lock()` returns to the caller, the calling thread is guaranteed to have the lock.  A thread unlocks the mutex using `mutex::unlock()`.
48 | 
49 | For those interested, C++ provides a number of wrapper classes that are designed to reduce bugs when using locks (e.g., forgetting to unlock a mutex).  You may wish to look at the definitions of [`std::unique_lock`](https://en.cppreference.com/w/cpp/thread/unique_lock) and [`std::lock_guard`](https://en.cppreference.com/w/cpp/thread/lock_guard).  For example `lock_guard` automatically locks a specified mutex on construction, and unlocks the mutex when it is goes out of scope.
50 | 
51 | We recommend that you take a look at the function `mutex_example()` in `tutorial/tutorial.cpp` for a simple example of using a mutex to protect updates to a shared counter.  In this example, the mutex is used to ensure the read-modify-write to the counter is performed atomically.
52 | 
53 | ## Condition Variables ##
54 | 
55 | A condition variable manages a list of threads waiting for a condition to hold (e.g., an event to occur), and allows other threads to notify the waiting threads that the event of interest has occurred. A condition variable, when used in conjunction with a mutex, provides an easy way to send notifications between threads.
56 | 
57 | There are two major operations on a condition variable: `wait()` and `notify()`.
58 | 
59 | A thread calls `wait(lock)` to indicate it wishes to wait until a notfication from another thread.  Notice that a mutex (wrapped in a `std::unique_lock`) is passed to the call to `wait()`.  When the thread is notified, the condition variable will acquire the lock.  This means that when a call to `wait()` returns, the calling thread is the current holder of the lock.  Typically the lock is used to protect a shared variable that the thread now needs to check to ensure the condition it is waiting for is true.  
60 | 
61 | For example, the code in `tutorial/tutorial.cpp` creates N threads.  N-1 of the threads wait for notification from thread 0, and then when notified, atomically increment a counter that is protected by a shared mutex.
62 | 
63 | A thread calls `notify()` on a condition variable to notify exactly one thread waiting on the condition variable and `notify_all()` to notify all threads waiting on the condition variable.  Notice how in `tutorial/tutorial.cpp`, thread 0 releases the lock protecting the counter prior to signaling all the waiting threads.
64 | 
65 | In your task execution system implementation, how might you use `notify_all()`?  Consider a situation where all worker threads are currently waiting for a new bulk task launch, and the application makes a call to `run()`, providing new tasks to execute.
66 | 
67 | 
68 | Additional references:
69 | 
70 | * <https://thispointer.com/c11-multithreading-part-7-condition-variables-explained>
71 | * <https://www.modernescpp.com/index.php/condition-variables>
72 | 
73 | ## C++ Atomics ##
74 | 
75 | C++ also provides a simple way to make operations on a variable atomic---just create a variable of the type `std::atomic<T>`. For example to create an integer that supports atomic increment, just create a variable of type:
76 | 
77 |     std::atomic<int> my_counter;
78 | 
79 | Now operations on `my_counter`, like `my_counter++` are guaranteed to be performed atomically.  For more detail see: <https://en.cppreference.com/w/cpp/atomic/atomic>.
80 | 


--------------------------------------------------------------------------------
/asst2/tutorial/tutorial.cpp:
--------------------------------------------------------------------------------
  1 | #include <condition_variable>
  2 | #include <mutex>
  3 | #include <thread>
  4 | 
  5 | #include <stdio.h>
  6 | 
  7 | /*
  8 |  Wrapper class around an integer counter and a mutex.
  9 |  */
 10 | class Counter {
 11 |     public:
 12 |         int counter_;
 13 |         std::mutex* mutex_;
 14 |         Counter() {
 15 |             counter_ = 0;
 16 |             mutex_ = new std::mutex();
 17 |         }
 18 |         ~Counter() {
 19 |             delete mutex_;
 20 |         }
 21 | };
 22 | 
 23 | void increment_counter_fn(Counter* counter) {
 24 |     for (int i = 0; i < 10000; i++) {
 25 |         // Call lock() method to acquire lock.
 26 |         counter->mutex_->lock();
 27 |         // Since multiple threads are trying to perform an increment, the
 28 |         // increment needs to be protected by a mutex.
 29 |         counter->counter_++;
 30 |         // Call unlock() method to release lock.
 31 |         counter->mutex_->unlock();
 32 |     }
 33 | }
 34 | 
 35 | /*
 36 |  * Threads increment a shared counter in a tight for loop 10,000 times.
 37 |  */
 38 | void mutex_example() {
 39 |     int num_threads = 8;
 40 | 
 41 |     printf("==============================================================\n");
 42 |     printf("Starting %d threads to increment counter...\n", num_threads);
 43 |     std::thread* threads = new std::thread[num_threads];
 44 |     Counter* counter = new Counter();
 45 |     // `num_threads` threads will call `increment_counter_fn`, trying to
 46 |     // increment `counter`.
 47 |     for (int i = 0; i < num_threads; i++) {
 48 |         threads[i] = std::thread(increment_counter_fn, counter);
 49 |     }
 50 |     // Wait for spawned threads to complete.
 51 |     for (int i = 0; i < num_threads; i++) {
 52 |         threads[i].join();
 53 |     }
 54 |     // Verify that final counter value is (10000 * `num_threads`).
 55 |     printf("Final counter value: %d...\n", counter->counter_);
 56 |     printf("==============================================================\n");
 57 | 
 58 |     delete counter;
 59 |     delete[] threads;
 60 | }
 61 | 
 62 | /*
 63 |  * Wrapper class around a counter, a condition variable, and a mutex.
 64 |  */
 65 | class ThreadState {
 66 |     public:
 67 |         std::condition_variable* condition_variable_;
 68 |         std::mutex* mutex_;
 69 |         int counter_;
 70 |         int num_waiting_threads_;
 71 |         ThreadState(int num_waiting_threads) {
 72 |             condition_variable_ = new std::condition_variable();
 73 |             mutex_ = new std::mutex();
 74 |             counter_ = 0;
 75 |             num_waiting_threads_ = num_waiting_threads;
 76 |         }
 77 |         ~ThreadState() {
 78 |             delete condition_variable_;
 79 |             delete mutex_;
 80 |         }
 81 | };
 82 | 
 83 | void signal_fn(ThreadState* thread_state) {
 84 |     // Acquire mutex to make sure the shared counter is read in a
 85 |     // consistent state.
 86 |     thread_state->mutex_->lock();
 87 |     while (thread_state->counter_ < thread_state->num_waiting_threads_) {
 88 |         thread_state->mutex_->unlock();
 89 |         // Release the mutex before calling `notify_all()` to make sure
 90 |         // waiting threads have a chance to make progress.
 91 |         thread_state->condition_variable_->notify_all();
 92 |         // Re-acquire the mutex to read the shared counter again.
 93 |         thread_state->mutex_->lock();
 94 |     }
 95 |     thread_state->mutex_->unlock();
 96 | }
 97 | 
 98 | void wait_fn(ThreadState* thread_state) {
 99 |     // A lock must be held in order to wait on a condition variable.
100 |     // This lock is atomically released before the thread goes to sleep
101 |     // when `wait()` is called. The lock is atomically re-acquired when
102 |     // the thread is woken up using `notify_all()`.
103 |     std::unique_lock<std::mutex> lk(*thread_state->mutex_);
104 |     thread_state->condition_variable_->wait(lk);
105 |     // Increment the shared counter with the lock re-acquired to inform the
106 |     // signaling thread that this waiting thread has successfully been
107 |     // woken up.
108 |     thread_state->counter_++;
109 |     printf("Lock re-acquired after wait()...\n");
110 |     lk.unlock();
111 | }
112 | 
113 | /*
114 |  * Signaling thread spins until each waiting thread increments a shared
115 |  * counter after being woken up from the `wait()` method.
116 |  */
117 | void condition_variable_example() {
118 |     int num_threads = 3;
119 | 
120 |     printf("==============================================================\n");
121 |     printf("Starting %d threads for signal-and-waiting...\n", num_threads);
122 |     std::thread* threads = new std::thread[num_threads];
123 |     ThreadState* thread_state = new ThreadState(num_threads-1);
124 |     threads[0] = std::thread(signal_fn, thread_state);
125 |     for (int i = 1; i < num_threads; i++) {
126 |         threads[i] = std::thread(wait_fn, thread_state);
127 |     }
128 |     for (int i = 0; i < num_threads; i++) {
129 |         threads[i].join();
130 |     }
131 |     printf("==============================================================\n");
132 | 
133 |     delete thread_state;
134 |     delete[] threads;
135 | }
136 | 
137 | 
138 | int main(int argc, char** argv) {
139 |    mutex_example();
140 |    condition_variable_example();
141 | }
142 | 


--------------------------------------------------------------------------------
/asst3/cloud_readme.md:
--------------------------------------------------------------------------------
  1 | # AWS Setup Instructions #
  2 | 
  3 | For performance testing, you will need to run it on a VM instance on Amazon Web Services (AWS). We've already sent you student coupons that you can use for billing purposes. Here are the steps for how to get setup for running on AWS.
  4 | 
  5 | NOTE: __Please don't forget to SHUT DOWN your instances when you're done for the day to avoid burning through credits overnight!__
  6 | 
  7 | ### Creating a VM with a GPU ###
  8 |       
  9 | 1. Now you're ready to create a VM instance. Click on the button that says `Launch Instances`. Choose the `Ubuntu Server 20.04 LTS (HVM), SSD Volume Type` AMI:
 10 | ![AMI Selection](handout/choose_ami.png?raw=true)
 11 | 
 12 | 2. Choose the `g4dn.xlarge` GPU Instance Type and then click `4. Add Storage` on the top bar: 
 13 | ![GPU instance](handout/choose_instance.png?raw=true)
 14 | 
 15 | 3. Change the size of the `Root` volume to 64 GB to accomodate the packages we will need to install to make the instance functional for the assignment:
 16 | ![Storage](handout/choose_storage.png?raw=true)
 17 | 
 18 | 5. AWS will ask you to select a key pair. Click the first dropdown and choose `Create a new key pair` and give it whatever name you'd like. This will download a keyfile to your computer called `<key_name>.pem` which you will use to login to the VM instance you are about to create. Finally, click `Launch Instances`.
 19 | ![Key Pair](handout/new_key_pair.png?raw=true)
 20 | 
 21 | __Note: `gd4n.xlarge` instances cost $0.526 / hour, so leaving one running for a whole day will consume $12.624 worth of your AWS coupon.__
 22 | 
 23 | 4. Now that you've created your VM, you should be able to __SSH__ into it. You need the public IPv4 DNS name to SSH into it, which you can find on the instance page by clicking the `View Instances` button on the current page and then the instance ID for your created instance (note, it may take a moment for the instance to startup and be assigned an IP address):
 24 | ![Public DNS Name](handout/public_dns.png?raw=true)
 25 | Once you have the IP address, you can login to the instance by running this command:
 26 | ~~~~
 27 | ssh -i path/to/key_name.pem ubuntu@<public_dns_name>
 28 | ~~~~
 29 | 
 30 | ### Setting up the VM environment ###
 31 | 
 32 | We have included a convenience script, __install.sh__, which performs steps 5,6 and 7 for you. To run it, do:
 33 | 
 34 | ~~~~
 35 | chmod +x install.sh
 36 | sudo ./install.sh
 37 | source ~/.bashrc
 38 | ~~~~
 39 | 
 40 | If for some reason the script does not work, the manual instructions follow:
 41 | 
 42 | ### Manually setting up the VM environment ###
 43 | 
 44 | 5. Once you SSH into your VM instance, you'll want to install whatever software you need to make the machine a useful development environment for you.  For example we recommend:
 45 | ~~~~
 46 | sudo apt update
 47 | sudo apt install make g++ freeglut3-dev # Required
 48 | sudo apt install vim 
 49 | ~~~~
 50 | 
 51 | ### Installing CUDA ###    
 52 | 
 53 | 6. Now you need to download the CUDA 11 runtime from NVIDIA. SSH into your AWS instance and run the following:
 54 | 
 55 | ~~~~
 56 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
 57 | sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
 58 | sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub
 59 | sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /"
 60 | sudo apt-get update
 61 | sudo apt-get -y install cuda
 62 | ~~~~
 63 |  
 64 | 7. `nvcc` is the NVIDIA CUDA compiler. The default install locates CUDA binaries in `/usr/local/cuda/bin/`, so you'll want to add this directory to your path.  For example, from a bash shell that would be:
 65 | 
 66 | ~~~~
 67 | export PATH=$PATH:/usr/local/cuda/bin
 68 | ~~~~
 69 | 
 70 | In general we recommend that you perform this `$PATH` update on login, so you can add this line to the end of your `.bashrc` file.  Don't forget to `source .bashrc` if you want to have this modification take effect without logging out and back in to the instance.
 71 | 
 72 | ### Confirming that CUDA has been installed ###
 73 | 
 74 | Suppose you have carried out steps 5-7 or run the __install.sh__ script. At this point CUDA should be installed and you should be able to run the `nvidia-smi` command to make sure everything is setup correctly.  The result of the command should indicate that your VM has one NVIDIA K80 GPU.
 75 | 
 76 | ~~~~
 77 | ubuntu@ip-172-31-20-116:~/asst3$ nvidia-smi 
 78 | Fri Oct 22 18:08:14 2021       
 79 | +-----------------------------------------------------------------------------+
 80 | | NVIDIA-SMI 495.29.05    Driver Version: 495.29.05    CUDA Version: 11.5     |
 81 | |-------------------------------+----------------------+----------------------+
 82 | | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
 83 | | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
 84 | |                               |                      |               MIG M. |
 85 | |===============================+======================+======================|
 86 | |   0  Tesla T4            Off  | 00000000:00:1E.0 Off |                    0 |
 87 | | N/A   50C    P0    27W /  70W |      0MiB / 15109MiB |      0%      Default |
 88 | |                               |                      |                  N/A |
 89 | +-------------------------------+----------------------+----------------------+
 90 |                                                                                
 91 | +-----------------------------------------------------------------------------+
 92 | | Processes:                                                                  |
 93 | |  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
 94 | |        ID   ID                                                   Usage      |
 95 | |=============================================================================|
 96 | |  No running processes found                                                 |
 97 | +-----------------------------------------------------------------------------+
 98 | ~~~~
 99 | 
100 | If you're confused about any of the steps, having problems with setting up your account or have any additional questions, reach us out on Piazza!
101 |   
102 | __Again, please don't forget to STOP your instances when you're done with your work for the day!__
103 | 
104 | ### AWS Setup Trouble Shooting
105 | 1. If you received an error message stating that you are not able to launch additional resources in this region, AWS will validate your request. The validation process should take around 20 minutes. If that is not the case, please email AWS at aws-verification@amazon.com.
106 | ![Unavailable Region](handout/location_limit.png?raw=true)
107 | 
108 | 2. If you received and error message stating that you have requested more vCPU capacity than your current limit, please check your quota.
109 | ![Quota Navigation Bar](handout/vCPU_trouble.png?raw=true)
110 | ![Quota Dashboard](handout/vCPU_dashboard.png?raw=true)
111 | ![Quota Dashboard Search](handout/vCPU_dashboard_2.png?raw=true)
112 | If your Applied quota value is less than 4, please submit a request for quota increase and put 4 as your requested number of vCPUs.
113 | ![Quota Request](handout/quota_request.png?raw=true)
114 | 


--------------------------------------------------------------------------------
/asst3/handout/bug_example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/bug_example.jpg


--------------------------------------------------------------------------------
/asst3/handout/choose_ami.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/choose_ami.png


--------------------------------------------------------------------------------
/asst3/handout/choose_instance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/choose_instance.png


--------------------------------------------------------------------------------
/asst3/handout/choose_storage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/choose_storage.png


--------------------------------------------------------------------------------
/asst3/handout/dependencies.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/dependencies.jpg


--------------------------------------------------------------------------------
/asst3/handout/gpu_instance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/gpu_instance.png


--------------------------------------------------------------------------------
/asst3/handout/gpu_instance.png_original:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/gpu_instance.png_original


--------------------------------------------------------------------------------
/asst3/handout/ip_address.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/ip_address.png


--------------------------------------------------------------------------------
/asst3/handout/location_limit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/location_limit.png


--------------------------------------------------------------------------------
/asst3/handout/navigation_quota.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/navigation_quota.png


--------------------------------------------------------------------------------
/asst3/handout/new_key_pair.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/new_key_pair.png


--------------------------------------------------------------------------------
/asst3/handout/order.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/order.jpg


--------------------------------------------------------------------------------
/asst3/handout/point_in_circle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/point_in_circle.jpg


--------------------------------------------------------------------------------
/asst3/handout/public_dns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/public_dns.png


--------------------------------------------------------------------------------
/asst3/handout/quota_request.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/quota_request.png


--------------------------------------------------------------------------------
/asst3/handout/teaser.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/teaser.jpg


--------------------------------------------------------------------------------
/asst3/handout/vCPU_dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/vCPU_dashboard.png


--------------------------------------------------------------------------------
/asst3/handout/vCPU_dashboard_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/vCPU_dashboard_2.png


--------------------------------------------------------------------------------
/asst3/handout/vCPU_trouble.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/vCPU_trouble.png


--------------------------------------------------------------------------------
/asst3/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # A helper script to install CUDA for Ubuntu 20.04
 4 | 
 5 | set -e
 6 | 
 7 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
 8 | mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
 9 | wget https://developer.download.nvidia.com/compute/cuda/11.5.0/local_installers/cuda-repo-ubuntu2004-11-5-local_11.5.0-495.29.05-1_amd64.deb
10 | dpkg -i cuda-repo-ubuntu2004-11-5-local_11.5.0-495.29.05-1_amd64.deb
11 | apt-key add /var/cuda-repo-ubuntu2004-11-5-local/7fa2af80.pub
12 | 
13 | apt-get update
14 | apt-get install -y g++ make cuda freeglut3-dev
15 | 
16 | # Update PATH to include bin directory containing nvcc
17 | echo "export PATH=\$PATH:/usr/local/cuda/bin" >> /home/ubuntu/.bashrc
18 | 


--------------------------------------------------------------------------------
/asst3/render/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | EXECUTABLE := render
 3 | 
 4 | CU_FILES   := cudaRenderer.cu
 5 | 
 6 | CU_DEPS    :=
 7 | 
 8 | CC_FILES   := main.cpp display.cpp benchmark.cpp refRenderer.cpp \
 9 |               noise.cpp ppm.cpp sceneLoader.cpp
10 | 
11 | LOGS	   := logs
12 | 
13 | ###########################################################
14 | 
15 | ARCH=$(shell uname | sed -e 's/-.*//g')
16 | OBJDIR=objs
17 | CXX=g++ -m64
18 | CXXFLAGS=-O3 -Wall -g -std=c++11
19 | HOSTNAME=$(shell hostname)
20 | 
21 | LIBS       :=
22 | FRAMEWORKS :=
23 | 
24 | NVCCFLAGS=-O3 -m64 --gpu-architecture compute_61 -std=c++11
25 | LIBS += GL glut cudart
26 | 
27 | ifneq ($(wildcard /opt/cuda-8.0/.*),)
28 | # Latedays
29 | LDFLAGS=-L/opt/cuda-8.0/lib64/ -lcudart
30 | else
31 | # GHC
32 | LDFLAGS=-L/usr/local/cuda-9.0/lib64/ -lcudart
33 | endif
34 | 
35 | LDLIBS  := $(addprefix -l, $(LIBS))
36 | LDFRAMEWORKS := $(addprefix -framework , $(FRAMEWORKS))
37 | 
38 | NVCC=nvcc
39 | 
40 | OBJS=$(OBJDIR)/main.o $(OBJDIR)/display.o $(OBJDIR)/benchmark.o $(OBJDIR)/refRenderer.o \
41 |      $(OBJDIR)/cudaRenderer.o $(OBJDIR)/noise.o $(OBJDIR)/ppm.o $(OBJDIR)/sceneLoader.o
42 | 
43 | 
44 | .PHONY: dirs clean
45 | 
46 | default: $(EXECUTABLE)
47 | 
48 | dirs:
49 | 		mkdir -p $(OBJDIR)/
50 | 
51 | clean:
52 | 		rm -rf $(OBJDIR) *~ $(EXECUTABLE) $(LOGS)
53 | 
54 | check:	default
55 | 		./checker.pl
56 | 
57 | $(EXECUTABLE): dirs $(OBJS)
58 | 		$(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS) $(LDFRAMEWORKS)
59 | 
60 | $(OBJDIR)/%.o: %.cpp
61 | 		$(CXX) $< $(CXXFLAGS) -c -o $@
62 | 
63 | $(OBJDIR)/%.o: %.cu
64 | 		$(NVCC) $< $(NVCCFLAGS) -c -o $@
65 | 


--------------------------------------------------------------------------------
/asst3/render/checker.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | use POSIX;
  4 | 
  5 | my @scene_names = ("rgb", "rgby", "rand10k", "rand100k", "biglittle", "littlebig", "pattern", "bouncingballs", "hypnosis", "fireworks", "snow", "snowsingle");
  6 | my @score_scene_names = ("rgb", "rand10k", "rand100k", "pattern", "snowsingle", "biglittle");
  7 | 
  8 | my %fast_times;
  9 | 
 10 | my $perf_points = 10;
 11 | my $correctness_points = 2;
 12 | 
 13 | my %correct;
 14 | 
 15 | my %your_times;
 16 | 
 17 | `mkdir -p logs`;
 18 | `rm -rf logs/*`;
 19 | 
 20 | print "\n";
 21 | print ("--------------\n");
 22 | my $hostname = `hostname`;
 23 | print ("Running tests on $hostname\n");
 24 | print ("--------------\n");
 25 | 
 26 | my $render_ref = "render_ref";
 27 | 
 28 | foreach my $scene (@scene_names) {
 29 |     print ("\nScene : $scene\n");
 30 |     my @sys_stdout = system ("./render -c $scene -s 1024 > ./logs/correctness_${scene}.log");
 31 |     my $return_value  = $?;
 32 |     if ($return_value == 0) {
 33 |         print ("Correctness passed!\n");
 34 |         $correct{$scene} = 1;
 35 |     }
 36 |     else {
 37 |         print ("Correctness failed ... Check ./logs/correctness_${scene}.log\n");
 38 |         $correct{$scene} = 0;
 39 |     }
 40 | 
 41 |     if (${scene} ~~ @score_scene_names) {
 42 |         my $your_time = `./render -r cuda -b 0:4 $scene -s 1024 | tee ./logs/time_${scene}.log | grep Total:`;
 43 |         chomp($your_time);
 44 |         $your_time =~ s/^[^0-9]*//;
 45 |         $your_time =~ s/ ms.*//;
 46 | 
 47 |         print ("Your time : $your_time\n");
 48 |         $your_times{$scene} = $your_time;
 49 | 
 50 |         my $fast_time = `./$render_ref -r cuda -b 0:4 $scene -s 1024 | tee ./logs/time_${scene}.log | grep Total:`;
 51 |         chomp($fast_time);
 52 |         $fast_time =~ s/^[^0-9]*//;
 53 |         $fast_time =~ s/ ms.*//;
 54 | 
 55 |         print ("Ref Time: $fast_time\n");
 56 |         $fast_times{$scene} = $fast_time;
 57 |     }
 58 | }
 59 | 
 60 | print "\n";
 61 | print ("------------\n");
 62 | print ("Score table:\n");
 63 | print ("------------\n");
 64 | 
 65 | my $header = sprintf ("| %-15s | %-16s | %-15s | %-15s |\n", "Scene Name", "Ref Time (T_ref)", "Your Time (T)", "Score");
 66 | my $dashes = $header;
 67 | $dashes =~ s/./-/g;
 68 | print $dashes;
 69 | print $header;
 70 | print $dashes;
 71 | 
 72 | my $total_score = 0;
 73 | 
 74 | foreach my $scene (@score_scene_names){
 75 |     my $score;
 76 |     my $your_time = $your_times{$scene};
 77 |     my $fast_time = $fast_times{$scene};
 78 | 
 79 |     if ($correct{$scene}) {
 80 |         if ($your_time <= 1.20 * $fast_time) {
 81 |             $score = $perf_points + $correctness_points;
 82 |         }
 83 |         elsif ($your_time > 10 * $fast_time) {
 84 |             $score = $correctness_points;
 85 |         }
 86 |         else {
 87 |             $score = $correctness_points + ceil ($perf_points * ($fast_time /$your_time));
 88 |         }
 89 |     }
 90 |     else {
 91 |         $your_time .= " (F)";
 92 |         $score = 0;
 93 |     }
 94 | 
 95 |     printf ("| %-15s | %-16s | %-15s | %-15s |\n", "$scene", "$fast_time", "$your_time", "$score");
 96 |     $total_score += $score;
 97 | }
 98 | print $dashes;
 99 | printf ("| %-15s   %-16s | %-15s | %-15s |\n", "", "", "Total score:",
100 |     $total_score . "/" . ($perf_points+$correctness_points) * ($#score_scene_names + 1));
101 | print $dashes;
102 | 


--------------------------------------------------------------------------------
/asst3/render/checker.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | import subprocess
  4 | import os
  5 | import shutil
  6 | import re
  7 | import math
  8 | 
  9 | perf_pts = 10
 10 | correctness_pts = 2
 11 | 
 12 | # scene_names = ["rand100k"]
 13 | # score_scene_names = {"rand100k"}
 14 | scene_names = ["rgb", "rgby", "rand10k", "rand100k", "biglittle", "littlebig", "pattern", "bouncingballs", "hypnosis", "fireworks", "snow", "snowsingle"]
 15 | score_scene_names_list = ["rgb", "rand10k", "rand100k", "pattern", "snowsingle", "biglittle"]
 16 | score_scene_names = set(score_scene_names_list)
 17 | 
 18 | #### LOGS MANAGEMENT ####
 19 | # Set up a new logs dir (remove old logs dir, create new logs dir)
 20 | if os.path.isdir('logs'):
 21 |     shutil.rmtree('logs')
 22 | os.mkdir('logs')
 23 | 
 24 | # Helper functions to convert scene names to appropriate log file names
 25 | def correctness_log_file(scene):
 26 |     return "./logs/correctness_%s.log" % scene
 27 | 
 28 | def time_log_file(scene):
 29 |     return "./logs/time_%s.log" % scene
 30 | #### END OF LOGS MANAGEMENT ####
 31 | 
 32 | #### RUNNING THE RENDERERS ####
 33 | def check_correctness(render_cmd, scene):
 34 |     cmd_string = "./%s -c %s -s 1024 > %s" % (render_cmd, scene, correctness_log_file(scene))
 35 |     # print("Checking correctness: %s" % cmd_string)
 36 | 
 37 |     # Actually run it
 38 |     result = subprocess.run([cmd_string], shell=True)
 39 | 
 40 |     return result.returncode == 0
 41 | 
 42 | # Run a renderer one time and get the time taken
 43 | def get_time(render_cmd, scene):
 44 |     # print("get_time %s %s" % (render_cmd, scene))
 45 |     cmd_string = "./%s -r cuda -b 0:4 %s -s 1024 | tee %s | grep Total:" % (render_cmd, scene, time_log_file(scene))
 46 | 
 47 |     # Actually run the renderer
 48 |     result = subprocess.run([cmd_string], shell=True, capture_output=True)
 49 | 
 50 |     # Extract the time taken
 51 |     time = float(re.search(r'\d+\.\d+', str(result.stdout)).group())
 52 |     return time
 53 | #### END OF RUNNING THE RENDERERS ####
 54 | 
 55 | 
 56 | # Run all scenes. Some of them are for performance.
 57 | def run_scenes(n_runs):
 58 |     correct = {}
 59 |     stu_times = {}
 60 |     ref_times = {}
 61 |     for scene in scene_names:
 62 |         print("\nRunning scene: %s..." % (scene))
 63 | 
 64 |         # Check for correctness
 65 |         correct[scene] = check_correctness("render", scene)
 66 |         if not correct[scene]:
 67 |             print("[%s] Correctness failed ... Check %s" % (scene, correctness_log_file(scene)))
 68 |         else:
 69 |             print("[%s] Correctness passed!" % scene)
 70 | 
 71 |         # Check for performance
 72 |         if scene in score_scene_names:
 73 | 
 74 |             # Do multiple perf runs
 75 |             stu_times[scene] = [get_time("render", scene) for _ in range(n_runs)]
 76 |             ref_times[scene] = [get_time("render_ref", scene) for _ in range(n_runs)]
 77 | 
 78 |             print("[%s] Student times: " % (scene), stu_times[scene])
 79 |             print("[%s] Reference times: " % (scene), ref_times[scene])
 80 | 
 81 |     return correct, stu_times, ref_times
 82 | 
 83 | # Compute scores
 84 | def score_table(correct, stu_times, ref_times):
 85 |     print("------------")
 86 |     print("Score table:")
 87 |     print("------------")
 88 |     header = "| %-15s | %-16s | %-15s | %-15s |" % ("Scene Name", "Ref Time (T_ref)", "Your Time (T)", "Score")
 89 |     dashes = "-"*len(header)
 90 |     print(dashes)
 91 |     print(header)
 92 |     print(dashes)
 93 | 
 94 |     total_score = 0
 95 | 
 96 |     for scene in score_scene_names_list:
 97 |         stu_time = min(stu_times[scene])
 98 |         ref_time = min(ref_times[scene])
 99 |         if correct[scene]:
100 |             if stu_time <= 1.2 * ref_time:
101 |                 score = perf_pts + correctness_pts
102 |             elif stu_time > 10 * ref_time:
103 |                 score = correctness_pts
104 |             else:
105 |                 score = correctness_pts + math.ceil(perf_pts * (ref_time / stu_time))
106 |         else:
107 |             score = 0
108 | 
109 |         print("| %-15s | %-16s | %-15s | %-15s |" % (scene, ref_time, stu_time if correct[scene] else "(F)", score));
110 |         total_score += score
111 | 
112 |     print(dashes)
113 | 
114 |     max_total_score = (perf_pts + correctness_pts) * len(score_scene_names)
115 |     score_string = "%s/%s" % (total_score, max_total_score)
116 |     print("| %-15s   %-16s | %-15s | %-15s |" % ("", "", "Total score:", score_string))
117 | 
118 |     print(dashes)
119 | 
120 | correct, stu_times, ref_times = run_scenes(3)
121 | score_table(correct, stu_times, ref_times)
122 | 


--------------------------------------------------------------------------------
/asst3/render/circleBoxTest.cu_inl:
--------------------------------------------------------------------------------
 1 | 
 2 | // circleInBoxConservative --
 3 | //
 4 | // Tests whether circle with center (circleX, circleY) and radius
 5 | // `circleRadius` *may intersect* the box defined by coordinates for
 6 | // it's left and right sides, and top and bottom edges.  For
 7 | // efficiency, this is a conservative test.  If it returns 0, then the
 8 | // circle definitely does not intersect the box.  However a result of
 9 | // 1 does not imply an intersection actually exists.  Further tests
10 | // are needed to determine if an intersection actually exists.  For
11 | // example, you could continue with actual point in circle tests, or
12 | // make a subsequent call to circleInBox().
13 | __device__ __inline__ int
14 | circleInBoxConservative(
15 |     float circleX, float circleY, float circleRadius,
16 |     float boxL, float boxR, float boxT, float boxB)
17 | {
18 | 
19 |     // expand box by circle radius.  Test if circle center is in the
20 |     // expanded box.
21 | 
22 |     if ( circleX >= (boxL - circleRadius) &&
23 |          circleX <= (boxR + circleRadius) &&
24 |          circleY >= (boxB - circleRadius) &&
25 |          circleY <= (boxT + circleRadius) ) {
26 |         return 1;
27 |     } else {
28 |         return 0;
29 |     }
30 | }
31 | 
32 | 
33 | // circleInBox --
34 | //
35 | // This is a true circle in box test.  It is more expensive than the
36 | // function circleInBoxConservative above, but it's 1/0 result is a
37 | // definitive result.
38 | __device__ __inline__ int
39 | circleInBox(
40 |     float circleX, float circleY, float circleRadius,
41 |     float boxL, float boxR, float boxT, float boxB)
42 | {
43 | 
44 |     // clamp circle center to box (finds the closest point on the box)
45 |     float closestX = (circleX > boxL) ? ((circleX < boxR) ? circleX : boxR) : boxL;
46 |     float closestY = (circleY > boxB) ? ((circleY < boxT) ? circleY : boxT) : boxB;
47 | 
48 |     // is circle radius less than the distance to the closest point on
49 |     // the box?
50 |     float distX = closestX - circleX;
51 |     float distY = closestY - circleY;
52 | 
53 |     if ( ((distX*distX) + (distY*distY)) <= (circleRadius*circleRadius) ) {
54 |         return 1;
55 |     } else {
56 |         return 0;
57 |     }
58 | }
59 | 
60 | 


--------------------------------------------------------------------------------
/asst3/render/circleRenderer.h:
--------------------------------------------------------------------------------
 1 | #ifndef __CIRCLE_RENDERER_H__
 2 | #define __CIRCLE_RENDERER_H__
 3 | 
 4 | struct Image;
 5 | 
 6 | // fireworks constants
 7 | #define NUM_FIREWORKS 15
 8 | #define NUM_SPARKS 20
 9 | 
10 | typedef enum {
11 |     CIRCLE_RGB,
12 |     CIRCLE_RGBY,
13 |     CIRCLE_TEST_10K,
14 |     CIRCLE_TEST_100K,
15 |     PATTERN,
16 |     SNOWFLAKES,
17 |     FIREWORKS, 
18 |     HYPNOSIS, 
19 |     BOUNCING_BALLS, 
20 |     SNOWFLAKES_SINGLE_FRAME,
21 |     BIG_LITTLE,
22 |     LITTLE_BIG
23 | } SceneName;
24 | 
25 | 
26 | class CircleRenderer {
27 | 
28 | public:
29 | 
30 |     virtual ~CircleRenderer() { };
31 | 
32 |     virtual const Image* getImage() = 0;
33 | 
34 |     virtual void setup() = 0;
35 | 
36 |     virtual void loadScene(SceneName name) = 0;
37 | 
38 |     virtual void allocOutputImage(int width, int height) = 0;
39 | 
40 |     virtual void clearImage() = 0;
41 | 
42 |     virtual void advanceAnimation() = 0;
43 | 
44 |     virtual void render() = 0;
45 | 
46 |     //virtual void dumpParticles(const char* filename) {}
47 | 
48 | };
49 | 
50 | 
51 | #endif
52 | 


--------------------------------------------------------------------------------
/asst3/render/cudaRenderer.h:
--------------------------------------------------------------------------------
 1 | #ifndef __CUDA_RENDERER_H__
 2 | #define __CUDA_RENDERER_H__
 3 | 
 4 | #ifndef uint
 5 | #define uint unsigned int
 6 | #endif
 7 | 
 8 | #include "circleRenderer.h"
 9 | 
10 | 
11 | class CudaRenderer : public CircleRenderer {
12 | 
13 | private:
14 | 
15 |     Image* image;
16 |     SceneName sceneName;
17 | 
18 |     int numCircles;
19 |     float* position;
20 |     float* velocity;
21 |     float* color;
22 |     float* radius;
23 | 
24 |     float* cudaDevicePosition;
25 |     float* cudaDeviceVelocity;
26 |     float* cudaDeviceColor;
27 |     float* cudaDeviceRadius;
28 |     float* cudaDeviceImageData;
29 | 
30 | public:
31 | 
32 |     CudaRenderer();
33 |     virtual ~CudaRenderer();
34 | 
35 |     const Image* getImage();
36 | 
37 |     void setup();
38 | 
39 |     void loadScene(SceneName name);
40 | 
41 |     void allocOutputImage(int width, int height);
42 | 
43 |     void clearImage();
44 | 
45 |     void advanceAnimation();
46 | 
47 |     void render();
48 | 
49 |     void shadePixel(
50 |         int circleIndex,
51 |         float pixelCenterX, float pixelCenterY,
52 |         float px, float py, float pz,
53 |         float* pixelData);
54 | };
55 | 
56 | 
57 | #endif
58 | 


--------------------------------------------------------------------------------
/asst3/render/cycleTimer.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SYRAH_CYCLE_TIMER_H_
  2 | #define _SYRAH_CYCLE_TIMER_H_
  3 | 
  4 | #if defined(__APPLE__)
  5 |   #if defined(__x86_64__)
  6 |     #include <sys/sysctl.h>
  7 |   #else
  8 |     #include <mach/mach.h>
  9 |     #include <mach/mach_time.h>
 10 |   #endif // __x86_64__ or not
 11 | 
 12 |   #include <stdio.h>  // fprintf
 13 |   #include <stdlib.h> // exit
 14 | 
 15 | #elif _WIN32
 16 | #  include <windows.h>
 17 | #  include <time.h>
 18 | #else
 19 | #  include <stdio.h>
 20 | #  include <stdlib.h>
 21 | #  include <string.h>
 22 | #  include <sys/time.h>
 23 | #endif
 24 | 
 25 | 
 26 |   // This uses the cycle counter of the processor.  Different
 27 |   // processors in the system will have different values for this.  If
 28 |   // you process moves across processors, then the delta time you
 29 |   // measure will likely be incorrect.  This is mostly for fine
 30 |   // grained measurements where the process is likely to be on the
 31 |   // same processor.  For more global things you should use the
 32 |   // Time interface.
 33 | 
 34 |   // Also note that if you processors' speeds change (i.e. processors
 35 |   // scaling) or if you are in a heterogenous environment, you will
 36 |   // likely get spurious results.
 37 |   class CycleTimer {
 38 |   public:
 39 |     typedef unsigned long long SysClock;
 40 | 
 41 |     //////////
 42 |     // Return the current CPU time, in terms of clock ticks.
 43 |     // Time zero is at some arbitrary point in the past.
 44 |     static SysClock currentTicks() {
 45 | #if defined(__APPLE__) && !defined(__x86_64__)
 46 |       return mach_absolute_time();
 47 | #elif defined(_WIN32)
 48 |       LARGE_INTEGER qwTime;
 49 |       QueryPerformanceCounter(&qwTime);
 50 |       return qwTime.QuadPart;
 51 | #elif defined(__x86_64__)
 52 |       unsigned int a, d;
 53 |       asm volatile("rdtsc" : "=a" (a), "=d" (d));
 54 |       return static_cast<unsigned long long>(a) |
 55 |         (static_cast<unsigned long long>(d) << 32);
 56 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser.
 57 |       unsigned int val;
 58 |       asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val));
 59 |       return val;
 60 | #else
 61 |       timespec spec;
 62 |       clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec);
 63 |       return CycleTimer::SysClock(static_cast<float>(spec.tv_sec) * 1e9 + static_cast<float>(spec.tv_nsec));
 64 | #endif
 65 |     }
 66 | 
 67 |     //////////
 68 |     // Return the current CPU time, in terms of seconds.
 69 |     // This is slower than currentTicks().  Time zero is at
 70 |     // some arbitrary point in the past.
 71 |     static double currentSeconds() {
 72 |       return currentTicks() * secondsPerTick();
 73 |     }
 74 | 
 75 |     //////////
 76 |     // Return the conversion from seconds to ticks.
 77 |     static double ticksPerSecond() {
 78 |       return 1.0/secondsPerTick();
 79 |     }
 80 | 
 81 |     static const char* tickUnits() {
 82 | #if defined(__APPLE__) && !defined(__x86_64__)
 83 |       return "ns";
 84 | #elif defined(__WIN32__) || defined(__x86_64__)
 85 |       return "cycles";
 86 | #else
 87 |       return "ns"; // clock_gettime
 88 | #endif
 89 |     }
 90 | 
 91 |     //////////
 92 |     // Return the conversion from ticks to seconds.
 93 |     static double secondsPerTick() {
 94 |       static bool initialized = false;
 95 |       static double secondsPerTick_val;
 96 |       if (initialized) return secondsPerTick_val;
 97 | #if defined(__APPLE__)
 98 |   #ifdef __x86_64__
 99 |       int args[] = {CTL_HW, HW_CPU_FREQ};
100 |       unsigned int Hz;
101 |       size_t len = sizeof(Hz);
102 |       if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) {
103 |          fprintf(stderr, "Failed to initialize secondsPerTick_val!\n");
104 |          exit(-1);
105 |       }
106 |       secondsPerTick_val = 1.0 / (double) Hz;
107 |   #else
108 |       mach_timebase_info_data_t time_info;
109 |       mach_timebase_info(&time_info);
110 | 
111 |       // Scales to nanoseconds without 1e-9f
112 |       secondsPerTick_val = (1e-9*static_cast<double>(time_info.numer))/
113 |         static_cast<double>(time_info.denom);
114 |   #endif // x86_64 or not
115 | #elif defined(_WIN32)
116 |       LARGE_INTEGER qwTicksPerSec;
117 |       QueryPerformanceFrequency(&qwTicksPerSec);
118 |       secondsPerTick_val = 1.0/static_cast<double>(qwTicksPerSec.QuadPart);
119 | #else
120 |       FILE *fp = fopen("/proc/cpuinfo","r");
121 |       char input[1024];
122 |       if (!fp) {
123 |          fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo.");
124 |          exit(-1);
125 |       }
126 |       // In case we don't find it, e.g. on the N900
127 |       secondsPerTick_val = 1e-9;
128 |       while (!feof(fp) && fgets(input, 1024, fp)) {
129 |         // NOTE(boulos): Because reading cpuinfo depends on dynamic
130 |         // frequency scaling it's better to read the @ sign first
131 |         float GHz, MHz;
132 |         if (strstr(input, "model name")) {
133 |           char* at_sign = strstr(input, "@");
134 |           if (at_sign) {
135 |             char* after_at = at_sign + 1;
136 |             char* GHz_str = strstr(after_at, "GHz");
137 |             char* MHz_str = strstr(after_at, "MHz");
138 |             if (GHz_str) {
139 |               *GHz_str = '\0';
140 |               if (1 == sscanf(after_at, "%f", &GHz)) {
141 |                 //printf("GHz = %f\n", GHz);
142 |                 secondsPerTick_val = 1e-9f / GHz;
143 |                 break;
144 |               }
145 |             } else if (MHz_str) {
146 |               *MHz_str = '\0';
147 |               if (1 == sscanf(after_at, "%f", &MHz)) {
148 |                 //printf("MHz = %f\n", MHz);
149 |                 secondsPerTick_val = 1e-6f / GHz;
150 |                 break;
151 |               }
152 |             }
153 |           }
154 |         } else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) {
155 |           //printf("MHz = %f\n", MHz);
156 |           secondsPerTick_val = 1e-6f / MHz;
157 |           break;
158 |         }
159 |       }
160 |       fclose(fp);
161 | #endif
162 | 
163 |       initialized = true;
164 |       return secondsPerTick_val;
165 |     }
166 | 
167 |     //////////
168 |     // Return the conversion from ticks to milliseconds.
169 |     static double msPerTick() {
170 |       return secondsPerTick() * 1000.0;
171 |     }
172 | 
173 |   private:
174 |     CycleTimer();
175 |   };
176 | 
177 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_
178 | 


--------------------------------------------------------------------------------
/asst3/render/display.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | 
  3 | #include "circleRenderer.h"
  4 | #include "cycleTimer.h"
  5 | #include "image.h"
  6 | #include "platformgl.h"
  7 | 
  8 | 
  9 | void renderPicture();
 10 | 
 11 | 
 12 | static struct {
 13 |     int width;
 14 |     int height;
 15 |     bool updateSim;
 16 |     bool printStats;
 17 |     bool pauseSim;
 18 |     double lastFrameTime;
 19 | 
 20 |     CircleRenderer* renderer;
 21 | 
 22 | } gDisplay;
 23 | 
 24 | // handleReshape --
 25 | //
 26 | // Event handler, fired when the window is resized
 27 | void
 28 | handleReshape(int w, int h) {
 29 |     gDisplay.width = w;
 30 |     gDisplay.height = h;
 31 |     glViewport(0, 0, gDisplay.width, gDisplay.height);
 32 |     glutPostRedisplay();
 33 | }
 34 | 
 35 | void
 36 | handleDisplay() {
 37 | 
 38 |     // simulation and rendering work is done in the renderPicture
 39 |     // function below
 40 | 
 41 |     renderPicture();
 42 | 
 43 |     // the subsequent code uses OpenGL to present the state of the
 44 |     // rendered image on the screen.
 45 | 
 46 |     const Image* img = gDisplay.renderer->getImage();
 47 | 
 48 |     int width = std::min(img->width, gDisplay.width);
 49 |     int height = std::min(img->height, gDisplay.height);
 50 | 
 51 |     glDisable(GL_DEPTH_TEST);
 52 |     glClearColor(0.f, 0.f, 0.f, 1.f);
 53 |     glClear(GL_COLOR_BUFFER_BIT);
 54 | 
 55 |     glMatrixMode(GL_PROJECTION);
 56 |     glLoadIdentity();
 57 |     glOrtho(0.f, gDisplay.width, 0.f, gDisplay.height, -1.f, 1.f);
 58 | 
 59 |     glMatrixMode(GL_MODELVIEW);
 60 |     glLoadIdentity();
 61 | 
 62 |     // copy image data from the renderer to the OpenGL
 63 |     // frame-buffer.  This is inefficient solution is the processing
 64 |     // to generate the image is done in CUDA.  An improved solution
 65 |     // would render to a CUDA surface object (stored in GPU memory),
 66 |     // and then bind this surface as a texture enabling it's use in
 67 |     // normal openGL rendering
 68 |     glRasterPos2i(0, 0);
 69 |     glDrawPixels(width, height, GL_RGBA, GL_FLOAT, img->data);
 70 | 
 71 |     double currentTime = CycleTimer::currentSeconds();
 72 | 
 73 |     if (gDisplay.printStats)
 74 |         printf("%.2f ms\n", 1000.f * (currentTime - gDisplay.lastFrameTime));
 75 | 
 76 |     gDisplay.lastFrameTime = currentTime;
 77 | 
 78 |     glutSwapBuffers();
 79 |     glutPostRedisplay();
 80 | }
 81 | 
 82 | 
 83 | // handleKeyPress --
 84 | //
 85 | // Keyboard event handler
 86 | void
 87 | handleKeyPress(unsigned char key, int x, int y) {
 88 | 
 89 |     switch (key) {
 90 |     case 'q':
 91 |     case 'Q':
 92 |         exit(1);
 93 |         break;
 94 |     case '=':
 95 |     case '+':
 96 |         gDisplay.updateSim = true;
 97 |         break;
 98 |     case 'p':
 99 |     case 'P':
100 |         gDisplay.pauseSim = !gDisplay.pauseSim;
101 |         if (!gDisplay.pauseSim)
102 |             gDisplay.updateSim = true;
103 |         break;
104 |     }
105 | }
106 | 
107 | // renderPicture --
108 | //
109 | // At the reall work is done here, not in the display handler
110 | void
111 | renderPicture() {
112 | 
113 |     double startTime = CycleTimer::currentSeconds();
114 | 
115 |     // clear screen
116 |     gDisplay.renderer->clearImage();
117 | 
118 |     double endClearTime = CycleTimer::currentSeconds();
119 | 
120 |     // update particle positions and state
121 |     if (gDisplay.updateSim) {
122 |         gDisplay.renderer->advanceAnimation();
123 |     }
124 |     if (gDisplay.pauseSim)
125 |         gDisplay.updateSim = false;
126 | 
127 |     double endSimTime = CycleTimer::currentSeconds();
128 | 
129 |     // render the particles< into the image
130 |     gDisplay.renderer->render();
131 | 
132 |     double endRenderTime = CycleTimer::currentSeconds();
133 | 
134 |     if (gDisplay.printStats) {
135 |         printf("Clear:    %.3f ms\n", 1000.f * (endClearTime - startTime));
136 |         printf("Advance:  %.3f ms\n", 1000.f * (endSimTime - endClearTime));
137 |         printf("Render:   %.3f ms\n", 1000.f * (endRenderTime - endSimTime));
138 |     }
139 | }
140 | 
141 | void
142 | startRendererWithDisplay(CircleRenderer* renderer) {
143 | 
144 |     // setup the display
145 | 
146 |     const Image* img = renderer->getImage();
147 | 
148 |     gDisplay.renderer = renderer;
149 |     gDisplay.updateSim = true;
150 |     gDisplay.pauseSim = false;
151 |     gDisplay.printStats = true;
152 |     gDisplay.lastFrameTime = CycleTimer::currentSeconds();
153 |     gDisplay.width = img->width;
154 |     gDisplay.height = img->height;
155 | 
156 |     // configure GLUT
157 | 
158 |     glutInitWindowSize(gDisplay.width, gDisplay.height);
159 |     glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);
160 |     glutCreateWindow("CMU 15-418 Assignment 2 - Circle Renderer");
161 |     glutDisplayFunc(handleDisplay);
162 |     glutKeyboardFunc(handleKeyPress);
163 |     glutMainLoop();
164 | }
165 | 


--------------------------------------------------------------------------------
/asst3/render/exclusiveScan.cu_inl:
--------------------------------------------------------------------------------
  1 | 
  2 | // exclusiveScan.cu_inl
  3 | 
  4 | // This is a shared-memory implementation of exclusive scan. Note that the
  5 | // exclusive scan you implemented in Part 1 uses slower *global* memory, and has
  6 | // overhead from performing multiple kernel launches.
  7 | // Because it uses shared memory, it must be run within a single thread block.
  8 | 
  9 | 
 10 | // REQUIREMENTS:
 11 | //  - Input array must have power-of-two length.
 12 | //  - Number of threads in the thread block must be the size of the array!
 13 | //  - SCAN_BLOCK_DIM is both the number of threads in the block (must be power of 2) 
 14 | //         and the number of elements that will be scanned. 
 15 | //          You should define this in your cudaRenderer.cu file, 
 16 | //          based on your implementation.
 17 | //  - The parameter sScratch should be a pointer to an array with 2*SCAN_BLOCK_DIM elements
 18 | //  - The 3 arrays should be in shared memory. 
 19 | 
 20 | // ================= USAGE (in cudaRenderer.cu) =====================
 21 | 
 22 | // at the top of the file:
 23 | 
 24 | // #define SCAN_BLOCK_DIM   BLOCKSIZE  // needed by sharedMemExclusiveScan implementation
 25 | // #include "exclusiveScan.cu_inl"
 26 | 
 27 | // ...
 28 | 
 29 | // in a kernel:
 30 | 
 31 | // If you're using 2D indices, compute a linear thread index as folows.
 32 | // NOTE: scan assumes that every 32 adjacent linear thread indices 
 33 | // (0-31, 32-63, ...) form a warp, which means they execute in lockstep.
 34 | 
 35 | // If you do linearThreadIndex = threadIdx.x * blockDim.x + threadIdx.y;
 36 | // you will get a linear thread index, but it won't be sorted into warps,
 37 | // which will break scan!
 38 | 
 39 | // int linearThreadIndex =  threadIdx.y * blockDim.x + threadIdx.x;
 40 | 
 41 | // __shared__ uint prefixSumInput[BLOCKSIZE];
 42 | // __shared__ uint prefixSumOutput[BLOCKSIZE];
 43 | // __shared__ uint prefixSumScratch[2 * BLOCKSIZE];
 44 | // sharedMemExclusiveScan(linearThreadIndex, prefixSumInput, prefixSumOutput, prefixSumScratch, BLOCKSIZE);
 45 | 
 46 | 
 47 | #define LOG2_WARP_SIZE 5U
 48 | #define WARP_SIZE (1U << LOG2_WARP_SIZE)
 49 | 
 50 | //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
 51 | //assuming size <= WARP_SIZE
 52 | inline __device__ uint
 53 | warpScanInclusive(int threadIndex, uint idata, volatile uint *s_Data, uint size){
 54 |     // Note some of the calculations are obscure because they are optimized.
 55 |     // For example, (threadIndex & (size - 1)) computes threadIndex % size,
 56 |     // which works, assuming size is a power of 2.
 57 | 
 58 |     uint pos = 2 * threadIndex - (threadIndex & (size - 1));
 59 |     s_Data[pos] = 0;
 60 |     pos += size;
 61 |     s_Data[pos] = idata;
 62 | 
 63 |     for(uint offset = 1; offset < size; offset <<= 1)
 64 |         s_Data[pos] += s_Data[pos - offset];
 65 | 
 66 |     return s_Data[pos];
 67 | }
 68 | 
 69 | inline __device__ uint warpScanExclusive(int threadIndex, uint idata, volatile uint *sScratch, uint size){
 70 |     return warpScanInclusive(threadIndex, idata, sScratch, size) - idata;
 71 | }
 72 | 
 73 | __inline__ __device__ void
 74 | sharedMemExclusiveScan(int threadIndex, uint* sInput, uint* sOutput, volatile uint* sScratch, uint size)
 75 | {
 76 |     if (size > WARP_SIZE) {
 77 | 
 78 |         uint idata = sInput[threadIndex];
 79 | 
 80 |         //Bottom-level inclusive warp scan
 81 |         uint warpResult = warpScanInclusive(threadIndex, idata, sScratch, WARP_SIZE);
 82 | 
 83 |         // Save top elements of each warp for exclusive warp scan sync
 84 |         // to wait for warp scans to complete (because s_Data is being
 85 |         // overwritten)
 86 |         __syncthreads();
 87 | 
 88 |         if ( (threadIndex & (WARP_SIZE - 1)) == (WARP_SIZE - 1) )
 89 |             sScratch[threadIndex >> LOG2_WARP_SIZE] = warpResult;
 90 | 
 91 |         // wait for warp scans to complete
 92 |         __syncthreads();
 93 | 
 94 |         if ( threadIndex < (SCAN_BLOCK_DIM / WARP_SIZE)) {
 95 |             // grab top warp elements
 96 |             uint val = sScratch[threadIndex];
 97 |             // calculate exclusive scan and write back to shared memory
 98 |             sScratch[threadIndex] = warpScanExclusive(threadIndex, val, sScratch, size >> LOG2_WARP_SIZE);
 99 |         }
100 | 
101 |         //return updated warp scans with exclusive scan results
102 |         __syncthreads();
103 | 
104 |         sOutput[threadIndex] = warpResult + sScratch[threadIndex >> LOG2_WARP_SIZE] - idata;
105 | 
106 |     } else if (threadIndex < WARP_SIZE) {
107 |         uint idata = sInput[threadIndex];
108 |         sOutput[threadIndex] = warpScanExclusive(threadIndex, idata, sScratch, size);
109 |     }
110 | }
111 | 


--------------------------------------------------------------------------------
/asst3/render/image.h:
--------------------------------------------------------------------------------
 1 | #ifndef  __IMAGE_H__
 2 | #define  __IMAGE_H__
 3 | 
 4 | 
 5 | struct Image {
 6 | 
 7 |     Image(int w, int h) {
 8 |         width = w;
 9 |         height = h;
10 |         data = new float[4 * width * height];
11 |     }
12 | 
13 |     void clear(float r, float g, float b, float a) {
14 | 
15 |         int numPixels = width * height;
16 |         float* ptr = data;
17 |         for (int i=0; i<numPixels; i++) {
18 |             ptr[0] = r;
19 |             ptr[1] = g;
20 |             ptr[2] = b;
21 |             ptr[3] = a;
22 |             ptr += 4;
23 |         }
24 |     }
25 | 
26 |     int width;
27 |     int height;
28 |     float* data;
29 | };
30 | 
31 | 
32 | #endif
33 | 


--------------------------------------------------------------------------------
/asst3/render/index.html:
--------------------------------------------------------------------------------
1 | <!DOCTYPE html>
2 | <!--STATUS OK--><html> <head><meta http-equiv=content-type content=text/html;charset=utf-8><meta http-equiv=X-UA-Compatible content=IE=Edge><meta content=always name=referrer><link rel=stylesheet type=text/css href=http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css><title>百度一下，你就知道</title></head> <body link=#0000cc> <div id=wrapper> <div id=head> <div class=head_wrapper> <div class=s_form> <div class=s_form_wrapper> <div id=lg> <img hidefocus=true src=//www.baidu.com/img/bd_logo1.png width=270 height=129> </div> <form id=form name=f action=//www.baidu.com/s class=fm> <input type=hidden name=bdorz_come value=1> <input type=hidden name=ie value=utf-8> <input type=hidden name=f value=8> <input type=hidden name=rsv_bp value=1> <input type=hidden name=rsv_idx value=1> <input type=hidden name=tn value=baidu><span class="bg s_ipt_wr"><input id=kw name=wd class=s_ipt value maxlength=255 autocomplete=off autofocus></span><span class="bg s_btn_wr"><input type=submit id=su value=百度一下 class="bg s_btn"></span> </form> </div> </div> <div id=u1> <a href=http://news.baidu.com name=tj_trnews class=mnav>新闻</a> <a href=http://www.hao123.com name=tj_trhao123 class=mnav>hao123</a> <a href=http://map.baidu.com name=tj_trmap class=mnav>地图</a> <a href=http://v.baidu.com name=tj_trvideo class=mnav>视频</a> <a href=http://tieba.baidu.com name=tj_trtieba class=mnav>贴吧</a> <noscript> <a href=http://www.baidu.com/bdorz/login.gif?login&amp;tpl=mn&amp;u=http%3A%2F%2Fwww.baidu.com%2f%3fbdorz_come%3d1 name=tj_login class=lb>登录</a> </noscript> <script>document.write('<a href="http://www.baidu.com/bdorz/login.gif?login&tpl=mn&u='+ encodeURIComponent(window.location.href+ (window.location.search === "" ? "?" : "&")+ "bdorz_come=1")+ '" name="tj_login" class="lb">登录</a>');</script> <a href=//www.baidu.com/more/ name=tj_briicon class=bri style="display: block;">更多产品</a> </div> </div> </div> <div id=ftCon> <div id=ftConw> <p id=lh> <a href=http://home.baidu.com>关于百度</a> <a href=http://ir.baidu.com>About Baidu</a> </p> <p id=cp>&copy;2017&nbsp;Baidu&nbsp;<a href=http://www.baidu.com/duty/>使用百度前必读</a>&nbsp; <a href=http://jianyi.baidu.com/ class=cp-feedback>意见反馈</a>&nbsp;京ICP证030173号&nbsp; <img src=//www.baidu.com/img/gs.gif> </p> </div> </div> </div> </body> </html>
3 | 


--------------------------------------------------------------------------------
/asst3/render/lookupColor.cu_inl:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | __device__ __inline__ float3
 4 | lookupColor(float coord) {
 5 | 
 6 |     float scaledCoord = coord * (COLOR_MAP_SIZE-1);
 7 | 
 8 |     // using short type rather than int type since 16-bit integer math
 9 |     // is faster than 32-bit integrer math on NVIDIA GPUs
10 |     short maxValue = COLOR_MAP_SIZE-1;
11 |     short intCoord = static_cast<short>(scaledCoord);
12 |     short base = (intCoord < maxValue) ? intCoord : maxValue;  // min
13 | 
14 |     // linearly interpolate between values in the table based on the
15 |     // value of coord
16 |     float weight = scaledCoord - static_cast<float>(base);
17 |     float oneMinusWeight = 1.f - weight;
18 | 
19 |     float r = (oneMinusWeight * cuConstColorRamp[base][0]) + (weight * cuConstColorRamp[base+1][0]);
20 |     float g = (oneMinusWeight * cuConstColorRamp[base][1]) + (weight * cuConstColorRamp[base+1][1]);
21 |     float b = (oneMinusWeight * cuConstColorRamp[base][2]) + (weight * cuConstColorRamp[base+1][2]);
22 |     return make_float3(r, g, b);
23 | }
24 | 


--------------------------------------------------------------------------------
/asst3/render/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <stdio.h>
  3 | #include <getopt.h>
  4 | #include <string>
  5 | 
  6 | #include "refRenderer.h"
  7 | #include "cudaRenderer.h"
  8 | #include "platformgl.h"
  9 | 
 10 | #define DEFAULT_IMAGE_SIZE 1024
 11 | 
 12 | 
 13 | void startRendererWithDisplay(CircleRenderer* renderer);
 14 | void startBenchmark(CircleRenderer* renderer, int startFrame, int totalFrames, const std::string& frameFilename);
 15 | void CheckBenchmark(CircleRenderer* ref_renderer, CircleRenderer* cuda_renderer,
 16 |                         int benchmarkFrameStart, int totalFrames, const std::string& frameFilename);
 17 | 
 18 | 
 19 | void usage(const char* progname) {
 20 |     printf("Usage: %s [options] scenename\n", progname);
 21 |     printf("Valid scenenames are: rgb, rgby, rand10k, rand100k, biglittle, littlebig, pattern,\n"
 22 |            "                      bouncingballs, fireworks, hypnosis, snow, snowsingle\n");
 23 |     printf("Program Options:\n");
 24 |     printf("  -r  --renderer <cpuref/cuda>  Select renderer: ref or cuda (default=cuda)\n");
 25 |     printf("  -s  --size  <INT>             Rendered image size: <INT>x<INT> pixels (default=%d)\n", DEFAULT_IMAGE_SIZE);    
 26 |     printf("  -b  --bench <START:END>       Run for frames [START,END) (default=[0,1))\n");
 27 |     printf("  -c  --check                   Check correctness of CUDA output against CPU reference\n");
 28 |     printf("  -i  --interactive             Render output to interactive display\n");
 29 |     printf("  -f  --file  <FILENAME>        Output file name (FILENAME_xxxx.ppm) (default=output)\n");
 30 |     printf("  -?  --help                    This message\n");
 31 | }
 32 | 
 33 | 
 34 | int main(int argc, char** argv)
 35 | {
 36 | 
 37 |     int benchmarkFrameStart = 0;
 38 |     int benchmarkFrameEnd = 1;
 39 |     int imageSize = DEFAULT_IMAGE_SIZE;
 40 | 
 41 |     std::string sceneNameStr;
 42 |     std::string frameFilename("output");
 43 |     SceneName sceneName;
 44 |     bool useRefRenderer = false;
 45 |     bool checkCorrectness = false;
 46 |     bool interactiveMode = false;
 47 |     
 48 |     // parse commandline options ////////////////////////////////////////////
 49 |     int opt;
 50 |     static struct option long_options[] = {
 51 |         {"help",        0, 0,  '?'},
 52 |         {"check",       0, 0,  'c'},
 53 |         {"bench",       1, 0,  'b'},
 54 | 	{"interactive", 0, 0,  'i'},
 55 |         {"file",        1, 0,  'f'},
 56 |         {"renderer",    1, 0,  'r'},
 57 |         {"size",        1, 0,  's'},
 58 |         {0 ,0, 0, 0}
 59 |     };
 60 | 
 61 |     while ((opt = getopt_long(argc, argv, "b:f:r:s:ci?", long_options, NULL)) != EOF) {
 62 | 
 63 |         switch (opt) {
 64 |         case 'b':
 65 |             if (sscanf(optarg, "%d:%d", &benchmarkFrameStart, &benchmarkFrameEnd) != 2) {
 66 |                 fprintf(stderr, "Invalid argument to -b option\n");
 67 |                 usage(argv[0]);
 68 |                 exit(1);
 69 |             }
 70 |             break;
 71 | 	case 'i':
 72 |    	    interactiveMode = true;
 73 | 	    break;
 74 |         case 'c':
 75 |             checkCorrectness = true;
 76 |             break;
 77 |         case 'f':
 78 |             frameFilename = optarg;
 79 |             break;
 80 |         case 'r':
 81 |             if (std::string(optarg).compare("cuda") == 0) {
 82 |                 useRefRenderer = false;
 83 |             } else if (std::string(optarg).compare("cpuref") == 0) {
 84 | 	      useRefRenderer = true;
 85 | 	    } else {
 86 | 	      fprintf(stderr, "ERROR: Unknown renderer type: %s\n", optarg);
 87 | 	      usage(argv[0]);
 88 | 	      return 1;
 89 | 	    }
 90 |             break;
 91 |         case 's':
 92 |             imageSize = atoi(optarg);
 93 |             break;
 94 |         case '?':
 95 |         default:
 96 |             usage(argv[0]);
 97 |             return 1;
 98 |         }
 99 |     }
100 |     // end parsing of commandline options //////////////////////////////////////
101 | 
102 | 
103 |     if (optind + 1 > argc) {
104 |         fprintf(stderr, "Error: missing scene name\n");
105 |         usage(argv[0]);
106 |         return 1;
107 |     }
108 | 
109 |     sceneNameStr = argv[optind];
110 | 
111 |     if (sceneNameStr.compare("snow") == 0) {
112 |         sceneName = SNOWFLAKES;
113 |     } else if (sceneNameStr.compare("snowsingle") == 0) {
114 |         sceneName = SNOWFLAKES_SINGLE_FRAME;
115 |     } else if (sceneNameStr.compare("rgb") == 0) {
116 |         sceneName = CIRCLE_RGB;
117 |     } else if (sceneNameStr.compare("rgby") == 0) {
118 |         sceneName = CIRCLE_RGBY;
119 |     } else if (sceneNameStr.compare("rand10k") == 0) {
120 |         sceneName = CIRCLE_TEST_10K;
121 |     } else if (sceneNameStr.compare("rand100k") == 0) {
122 |         sceneName = CIRCLE_TEST_100K;
123 |     } else if (sceneNameStr.compare("pattern") == 0) {
124 |         sceneName = PATTERN;
125 |     } else if (sceneNameStr.compare("biglittle") == 0) {
126 |         sceneName = BIG_LITTLE;
127 |     } else if (sceneNameStr.compare("littlebig") == 0) {
128 |         sceneName = LITTLE_BIG;
129 |     } else if (sceneNameStr.compare("bouncingballs") == 0) {
130 |         sceneName = BOUNCING_BALLS;  
131 |     } else if (sceneNameStr.compare("hypnosis") == 0) { 
132 |         sceneName = HYPNOSIS;           
133 |     } else if (sceneNameStr.compare("fireworks") == 0) { 
134 |         sceneName = FIREWORKS;    
135 |     }else {
136 |         fprintf(stderr, "Unknown scene name (%s)\n", sceneNameStr.c_str());
137 |         usage(argv[0]);
138 |         return 1;
139 |     }
140 | 
141 |     printf("Rendering to %dx%d image\n", imageSize, imageSize);
142 | 
143 |     CircleRenderer* renderer;
144 | 
145 |     if (checkCorrectness) {
146 |         // Need both the renderers
147 | 
148 |         CircleRenderer* ref_renderer;
149 |         CircleRenderer* cuda_renderer;
150 | 
151 |         ref_renderer = new RefRenderer();
152 |         cuda_renderer = new CudaRenderer();
153 | 
154 |         ref_renderer->allocOutputImage(imageSize, imageSize);
155 |         ref_renderer->loadScene(sceneName);
156 |         ref_renderer->setup();
157 |         cuda_renderer->allocOutputImage(imageSize, imageSize);
158 |         cuda_renderer->loadScene(sceneName);
159 |         cuda_renderer->setup();
160 | 
161 |         // Check the correctness
162 |         CheckBenchmark(ref_renderer, cuda_renderer, 0, 1, frameFilename);
163 |     }
164 |     else {
165 | 
166 |         if (useRefRenderer)
167 |             renderer = new RefRenderer();
168 |         else
169 |             renderer = new CudaRenderer();
170 | 
171 |         renderer->allocOutputImage(imageSize, imageSize);
172 |         renderer->loadScene(sceneName);
173 |         renderer->setup();
174 | 
175 |         if (!interactiveMode)
176 |             startBenchmark(renderer, benchmarkFrameStart, benchmarkFrameEnd - benchmarkFrameStart, frameFilename);
177 |         else {
178 |             glutInit(&argc, argv);
179 |             startRendererWithDisplay(renderer);
180 |         }
181 |     }
182 | 
183 |     return 0;
184 | }
185 | 


--------------------------------------------------------------------------------
/asst3/render/noise.h:
--------------------------------------------------------------------------------
 1 | #ifndef __NOISE_H__
 2 | #define __NOISE_H__
 3 | 
 4 | 
 5 | void vec2CellNoise(float location[3], float result[2], int index);
 6 | 
 7 | void getNoiseTables(int** permX, int** permY, float** value1D);
 8 | 
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/asst3/render/noiseCuda.cu_inl:
--------------------------------------------------------------------------------
 1 | 
 2 | // included by fastRenderer.cu
 3 | 
 4 | __device__ __inline__ float2
 5 | cudaVec2CellNoise(float3 location, int index)
 6 | {
 7 |     int integer_of_x = static_cast<int>( location.x );
 8 |     int integer_of_y = static_cast<int>( location.y );
 9 |     int integer_of_z = static_cast<int>( location.z );
10 |     int hash = cuConstNoiseXPermutationTable[ (integer_of_x*index) & 0xFF ];
11 |     hash = cuConstNoiseXPermutationTable[ ( hash + integer_of_y ) & 0xFF ];
12 |     hash = cuConstNoiseXPermutationTable[ ( hash + integer_of_z ) & 0xFF ];
13 |     float x_result = cuConstNoise1DValueTable[ hash ];
14 |     hash = cuConstNoiseYPermutationTable[ integer_of_x & 0xFF ];
15 |     hash = cuConstNoiseYPermutationTable[ ( hash + integer_of_y ) & 0xFF ];
16 |     hash = cuConstNoiseYPermutationTable[ ( hash + integer_of_z ) & 0xFF ];
17 |     float y_result = cuConstNoise1DValueTable[ hash ];
18 | 
19 |     return make_float2(x_result, y_result);
20 | }
21 | 


--------------------------------------------------------------------------------
/asst3/render/platformgl.h:
--------------------------------------------------------------------------------
 1 | #ifndef __PLATFORM_GL_H__
 2 | #define __PLATFORM_GL_H__
 3 | 
 4 | #ifdef __APPLE__
 5 | #include <GLUT/glut.h>
 6 | #else
 7 | #include <GL/glut.h>
 8 | #endif
 9 | 
10 | #endif
11 | 
12 | 


--------------------------------------------------------------------------------
/asst3/render/ppm.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | #include <math.h>
 4 | #include <algorithm>
 5 | 
 6 | #include "image.h"
 7 | #include "util.h"
 8 | 
 9 | 
10 | 
11 | // writePPMImage --
12 | //
13 | // assumes input pixels are float4
14 | // write 3-channel (8 bit --> 24 bits per pixel) ppm
15 | void
16 | writePPMImage(const Image* image, const char *filename)
17 | {
18 |     FILE *fp = fopen(filename, "wb");
19 | 
20 |     if (!fp) {
21 |         fprintf(stderr, "Error: could not open %s for write\n", filename);
22 |         exit(1);
23 |     }
24 | 
25 |     // write ppm header
26 |     fprintf(fp, "P6\n");
27 |     fprintf(fp, "%d %d\n", image->width, image->height);
28 |     fprintf(fp, "255\n");
29 | 
30 |     for (int j=image->height-1; j>=0; j--) {
31 |         for (int i=0; i<image->width; i++) {
32 | 
33 |             const float* ptr = &image->data[4 * (j*image->width + i)];
34 | 
35 |             char val[3];
36 |             val[0] = static_cast<char>(255.f * CLAMP(ptr[0], 0.f, 1.f));
37 |             val[1] = static_cast<char>(255.f * CLAMP(ptr[1], 0.f, 1.f));
38 |             val[2] = static_cast<char>(255.f * CLAMP(ptr[2], 0.f, 1.f));
39 | 
40 |             fputc(val[0], fp);
41 |             fputc(val[1], fp);
42 |             fputc(val[2], fp);
43 |         }
44 |     }
45 | 
46 |     fclose(fp);
47 |     printf("Wrote image file %s\n", filename);
48 | }
49 | 


--------------------------------------------------------------------------------
/asst3/render/ppm.h:
--------------------------------------------------------------------------------
1 | #ifndef __PPM_H__
2 | #define __PPM_H__
3 | 
4 | struct Image;
5 | 
6 | void writePPMImage(const Image* image, const char *filename);
7 | 
8 | #endif
9 | 


--------------------------------------------------------------------------------
/asst3/render/refRenderer.h:
--------------------------------------------------------------------------------
 1 | #ifndef __REF_RENDERER_H__
 2 | #define __REF_RENDERER_H__
 3 | 
 4 | #include "circleRenderer.h"
 5 | 
 6 | 
 7 | class RefRenderer : public CircleRenderer {
 8 | 
 9 | private:
10 | 
11 |     Image* image;
12 |     SceneName sceneName;
13 | 
14 |     int numCircles;
15 |     float* position;
16 |     float* velocity;
17 |     float* color;
18 |     float* radius;
19 | 
20 | public:
21 | 
22 |     RefRenderer();
23 |     virtual ~RefRenderer();
24 | 
25 |     const Image* getImage();
26 | 
27 |     void setup();
28 | 
29 |     void loadScene(SceneName name);
30 | 
31 |     void allocOutputImage(int width, int height);
32 | 
33 |     void clearImage();
34 | 
35 |     void advanceAnimation();
36 | 
37 |     void render();
38 | 
39 |     void dumpParticles(const char* filename);
40 | 
41 |     void shadePixel(
42 |         int circleIndex,
43 |         float pixelCenterX, float pixelCenterY,
44 |         float px, float py, float pz,
45 |         float* pixelData);
46 | };
47 | 
48 | 
49 | #endif
50 | 


--------------------------------------------------------------------------------
/asst3/render/refTimings.txt:
--------------------------------------------------------------------------------
 1 | Performance of reference implementation on all scenes:
 2 | (All timings are in millseconds)
 3 | 
 4 | Tests run using benchmark mode flag --bench 0:4
 5 | (Reported times are per-frame time for just the call to render())
 6 | 
 7 |                  image size: 512x512           image size: 1024x1024
 8 |                      ref    cuda (speedup)          ref      cuda (speedup)
 9 | --------------------------------------------------------------------------
10 | rgb                 1.94    0.13 (14.9x)           8.02      0.49 (16.4x)
11 | rgby                1.05    0.12  (8.8x)           4.31      0.46  (9.4x)    
12 | pattern             4.32    0.49  (8.8x)          18.86      1.76 (10.7x)
13 | rand10k           208.40    5.86 (35.6x)         882.75     21.26 (41.5x)
14 | rand100k         2084.03   60.47 (41.3x)        8860.17    217.72 (40.7x)
15 | snowsingle        255.55   29.72  (8.6x         1006.35    113.96  (8.8x)
16 | 
17 | 


--------------------------------------------------------------------------------
/asst3/render/render_ref:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/render/render_ref


--------------------------------------------------------------------------------
/asst3/render/sceneLoader.h:
--------------------------------------------------------------------------------
 1 | #ifndef __SCENE_LOADER_H__
 2 | #define __SCENE_LOADER_H__
 3 | 
 4 | #include "circleRenderer.h"
 5 | 
 6 | void
 7 | loadCircleScene(
 8 |     SceneName sceneName,
 9 |     int& numCircles,
10 |     float*& position,
11 |     float*& velocity,
12 |     float*& color,
13 |     float*& radius);
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/asst3/render/util.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef __UTIL_H__
 3 | #define __UTIL_H__
 4 | 
 5 | #include <algorithm>
 6 | 
 7 | #define CLAMP(x, minimum, maximum) std::max(minimum, std::min(x, maximum))
 8 | 
 9 | #endif
10 | 


--------------------------------------------------------------------------------
/asst3/saxpy/CycleTimer.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SYRAH_CYCLE_TIMER_H_
  2 | #define _SYRAH_CYCLE_TIMER_H_
  3 | 
  4 | #if defined(__APPLE__)
  5 |   #if defined(__x86_64__)
  6 |     #include <sys/sysctl.h>
  7 |   #else
  8 |     #include <mach/mach.h>
  9 |     #include <mach/mach_time.h>
 10 |   #endif // __x86_64__ or not
 11 | 
 12 |   #include <stdio.h>  // fprintf
 13 |   #include <stdlib.h> // exit
 14 | 
 15 | #elif _WIN32
 16 | #  include <windows.h>
 17 | #  include <time.h>
 18 | #else
 19 | #  include <stdio.h>
 20 | #  include <stdlib.h>
 21 | #  include <string.h>
 22 | #  include <sys/time.h>
 23 | #endif
 24 | 
 25 | 
 26 |   // This uses the cycle counter of the processor.  Different
 27 |   // processors in the system will have different values for this.  If
 28 |   // you process moves across processors, then the delta time you
 29 |   // measure will likely be incorrect.  This is mostly for fine
 30 |   // grained measurements where the process is likely to be on the
 31 |   // same processor.  For more global things you should use the
 32 |   // Time interface.
 33 | 
 34 |   // Also note that if you processors' speeds change (i.e. processors
 35 |   // scaling) or if you are in a heterogenous environment, you will
 36 |   // likely get spurious results.
 37 |   class CycleTimer {
 38 |   public:
 39 |     typedef unsigned long long SysClock;
 40 | 
 41 |     //////////
 42 |     // Return the current CPU time, in terms of clock ticks.
 43 |     // Time zero is at some arbitrary point in the past.
 44 |     static SysClock currentTicks() {
 45 | #if defined(__APPLE__) && !defined(__x86_64__)
 46 |       return mach_absolute_time();
 47 | #elif defined(_WIN32)
 48 |       LARGE_INTEGER qwTime;
 49 |       QueryPerformanceCounter(&qwTime);
 50 |       return qwTime.QuadPart;
 51 | #elif defined(__x86_64__)
 52 |       unsigned int a, d;
 53 |       asm volatile("rdtsc" : "=a" (a), "=d" (d));
 54 |       return static_cast<unsigned long long>(a) |
 55 |         (static_cast<unsigned long long>(d) << 32);
 56 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser.
 57 |       unsigned int val;
 58 |       asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val));
 59 |       return val;
 60 | #else
 61 |       timespec spec;
 62 |       clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec);
 63 |       return CycleTimer::SysClock(static_cast<float>(spec.tv_sec) * 1e9 + static_cast<float>(spec.tv_nsec));
 64 | #endif
 65 |     }
 66 | 
 67 |     //////////
 68 |     // Return the current CPU time, in terms of seconds.
 69 |     // This is slower than currentTicks().  Time zero is at
 70 |     // some arbitrary point in the past.
 71 |     static double currentSeconds() {
 72 |       return currentTicks() * secondsPerTick();
 73 |     }
 74 | 
 75 |     //////////
 76 |     // Return the conversion from seconds to ticks.
 77 |     static double ticksPerSecond() {
 78 |       return 1.0/secondsPerTick();
 79 |     }
 80 | 
 81 |     static const char* tickUnits() {
 82 | #if defined(__APPLE__) && !defined(__x86_64__)
 83 |       return "ns";
 84 | #elif defined(__WIN32__) || defined(__x86_64__)
 85 |       return "cycles";
 86 | #else
 87 |       return "ns"; // clock_gettime
 88 | #endif
 89 |     }
 90 | 
 91 |     //////////
 92 |     // Return the conversion from ticks to seconds.
 93 |     static double secondsPerTick() {
 94 |       static bool initialized = false;
 95 |       static double secondsPerTick_val;
 96 |       if (initialized) return secondsPerTick_val;
 97 | #if defined(__APPLE__)
 98 |   #ifdef __x86_64__
 99 |       int args[] = {CTL_HW, HW_CPU_FREQ};
100 |       unsigned int Hz;
101 |       size_t len = sizeof(Hz);
102 |       if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) {
103 |          fprintf(stderr, "Failed to initialize secondsPerTick_val!\n");
104 |          exit(-1);
105 |       }
106 |       secondsPerTick_val = 1.0 / (double) Hz;
107 |   #else
108 |       mach_timebase_info_data_t time_info;
109 |       mach_timebase_info(&time_info);
110 | 
111 |       // Scales to nanoseconds without 1e-9f
112 |       secondsPerTick_val = (1e-9*static_cast<double>(time_info.numer))/
113 |         static_cast<double>(time_info.denom);
114 |   #endif // x86_64 or not
115 | #elif defined(_WIN32)
116 |       LARGE_INTEGER qwTicksPerSec;
117 |       QueryPerformanceFrequency(&qwTicksPerSec);
118 |       secondsPerTick_val = 1.0/static_cast<double>(qwTicksPerSec.QuadPart);
119 | #else
120 |       FILE *fp = fopen("/proc/cpuinfo","r");
121 |       char input[1024];
122 |       if (!fp) {
123 |          fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo.");
124 |          exit(-1);
125 |       }
126 |       // In case we don't find it, e.g. on the N900
127 |       secondsPerTick_val = 1e-9;
128 |       while (!feof(fp) && fgets(input, 1024, fp)) {
129 |         // NOTE(boulos): Because reading cpuinfo depends on dynamic
130 |         // frequency scaling it's better to read the @ sign first
131 |         float GHz, MHz;
132 |         if (strstr(input, "model name")) {
133 |           char* at_sign = strstr(input, "@");
134 |           if (at_sign) {
135 |             char* after_at = at_sign + 1;
136 |             char* GHz_str = strstr(after_at, "GHz");
137 |             char* MHz_str = strstr(after_at, "MHz");
138 |             if (GHz_str) {
139 |               *GHz_str = '\0';
140 |               if (1 == sscanf(after_at, "%f", &GHz)) {
141 |                 //printf("GHz = %f\n", GHz);
142 |                 secondsPerTick_val = 1e-9f / GHz;
143 |                 break;
144 |               }
145 |             } else if (MHz_str) {
146 |               *MHz_str = '\0';
147 |               if (1 == sscanf(after_at, "%f", &MHz)) {
148 |                 //printf("MHz = %f\n", MHz);
149 |                 secondsPerTick_val = 1e-6f / GHz;
150 |                 break;
151 |               }
152 |             }
153 |           }
154 |         } else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) {
155 |           //printf("MHz = %f\n", MHz);
156 |           secondsPerTick_val = 1e-6f / MHz;
157 |           break;
158 |         }
159 |       }
160 |       fclose(fp);
161 | #endif
162 | 
163 |       initialized = true;
164 |       return secondsPerTick_val;
165 |     }
166 | 
167 |     //////////
168 |     // Return the conversion from ticks to milliseconds.
169 |     static double msPerTick() {
170 |       return secondsPerTick() * 1000.0;
171 |     }
172 | 
173 |   private:
174 |     CycleTimer();
175 |   };
176 | 
177 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_
178 | 


--------------------------------------------------------------------------------
/asst3/saxpy/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | EXECUTABLE := cudaSaxpy
 3 | 
 4 | CU_FILES   := saxpy.cu
 5 | 
 6 | CU_DEPS    :=
 7 | 
 8 | CC_FILES   := main.cpp
 9 | 
10 | ###########################################################
11 | 
12 | ARCH=$(shell uname | sed -e 's/-.*//g')
13 | 
14 | OBJDIR=objs
15 | CXX=g++ -m64
16 | CXXFLAGS=-O3 -Wall
17 | ifeq ($(ARCH), Darwin)
18 | # Building on mac
19 | LDFLAGS=-L/usr/local/depot/cuda/lib/ -lcudart
20 | else
21 | # Building on Linux
22 | LDFLAGS=-L/usr/local/cuda-9.0/lib64/ -lcudart
23 | endif
24 | NVCC=nvcc
25 | NVCCFLAGS=-O3 -m64 --gpu-architecture compute_61
26 | 
27 | 
28 | OBJS=$(OBJDIR)/main.o  $(OBJDIR)/saxpy.o
29 | 
30 | 
31 | .PHONY: dirs clean
32 | 
33 | default: $(EXECUTABLE)
34 | 
35 | dirs:
36 | 		mkdir -p $(OBJDIR)/
37 | 
38 | clean:
39 | 		rm -rf $(OBJDIR) *.ppm *~ $(EXECUTABLE)
40 | 
41 | $(EXECUTABLE): dirs $(OBJS)
42 | 		$(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(LDFLAGS)
43 | 
44 | $(OBJDIR)/%.o: %.cpp
45 | 		$(CXX) $< $(CXXFLAGS) -c -o $@
46 | 
47 | $(OBJDIR)/%.o: %.cu
48 | 		$(NVCC) $< $(NVCCFLAGS) -c -o $@
49 | 


--------------------------------------------------------------------------------
/asst3/saxpy/main.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | #include <getopt.h>
 4 | #include <cmath>
 5 | #include <string>
 6 | 
 7 | void saxpyCuda(int N, float alpha, float* x, float* y, float* result);
 8 | void printCudaInfo();
 9 | 
10 | 
11 | void usage(const char* progname) {
12 |     printf("Usage: %s [options]\n", progname);
13 |     printf("Program Options:\n");
14 |     printf("  -n  --arraysize <INT>  Number of elements in arrays\n");
15 |     printf("  -?  --help             This message\n");
16 | }
17 | 
18 | 
19 | bool check(int N, float alpha, float* x, float* y, float* result) {
20 | 
21 |     for (int i = 0; i < N; ++i) {
22 |         if (abs(alpha * x[i] + y[i] - result[i]) > 1e-5) {
23 |             return false;
24 |         }
25 |     }
26 |     return true;
27 | 
28 | }
29 | 
30 | 
31 | int main(int argc, char** argv)
32 | {
33 | 
34 |     // default: arrays of 100M numbers
35 |     int N = 100 * 1000 * 1000;
36 | 
37 |     // parse commandline options ////////////////////////////////////////////
38 |     int opt;
39 |     static struct option long_options[] = {
40 |         {"arraysize",  1, 0, 'n'},
41 |         {"help",       0, 0, '?'},
42 |         {0 ,0, 0, 0}
43 |     };
44 | 
45 |     while ((opt = getopt_long(argc, argv, "?n:", long_options, NULL)) != EOF) {
46 | 
47 |         switch (opt) {
48 |         case 'n':
49 |             N = atoi(optarg);
50 |             break;
51 |         case '?':
52 |         default:
53 |             usage(argv[0]);
54 |             return 1;
55 |         }
56 |     }
57 |     // end parsing of commandline options //////////////////////////////////////
58 | 
59 |     const float alpha = 2.0f;
60 |     float* xarray = new float[N];
61 |     float* yarray = new float[N];
62 |     float* resultarray = new float[N];
63 | 
64 |     for (int i=0; i<N; i++) {
65 |         xarray[i] = yarray[i] = i % 10;
66 |         resultarray[i] = 0.f;
67 |    }
68 | 
69 |     printCudaInfo();
70 | 
71 |     printf("Running 3 timing tests:\n");
72 |     for (int i=0; i<3; i++) {
73 |       saxpyCuda(N, alpha, xarray, yarray, resultarray);
74 |       if (!check(N, alpha, xarray, yarray, resultarray)) {
75 |           printf("Error!\n");
76 |       } else {
77 |           printf("Pass!\n");
78 |       }
79 |     }
80 | 
81 |     delete [] xarray;
82 |     delete [] yarray;
83 |     delete [] resultarray;
84 |     return 0;
85 | }
86 | 


--------------------------------------------------------------------------------
/asst3/saxpy/saxpy.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | 
  3 | #include <cuda.h>
  4 | #include <cuda_runtime.h>
  5 | #include <driver_functions.h>
  6 | 
  7 | #include "CycleTimer.h"
  8 | 
  9 | 
 10 | // return GB/sec
 11 | float GBPerSec(int bytes, float sec) {
 12 |   return static_cast<float>(bytes) / (1024. * 1024. * 1024.) / sec;
 13 | }
 14 | 
 15 | 
 16 | // This is the CUDA "kernel" function that is run on the GPU.  You
 17 | // know this because it is marked as a __global__ function.
 18 | __global__ void
 19 | saxpy_kernel(int N, float alpha, float* x, float* y, float* result) {
 20 | 
 21 |     // compute overall thread index from position of thread in current
 22 |     // block, and given the block we are in (in this example only a 1D
 23 |     // calculation is needed so the code only looks at the .x terms of
 24 |     // blockDim and threadIdx.
 25 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
 26 | 
 27 | 
 28 |     // this check is necessary to make the code work for values of N
 29 |     // that are not a multiple of the thread block size (blockDim.x)
 30 |     if (index < N)
 31 |        result[index] = alpha * x[index] + y[index];
 32 | }
 33 | 
 34 | 
 35 | // saxpyCuda --
 36 | //
 37 | // This function is regular C code running on the CPU.  It allocates
 38 | // memory on the GPU using CUDA API functions, uses CUDA API functions
 39 | // to transfer data from the CPU's memory address space to GPU memory
 40 | // address space, and launches the CUDA kernel function on the GPU.
 41 | void saxpyCuda(int N, float alpha, float* xarray, float* yarray, float* resultarray) {
 42 | 
 43 |     // must read both input arrays (xarray and yarray) and write to
 44 |     // output array (resultarray)
 45 |     int totalBytes = sizeof(float) * 3 * N;
 46 | 
 47 |     // compute number of blocks and threads per block.  In this
 48 |     // application we've hardcoded thread blocks to contain 512 CUDA
 49 |     // threads.
 50 |     const int threadsPerBlock = 512;
 51 | 
 52 |     // Notice the round up here.  The code needs to compute the number
 53 |     // of threads blocks needed such that there is one thread per
 54 |     // element of the arrays.  This code is written to work for values
 55 |     // of N that are not multiples of threadPerBlock.
 56 |     const int blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
 57 | 
 58 |     // These are pointers that will be pointers to memory allocated
 59 |     // *one the GPU*.  You should allocate these pointers via
 60 |     // cudaMalloc.  You can access the resulting buffers from CUDA
 61 |     // device kernel code (see the kernel function saxpy_kernel()
 62 |     // above) but you cannot access the contents these buffers from
 63 |     // this thread. CPU threads cannot issue loads and stores from GPU
 64 |     // memory!
 65 |     float* device_x = NULL;
 66 |     float* device_y = NULL;
 67 |     float* device_result = NULL;
 68 | 
 69 |     //
 70 |     // CS149 TODO: allocate device memory buffers on the GPU using cudaMalloc.
 71 |     //
 72 |     // We highly recommend taking a look at NVIDIA's
 73 |     // tutorial, which clearly walks you through the few lines of code
 74 |     // you need to write for this part of the assignment:
 75 |     //
 76 |     // https://devblogs.nvidia.com/easy-introduction-cuda-c-and-c/
 77 |     //
 78 | 
 79 |     cudaMalloc(&device_x, N * sizeof(float));
 80 |     cudaMalloc(&device_y, N * sizeof(float));
 81 |     cudaMalloc(&device_result, N * sizeof(float));
 82 | 
 83 |     // start timing after allocation of device memory
 84 |     double startTime = CycleTimer::currentSeconds();
 85 | 
 86 |     //
 87 |     // CS149 TODO: copy input arrays to the GPU using cudaMemcpy
 88 |     //
 89 | 
 90 |     cudaMemcpy(device_x, xarray, N * sizeof(float), cudaMemcpyHostToDevice);
 91 |     cudaMemcpy(device_y, yarray, N * sizeof(float), cudaMemcpyHostToDevice);
 92 | 
 93 | 
 94 |     double startExecTime = CycleTimer::currentSeconds();
 95 |     // run CUDA kernel. (notice the <<< >>> brackets indicating a CUDA
 96 |     // kernel launch) Execution on the GPU occurs here.
 97 |     saxpy_kernel<<<blocks, threadsPerBlock>>>(N, alpha, device_x, device_y, device_result);
 98 | 
 99 |     // return before all work id done.
100 |     cudaDeviceSynchronize();
101 | 
102 |     double endExecTime = CycleTimer::currentSeconds();
103 |     //
104 |     // CS149 TODO: copy result from GPU back to CPU using cudaMemcpy
105 |     //
106 |     cudaMemcpy(resultarray, device_result, N * sizeof(float), cudaMemcpyDeviceToHost);
107 | 
108 |     // end timing after result has been copied back into host memory
109 |     double endTime = CycleTimer::currentSeconds();
110 | 
111 |     double ExecTime = endExecTime - startExecTime;
112 |     printf("Execute Time: %.3f ms\t\t[%.3f GB/s]\n", 1000.f * ExecTime, GBPerSec(totalBytes, ExecTime));
113 | 
114 |     cudaError_t errCode = cudaPeekAtLastError();
115 |     if (errCode != cudaSuccess) {
116 |         fprintf(stderr, "WARNING: A CUDA error occured: code=%d, %s\n",
117 | 		errCode, cudaGetErrorString(errCode));
118 |     }
119 | 
120 |     double overallDuration = endTime - startTime;
121 |     printf("Effective BW by CUDA saxpy: %.3f ms\t\t[%.3f GB/s]\n", 1000.f * overallDuration, GBPerSec(totalBytes, overallDuration));
122 | 
123 |     //
124 |     // CS149 TODO: free memory buffers on the GPU using cudaFree
125 |     //
126 |     cudaFree(device_x);
127 |     cudaFree(device_y);
128 |     cudaFree(device_result);
129 | 
130 | }
131 | 
132 | void printCudaInfo() {
133 | 
134 |     // print out stats about the GPU in the machine.  Useful if
135 |     // students want to know what GPU they are running on.
136 | 
137 |     int deviceCount = 0;
138 |     cudaError_t err = cudaGetDeviceCount(&deviceCount);
139 | 
140 |     printf("---------------------------------------------------------\n");
141 |     printf("Found %d CUDA devices\n", deviceCount);
142 | 
143 |     for (int i=0; i<deviceCount; i++) {
144 |         cudaDeviceProp deviceProps;
145 |         cudaGetDeviceProperties(&deviceProps, i);
146 |         printf("Device %d: %s\n", i, deviceProps.name);
147 |         printf("   SMs:        %d\n", deviceProps.multiProcessorCount);
148 |         printf("   Global mem: %.0f MB\n",
149 |                static_cast<float>(deviceProps.totalGlobalMem) / (1024 * 1024));
150 |         printf("   CUDA Cap:   %d.%d\n", deviceProps.major, deviceProps.minor);
151 |     }
152 |     printf("---------------------------------------------------------\n");
153 | }
154 | 


--------------------------------------------------------------------------------
/asst3/scan/CycleTimer.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SYRAH_CYCLE_TIMER_H_
  2 | #define _SYRAH_CYCLE_TIMER_H_
  3 | 
  4 | #if defined(__APPLE__)
  5 |   #if defined(__x86_64__)
  6 |     #include <sys/sysctl.h>
  7 |   #else
  8 |     #include <mach/mach.h>
  9 |     #include <mach/mach_time.h>
 10 |   #endif // __x86_64__ or not
 11 | 
 12 |   #include <stdio.h>  // fprintf
 13 |   #include <stdlib.h> // exit
 14 | 
 15 | #elif _WIN32
 16 | #  include <windows.h>
 17 | #  include <time.h>
 18 | #else
 19 | #  include <stdio.h>
 20 | #  include <stdlib.h>
 21 | #  include <string.h>
 22 | #  include <sys/time.h>
 23 | #endif
 24 | 
 25 | 
 26 |   // This uses the cycle counter of the processor.  Different
 27 |   // processors in the system will have different values for this.  If
 28 |   // you process moves across processors, then the delta time you
 29 |   // measure will likely be incorrect.  This is mostly for fine
 30 |   // grained measurements where the process is likely to be on the
 31 |   // same processor.  For more global things you should use the
 32 |   // Time interface.
 33 | 
 34 |   // Also note that if you processors' speeds change (i.e. processors
 35 |   // scaling) or if you are in a heterogenous environment, you will
 36 |   // likely get spurious results.
 37 |   class CycleTimer {
 38 |   public:
 39 |     typedef unsigned long long SysClock;
 40 | 
 41 |     //////////
 42 |     // Return the current CPU time, in terms of clock ticks.
 43 |     // Time zero is at some arbitrary point in the past.
 44 |     static SysClock currentTicks() {
 45 | #if defined(__APPLE__) && !defined(__x86_64__)
 46 |       return mach_absolute_time();
 47 | #elif defined(_WIN32)
 48 |       LARGE_INTEGER qwTime;
 49 |       QueryPerformanceCounter(&qwTime);
 50 |       return qwTime.QuadPart;
 51 | #elif defined(__x86_64__)
 52 |       unsigned int a, d;
 53 |       asm volatile("rdtsc" : "=a" (a), "=d" (d));
 54 |       return static_cast<unsigned long long>(a) |
 55 |         (static_cast<unsigned long long>(d) << 32);
 56 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser.
 57 |       unsigned int val;
 58 |       asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val));
 59 |       return val;
 60 | #else
 61 |       timespec spec;
 62 |       clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec);
 63 |       return CycleTimer::SysClock(static_cast<float>(spec.tv_sec) * 1e9 + static_cast<float>(spec.tv_nsec));
 64 | #endif
 65 |     }
 66 | 
 67 |     //////////
 68 |     // Return the current CPU time, in terms of seconds.
 69 |     // This is slower than currentTicks().  Time zero is at
 70 |     // some arbitrary point in the past.
 71 |     static double currentSeconds() {
 72 |       return currentTicks() * secondsPerTick();
 73 |     }
 74 | 
 75 |     //////////
 76 |     // Return the conversion from seconds to ticks.
 77 |     static double ticksPerSecond() {
 78 |       return 1.0/secondsPerTick();
 79 |     }
 80 | 
 81 |     static const char* tickUnits() {
 82 | #if defined(__APPLE__) && !defined(__x86_64__)
 83 |       return "ns";
 84 | #elif defined(__WIN32__) || defined(__x86_64__)
 85 |       return "cycles";
 86 | #else
 87 |       return "ns"; // clock_gettime
 88 | #endif
 89 |     }
 90 | 
 91 |     //////////
 92 |     // Return the conversion from ticks to seconds.
 93 |     static double secondsPerTick() {
 94 |       static bool initialized = false;
 95 |       static double secondsPerTick_val;
 96 |       if (initialized) return secondsPerTick_val;
 97 | #if defined(__APPLE__)
 98 |   #ifdef __x86_64__
 99 |       int args[] = {CTL_HW, HW_CPU_FREQ};
100 |       unsigned int Hz;
101 |       size_t len = sizeof(Hz);
102 |       if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) {
103 |          fprintf(stderr, "Failed to initialize secondsPerTick_val!\n");
104 |          exit(-1);
105 |       }
106 |       secondsPerTick_val = 1.0 / (double) Hz;
107 |   #else
108 |       mach_timebase_info_data_t time_info;
109 |       mach_timebase_info(&time_info);
110 | 
111 |       // Scales to nanoseconds without 1e-9f
112 |       secondsPerTick_val = (1e-9*static_cast<double>(time_info.numer))/
113 |         static_cast<double>(time_info.denom);
114 |   #endif // x86_64 or not
115 | #elif defined(_WIN32)
116 |       LARGE_INTEGER qwTicksPerSec;
117 |       QueryPerformanceFrequency(&qwTicksPerSec);
118 |       secondsPerTick_val = 1.0/static_cast<double>(qwTicksPerSec.QuadPart);
119 | #else
120 |       FILE *fp = fopen("/proc/cpuinfo","r");
121 |       char input[1024];
122 |       if (!fp) {
123 |          fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo.");
124 |          exit(-1);
125 |       }
126 |       // In case we don't find it, e.g. on the N900
127 |       secondsPerTick_val = 1e-9;
128 |       while (!feof(fp) && fgets(input, 1024, fp)) {
129 |         // NOTE(boulos): Because reading cpuinfo depends on dynamic
130 |         // frequency scaling it's better to read the @ sign first
131 |         float GHz, MHz;
132 |         if (strstr(input, "model name")) {
133 |           char* at_sign = strstr(input, "@");
134 |           if (at_sign) {
135 |             char* after_at = at_sign + 1;
136 |             char* GHz_str = strstr(after_at, "GHz");
137 |             char* MHz_str = strstr(after_at, "MHz");
138 |             if (GHz_str) {
139 |               *GHz_str = '\0';
140 |               if (1 == sscanf(after_at, "%f", &GHz)) {
141 |                 //printf("GHz = %f\n", GHz);
142 |                 secondsPerTick_val = 1e-9f / GHz;
143 |                 break;
144 |               }
145 |             } else if (MHz_str) {
146 |               *MHz_str = '\0';
147 |               if (1 == sscanf(after_at, "%f", &MHz)) {
148 |                 //printf("MHz = %f\n", MHz);
149 |                 secondsPerTick_val = 1e-6f / GHz;
150 |                 break;
151 |               }
152 |             }
153 |           }
154 |         } else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) {
155 |           //printf("MHz = %f\n", MHz);
156 |           secondsPerTick_val = 1e-6f / MHz;
157 |           break;
158 |         }
159 |       }
160 |       fclose(fp);
161 | #endif
162 | 
163 |       initialized = true;
164 |       return secondsPerTick_val;
165 |     }
166 | 
167 |     //////////
168 |     // Return the conversion from ticks to milliseconds.
169 |     static double msPerTick() {
170 |       return secondsPerTick() * 1000.0;
171 |     }
172 | 
173 |   private:
174 |     CycleTimer();
175 |   };
176 | 
177 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_
178 | 


--------------------------------------------------------------------------------
/asst3/scan/Makefile:
--------------------------------------------------------------------------------
 1 | EXECUTABLE := cudaScan
 2 | 
 3 | CU_FILES   := scan.cu
 4 | 
 5 | CU_DEPS    :=
 6 | 
 7 | CC_FILES   := main.cpp
 8 | 
 9 | all: $(EXECUTABLE) $(REFERENCE)
10 | 
11 | LOGS	   := logs
12 | 
13 | ###########################################################
14 | 
15 | OBJDIR=objs
16 | CXX=g++ -m64 
17 | CXXFLAGS=-O3 -Wall -std=c++11
18 | LDFLAGS=-L/usr/local/cuda-9.0/lib64/ -lcudart
19 | NVCC=nvcc
20 | NVCCFLAGS=-O3 -m64 --gpu-architecture compute_61 -std=c++11
21 | 
22 | 
23 | OBJS=$(OBJDIR)/main.o  $(OBJDIR)/scan.o
24 | 
25 | 
26 | .PHONY: dirs clean
27 | 
28 | default: $(EXECUTABLE)
29 | 
30 | dirs:
31 | 		mkdir -p $(OBJDIR)/
32 | 
33 | clean:
34 | 		rm -rf $(OBJDIR) *.ppm *~ $(EXECUTABLE) $(LOGS)
35 | 
36 | check_scan: default
37 | 				./checker.pl scan
38 | 
39 | check_find_repeats: default
40 | 				./checker.pl find_repeats
41 | 
42 | $(EXECUTABLE): dirs $(OBJS)
43 | 		$(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(LDFLAGS)
44 | 
45 | $(OBJDIR)/%.o: %.cpp
46 | 		$(CXX) $< $(CXXFLAGS) -c -o $@
47 | 
48 | $(OBJDIR)/%.o: %.cu
49 | 		$(NVCC) $< $(NVCCFLAGS) -c -o $@
50 | 


--------------------------------------------------------------------------------
/asst3/scan/checker.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | use POSIX;
  4 | 
  5 | my @element_counts = ("1000000", "10000000", "20000000", "40000000");
  6 | 
  7 | my %fast_times; 
  8 | my %your_times; 
  9 | 
 10 | my $perf_points = 1.25;
 11 | my %correct;
 12 | my $test;
 13 | 
 14 | `mkdir -p logs`;
 15 | `rm -rf logs/*`;
 16 | `mkdir logs/test`;
 17 | `mkdir logs/ref`;
 18 | 
 19 | if(scalar (@ARGV) != 1 || (@ARGV[0] ne "find_repeats" && @ARGV[0] ne "scan")) 
 20 | {
 21 |   print ("Usage: ./checker.pl <test>: test = scan, find_repeats\n"); 
 22 |   exit(1);   
 23 | } else { 
 24 |     $test = @ARGV[0];
 25 |     print("Test: $test" ); 
 26 | }
 27 | 
 28 | print "\n";
 29 | print ("--------------\n");
 30 | print ("Running tests:\n");
 31 | print ("--------------\n");
 32 | 
 33 | foreach my $element_count (@element_counts) {
 34 |     print ("\nElement Count: $element_count\n");
 35 |     my @sys_stdout = system ("./cudaScan -m ${test} -i random -n $element_count > ./logs/test/${test}_correctness_${element_count}.log");
 36 |     my $return_value  = $?;
 37 |     if ($return_value == 0) {
 38 |         print ("Correctness passed!\n");
 39 |         $correct{$element_count} = 1;
 40 |     }
 41 |     else {
 42 |         print ("Correctness failed\n");
 43 |         $correct{$scene} = 0;
 44 |     }
 45 | 
 46 |     my $your_time = `./cudaScan -m ${test} -i random -n $element_count | tee ./logs/test/${test}_time_${element_count}.log | grep \'Student GPU time:\'`;
 47 |     chomp($your_time);
 48 |     $your_time =~ s/^[^0-9]*//;
 49 |     $your_time =~ s/ ms.*//;
 50 |     print ("Student Time: $your_time\n"); 
 51 |     
 52 |     my $fast_time = `./cudaScan_ref -m ${test} -i random -n $element_count | tee ./logs/ref/${test}_time_${element_count}.log | grep \'Student GPU time:\'`;
 53 |     chomp($fast_time);
 54 |     $fast_time =~ s/^[^0-9]*//;
 55 |     $fast_time =~ s/ ms.*//;
 56 |     print ("Ref Time: $fast_time\n"); 
 57 | 
 58 |     $your_times{$element_count} = $your_time;
 59 |     $fast_times{$element_count} = $fast_time;
 60 | }
 61 | 
 62 | print "\n";
 63 | print ("-------------------------\n");
 64 | print (ucfirst($test). " Score Table:\n");
 65 | print ("-------------------------\n");
 66 | 
 67 | my $header = sprintf ("| %-15s | %-15s | %-15s | %-15s |\n", "Element Count", "Ref Time", "Student Time", "Score");
 68 | my $dashes = $header;
 69 | $dashes =~ s/./-/g;
 70 | print $dashes;
 71 | print $header;
 72 | print $dashes;
 73 | 
 74 | my $total_score = 0;
 75 | 
 76 | foreach my $element_count (@element_counts){
 77 |     my $score;
 78 |     my $fast_time = $fast_times{$element_count};
 79 |     my $time = $your_times{$element_count};
 80 | 
 81 |     if ($correct{$element_count}) {
 82 |         if ($time <= 1.20 * $fast_time) {
 83 |             $score = $perf_points;
 84 |         }
 85 |         else {
 86 |             $score = $perf_points * ($fast_time /$time);
 87 |         }
 88 |     }
 89 |     else {
 90 |         $time .= " (F)";
 91 |         $score = 0;
 92 |     }
 93 | 
 94 |     printf ("| %-15s | %-15s | %-15s | %-15s |\n", "$element_count", "$fast_time", "$time", "$score");
 95 |     $total_score += $score;
 96 | }
 97 | print $dashes;
 98 | printf ("| %-33s | %-15s | %-15s |\n", "", "Total score:", 
 99 |     $total_score . "/" . ($perf_points * keys %fast_times));
100 | print $dashes;
101 | 


--------------------------------------------------------------------------------
/asst3/scan/cudaScan_ref:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/scan/cudaScan_ref


--------------------------------------------------------------------------------
/asst3/scan/log.txt:
--------------------------------------------------------------------------------
 1 | ---------------------------------------------------------
 2 | Found 4 CUDA devices
 3 | Device 0: NVIDIA GeForce GTX 1080
 4 |    SMs:        20
 5 |    Global mem: 8120 MB
 6 |    CUDA Cap:   6.1
 7 | Device 1: NVIDIA GeForce GTX 1080
 8 |    SMs:        20
 9 |    Global mem: 8120 MB
10 |    CUDA Cap:   6.1
11 | Device 2: NVIDIA GeForce GTX 1080
12 |    SMs:        20
13 |    Global mem: 8120 MB
14 |    CUDA Cap:   6.1
15 | Device 3: NVIDIA GeForce GTX 1080
16 |    SMs:        20
17 |    Global mem: 8120 MB
18 |    CUDA Cap:   6.1
19 | ---------------------------------------------------------
20 | Array size: 64
21 | block: 1, thread: 64
22 | cmp: idx: 41, value: 9
23 | 1 blocks, 32 thread !
24 | 1 blocks, 16 thread !
25 | 1 blocks, 8 thread !
26 | 1 blocks, 4 thread !
27 | 1 blocks, 2 thread !
28 | 1 blocks, 1 thread !
29 | 1 blocks, 1 thread !
30 | 1 blocks, 2 thread !
31 | 1 blocks, 4 thread !
32 | 1 blocks, 8 thread !
33 | 1 blocks, 16 thread !
34 | 1 blocks, 32 thread !
35 | fill: idx: 41, prefix_idx: 0, value: 9
36 | block: 1, thread: 64
37 | cmp: idx: 41, value: 9
38 | 1 blocks, 32 thread !
39 | 1 blocks, 16 thread !
40 | 1 blocks, 8 thread !
41 | 1 blocks, 4 thread !
42 | 1 blocks, 2 thread !
43 | 1 blocks, 1 thread !
44 | 1 blocks, 1 thread !
45 | 1 blocks, 2 thread !
46 | 1 blocks, 4 thread !
47 | 1 blocks, 8 thread !
48 | 1 blocks, 16 thread !
49 | 1 blocks, 32 thread !
50 | fill: idx: 41, prefix_idx: 0, value: 9
51 | block: 1, thread: 64
52 | cmp: idx: 41, value: 9
53 | 1 blocks, 32 thread !
54 | 1 blocks, 16 thread !
55 | 1 blocks, 8 thread !
56 | 1 blocks, 4 thread !
57 | 1 blocks, 2 thread !
58 | 1 blocks, 1 thread !
59 | 1 blocks, 1 thread !
60 | 1 blocks, 2 thread !
61 | 1 blocks, 4 thread !
62 | 1 blocks, 8 thread !
63 | 1 blocks, 16 thread !
64 | 1 blocks, 32 thread !
65 | fill: idx: 41, prefix_idx: 0, value: 9
66 | Student GPU time: 0.216 ms
67 | Find_repeats outputs are correct!
68 | 


--------------------------------------------------------------------------------
/asst4/bfs/Makefile:
--------------------------------------------------------------------------------
1 | all: default grade
2 | 
3 | default: main.cpp bfs.cpp
4 | 	g++ -I../ -std=c++11 -fopenmp -O3 -g -o bfs main.cpp bfs.cpp ../common/graph.cpp ref_bfs.o
5 | grade: grade.cpp bfs.cpp
6 | 	g++ -I../ -std=c++11 -fopenmp -O3 -g -o bfs_grader grade.cpp bfs.cpp ../common/graph.cpp ref_bfs.o
7 | clean:
8 | 	rm -rf bfs_grader bfs  *~ *.*~
9 | 


--------------------------------------------------------------------------------
/asst4/bfs/bfs.h:
--------------------------------------------------------------------------------
 1 | #ifndef __BFS_H__
 2 | #define __BFS_H__
 3 | 
 4 | //#define DEBUG
 5 | 
 6 | #include "common/graph.h"
 7 | #include <stdlib.h>
 8 | 
 9 | struct solution
10 | {
11 |   int *distances;
12 | };
13 | 
14 | struct vertex_set {
15 |   // # of vertices in the set
16 |   int count;
17 |   // max size of buffer vertices
18 |   int max_vertices;
19 |   // array of vertex ids in set
20 |   int *vertices;
21 | };
22 | 
23 | 
24 | void bfs_top_down(Graph graph, solution* sol);
25 | void bfs_bottom_up(Graph graph, solution* sol);
26 | void bfs_hybrid(Graph graph, solution* sol);
27 | 
28 | #endif
29 | 


--------------------------------------------------------------------------------
/asst4/bfs/ref_bfs.o:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/bfs/ref_bfs.o


--------------------------------------------------------------------------------
/asst4/cloud_readme.md:
--------------------------------------------------------------------------------
 1 | # AWS Setup Instructions #
 2 | 
 3 | For performance testing, you will need to run it on a VM instance on Amazon Web Services (AWS). We've already sent you student coupons that you can use for billing purposes. Here are the steps for how to get setup for running on AWS.
 4 | 
 5 | NOTE: __Please don't forget to SHUT DOWN your instances when you're done for the day to avoid burning through credits overnight!__
 6 | 
 7 | ### Creating a VM with 32 vCPU ###
 8 |       
 9 | 1. Now you're ready to create a VM instance. Click on the button that says `Launch Instances`. Choose the `Ubuntu Server 20.04 LTS (HVM), SSD Volume Type` AMI:
10 | ![AMI Selection](handout/AMI.png?raw=true)
11 | 
12 | 2. Choose the `m5.8xlarge` Instance Type and then click `4. Add Storage` on the top bar: 
13 | ![instance](handout/instance_type_big.png?raw=true)
14 | 
15 | 3. Change the size of the `Root` volume to 100 GB to accomodate the packages we will need to install to make the instance functional for the assignment:
16 | ![Storage](handout/storage_big.png?raw=true)
17 | 
18 | 5. AWS will ask you to select a key pair. You can use the same key pair from assignment 3. Alternatively, you can create a new one. To create a new one, click the first dropdown and choose `Create a new key pair` and give it whatever name you'd like. This will download a keyfile to your computer called `<key_name>.pem` which you will use to login to the VM instance you are about to create. Finally, click `Launch Instances`.
19 | ![Key Pair](handout/new_key_pair.png?raw=true)
20 | 
21 | __Note: `m5.8xlarge` instances cost $1.536 / hour, so leaving one running for a whole day will consume $36.86 worth of your AWS coupon.__
22 | 
23 | 4. Now that you've created your VM, you should be able to __SSH__ into it. You need the public IP address to SSH into it, which you can find on the instance page by clicking the `View Instances` button on the current page and then the instance ID for your created instance (note, it may take a moment for the instance to startup and be assigned an IP address):
24 | ![IP Address](handout/ip_address.png?raw=true)
25 | Once you have the IP address, you can login to the instance by running this command:
26 | ~~~~
27 | ssh -i path/to/key_name.pem ubuntu@<public_ip_address>
28 | ~~~~
29 | 
30 | 5. Once you SSH into your VM instance, you'll want to install whatever software you need to make the machine a useful development environment for you.  For example we recommend:
31 | ~~~~
32 | sudo apt update
33 | sudo apt install emacs25
34 | sudo apt install make
35 | sudo apt install g++
36 | ~~~~
37 | 
38 | If you're confused about any of the steps, having problems with setting up your account or have any additional questions, reach us out on Piazza!
39 |   
40 | __Again, please don't forget to SHUT DOWN your instances when you're done with your work for the day!__
41 | 


--------------------------------------------------------------------------------
/asst4/common/CycleTimer.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SYRAH_CYCLE_TIMER_H_
  2 | #define _SYRAH_CYCLE_TIMER_H_
  3 | 
  4 | #if defined(__APPLE__)
  5 |   #if defined(__x86_64__)
  6 |     #include <sys/sysctl.h>
  7 |   #else
  8 |     #include <mach/mach.h>
  9 |     #include <mach/mach_time.h>
 10 |   #endif // __x86_64__ or not
 11 | 
 12 |   #include <stdio.h>  // fprintf
 13 |   #include <stdlib.h> // exit
 14 | 
 15 | #elif _WIN32
 16 | #  include <windows.h>
 17 | #  include <time.h>
 18 | #else
 19 | #  include <stdio.h>
 20 | #  include <stdlib.h>
 21 | #  include <string.h>
 22 | #  include <sys/time.h>
 23 | #endif
 24 | 
 25 | 
 26 |   // This uses the cycle counter of the processor.  Different
 27 |   // processors in the system will have different values for this.  If
 28 |   // you process moves across processors, then the delta time you
 29 |   // measure will likely be incorrect.  This is mostly for fine
 30 |   // grained measurements where the process is likely to be on the
 31 |   // same processor.  For more global things you should use the
 32 |   // Time interface.
 33 | 
 34 |   // Also note that if you processors' speeds change (i.e. processors
 35 |   // scaling) or if you are in a heterogenous environment, you will
 36 |   // likely get spurious results.
 37 |   class CycleTimer {
 38 |   public:
 39 |     typedef unsigned long long SysClock;
 40 | 
 41 |     //////////
 42 |     // Return the current CPU time, in terms of clock ticks.
 43 |     // Time zero is at some arbitrary point in the past.
 44 |     static SysClock currentTicks() {
 45 | #if defined(__APPLE__) && !defined(__x86_64__)
 46 |       return mach_absolute_time();
 47 | #elif defined(_WIN32)
 48 |       LARGE_INTEGER qwTime;
 49 |       QueryPerformanceCounter(&qwTime);
 50 |       return qwTime.QuadPart;
 51 | #elif defined(__x86_64__)
 52 |       unsigned int a, d;
 53 |       asm volatile("rdtsc" : "=a" (a), "=d" (d));
 54 |       return static_cast<unsigned long long>(a) |
 55 |         (static_cast<unsigned long long>(d) << 32);
 56 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser.
 57 |       unsigned int val;
 58 |       asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val));
 59 |       return val;
 60 | #else
 61 |       timespec spec;
 62 |       clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec);
 63 |       return CycleTimer::SysClock(static_cast<float>(spec.tv_sec) * 1e9 + static_cast<float>(spec.tv_nsec));
 64 | #endif
 65 |     }
 66 | 
 67 |     //////////
 68 |     // Return the current CPU time, in terms of seconds.
 69 |     // This is slower than currentTicks().  Time zero is at
 70 |     // some arbitrary point in the past.
 71 |     static double currentSeconds() {
 72 |       return currentTicks() * secondsPerTick();
 73 |     }
 74 | 
 75 |     //////////
 76 |     // Return the conversion from seconds to ticks.
 77 |     static double ticksPerSecond() {
 78 |       return 1.0/secondsPerTick();
 79 |     }
 80 | 
 81 |     static const char* tickUnits() {
 82 | #if defined(__APPLE__) && !defined(__x86_64__)
 83 |       return "ns";
 84 | #elif defined(__WIN32__) || defined(__x86_64__)
 85 |       return "cycles";
 86 | #else
 87 |       return "ns"; // clock_gettime
 88 | #endif
 89 |     }
 90 | 
 91 |     //////////
 92 |     // Return the conversion from ticks to seconds.
 93 |     static double secondsPerTick() {
 94 |       static bool initialized = false;
 95 |       static double secondsPerTick_val;
 96 |       if (initialized) return secondsPerTick_val;
 97 | #if defined(__APPLE__)
 98 |   #ifdef __x86_64__
 99 |       int args[] = {CTL_HW, HW_CPU_FREQ};
100 |       unsigned int Hz;
101 |       size_t len = sizeof(Hz);
102 |       if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) {
103 |          fprintf(stderr, "Failed to initialize secondsPerTick_val!\n");
104 |          exit(-1);
105 |       }
106 |       secondsPerTick_val = 1.0 / (double) Hz;
107 |   #else
108 |       mach_timebase_info_data_t time_info;
109 |       mach_timebase_info(&time_info);
110 | 
111 |       // Scales to nanoseconds without 1e-9f
112 |       secondsPerTick_val = (1e-9*static_cast<double>(time_info.numer))/
113 |         static_cast<double>(time_info.denom);
114 |   #endif // x86_64 or not
115 | #elif defined(_WIN32)
116 |       LARGE_INTEGER qwTicksPerSec;
117 |       QueryPerformanceFrequency(&qwTicksPerSec);
118 |       secondsPerTick_val = 1.0/static_cast<double>(qwTicksPerSec.QuadPart);
119 | #else
120 |       FILE *fp = fopen("/proc/cpuinfo","r");
121 |       char input[1024];
122 |       if (!fp) {
123 |          fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo.");
124 |          exit(-1);
125 |       }
126 |       // In case we don't find it, e.g. on the N900
127 |       secondsPerTick_val = 1e-9;
128 |       while (!feof(fp) && fgets(input, 1024, fp)) {
129 |         // NOTE(boulos): Because reading cpuinfo depends on dynamic
130 |         // frequency scaling it's better to read the @ sign first
131 |         float GHz, MHz;
132 |         if (strstr(input, "model name")) {
133 |           char* at_sign = strstr(input, "@");
134 |           if (at_sign) {
135 |             char* after_at = at_sign + 1;
136 |             char* GHz_str = strstr(after_at, "GHz");
137 |             char* MHz_str = strstr(after_at, "MHz");
138 |             if (GHz_str) {
139 |               *GHz_str = '\0';
140 |               if (1 == sscanf(after_at, "%f", &GHz)) {
141 |                 //printf("GHz = %f\n", GHz);
142 |                 secondsPerTick_val = 1e-9f / GHz;
143 |                 break;
144 |               }
145 |             } else if (MHz_str) {
146 |               *MHz_str = '\0';
147 |               if (1 == sscanf(after_at, "%f", &MHz)) {
148 |                 //printf("MHz = %f\n", MHz);
149 |                 secondsPerTick_val = 1e-6f / GHz;
150 |                 break;
151 |               }
152 |             }
153 |           }
154 |         } else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) {
155 |           //printf("MHz = %f\n", MHz);
156 |           secondsPerTick_val = 1e-6f / MHz;
157 |           break;
158 |         }
159 |       }
160 |       fclose(fp);
161 | #endif
162 | 
163 |       initialized = true;
164 |       return secondsPerTick_val;
165 |     }
166 | 
167 |     //////////
168 |     // Return the conversion from ticks to milliseconds.
169 |     static double msPerTick() {
170 |       return secondsPerTick() * 1000.0;
171 |     }
172 | 
173 |   private:
174 |     CycleTimer();
175 |   };
176 | 
177 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_
178 | 


--------------------------------------------------------------------------------
/asst4/common/contracts.h:
--------------------------------------------------------------------------------
 1 | /* Debugging with contracts; simulating cc0 -d
 2 |  * Enable with gcc -DDEBUG ...
 3 |  *
 4 |  * 15-122 Principles of Imperative Computation
 5 |  * Frank Pfenning
 6 |  */
 7 | 
 8 | #include <assert.h>
 9 | 
10 | /* Unlike typical header files, "contracts.h" may be
11 |  * included multiple times, with and without DEBUG defined.
12 |  * For this to succeed we first undefine the macros in
13 |  * question in order to avoid a redefinition warning.
14 |  */
15 | 
16 | #undef ASSERT
17 | #undef REQUIRES
18 | #undef ENSURES
19 | 
20 | #ifdef DEBUG
21 | 
22 | #define ASSERT(COND) assert(COND)
23 | #define REQUIRES(COND) assert(COND)
24 | #define ENSURES(COND) assert(COND)
25 | 
26 | #else
27 | 
28 | #define ASSERT(COND) ((void)0)
29 | #define REQUIRES(COND) ((void)0)
30 | #define ENSURES(COND) ((void)0)
31 | 
32 | #endif
33 | 


--------------------------------------------------------------------------------
/asst4/common/grade.h:
--------------------------------------------------------------------------------
  1 | #ifndef __GRADE_H__
  2 | #define __GRADE_H__
  3 | 
  4 | #include <stdio.h>
  5 | #include <sstream>
  6 | #include <iomanip>
  7 | #include <chrono>
  8 | 
  9 | #include <type_traits>
 10 | #include <utility>
 11 | 
 12 | #include <float.h>
 13 | #include <cmath>
 14 | 
 15 | #include <omp.h>
 16 | 
 17 | #include "graph.h"
 18 | #include "graph_internal.h"
 19 | #include "contracts.h"
 20 | 
 21 | // Epsilon for approximate float comparisons
 22 | #define EPSILON 0.00000000001
 23 | 
 24 | // Output column size
 25 | #define COL_SIZE 15
 26 | 
 27 | // Point value for apps that are not run.
 28 | #define POINTS_NA -1
 29 | 
 30 | // Point value for apps that yeilded incorrect results.
 31 | #define POINTS_INCORRECT -2
 32 | 
 33 | /*
 34 |  * Printing functions
 35 |  */
 36 | 
 37 | static void sep(std::ostream& out, char separator = '-', int length = 78)
 38 | {
 39 |     for (int i = 0; i < length; i++)
 40 |       out << separator;
 41 |     out << std::endl;
 42 | }
 43 | 
 44 | static void printTimingApp(std::ostream& timing, const char* appName)
 45 | {
 46 |   std::cout << std::endl;
 47 |   std::cout << "Timing results for " << appName << ":" << std::endl;
 48 |   sep(std::cout, '=', 75);
 49 | 
 50 |   timing << std::endl;
 51 |   timing << "Timing results for " << appName << ":" << std::endl;
 52 |   sep(timing, '=', 75);
 53 | }
 54 | 
 55 | /*
 56 |  * Correctness checkers
 57 |  */
 58 | 
 59 | template <class T>
 60 | bool compareArrays(Graph graph, T* ref, T* stu)
 61 | {
 62 |   for (int i = 0; i < graph->num_nodes; i++) {
 63 |     if (ref[i] != stu[i]) {
 64 |       std::cerr << "*** Results disagree at " << i << " expected " 
 65 |         << ref[i] << " found " << stu[i] << std::endl;
 66 |       return false;
 67 |     }
 68 |   }
 69 |   return true;
 70 | }
 71 | 
 72 | template <class T>
 73 | bool compareApprox(Graph graph, T* ref, T* stu)
 74 | {
 75 |   for (int i = 0; i < graph->num_nodes; i++) {
 76 |     if (fabs(ref[i] - stu[i]) > EPSILON) {
 77 |       std::cerr << "*** Results disagree at " << i << " expected " 
 78 |         << ref[i] << " found " << stu[i] << std::endl;
 79 |       return false;
 80 |     }
 81 |   }
 82 |   return true;
 83 | }
 84 | 
 85 | template <class T>
 86 | bool compareArraysAndDisplay(Graph graph, T* ref, T*stu) 
 87 | {
 88 |   printf("\n----------------------------------\n");
 89 |   printf("Visualization of student results");
 90 |   printf("\n----------------------------------\n\n");
 91 | 
 92 |   int grid_dim = (int)sqrt(graph->num_nodes);
 93 |   for (int j=0; j<grid_dim; j++) {
 94 |     for (int i=0; i<grid_dim; i++) {
 95 |       printf("%02d ", stu[j*grid_dim + i]);
 96 |     }
 97 |     printf("\n");
 98 |   }
 99 |   printf("\n----------------------------------\n");
100 |   printf("Visualization of reference results");
101 |   printf("\n----------------------------------\n\n");
102 | 
103 |   grid_dim = (int)sqrt(graph->num_nodes);
104 |   for (int j=0; j<grid_dim; j++) {
105 |     for (int i=0; i<grid_dim; i++) {
106 |       printf("%02d ", ref[j*grid_dim + i]);
107 |     }
108 |     printf("\n");
109 |   }
110 |   
111 |   return compareArrays<T>(graph, ref, stu);
112 | }
113 | 
114 | template <class T>
115 | bool compareArraysAndRadiiEst(Graph graph, T* ref, T* stu) 
116 | {
117 |   bool isCorrect = true;
118 |   for (int i = 0; i < graph->num_nodes; i++) {
119 |     if (ref[i] != stu[i]) {
120 |       std::cerr << "*** Results disagree at " << i << " expected "
121 |         << ref[i] << " found " << stu[i] << std::endl;
122 | 	isCorrect = false;
123 |     }
124 |   }
125 |   int stuMaxVal = -1;
126 |   int refMaxVal = -1;
127 |   #pragma omp parallel for schedule(dynamic, 512) reduction(max: stuMaxVal)
128 |   for (int i = 0; i < graph->num_nodes; i++) {
129 | 	if (stu[i] > stuMaxVal)
130 | 		stuMaxVal = stu[i];
131 |   }
132 |   #pragma omp parallel for schedule(dynamic, 512) reduction(max: refMaxVal)
133 |   for (int i = 0; i < graph->num_nodes; i++) {
134 |         if (ref[i] > refMaxVal)
135 |                 refMaxVal = ref[i];
136 |   }
137 |  
138 |   if (refMaxVal != stuMaxVal) {
139 | 	std::cerr << "*** Radius estimates differ. Expected: " << refMaxVal << " Got: " << stuMaxVal << std::endl;
140 | 	isCorrect = false;
141 |   }   
142 |   return isCorrect;
143 | }
144 | 
145 | #endif /* __GRADE_H__ */
146 | 


--------------------------------------------------------------------------------
/asst4/common/graph.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GRAPH_H__
 2 | #define __GRAPH_H__
 3 | 
 4 | using Vertex = int;
 5 | 
 6 | struct graph
 7 | {
 8 |     // Number of edges in the graph
 9 |     int num_edges;
10 |     // Number of vertices in the graph
11 |     int num_nodes;
12 | 
13 |     // The node reached by vertex i's first outgoing edge is given by
14 |     // outgoing_edges[outgoing_starts[i]].  To iterate over all
15 |     // outgoing edges, please see the top-down bfs implementation.
16 |     int* outgoing_starts;
17 |     Vertex* outgoing_edges;
18 | 
19 |     int* incoming_starts;
20 |     Vertex* incoming_edges;
21 | };
22 | 
23 | using Graph = graph*;
24 | 
25 | /* Getters */
26 | static inline int num_nodes(const Graph);
27 | static inline int num_edges(const Graph);
28 | 
29 | static inline const Vertex* outgoing_begin(const Graph, Vertex);
30 | static inline const Vertex* outgoing_end(const Graph, Vertex);
31 | static inline int outgoing_size(const Graph, Vertex);
32 | 
33 | static inline const Vertex* incoming_begin(const Graph, Vertex);
34 | static inline const Vertex* incoming_end(const Graph, Vertex);
35 | static inline int incoming_size(const Graph, Vertex);
36 | 
37 | 
38 | /* IO */
39 | Graph load_graph(const char* filename);
40 | Graph load_graph_binary(const char* filename);
41 | void store_graph_binary(const char* filename, Graph);
42 | 
43 | void print_graph(const graph*);
44 | 
45 | 
46 | /* Deallocation */
47 | void free_graph(Graph);
48 | 
49 | 
50 | /* Included here to enable inlining. Don't look. */
51 | #include "graph_internal.h"
52 | 
53 | #endif
54 | 


--------------------------------------------------------------------------------
/asst4/common/graph_internal.h:
--------------------------------------------------------------------------------
 1 | #ifndef __GRAPH_INTERNAL_H__
 2 | #define __GRAPH_INTERNAL_H__
 3 | 
 4 | #include <stdlib.h>
 5 | #include "contracts.h"
 6 | 
 7 | static inline int num_nodes(const Graph graph)
 8 | {
 9 |   REQUIRES(graph != NULL);
10 |   return graph->num_nodes;
11 | }
12 | 
13 | static inline int num_edges(const Graph graph)
14 | {
15 |   REQUIRES(graph != NULL);
16 |   return graph->num_edges;
17 | }
18 | 
19 | static inline const Vertex* outgoing_begin(const Graph g, Vertex v)
20 | {
21 |   REQUIRES(g != NULL);
22 |   REQUIRES(0 <= v && v < num_nodes(g));
23 |   return g->outgoing_edges + g->outgoing_starts[v];
24 | }
25 | 
26 | static inline const Vertex* outgoing_end(const Graph g, Vertex v)
27 | {
28 |   REQUIRES(g != NULL);
29 |   REQUIRES(0 <= v && v < num_nodes(g));
30 |   int offset = (v == g->num_nodes - 1) ? g->num_edges : g->outgoing_starts[v + 1];
31 |   return g->outgoing_edges + offset;
32 | }
33 | 
34 | static inline int outgoing_size(const Graph g, Vertex v)
35 | {
36 |   REQUIRES(g != NULL);
37 |   REQUIRES(0 <= v && v < num_nodes(g));
38 |   if (v == g->num_nodes - 1) {
39 |     return g->num_edges - g->outgoing_starts[v];
40 |   } else {
41 |     return g->outgoing_starts[v + 1] - g->outgoing_starts[v];
42 |   }
43 | }
44 | 
45 | static inline const Vertex* incoming_begin(const Graph g, Vertex v)
46 | {
47 |   REQUIRES(g != NULL);
48 |   REQUIRES(0 <= v && v < num_nodes(g));
49 |   return g->incoming_edges + g->incoming_starts[v];
50 | }
51 | 
52 | static inline const Vertex* incoming_end(const Graph g, Vertex v)
53 | {
54 |   REQUIRES(g != NULL);
55 |   REQUIRES(0 <= v && v < num_nodes(g));
56 |   int offset = (v == g->num_nodes - 1) ? g->num_edges : g->incoming_starts[v + 1];
57 |   return g->incoming_edges + offset;
58 | }
59 | 
60 | static inline int incoming_size(const Graph g, Vertex v)
61 | {
62 |   REQUIRES(g != NULL);
63 |   REQUIRES(0 <= v && v < num_nodes(g));
64 |   if (v == g->num_nodes - 1) {
65 |     return g->num_edges - g->incoming_starts[v];
66 |   } else {
67 |     return g->incoming_starts[v + 1] - g->incoming_starts[v];
68 |   }
69 | }
70 | 
71 | #endif // __GRAPH_INTERNAL_H__
72 | 


--------------------------------------------------------------------------------
/asst4/handout/AMI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/handout/AMI.png


--------------------------------------------------------------------------------
/asst4/handout/instance_type.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/handout/instance_type.png


--------------------------------------------------------------------------------
/asst4/handout/instance_type_big.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/handout/instance_type_big.png


--------------------------------------------------------------------------------
/asst4/handout/ip_address.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/handout/ip_address.png


--------------------------------------------------------------------------------
/asst4/handout/new_key_pair.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/handout/new_key_pair.png


--------------------------------------------------------------------------------
/asst4/handout/storage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/handout/storage.png


--------------------------------------------------------------------------------
/asst4/handout/storage_big.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/handout/storage_big.png


--------------------------------------------------------------------------------
/asst4/imgs/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/imgs/1.png


--------------------------------------------------------------------------------
/asst4/imgs/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/imgs/2.png


--------------------------------------------------------------------------------
/asst4/pagerank/Makefile:
--------------------------------------------------------------------------------
1 | all: default grade
2 | 
3 | default: page_rank.cpp main.cpp
4 | 	g++ -I../ -std=c++11 -fopenmp -g -O3 -o pr main.cpp page_rank.cpp ../common/graph.cpp ref_pr.a
5 | grade: page_rank.cpp grade.cpp
6 | 	g++ -I../ -std=c++11 -fopenmp -g -O3 -o pr_grader grade.cpp page_rank.cpp ../common/graph.cpp ref_pr.a
7 | clean:
8 | 	rm -rf pr pr_grader *~ *.*~
9 | 


--------------------------------------------------------------------------------
/asst4/pagerank/grade.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <omp.h>
  4 | #include <string>
  5 | #include <unistd.h>
  6 | #include <limits>
  7 | 
  8 | #include <iostream>
  9 | #include <sstream>
 10 | #include <vector>
 11 | 
 12 | #include "../common/CycleTimer.h"
 13 | #include "../common/graph.h"
 14 | #include "../common/grade.h"
 15 | #include "page_rank.h"
 16 | 
 17 | #define USE_BINARY_GRAPH 1
 18 | 
 19 | #define PageRankDampening 0.3f
 20 | #define PageRankConvergence 1e-7d
 21 | 
 22 | void reference_pageRank(Graph g, double* solution, double damping,
 23 |                         double convergence);
 24 | 
 25 | void usage(const char* binary_name) {
 26 |     std::cout << "Usage: " << binary_name << " [options] graphdir" << std::endl;
 27 |     std::cout << std::endl;
 28 |     std::cout << "Options:" << std::endl;
 29 |     std::cout << "  -n  INT number of threads" << std::endl;
 30 |     std::cout << "  -r  INT number of runs" << std::endl;
 31 |     std::cout << "  -h      this commandline help message" << std::endl;
 32 | }
 33 | 
 34 | graph* load_graph(std::string graph_filename) {
 35 |     graph* g;
 36 |     if (USE_BINARY_GRAPH) {
 37 |       g = load_graph_binary(graph_filename.c_str());
 38 |     } else {
 39 |         g = load_graph(graph_filename);
 40 |         printf("storing binary form of graph!\n");
 41 |         store_graph_binary(graph_filename.append(".bin").c_str(), g);
 42 |         delete g;
 43 |         exit(1);
 44 |     }
 45 |     return g;
 46 | }
 47 | 
 48 | double run_on_graph(graph* g, int num_threads, int num_runs, std::string graph_name) {
 49 | 
 50 |     double* sol_stu = new double[g->num_nodes];
 51 |     double* sol_ref = new double[g->num_nodes];
 52 |         
 53 |     omp_set_num_threads(num_threads);
 54 | 
 55 |     double start, time;
 56 |     
 57 |     //Run implementation
 58 |     double stu_time = std::numeric_limits<int>::max();
 59 |     for (int r = 0; r < num_runs; r++) {
 60 |         start = CycleTimer::currentSeconds();
 61 |         pageRank(g, sol_stu, PageRankDampening, PageRankConvergence);
 62 |         //reference_pageRank(g, sol_stu, PageRankDampening, PageRankConvergence);
 63 |         time = CycleTimer::currentSeconds() - start;
 64 |         stu_time = std::min(stu_time, time);
 65 |     }
 66 | 
 67 |     //Run reference implementation
 68 |     double ref_time = std::numeric_limits<int>::max();
 69 |     for (int r = 0; r < num_runs; r++) {
 70 |         start = CycleTimer::currentSeconds();
 71 |         reference_pageRank(g, sol_ref, PageRankDampening, PageRankConvergence);
 72 |         time = CycleTimer::currentSeconds() - start;
 73 |         ref_time = std::min(ref_time, time);
 74 |     }
 75 | 
 76 |     bool correct = compareApprox(g, sol_ref, sol_stu);
 77 |     
 78 |     delete(sol_stu);
 79 |     delete(sol_ref);
 80 | 
 81 |     if (!correct) {
 82 |         std::cout << "Page rank incorrect" << std::endl; 
 83 |     } else {
 84 |         std::cout << "ref_time: " <<  ref_time << "s" << std::endl;
 85 |         std::cout << "stu_time: " <<  stu_time << "s" << std::endl;
 86 |     }
 87 | 
 88 |     double max_score = 4;
 89 |     double max_perf_score = 0.8 * max_score;
 90 |     double correctness_score = 0.2 * max_score;
 91 |     correctness_score = (correct) ? correctness_score : 0;
 92 | 
 93 |     double ratio = (ref_time/stu_time);
 94 | 
 95 |     double slope = max_perf_score/(0.7 - 0.3);
 96 |     double offset = 0.3 * slope;
 97 | 
 98 |     double perf_score = (correct) ? ratio*slope - offset : 0;
 99 | 
100 |     if (perf_score < 0) perf_score = 0;
101 |     if (perf_score > max_perf_score) perf_score = max_perf_score;
102 | 
103 |     return (correctness_score + perf_score);
104 | }
105 | 
106 | void print_separator_line() {
107 |     for (int i = 0; i < 43; i++) {
108 |         std::cout<<"-";
109 |     }
110 |     std::cout<<std::endl;
111 | }
112 | 
113 | void print_scores(std::vector<std::string> grade_graphs, std::vector<double> scores) {
114 |     
115 |     std::cout.precision(5);
116 |     std::cout.setf(std::ios::fixed, std:: ios::floatfield);
117 |     std::cout<<std::endl<<std::endl;
118 | 
119 |     print_separator_line();
120 | 
121 |     std::cout<<"SCORES :"<<std::endl;
122 | 
123 |     print_separator_line();
124 | 
125 |     double total_score = 0.0;
126 | 
127 |     for (int g = 0; g < grade_graphs.size(); g++) {
128 |         auto& graph_name = grade_graphs[g];
129 | 
130 |         total_score += scores[g];
131 | 
132 |         std::string max_score = "4";
133 | 
134 |         std::cout<<graph_name;
135 |         for (int i = 0; i < (28 - graph_name.length()); i++) {
136 |             std::cout<<" ";
137 |         }
138 |         std::cout<<"| ";
139 |         std::cout<<"  "<<scores[g]<<" / "<<max_score<<" |"<<std::endl;
140 | 
141 |         print_separator_line();
142 |     }
143 | 
144 |     std::cout<<"TOTAL";
145 |     for (int i = 0; i < (28 - 5); i++) {
146 |             std::cout<<" ";
147 |     }
148 |     std::cout<<"| ";
149 |     std::cout<<"  "<<total_score<<" / "<<"16"<<" |"<<std::endl;
150 | 
151 |     print_separator_line();
152 | 
153 | }
154 | 
155 | int main(int argc, char** argv) {
156 | 
157 |     int num_threads = omp_get_max_threads();
158 |     int num_runs = 1;
159 |     std::string graph_name, graph_dir;
160 |     bool grade = false;
161 | 
162 |     int opt;
163 |     while ((opt = getopt(argc,argv,"n:r:h")) != EOF) {
164 |         switch(opt) {
165 |             case 'n':
166 |                 num_threads = atoi(optarg);
167 |                 break;
168 |             case 'r':
169 |                 num_runs = atoi(optarg);
170 |                 break;
171 |             case 'h':
172 |             case '?':
173 |             default:
174 |                 usage(argv[0]);
175 |                 exit(1);
176 |         }
177 |     }
178 | 
179 |     if (argc <= optind) {
180 |         usage(argv[0]);
181 |         exit(1);
182 |     }
183 | 
184 |     graph_dir = argv[optind];
185 |   
186 |     printf("Max system threads = %d\n", omp_get_max_threads());
187 |     printf("Running with %d threads\n", num_threads);
188 | 
189 | 
190 |     std::vector<std::string> grade_graphs = { "soc-livejournal1_68m.graph",
191 |                                               "com-orkut_117m.graph",
192 |                                               "rmat_200m.graph",
193 |                                               "random_500m.graph"};
194 | 
195 |     std::vector<double> scores(grade_graphs.size());
196 | 
197 |     int i = 0;
198 |     for (auto& graph_name: grade_graphs) {
199 |         graph* g = load_graph(graph_dir + '/' + graph_name);
200 |         std::cout << "\nGraph: " << graph_name << std::endl;
201 |         scores[i] = run_on_graph(g, num_threads, num_runs, graph_name);
202 |         delete g;
203 |         i++;
204 |     }
205 | 
206 |     print_scores(grade_graphs, scores);
207 | 
208 |     return 0;
209 | }
210 | 


--------------------------------------------------------------------------------
/asst4/pagerank/page_rank.cpp:
--------------------------------------------------------------------------------
  1 | #include "page_rank.h"
  2 | 
  3 | #include <stdlib.h>
  4 | #include <cmath>
  5 | #include <omp.h>
  6 | #include <utility>
  7 | #include <vector>
  8 | 
  9 | #include "../common/CycleTimer.h"
 10 | #include "../common/graph.h"
 11 | 
 12 | // #define DEBUG
 13 | 
 14 | // pageRank --
 15 | //
 16 | // g:           graph to process (see common/graph.h)
 17 | // solution:    array of per-vertex vertex scores (length of array is num_nodes(g))
 18 | // damping:     page-rank algorithm's damping parameter
 19 | // convergence: page-rank algorithm's convergence threshold
 20 | //
 21 | void pageRank(Graph g, double* solution, double damping, double convergence)
 22 | {
 23 | 
 24 |   // initialize vertex weights to uniform probability. Double
 25 |   // precision scores are used to avoid underflow for large graphs
 26 | 
 27 |   int numNodes = num_nodes(g);
 28 |   double equal_prob = 1.0 / numNodes;
 29 | 
 30 |   std::vector<double> ans(numNodes, equal_prob);
 31 |   std::vector<double> tmp(numNodes);
 32 | 
 33 |   bool converged{false};
 34 | 
 35 |   while (!converged) {
 36 | 
 37 |     double no_out_score = 0;
 38 | 
 39 |     #ifndef DEBUG
 40 |     #pragma omp parallel for reduction(+:no_out_score)
 41 |     #endif
 42 |     for (int i = 0; i < numNodes; ++i) {
 43 |       no_out_score += outgoing_size(g, i) == 0 ? damping * ans[i] / numNodes : 0;
 44 |     }
 45 | 
 46 | 
 47 |     #ifndef DEBUG
 48 |     #pragma omp parallel for
 49 |     #endif
 50 |     for (int i = 0; i < numNodes; ++i) {
 51 |       double tmp_score = 0;
 52 |       const Vertex* start = incoming_begin(g, i);
 53 |       const Vertex* end = incoming_end(g, i);
 54 |       for (const Vertex* v = start; v != end; ++v) {
 55 |         tmp_score += ans[*v] / outgoing_size(g, *v);
 56 |       }
 57 |       tmp_score = tmp_score * damping + (1.0 - damping) / numNodes;
 58 |       tmp_score += no_out_score;
 59 |       tmp[i] = tmp_score;
 60 |     }
 61 | 
 62 |     double diff = 0;
 63 |     #ifndef DEBUG
 64 |     #pragma omp parallel for reduction(+:diff)
 65 |     #endif
 66 |     for (int i = 0; i < numNodes; ++i) {
 67 |       diff += std::fabs(ans[i] - tmp[i]);
 68 |     }
 69 | 
 70 |     #ifdef DEBUG
 71 |     printf("DIFF: %lf | CONVER: %lf\n", diff, convergence);
 72 |     #endif
 73 |     std::swap(ans, tmp);
 74 |     converged = diff < convergence;
 75 |   }
 76 | 
 77 |   memcpy(solution, &*ans.begin(), sizeof(double) * numNodes);
 78 | 
 79 |   /*
 80 |      CS149 students: Implement the page rank algorithm here.  You
 81 |      are expected to parallelize the algorithm using openMP.  Your
 82 |      solution may need to allocate (and free) temporary arrays.
 83 | 
 84 |      Basic page rank pseudocode is provided below to get you started:
 85 | 
 86 |      // initialization: see example code above
 87 |      score_old[vi] = 1/numNodes;
 88 | 
 89 |      while (!converged) {
 90 | 
 91 |        // compute score_new[vi] for all nodes vi:
 92 |        score_new[vi] = sum over all nodes vj reachable from incoming edges
 93 |                           { score_old[vj] / number of edges leaving vj  }
 94 |        score_new[vi] = (damping * score_new[vi]) + (1.0-damping) / numNodes;
 95 | 
 96 |        score_new[vi] += sum over all nodes v in graph with no outgoing edges
 97 |                           { damping * score_old[v] / numNodes }
 98 | 
 99 |        // compute how much per-node scores have changed
100 |        // quit once algorithm has converged
101 | 
102 |        global_diff = sum over all nodes vi { abs(score_new[vi] - score_old[vi]) };
103 |        converged = (global_diff < convergence)
104 |      }
105 | 
106 |    */
107 | }
108 | 


--------------------------------------------------------------------------------
/asst4/pagerank/page_rank.h:
--------------------------------------------------------------------------------
1 | #ifndef __PAGE_RANK_H__
2 | #define __PAGE_RANK_H__
3 | 
4 | #include "common/graph.h"
5 | 
6 | void pageRank(Graph g, double* solution, double damping, double convergence);
7 | 
8 | #endif /* __PAGE_RANK_H__ */
9 | 


--------------------------------------------------------------------------------
/asst4/pagerank/ref_pr.a:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/pagerank/ref_pr.a


--------------------------------------------------------------------------------
/asst4/tools/Makefile:
--------------------------------------------------------------------------------
1 | BINARYNAME=graphTools
2 | 
3 | main:
4 | 	g++ -std=c++11 -g -O3 -o ${BINARYNAME} graphTools.cpp ../common/graph.cpp
5 | clean:
6 | 	rm -rf pr *~ *.*~ ${BINARYNAME}
7 | 


--------------------------------------------------------------------------------
/asst4/tools/plaintext.graph:
--------------------------------------------------------------------------------
 1 | AdjacencyGraph
 2 | # num vertices
 3 | 5
 4 | # num edges
 5 | 8
 6 | # edge starts
 7 | 0 4 6 7 8
 8 | # all the outgoing edges (target vertex)
 9 | 1 2 3 4
10 | 2 3
11 | 0
12 | 0
13 | 


--------------------------------------------------------------------------------