├── README.md ├── asst1 ├── README.md ├── common │ ├── CycleTimer.h │ ├── ppm.cpp │ └── tasksys.cpp ├── imgs │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ ├── Snipaste_2022-02-28_23-37-11.png │ ├── Snipaste_2022-02-28_23-49-32.png │ └── Snipaste_2022-02-28_23-49-52.png ├── prog1_mandelbrot_threads │ ├── Makefile │ ├── main.cpp │ ├── mandelbrot │ ├── mandelbrot-serial.ppm │ ├── mandelbrot-thread.ppm │ ├── mandelbrotSerial.cpp │ ├── mandelbrotThread.cpp │ └── objs │ │ ├── main.o │ │ ├── mandelbrotSerial.o │ │ ├── mandelbrotThread.o │ │ └── ppm.o ├── prog2_vecintrin │ ├── CS149intrin.cpp │ ├── CS149intrin.h │ ├── Makefile │ ├── logger.cpp │ ├── logger.h │ └── main.cpp ├── prog3_mandelbrot_ispc │ ├── Makefile │ ├── main.cpp │ ├── mandelbrot.ispc │ └── mandelbrotSerial.cpp ├── prog4_sqrt │ ├── Makefile │ ├── main.cpp │ ├── sqrt.ispc │ └── sqrtSerial.cpp └── prog5_saxpy │ ├── Makefile │ ├── main.cpp │ ├── saxpy.ispc │ └── saxpySerial.cpp ├── asst2 ├── README.md ├── common │ ├── CycleTimer.h │ └── ppm.cpp ├── figs │ └── task_graph.png ├── part_a │ ├── .gitignore │ ├── Makefile │ ├── itasksys.h │ ├── runtasks_ref_linux │ ├── runtasks_ref_osx_arm │ ├── runtasks_ref_osx_x86 │ ├── tasksys.cpp │ └── tasksys.h ├── part_b │ ├── .gitignore │ ├── Makefile │ ├── itasksys.h │ ├── runtasks_ref_linux │ ├── runtasks_ref_osx_arm │ ├── runtasks_ref_osx_x86 │ ├── tasksys.cpp │ └── tasksys.h ├── tests │ ├── main.cpp │ ├── main_ref.cpp │ ├── run_test_harness.py │ └── tests.h └── tutorial │ ├── Makefile │ ├── README.md │ └── tutorial.cpp ├── asst3 ├── README.md ├── cloud_readme.md ├── handout │ ├── bug_example.jpg │ ├── choose_ami.png │ ├── choose_instance.png │ ├── choose_storage.png │ ├── dependencies.jpg │ ├── gpu_instance.png │ ├── gpu_instance.png_original │ ├── ip_address.png │ ├── location_limit.png │ ├── navigation_quota.png │ ├── new_key_pair.png │ ├── order.jpg │ ├── point_in_circle.jpg │ ├── public_dns.png │ ├── quota_request.png │ ├── teaser.jpg │ ├── vCPU_dashboard.png │ ├── vCPU_dashboard_2.png │ └── vCPU_trouble.png ├── install.sh ├── render │ ├── Makefile │ ├── benchmark.cpp │ ├── checker.pl │ ├── checker.py │ ├── circleBoxTest.cu_inl │ ├── circleRenderer.h │ ├── cudaRenderer.cu │ ├── cudaRenderer.h │ ├── cycleTimer.h │ ├── display.cpp │ ├── exclusiveScan.cu_inl │ ├── image.h │ ├── index.html │ ├── lookupColor.cu_inl │ ├── main.cpp │ ├── noise.cpp │ ├── noise.h │ ├── noiseCuda.cu_inl │ ├── platformgl.h │ ├── ppm.cpp │ ├── ppm.h │ ├── refRenderer.cpp │ ├── refRenderer.h │ ├── refTimings.txt │ ├── render_ref │ ├── sceneLoader.cpp │ ├── sceneLoader.h │ ├── snow.par │ └── util.h ├── saxpy │ ├── CycleTimer.h │ ├── Makefile │ ├── main.cpp │ ├── saxpy.cu │ └── tt.asm └── scan │ ├── CycleTimer.h │ ├── Makefile │ ├── checker.pl │ ├── cudaScan_ref │ ├── log.txt │ ├── main.cpp │ └── scan.cu └── asst4 ├── README.md ├── bfs ├── Makefile ├── bfs.cpp ├── bfs.h ├── grade.cpp ├── main.cpp └── ref_bfs.o ├── cloud_readme.md ├── common ├── CycleTimer.h ├── contracts.h ├── grade.h ├── graph.cpp ├── graph.h └── graph_internal.h ├── handout ├── AMI.png ├── instance_type.png ├── instance_type_big.png ├── ip_address.png ├── new_key_pair.png ├── storage.png └── storage_big.png ├── imgs ├── 1.png └── 2.png ├── pagerank ├── Makefile ├── grade.cpp ├── main.cpp ├── page_rank.cpp ├── page_rank.h └── ref_pr.a └── tools ├── Makefile ├── graphTools.cpp └── plaintext.graph /README.md: -------------------------------------------------------------------------------- 1 | # cs149 2 | 3 | 本项目是自学CS149 PARALLEL COMPUTING 完成的所有的课程lab。 4 | 5 | 该门课程对应的CMU15-418课程,斯坦福的内容较CMU少,老师是同一个人。 6 | 7 | 8 | -------------------------------------------------------------------------------- /asst1/common/CycleTimer.h: -------------------------------------------------------------------------------- 1 | #ifndef _SYRAH_CYCLE_TIMER_H_ 2 | #define _SYRAH_CYCLE_TIMER_H_ 3 | 4 | #if defined(__APPLE__) 5 | #if defined(__x86_64__) 6 | #include 7 | #else 8 | #include 9 | #include 10 | #endif // __x86_64__ or not 11 | 12 | #include // fprintf 13 | #include // exit 14 | 15 | #elif _WIN32 16 | # include 17 | # include 18 | #else 19 | # include 20 | # include 21 | # include 22 | # include 23 | #endif 24 | 25 | 26 | // This uses the cycle counter of the processor. Different 27 | // processors in the system will have different values for this. If 28 | // you process moves across processors, then the delta time you 29 | // measure will likely be incorrect. This is mostly for fine 30 | // grained measurements where the process is likely to be on the 31 | // same processor. For more global things you should use the 32 | // Time interface. 33 | 34 | // Also note that if you processors' speeds change (i.e. processors 35 | // scaling) or if you are in a heterogenous environment, you will 36 | // likely get spurious results. 37 | class CycleTimer { 38 | public: 39 | typedef unsigned long long SysClock; 40 | 41 | ////////// 42 | // Return the current CPU time, in terms of clock ticks. 43 | // Time zero is at some arbitrary point in the past. 44 | static SysClock currentTicks() { 45 | #if defined(__APPLE__) && !defined(__x86_64__) 46 | return mach_absolute_time(); 47 | #elif defined(_WIN32) 48 | LARGE_INTEGER qwTime; 49 | QueryPerformanceCounter(&qwTime); 50 | return qwTime.QuadPart; 51 | #elif defined(__x86_64__) 52 | unsigned int a, d; 53 | asm volatile("rdtsc" : "=a" (a), "=d" (d)); 54 | return static_cast(a) | 55 | (static_cast(d) << 32); 56 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser. 57 | unsigned int val; 58 | asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val)); 59 | return val; 60 | #else 61 | timespec spec; 62 | clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec); 63 | return CycleTimer::SysClock(static_cast(spec.tv_sec) * 1e9 + static_cast(spec.tv_nsec)); 64 | #endif 65 | } 66 | 67 | ////////// 68 | // Return the current CPU time, in terms of seconds. 69 | // This is slower than currentTicks(). Time zero is at 70 | // some arbitrary point in the past. 71 | static double currentSeconds() { 72 | return currentTicks() * secondsPerTick(); 73 | } 74 | 75 | ////////// 76 | // Return the conversion from seconds to ticks. 77 | static double ticksPerSecond() { 78 | return 1.0/secondsPerTick(); 79 | } 80 | 81 | static const char* tickUnits() { 82 | #if defined(__APPLE__) && !defined(__x86_64__) 83 | return "ns"; 84 | #elif defined(__WIN32__) || defined(__x86_64__) 85 | return "cycles"; 86 | #else 87 | return "ns"; // clock_gettime 88 | #endif 89 | } 90 | 91 | ////////// 92 | // Return the conversion from ticks to seconds. 93 | static double secondsPerTick() { 94 | static bool initialized = false; 95 | static double secondsPerTick_val; 96 | if (initialized) return secondsPerTick_val; 97 | #if defined(__APPLE__) 98 | #ifdef __x86_64__ 99 | int args[] = {CTL_HW, HW_CPU_FREQ}; 100 | unsigned int Hz; 101 | size_t len = sizeof(Hz); 102 | if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) { 103 | fprintf(stderr, "Failed to initialize secondsPerTick_val!\n"); 104 | exit(-1); 105 | } 106 | secondsPerTick_val = 1.0 / (double) Hz; 107 | #else 108 | mach_timebase_info_data_t time_info; 109 | mach_timebase_info(&time_info); 110 | 111 | // Scales to nanoseconds without 1e-9f 112 | secondsPerTick_val = (1e-9*static_cast(time_info.numer))/ 113 | static_cast(time_info.denom); 114 | #endif // x86_64 or not 115 | #elif defined(_WIN32) 116 | LARGE_INTEGER qwTicksPerSec; 117 | QueryPerformanceFrequency(&qwTicksPerSec); 118 | secondsPerTick_val = 1.0/static_cast(qwTicksPerSec.QuadPart); 119 | #else 120 | FILE *fp = fopen("/proc/cpuinfo","r"); 121 | char input[1024]; 122 | if (!fp) { 123 | fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo."); 124 | exit(-1); 125 | } 126 | // In case we don't find it, e.g. on the N900 127 | secondsPerTick_val = 1e-9; 128 | while (!feof(fp) && fgets(input, 1024, fp)) { 129 | // NOTE(boulos): Because reading cpuinfo depends on dynamic 130 | // frequency scaling it's better to read the @ sign first 131 | float GHz, MHz; 132 | if (strstr(input, "model name")) { 133 | char* at_sign = strstr(input, "@"); 134 | if (at_sign) { 135 | char* after_at = at_sign + 1; 136 | char* GHz_str = strstr(after_at, "GHz"); 137 | char* MHz_str = strstr(after_at, "MHz"); 138 | if (GHz_str) { 139 | *GHz_str = '\0'; 140 | if (1 == sscanf(after_at, "%f", &GHz)) { 141 | //printf("GHz = %f\n", GHz); 142 | secondsPerTick_val = 1e-9f / GHz; 143 | break; 144 | } 145 | } else if (MHz_str) { 146 | *MHz_str = '\0'; 147 | if (1 == sscanf(after_at, "%f", &MHz)) { 148 | //printf("MHz = %f\n", MHz); 149 | secondsPerTick_val = 1e-6f / GHz; 150 | break; 151 | } 152 | } 153 | } 154 | } else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) { 155 | //printf("MHz = %f\n", MHz); 156 | secondsPerTick_val = 1e-6f / MHz; 157 | break; 158 | } 159 | } 160 | fclose(fp); 161 | #endif 162 | 163 | initialized = true; 164 | return secondsPerTick_val; 165 | } 166 | 167 | ////////// 168 | // Return the conversion from ticks to milliseconds. 169 | static double msPerTick() { 170 | return secondsPerTick() * 1000.0; 171 | } 172 | 173 | private: 174 | CycleTimer(); 175 | }; 176 | 177 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_ 178 | -------------------------------------------------------------------------------- /asst1/common/ppm.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | 7 | 8 | void 9 | writePPMImage(int* data, int width, int height, const char *filename, int maxIterations) 10 | { 11 | FILE *fp = fopen(filename, "wb"); 12 | 13 | // write ppm header 14 | fprintf(fp, "P6\n"); 15 | fprintf(fp, "%d %d\n", width, height); 16 | fprintf(fp, "255\n"); 17 | 18 | for (int i = 0; i < width*height; ++i) { 19 | 20 | // Clamp iteration count for this pixel, then scale the value 21 | // to 0-1 range. Raise resulting value to a power (<1) to 22 | // increase brightness of low iteration count 23 | // pixels. a.k.a. Make things look cooler. 24 | 25 | float mapped = pow( std::min(static_cast(maxIterations), 26 | static_cast(data[i])) / 256.f, .5f); 27 | 28 | // convert back into 0-255 range, 8-bit channels 29 | unsigned char result = static_cast(255.f * mapped); 30 | for (int j = 0; j < 3; ++j) 31 | fputc(result, fp); 32 | } 33 | fclose(fp); 34 | printf("Wrote image file %s\n", filename); 35 | } 36 | -------------------------------------------------------------------------------- /asst1/imgs/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/imgs/1.png -------------------------------------------------------------------------------- /asst1/imgs/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/imgs/2.png -------------------------------------------------------------------------------- /asst1/imgs/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/imgs/3.png -------------------------------------------------------------------------------- /asst1/imgs/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/imgs/4.png -------------------------------------------------------------------------------- /asst1/imgs/Snipaste_2022-02-28_23-37-11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/imgs/Snipaste_2022-02-28_23-37-11.png -------------------------------------------------------------------------------- /asst1/imgs/Snipaste_2022-02-28_23-49-32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/imgs/Snipaste_2022-02-28_23-49-32.png -------------------------------------------------------------------------------- /asst1/imgs/Snipaste_2022-02-28_23-49-52.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/imgs/Snipaste_2022-02-28_23-49-52.png -------------------------------------------------------------------------------- /asst1/prog1_mandelbrot_threads/Makefile: -------------------------------------------------------------------------------- 1 | 2 | CXX=g++ -m64 3 | CXXFLAGS=-I../common -Iobjs/ -O3 -std=c++11 -Wall -fPIC 4 | 5 | APP_NAME=mandelbrot 6 | OBJDIR=objs 7 | COMMONDIR=../common 8 | 9 | PPM_CXX=$(COMMONDIR)/ppm.cpp 10 | PPM_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(PPM_CXX:.cpp=.o))) 11 | 12 | 13 | default: $(APP_NAME) 14 | 15 | .PHONY: dirs clean 16 | 17 | dirs: 18 | /bin/mkdir -p $(OBJDIR)/ 19 | 20 | clean: 21 | /bin/rm -rf $(OBJDIR) *.ppm *~ $(APP_NAME) 22 | 23 | OBJS=$(OBJDIR)/main.o $(OBJDIR)/mandelbrotSerial.o $(OBJDIR)/mandelbrotThread.o $(PPM_OBJ) 24 | 25 | $(APP_NAME): dirs $(OBJS) 26 | $(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm -lpthread 27 | 28 | $(OBJDIR)/%.o: %.cpp 29 | $(CXX) $< $(CXXFLAGS) -c -o $@ 30 | 31 | $(OBJDIR)/%.o: $(COMMONDIR)/%.cpp 32 | $(CXX) $< $(CXXFLAGS) -c -o $@ 33 | 34 | $(OBJDIR)/main.o: $(COMMONDIR)/CycleTimer.h 35 | 36 | -------------------------------------------------------------------------------- /asst1/prog1_mandelbrot_threads/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "CycleTimer.h" 6 | 7 | extern void mandelbrotSerial( 8 | float x0, float y0, float x1, float y1, 9 | int width, int height, 10 | int startRow, int numRows, 11 | int maxIterations, 12 | int output[]); 13 | 14 | extern void mandelbrotThread( 15 | int numThreads, 16 | float x0, float y0, float x1, float y1, 17 | int width, int height, 18 | int maxIterations, 19 | int output[]); 20 | 21 | extern void writePPMImage( 22 | int* data, 23 | int width, int height, 24 | const char *filename, 25 | int maxIterations); 26 | 27 | void 28 | scaleAndShift(float& x0, float& x1, float& y0, float& y1, 29 | float scale, 30 | float shiftX, float shiftY) 31 | { 32 | 33 | x0 *= scale; 34 | x1 *= scale; 35 | y0 *= scale; 36 | y1 *= scale; 37 | x0 += shiftX; 38 | x1 += shiftX; 39 | y0 += shiftY; 40 | y1 += shiftY; 41 | 42 | } 43 | 44 | void usage(const char* progname) { 45 | printf("Usage: %s [options]\n", progname); 46 | printf("Program Options:\n"); 47 | printf(" -t --threads Use N threads\n"); 48 | printf(" -v --view Use specified view settings\n"); 49 | printf(" -? --help This message\n"); 50 | } 51 | 52 | bool verifyResult (int *gold, int *result, int width, int height) { 53 | 54 | int i, j; 55 | 56 | for (i = 0; i < height; i++) { 57 | for (j = 0; j < width; j++) { 58 | if (gold[i * width + j] != result[i * width + j]) { 59 | printf ("Mismatch : [%d][%d], Expected : %d, Actual : %d\n", 60 | i, j, gold[i * width + j], result[i * width + j]); 61 | return 0; 62 | } 63 | } 64 | } 65 | 66 | return 1; 67 | } 68 | 69 | int main(int argc, char** argv) { 70 | 71 | const unsigned int width = 1600; 72 | const unsigned int height = 1200; 73 | const int maxIterations = 256; 74 | int numThreads = 2; 75 | 76 | float x0 = -2; 77 | float x1 = 1; 78 | float y0 = -1; 79 | float y1 = 1; 80 | 81 | // parse commandline options //////////////////////////////////////////// 82 | int opt; 83 | static struct option long_options[] = { 84 | {"threads", 1, 0, 't'}, 85 | {"view", 1, 0, 'v'}, 86 | {"help", 0, 0, '?'}, 87 | {0 ,0, 0, 0} 88 | }; 89 | 90 | while ((opt = getopt_long(argc, argv, "t:v:?", long_options, NULL)) != EOF) { 91 | 92 | switch (opt) { 93 | case 't': 94 | { 95 | numThreads = atoi(optarg); 96 | break; 97 | } 98 | case 'v': 99 | { 100 | int viewIndex = atoi(optarg); 101 | // change view settings 102 | if (viewIndex == 2) { 103 | float scaleValue = .015f; 104 | float shiftX = -.986f; 105 | float shiftY = .30f; 106 | scaleAndShift(x0, x1, y0, y1, scaleValue, shiftX, shiftY); 107 | } else if (viewIndex > 1) { 108 | fprintf(stderr, "Invalid view index\n"); 109 | return 1; 110 | } 111 | break; 112 | } 113 | case '?': 114 | default: 115 | usage(argv[0]); 116 | return 1; 117 | } 118 | } 119 | // end parsing of commandline options 120 | 121 | 122 | int* output_serial = new int[width*height]; 123 | int* output_thread = new int[width*height]; 124 | 125 | // 126 | // Run the serial implementation. Run the code three times and 127 | // take the minimum to get a good estimate. 128 | // 129 | 130 | double minSerial = 1e30; 131 | for (int i = 0; i < 5; ++i) { 132 | memset(output_serial, 0, width * height * sizeof(int)); 133 | double startTime = CycleTimer::currentSeconds(); 134 | mandelbrotSerial(x0, y0, x1, y1, width, height, 0, height, maxIterations, output_serial); 135 | double endTime = CycleTimer::currentSeconds(); 136 | minSerial = std::min(minSerial, endTime - startTime); 137 | } 138 | 139 | printf("[mandelbrot serial]:\t\t[%.3f] ms\n", minSerial * 1000); 140 | writePPMImage(output_serial, width, height, "mandelbrot-serial.ppm", maxIterations); 141 | 142 | // 143 | // Run the threaded version 144 | // 145 | 146 | double minThread = 1e30; 147 | for (int i = 0; i < 5; ++i) { 148 | memset(output_thread, 0, width * height * sizeof(int)); 149 | double startTime = CycleTimer::currentSeconds(); 150 | mandelbrotThread(numThreads, x0, y0, x1, y1, width, height, maxIterations, output_thread); 151 | double endTime = CycleTimer::currentSeconds(); 152 | minThread = std::min(minThread, endTime - startTime); 153 | } 154 | 155 | printf("[mandelbrot thread]:\t\t[%.3f] ms\n", minThread * 1000); 156 | writePPMImage(output_thread, width, height, "mandelbrot-thread.ppm", maxIterations); 157 | 158 | if (! verifyResult (output_serial, output_thread, width, height)) { 159 | printf ("Error : Output from threads does not match serial output\n"); 160 | 161 | delete[] output_serial; 162 | delete[] output_thread; 163 | 164 | return 1; 165 | } 166 | 167 | // compute speedup 168 | printf("\t\t\t\t(%.2fx speedup from %d threads)\n", minSerial/minThread, numThreads); 169 | 170 | delete[] output_serial; 171 | delete[] output_thread; 172 | 173 | return 0; 174 | } 175 | -------------------------------------------------------------------------------- /asst1/prog1_mandelbrot_threads/mandelbrot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/prog1_mandelbrot_threads/mandelbrot -------------------------------------------------------------------------------- /asst1/prog1_mandelbrot_threads/mandelbrot-serial.ppm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/prog1_mandelbrot_threads/mandelbrot-serial.ppm -------------------------------------------------------------------------------- /asst1/prog1_mandelbrot_threads/mandelbrot-thread.ppm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst1/prog1_mandelbrot_threads/mandelbrot-thread.ppm -------------------------------------------------------------------------------- /asst1/prog1_mandelbrot_threads/mandelbrotSerial.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Note: This code was modified from example code 4 | originally provided by Intel. To comply with Intel's open source 5 | licensing agreement, their copyright is retained below. 6 | 7 | ----------------------------------------------------------------- 8 | 9 | Copyright (c) 2010-2011, Intel Corporation 10 | All rights reserved. 11 | 12 | Redistribution and use in source and binary forms, with or without 13 | modification, are permitted provided that the following conditions are 14 | met: 15 | 16 | * Redistributions of source code must retain the above copyright 17 | notice, this list of conditions and the following disclaimer. 18 | 19 | * Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in the 21 | documentation and/or other materials provided with the distribution. 22 | 23 | * Neither the name of Intel Corporation nor the names of its 24 | contributors may be used to endorse or promote products derived from 25 | this software without specific prior written permission. 26 | 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 28 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 30 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 31 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 32 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 33 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 34 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 35 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 36 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 37 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 | */ 39 | 40 | 41 | static inline int mandel(float c_re, float c_im, int count) 42 | { 43 | float z_re = c_re, z_im = c_im; 44 | int i; 45 | for (i = 0; i < count; ++i) { 46 | 47 | if (z_re * z_re + z_im * z_im > 4.f) 48 | break; 49 | 50 | float new_re = z_re*z_re - z_im*z_im; 51 | float new_im = 2.f * z_re * z_im; 52 | z_re = c_re + new_re; 53 | z_im = c_im + new_im; 54 | } 55 | 56 | return i; 57 | } 58 | 59 | // 60 | // MandelbrotSerial -- 61 | // 62 | // Compute an image visualizing the mandelbrot set. The resulting 63 | // array contains the number of iterations required before the complex 64 | // number corresponding to a pixel could be rejected from the set. 65 | // 66 | // * x0, y0, x1, y1 describe the complex coordinates mapping 67 | // into the image viewport. 68 | // * width, height describe the size of the output image 69 | // * startRow, totalRows describe how much of the image to compute 70 | void mandelbrotSerial( 71 | float x0, float y0, float x1, float y1, 72 | int width, int height, 73 | int startRow, int totalRows, 74 | int maxIterations, 75 | int output[]) 76 | { 77 | float dx = (x1 - x0) / width; 78 | float dy = (y1 - y0) / height; 79 | 80 | int endRow = startRow + totalRows; 81 | 82 | for (int j = startRow; j < endRow; j++) { 83 | for (int i = 0; i < width; ++i) { 84 | float x = x0 + i * dx; 85 | float y = y0 + j * dy; 86 | 87 | int index = (j * width + i); 88 | output[index] = mandel(x, y, maxIterations); 89 | } 90 | } 91 | } 92 | 93 | -------------------------------------------------------------------------------- /asst1/prog1_mandelbrot_threads/mandelbrotThread.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "CycleTimer.h" 5 | 6 | typedef struct { 7 | float x0, x1; 8 | float y0, y1; 9 | unsigned int width; 10 | unsigned int height; 11 | int maxIterations; 12 | int* output; 13 | int threadId; 14 | int numThreads; 15 | } WorkerArgs; 16 | 17 | 18 | static inline int mandel(float c_re, float c_im, int count) 19 | { 20 | float z_re = c_re, z_im = c_im; 21 | int i; 22 | for (i = 0; i < count; ++i) { 23 | 24 | if (z_re * z_re + z_im * z_im > 4.f) 25 | break; 26 | 27 | float new_re = z_re*z_re - z_im*z_im; 28 | float new_im = 2.f * z_re * z_im; 29 | z_re = c_re + new_re; 30 | z_im = c_im + new_im; 31 | } 32 | 33 | return i; 34 | } 35 | 36 | // 每个线程按照step进行跳跃处理 37 | static void mandelbrotSerial( 38 | float x0, float y0, float x1, float y1, 39 | int width, int height, 40 | int startRow, int step, 41 | int maxIterations, 42 | int output[]) { 43 | float dx = (x1 - x0) / width; 44 | float dy = (y1 - y0) / height; 45 | 46 | for (int j = startRow; j < height; j += step) { 47 | for (int i = 0; i < width; ++i) { 48 | float x = x0 + i * dx; 49 | float y = y0 + j * dy; 50 | int index = (j * width + i); 51 | output[index] = mandel(x, y, maxIterations); 52 | } 53 | } 54 | } 55 | 56 | 57 | // 58 | // workerThreadStart -- 59 | // 60 | // Thread entrypoint. 61 | void workerThreadStart(WorkerArgs * const args) { 62 | 63 | // TODO FOR CS149 STUDENTS: Implement the body of the worker 64 | // thread here. Each thread should make a call to mandelbrotSerial() 65 | // to compute a part of the output image. For example, in a 66 | // program that uses two threads, thread 0 could compute the top 67 | // half of the image and thread 1 could compute the bottom half. 68 | 69 | printf("Hello world from thread %d\n", args->threadId); 70 | double startTime = CycleTimer::currentSeconds(); 71 | // method1. 72 | // unsigned int dh = (args->height + args->numThreads - 1) / args->numThreads; 73 | // mandelbrotSerial(args->x0, args->y0, args->x1, args->y1, args->width, args->height, args->threadId * dh, 74 | // std::min(args->height, (args->threadId + 1) * dh) - args->threadId * dh, args->maxIterations, args->output); 75 | // method2 76 | mandelbrotSerial(args->x0, args->y0, args->x1, args->y1, args->width, args->height, args->threadId, 77 | args->numThreads, args->maxIterations, args->output); 78 | double endTime = CycleTimer::currentSeconds(); 79 | 80 | printf("[mandelbrot threadid %d]: [%.3lf] ms\n", args->threadId, (endTime - startTime) * 1000); 81 | } 82 | 83 | // 84 | // MandelbrotThread -- 85 | // 86 | // Multi-threaded implementation of mandelbrot set image generation. 87 | // Threads of execution are created by spawning std::threads. 88 | void mandelbrotThread( 89 | int numThreads, 90 | float x0, float y0, float x1, float y1, 91 | int width, int height, 92 | int maxIterations, int output[]) 93 | { 94 | static constexpr int MAX_THREADS = 32; 95 | 96 | if (numThreads > MAX_THREADS) 97 | { 98 | fprintf(stderr, "Error: Max allowed threads is %d\n", MAX_THREADS); 99 | exit(1); 100 | } 101 | 102 | // Creates thread objects that do not yet represent a thread. 103 | std::thread workers[MAX_THREADS]; 104 | WorkerArgs args[MAX_THREADS]; 105 | 106 | for (int i=0; i 8 | #include 9 | #include "logger.h" 10 | 11 | //******************* 12 | //* Type Definition * 13 | //******************* 14 | 15 | extern Logger CS149Logger; 16 | 17 | template 18 | struct __cs149_vec { 19 | T value[VECTOR_WIDTH]; 20 | }; 21 | 22 | // Declare a mask with __cs149_mask 23 | struct __cs149_mask : __cs149_vec {}; 24 | 25 | // Declare a floating point vector register with __cs149_vec_float 26 | #define __cs149_vec_float __cs149_vec 27 | 28 | // Declare an integer vector register with __cs149_vec_int 29 | #define __cs149_vec_int __cs149_vec 30 | 31 | //*********************** 32 | //* Function Definition * 33 | //*********************** 34 | 35 | // Return a mask initialized to 1 in the first N lanes and 0 in the others 36 | __cs149_mask _cs149_init_ones(int first = VECTOR_WIDTH); 37 | 38 | // Return the inverse of maska 39 | __cs149_mask _cs149_mask_not(__cs149_mask &maska); 40 | 41 | // Return (maska | maskb) 42 | __cs149_mask _cs149_mask_or(__cs149_mask &maska, __cs149_mask &maskb); 43 | 44 | // Return (maska & maskb) 45 | __cs149_mask _cs149_mask_and(__cs149_mask &maska, __cs149_mask &maskb); 46 | 47 | // Count the number of 1s in maska 48 | int _cs149_cntbits(__cs149_mask &maska); 49 | 50 | // Set register to value if vector lane is active 51 | // otherwise keep the old value 52 | void _cs149_vset_float(__cs149_vec_float &vecResult, float value, __cs149_mask &mask); 53 | void _cs149_vset_int(__cs149_vec_int &vecResult, int value, __cs149_mask &mask); 54 | // For user's convenience, returns a vector register with all lanes initialized to value 55 | __cs149_vec_float _cs149_vset_float(float value); 56 | __cs149_vec_int _cs149_vset_int(int value); 57 | 58 | // Copy values from vector register src to vector register dest if vector lane active 59 | // otherwise keep the old value 60 | void _cs149_vmove_float(__cs149_vec_float &dest, __cs149_vec_float &src, __cs149_mask &mask); 61 | void _cs149_vmove_int(__cs149_vec_int &dest, __cs149_vec_int &src, __cs149_mask &mask); 62 | 63 | // Load values from array src to vector register dest if vector lane active 64 | // otherwise keep the old value 65 | void _cs149_vload_float(__cs149_vec_float &dest, float* src, __cs149_mask &mask); 66 | void _cs149_vload_int(__cs149_vec_int &dest, int* src, __cs149_mask &mask); 67 | 68 | // Store values from vector register src to array dest if vector lane active 69 | // otherwise keep the old value 70 | void _cs149_vstore_float(float* dest, __cs149_vec_float &src, __cs149_mask &mask); 71 | void _cs149_vstore_int(int* dest, __cs149_vec_int &src, __cs149_mask &mask); 72 | 73 | // Return calculation of (veca + vecb) if vector lane active 74 | // otherwise keep the old value 75 | void _cs149_vadd_float(__cs149_vec_float &vecResult, __cs149_vec_float &veca, __cs149_vec_float &vecb, __cs149_mask &mask); 76 | void _cs149_vadd_int(__cs149_vec_int &vecResult, __cs149_vec_int &veca, __cs149_vec_int &vecb, __cs149_mask &mask); 77 | 78 | // Return calculation of (veca - vecb) if vector lane active 79 | // otherwise keep the old value 80 | void _cs149_vsub_float(__cs149_vec_float &vecResult, __cs149_vec_float &veca, __cs149_vec_float &vecb, __cs149_mask &mask); 81 | void _cs149_vsub_int(__cs149_vec_int &vecResult, __cs149_vec_int &veca, __cs149_vec_int &vecb, __cs149_mask &mask); 82 | 83 | // Return calculation of (veca * vecb) if vector lane active 84 | // otherwise keep the old value 85 | void _cs149_vmult_float(__cs149_vec_float &vecResult, __cs149_vec_float &veca, __cs149_vec_float &vecb, __cs149_mask &mask); 86 | void _cs149_vmult_int(__cs149_vec_int &vecResult, __cs149_vec_int &veca, __cs149_vec_int &vecb, __cs149_mask &mask); 87 | 88 | // Return calculation of (veca / vecb) if vector lane active 89 | // otherwise keep the old value 90 | void _cs149_vdiv_float(__cs149_vec_float &vecResult, __cs149_vec_float &veca, __cs149_vec_float &vecb, __cs149_mask &mask); 91 | void _cs149_vdiv_int(__cs149_vec_int &vecResult, __cs149_vec_int &veca, __cs149_vec_int &vecb, __cs149_mask &mask); 92 | 93 | 94 | // Return calculation of absolute value abs(veca) if vector lane active 95 | // otherwise keep the old value 96 | void _cs149_vabs_float(__cs149_vec_float &vecResult, __cs149_vec_float &veca, __cs149_mask &mask); 97 | void _cs149_vabs_int(__cs149_vec_int &vecResult, __cs149_vec_int &veca, __cs149_mask &mask); 98 | 99 | // Return a mask of (veca > vecb) if vector lane active 100 | // otherwise keep the old value 101 | void _cs149_vgt_float(__cs149_mask &vecResult, __cs149_vec_float &veca, __cs149_vec_float &vecb, __cs149_mask &mask); 102 | void _cs149_vgt_int(__cs149_mask &vecResult, __cs149_vec_int &veca, __cs149_vec_int &vecb, __cs149_mask &mask); 103 | 104 | // Return a mask of (veca < vecb) if vector lane active 105 | // otherwise keep the old value 106 | void _cs149_vlt_float(__cs149_mask &vecResult, __cs149_vec_float &veca, __cs149_vec_float &vecb, __cs149_mask &mask); 107 | void _cs149_vlt_int(__cs149_mask &vecResult, __cs149_vec_int &veca, __cs149_vec_int &vecb, __cs149_mask &mask); 108 | 109 | // Return a mask of (veca == vecb) if vector lane active 110 | // otherwise keep the old value 111 | void _cs149_veq_float(__cs149_mask &vecResult, __cs149_vec_float &veca, __cs149_vec_float &vecb, __cs149_mask &mask); 112 | void _cs149_veq_int(__cs149_mask &vecResult, __cs149_vec_int &veca, __cs149_vec_int &vecb, __cs149_mask &mask); 113 | 114 | // Adds up adjacent pairs of elements, so 115 | // [0 1 2 3] -> [0+1 0+1 2+3 2+3] 116 | void _cs149_hadd_float(__cs149_vec_float &vecResult, __cs149_vec_float &vec); 117 | 118 | // Performs an even-odd interleaving where all even-indexed elements move to front half 119 | // of the array and odd-indexed to the back half, so 120 | // [0 1 2 3 4 5 6 7] -> [0 2 4 6 1 3 5 7] 121 | void _cs149_interleave_float(__cs149_vec_float &vecResult, __cs149_vec_float &vec); 122 | 123 | // Add a customized log to help debugging 124 | void addUserLog(const char * logStr); 125 | 126 | #endif 127 | -------------------------------------------------------------------------------- /asst1/prog2_vecintrin/Makefile: -------------------------------------------------------------------------------- 1 | all: myexp 2 | 3 | logger.o: logger.cpp logger.h CS149intrin.h CS149intrin.cpp 4 | g++ -c logger.cpp 5 | 6 | CS149intrin.o: CS149intrin.cpp CS149intrin.h logger.cpp logger.h 7 | g++ -c CS149intrin.cpp 8 | 9 | myexp: CS149intrin.o logger.o main.cpp 10 | g++ -I../common logger.o CS149intrin.o main.cpp -o myexp 11 | 12 | clean: 13 | rm -f *.o myexp *~ 14 | -------------------------------------------------------------------------------- /asst1/prog2_vecintrin/logger.cpp: -------------------------------------------------------------------------------- 1 | #include "logger.h" 2 | #include "CS149intrin.h" 3 | 4 | void Logger::addLog(const char * instruction, __cs149_mask mask, int N) { 5 | Log newLog; 6 | strcpy(newLog.instruction, instruction); 7 | newLog.mask = 0; 8 | for (int i=0; i0); 16 | log.push_back(newLog); 17 | } 18 | 19 | void Logger::printStats() { 20 | printf("****************** Printing Vector Unit Statistics *******************\n"); 21 | printf("Vector Width: %d\n", VECTOR_WIDTH); 22 | printf("Total Vector Instructions: %lld\n", stats.total_instructions); 23 | printf("Vector Utilization: %.1f%%\n", (double)stats.utilized_lane/stats.total_lane*100); 24 | printf("Utilized Vector Lanes: %lld\n", stats.utilized_lane); 25 | printf("Total Vector Lanes: %lld\n", stats.total_lane); 26 | } 27 | 28 | 29 | 30 | void Logger::printLog() { 31 | printf("***************** Printing Vector Unit Execution Log *****************\n"); 32 | printf(" Instruction | Vector Lane Occupancy ('*' for active, '_' for inactive)\n"); 33 | printf("------------- --------------------------------------------------------\n"); 34 | for (int i=0; i 5 | #include 6 | #include 7 | using namespace std; 8 | 9 | #define MAX_INST_LEN 32 10 | 11 | struct __cs149_mask; 12 | 13 | struct Log { 14 | char instruction[MAX_INST_LEN]; 15 | unsigned long long mask; // support vector width up to 64 16 | }; 17 | 18 | struct Statistics { 19 | unsigned long long utilized_lane; 20 | unsigned long long total_lane; 21 | unsigned long long total_instructions; 22 | }; 23 | 24 | class Logger { 25 | private: 26 | vector log; 27 | Statistics stats; 28 | 29 | public: 30 | void addLog(const char * instruction, __cs149_mask mask, int N = 0); 31 | void printStats(); 32 | void printLog(); 33 | }; 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /asst1/prog3_mandelbrot_ispc/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ -m64 2 | CXXFLAGS=-I../common -Iobjs/ -O3 -Wall -fPIC 3 | ISPC=ispc 4 | # note: requires AVX2 5 | # disabling AVX2 FMA since it causes a difference in output compared to reference on Mandelbrot 6 | ISPCFLAGS=-O3 --target=avx2-i32x8 --arch=x86-64 --opt=disable-fma --pic 7 | 8 | APP_NAME=mandelbrot_ispc 9 | OBJDIR=objs 10 | COMMONDIR=../common 11 | 12 | PPM_CXX=$(COMMONDIR)/ppm.cpp 13 | PPM_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(PPM_CXX:.cpp=.o))) 14 | 15 | TASKSYS_CXX=$(COMMONDIR)/tasksys.cpp 16 | TASKSYS_LIB=-lpthread 17 | TASKSYS_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(TASKSYS_CXX:.cpp=.o))) 18 | 19 | default: $(APP_NAME) 20 | 21 | .PHONY: dirs clean 22 | 23 | dirs: 24 | /bin/mkdir -p $(OBJDIR)/ 25 | 26 | clean: 27 | /bin/rm -rf $(OBJDIR) *.ppm *~ $(APP_NAME) 28 | 29 | OBJS=$(OBJDIR)/main.o $(OBJDIR)/mandelbrotSerial.o $(OBJDIR)/mandelbrot_ispc.o $(PPM_OBJ) $(TASKSYS_OBJ) 30 | 31 | $(APP_NAME): dirs $(OBJS) 32 | $(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASKSYS_LIB) 33 | 34 | $(OBJDIR)/%.o: %.cpp 35 | $(CXX) $< $(CXXFLAGS) -c -o $@ 36 | 37 | $(OBJDIR)/%.o: $(COMMONDIR)/%.cpp 38 | $(CXX) $< $(CXXFLAGS) -c -o $@ 39 | 40 | $(OBJDIR)/main.o: $(OBJDIR)/mandelbrot_ispc.h $(COMMONDIR)/CycleTimer.h 41 | 42 | $(OBJDIR)/%_ispc.h $(OBJDIR)//%_ispc.o: %.ispc 43 | $(ISPC) $(ISPCFLAGS) $< -o $(OBJDIR)/$*_ispc.o -h $(OBJDIR)/$*_ispc.h 44 | 45 | -------------------------------------------------------------------------------- /asst1/prog3_mandelbrot_ispc/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "CycleTimer.h" 6 | #include "mandelbrot_ispc.h" 7 | 8 | extern void mandelbrotSerial( 9 | float x0, float y0, float x1, float y1, 10 | int width, int height, 11 | int startRow, int numRows, 12 | int maxIterations, 13 | int output[]); 14 | 15 | extern void mandelbrotThread( 16 | int numThreads, 17 | float x0, float y0, float x1, float y1, 18 | int width, int height, 19 | int maxIterations, 20 | int output[]); 21 | 22 | extern void writePPMImage( 23 | int* data, 24 | int width, int height, 25 | const char *filename, 26 | int maxIterations); 27 | 28 | bool verifyResult (int *gold, int *result, int width, int height) { 29 | int i, j; 30 | 31 | for (i = 0; i < height; i++) { 32 | for (j = 0; j < width; j++) { 33 | if (gold[i * width + j] != result[i * width + j]) { 34 | printf ("Mismatch : [%d][%d], Expected : %d, Actual : %d\n", 35 | i, j, gold[i * width + j], result[i * width + j]); 36 | return 0; 37 | } 38 | } 39 | } 40 | 41 | return 1; 42 | } 43 | 44 | void 45 | scaleAndShift(float& x0, float& x1, float& y0, float& y1, 46 | float scale, 47 | float shiftX, float shiftY) 48 | { 49 | 50 | x0 *= scale; 51 | x1 *= scale; 52 | y0 *= scale; 53 | y1 *= scale; 54 | x0 += shiftX; 55 | x1 += shiftX; 56 | y0 += shiftY; 57 | y1 += shiftY; 58 | 59 | } 60 | 61 | using namespace ispc; 62 | 63 | void usage(const char* progname) { 64 | printf("Usage: %s [options]\n", progname); 65 | printf("Program Options:\n"); 66 | printf(" -t --tasks Run ISPC code implementation with tasks\n"); 67 | printf(" -v --view Use specified view settings\n"); 68 | printf(" -? --help This message\n"); 69 | } 70 | 71 | 72 | int main(int argc, char** argv) { 73 | 74 | const unsigned int width = 1200; 75 | const unsigned int height = 800; 76 | const int maxIterations = 256; 77 | 78 | float x0 = -2; 79 | float x1 = 1; 80 | float y0 = -1; 81 | float y1 = 1; 82 | 83 | bool useTasks = false; 84 | 85 | // parse commandline options //////////////////////////////////////////// 86 | int opt; 87 | static struct option long_options[] = { 88 | {"tasks", 0, 0, 't'}, 89 | {"view", 1, 0, 'v'}, 90 | {"help", 0, 0, '?'}, 91 | {0 ,0, 0, 0} 92 | }; 93 | 94 | while ((opt = getopt_long(argc, argv, "tv:?", long_options, NULL)) != EOF) { 95 | 96 | switch (opt) { 97 | case 't': 98 | useTasks = true; 99 | break; 100 | case 'v': 101 | { 102 | int viewIndex = atoi(optarg); 103 | // change view settings 104 | if (viewIndex == 2) { 105 | float scaleValue = .015f; 106 | float shiftX = -.986f; 107 | float shiftY = .30f; 108 | scaleAndShift(x0, x1, y0, y1, scaleValue, shiftX, shiftY); 109 | } else if (viewIndex > 1) { 110 | fprintf(stderr, "Invalid view index\n"); 111 | return 1; 112 | } 113 | break; 114 | } 115 | case '?': 116 | default: 117 | usage(argv[0]); 118 | return 1; 119 | } 120 | } 121 | // end parsing of commandline options 122 | 123 | int *output_serial = new int[width*height]; 124 | int *output_ispc = new int[width*height]; 125 | int *output_ispc_tasks = new int[width*height]; 126 | 127 | for (unsigned int i = 0; i < width * height; ++i) 128 | output_serial[i] = 0; 129 | 130 | // 131 | // Run the serial implementation. Teport the minimum time of three 132 | // runs for robust timing. 133 | // 134 | double minSerial = 1e30; 135 | for (int i = 0; i < 3; ++i) { 136 | double startTime = CycleTimer::currentSeconds(); 137 | mandelbrotSerial(x0, y0, x1, y1, width, height, 0, height, maxIterations, output_serial); 138 | double endTime = CycleTimer::currentSeconds(); 139 | minSerial = std::min(minSerial, endTime - startTime); 140 | } 141 | 142 | printf("[mandelbrot serial]:\t\t[%.3f] ms\n", minSerial * 1000); 143 | writePPMImage(output_serial, width, height, "mandelbrot-serial.ppm", maxIterations); 144 | 145 | // Clear out the buffer 146 | for (unsigned int i = 0; i < width * height; ++i) 147 | output_ispc[i] = 0; 148 | 149 | // 150 | // Compute the image using the ispc implementation 151 | // 152 | double minISPC = 1e30; 153 | for (int i = 0; i < 3; ++i) { 154 | double startTime = CycleTimer::currentSeconds(); 155 | mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, output_ispc); 156 | double endTime = CycleTimer::currentSeconds(); 157 | minISPC = std::min(minISPC, endTime - startTime); 158 | } 159 | 160 | printf("[mandelbrot ispc]:\t\t[%.3f] ms\n", minISPC * 1000); 161 | writePPMImage(output_ispc, width, height, "mandelbrot-ispc.ppm", maxIterations); 162 | 163 | 164 | if (! verifyResult (output_serial, output_ispc, width, height)) { 165 | printf ("Error : ISPC output differs from sequential output\n"); 166 | 167 | delete[] output_serial; 168 | delete[] output_ispc; 169 | delete[] output_ispc_tasks; 170 | 171 | return 1; 172 | } 173 | 174 | // Clear out the buffer 175 | for (unsigned int i = 0; i < width * height; ++i) { 176 | output_ispc_tasks[i] = 0; 177 | } 178 | 179 | double minTaskISPC = 1e30; 180 | if (useTasks) { 181 | // 182 | // Tasking version of the ISPC code 183 | // 184 | for (int i = 0; i < 3; ++i) { 185 | double startTime = CycleTimer::currentSeconds(); 186 | mandelbrot_ispc_withtasks(x0, y0, x1, y1, width, height, maxIterations, output_ispc_tasks); 187 | double endTime = CycleTimer::currentSeconds(); 188 | minTaskISPC = std::min(minTaskISPC, endTime - startTime); 189 | } 190 | 191 | printf("[mandelbrot multicore ispc]:\t[%.3f] ms\n", minTaskISPC * 1000); 192 | writePPMImage(output_ispc_tasks, width, height, "mandelbrot-task-ispc.ppm", maxIterations); 193 | 194 | if (! verifyResult (output_serial, output_ispc_tasks, width, height)) { 195 | printf ("Error : ISPC output differs from sequential output\n"); 196 | return 1; 197 | } 198 | } 199 | 200 | printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC); 201 | if (useTasks) { 202 | printf("\t\t\t\t(%.2fx speedup from task ISPC)\n", minSerial/minTaskISPC); 203 | } 204 | 205 | delete[] output_serial; 206 | delete[] output_ispc; 207 | delete[] output_ispc_tasks; 208 | 209 | 210 | return 0; 211 | } 212 | -------------------------------------------------------------------------------- /asst1/prog3_mandelbrot_ispc/mandelbrot.ispc: -------------------------------------------------------------------------------- 1 | 2 | 3 | static inline int mandel(float c_re, float c_im, int count) { 4 | float z_re = c_re, z_im = c_im; 5 | int i; 6 | for (i = 0; i < count; ++i) { 7 | 8 | if (z_re * z_re + z_im * z_im > 4.f) 9 | break; 10 | 11 | float new_re = z_re*z_re - z_im*z_im; 12 | float new_im = 2.f * z_re * z_im; 13 | z_re = c_re + new_re; 14 | z_im = c_im + new_im; 15 | } 16 | 17 | return i; 18 | } 19 | 20 | export void mandelbrot_ispc(uniform float x0, uniform float y0, 21 | uniform float x1, uniform float y1, 22 | uniform int width, uniform int height, 23 | uniform int maxIterations, 24 | uniform int output[]) 25 | { 26 | float dx = (x1 - x0) / width; 27 | float dy = (y1 - y0) / height; 28 | 29 | foreach (j = 0 ... height, i = 0 ... width) { 30 | float x = x0 + i * dx; 31 | float y = y0 + j * dy; 32 | 33 | int index = j * width + i; 34 | output[index] = mandel(x, y, maxIterations); 35 | } 36 | } 37 | 38 | // slightly different kernel to support tasking 39 | task void mandelbrot_ispc_task(uniform float x0, uniform float y0, 40 | uniform float x1, uniform float y1, 41 | uniform int width, uniform int height, 42 | uniform int rowsPerTask, 43 | uniform int maxIterations, 44 | uniform int output[]) 45 | { 46 | 47 | // taskIndex is an ISPC built-in 48 | 49 | uniform int ystart = taskIndex * rowsPerTask; 50 | uniform int yend = ystart + rowsPerTask; 51 | 52 | uniform float dx = (x1 - x0) / width; 53 | uniform float dy = (y1 - y0) / height; 54 | 55 | foreach (j = ystart ... yend, i = 0 ... width) { 56 | float x = x0 + i * dx; 57 | float y = y0 + j * dy; 58 | 59 | int index = j * width + i; 60 | output[index] = mandel(x, y, maxIterations); 61 | } 62 | } 63 | 64 | export void mandelbrot_ispc_withtasks(uniform float x0, uniform float y0, 65 | uniform float x1, uniform float y1, 66 | uniform int width, uniform int height, 67 | uniform int maxIterations, 68 | uniform int output[]) 69 | { 70 | 71 | uniform int rowsPerTask = height / 16; 72 | 73 | // create 2 tasks 74 | launch[16] mandelbrot_ispc_task(x0, y0, x1, y1, 75 | width, height, 76 | rowsPerTask, 77 | maxIterations, 78 | output); 79 | } 80 | -------------------------------------------------------------------------------- /asst1/prog3_mandelbrot_ispc/mandelbrotSerial.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | 15418 Spring 2012 note: This code was modified from example code 4 | originally provided by Intel. To comply with Intel's open source 5 | licensing agreement, their copyright is retained below. 6 | 7 | ----------------------------------------------------------------- 8 | 9 | Copyright (c) 2010-2011, Intel Corporation 10 | All rights reserved. 11 | 12 | Redistribution and use in source and binary forms, with or without 13 | modification, are permitted provided that the following conditions are 14 | met: 15 | 16 | * Redistributions of source code must retain the above copyright 17 | notice, this list of conditions and the following disclaimer. 18 | 19 | * Redistributions in binary form must reproduce the above copyright 20 | notice, this list of conditions and the following disclaimer in the 21 | documentation and/or other materials provided with the distribution. 22 | 23 | * Neither the name of Intel Corporation nor the names of its 24 | contributors may be used to endorse or promote products derived from 25 | this software without specific prior written permission. 26 | 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 28 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 30 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 31 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 32 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 33 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 34 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 35 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 36 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 37 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 | */ 39 | 40 | 41 | static inline int mandel(float c_re, float c_im, int count) 42 | { 43 | float z_re = c_re, z_im = c_im; 44 | int i; 45 | for (i = 0; i < count; ++i) { 46 | 47 | if (z_re * z_re + z_im * z_im > 4.f) 48 | break; 49 | 50 | float new_re = z_re*z_re - z_im*z_im; 51 | float new_im = 2.f * z_re * z_im; 52 | z_re = c_re + new_re; 53 | z_im = c_im + new_im; 54 | } 55 | 56 | return i; 57 | } 58 | 59 | // 60 | // MandelbrotSerial -- 61 | // 62 | // Compute an image visualizing the mandelbrot set. The resulting 63 | // array contains the number of iterations required before the complex 64 | // number corresponding to a pixel could be rejected from the set. 65 | // 66 | // * x0, y0, x1, y1 describe the complex coordinates mapping 67 | // into the image viewport. 68 | // * width, height describe the size of the output image 69 | // * startRow, totalRows describe how much of the image to compute 70 | void mandelbrotSerial( 71 | float x0, float y0, float x1, float y1, 72 | int width, int height, 73 | int startRow, int totalRows, 74 | int maxIterations, 75 | int output[]) 76 | { 77 | float dx = (x1 - x0) / width; 78 | float dy = (y1 - y0) / height; 79 | 80 | int endRow = startRow + totalRows; 81 | 82 | for (int j = startRow; j < endRow; j++) { 83 | for (int i = 0; i < width; ++i) { 84 | float x = x0 + i * dx; 85 | float y = y0 + j * dy; 86 | 87 | int index = (j * width + i); 88 | output[index] = mandel(x, y, maxIterations); 89 | } 90 | } 91 | } 92 | 93 | -------------------------------------------------------------------------------- /asst1/prog4_sqrt/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ -m64 -march=native 2 | CXXFLAGS=-I../common -Iobjs/ -O3 -Wall 3 | ISPC=ispc 4 | # note: requires AVX2 capable machine 5 | ISPCFLAGS=-O3 --target=avx2-i32x8 --arch=x86-64 --pic 6 | 7 | 8 | APP_NAME=sqrt 9 | OBJDIR=objs 10 | COMMONDIR=../common 11 | 12 | PPM_CXX=$(COMMONDIR)/ppm.cpp 13 | PPM_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(PPM_CXX:.cpp=.o))) 14 | 15 | TASKSYS_CXX=$(COMMONDIR)/tasksys.cpp 16 | TASKSYS_LIB=-lpthread 17 | TASKSYS_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(TASKSYS_CXX:.cpp=.o))) 18 | 19 | default: $(APP_NAME) 20 | 21 | .PHONY: dirs clean 22 | 23 | dirs: 24 | /bin/mkdir -p $(OBJDIR)/ 25 | 26 | clean: 27 | /bin/rm -rf $(OBJDIR) *.ppm *~ $(APP_NAME) 28 | 29 | OBJS=$(OBJDIR)/main.o $(OBJDIR)/sqrtSerial.o $(OBJDIR)/sqrt_ispc.o $(PPM_OBJ) $(TASKSYS_OBJ) 30 | 31 | $(APP_NAME): dirs $(OBJS) 32 | $(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASKSYS_LIB) 33 | 34 | $(OBJDIR)/%.o: %.cpp 35 | $(CXX) $< $(CXXFLAGS) -c -o $@ 36 | 37 | $(OBJDIR)/%.o: $(COMMONDIR)/%.cpp 38 | $(CXX) $< $(CXXFLAGS) -c -o $@ 39 | 40 | $(OBJDIR)/main.o: $(OBJDIR)/$(APP_NAME)_ispc.h $(COMMONDIR)/CycleTimer.h 41 | 42 | $(OBJDIR)/%_ispc.h $(OBJDIR)//%_ispc.o: %.ispc 43 | $(ISPC) $(ISPCFLAGS) $< -o $(OBJDIR)/$*_ispc.o -h $(OBJDIR)/$*_ispc.h 44 | 45 | -------------------------------------------------------------------------------- /asst1/prog4_sqrt/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "CycleTimer.h" 7 | #include "sqrt_ispc.h" 8 | 9 | using namespace ispc; 10 | 11 | extern void sqrtSerial(int N, float startGuess, float* values, float* output); 12 | 13 | static void verifyResult(int N, float* result, float* gold) { 14 | for (int i=0; i 1e-4) { 16 | printf("Error: [%d] Got %f expected %f\n", i, result[i], gold[i]); 17 | } 18 | } 19 | } 20 | 21 | int main() { 22 | 23 | const unsigned int N = 20 * 1000 * 1000; 24 | const float initialGuess = 1.0f; 25 | 26 | float* values = new float[N]; 27 | float* output = new float[N]; 28 | float* gold = new float[N]; 29 | 30 | for (unsigned int i=0; i(rand()) / RAND_MAX; 39 | // Q2 40 | // values[i] = 2.998f; 41 | // Q3 42 | if (i % 8) { 43 | values[i] = 1.0f; 44 | } else { 45 | values[i] = 2.998f; 46 | } 47 | } 48 | 49 | // generate a gold version to check results 50 | for (unsigned int i=0; i kThreshold) { 17 | guess = (3.f * guess - x * guess * guess * guess) * 0.5f; 18 | pred = abs(guess * guess * x - 1.f); 19 | } 20 | 21 | output[i] = x * guess; 22 | 23 | } 24 | } 25 | 26 | task void sqrt_ispc_task(uniform int N, 27 | uniform int span, 28 | uniform float initialGuess, 29 | uniform float values[], 30 | uniform float output[]) 31 | { 32 | 33 | uniform int indexStart = taskIndex * span; 34 | uniform int indexEnd = min(N, indexStart + span); 35 | 36 | foreach (i = indexStart ... indexEnd) { 37 | 38 | float x = values[i]; 39 | float guess = initialGuess; 40 | 41 | float pred = abs(guess * guess * x - 1.f); 42 | 43 | while (pred > kThreshold) { 44 | guess = (3.f * guess - x * guess * guess * guess) * 0.5f; 45 | pred = abs(guess * guess * x - 1.f); 46 | } 47 | 48 | output[i] = x * guess; 49 | 50 | } 51 | } 52 | 53 | export void sqrt_ispc_withtasks(uniform int N, 54 | uniform float initialGuess, 55 | uniform float values[], 56 | uniform float output[]) 57 | { 58 | 59 | uniform int span = N / 64; // 64 tasks 60 | 61 | launch[N/span] sqrt_ispc_task(N, span, initialGuess, values, output); 62 | } 63 | -------------------------------------------------------------------------------- /asst1/prog4_sqrt/sqrtSerial.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | 6 | void sqrtSerial(int N, 7 | float initialGuess, 8 | float values[], 9 | float output[]) 10 | { 11 | 12 | static const float kThreshold = 0.00001f; 13 | 14 | for (int i=0; i kThreshold) { 22 | guess = (3.f * guess - x * guess * guess * guess) * 0.5f; 23 | error = fabs(guess * guess * x - 1.f); 24 | } 25 | 26 | output[i] = x * guess; 27 | } 28 | } 29 | 30 | -------------------------------------------------------------------------------- /asst1/prog5_saxpy/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ -m64 2 | CXXFLAGS=-I../common -Iobjs/ -O2 -Wall 3 | ISPC=ispc 4 | # note: requires AVX2 5 | ISPCFLAGS=-O3 --target=avx2-i32x8 --arch=x86-64 --pic 6 | 7 | APP_NAME=saxpy 8 | OBJDIR=objs 9 | COMMONDIR=../common 10 | 11 | TASKSYS_CXX=$(COMMONDIR)/tasksys.cpp 12 | TASKSYS_LIB=-lpthread 13 | TASKSYS_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(TASKSYS_CXX:.cpp=.o))) 14 | 15 | default: $(APP_NAME) 16 | 17 | .PHONY: dirs clean 18 | 19 | dirs: 20 | /bin/mkdir -p $(OBJDIR)/ 21 | 22 | clean: 23 | /bin/rm -rf $(OBJDIR) *.ppm *~ $(APP_NAME) 24 | 25 | OBJS=$(OBJDIR)/main.o $(OBJDIR)/saxpySerial.o $(OBJDIR)/saxpy_ispc.o $(TASKSYS_OBJ) 26 | 27 | $(APP_NAME): dirs $(OBJS) 28 | $(CXX) $(CXXFLAGS) -o $@ $(OBJS) -lm $(TASKSYS_LIB) 29 | 30 | $(OBJDIR)/%.o: %.cpp 31 | $(CXX) $< $(CXXFLAGS) -c -o $@ 32 | 33 | $(OBJDIR)/%.o: $(COMMONDIR)/%.cpp 34 | $(CXX) $< $(CXXFLAGS) -c -o $@ 35 | 36 | $(OBJDIR)/main.o: $(OBJDIR)/$(APP_NAME)_ispc.h $(COMMONDIR)/CycleTimer.h 37 | 38 | $(OBJDIR)/%_ispc.h $(OBJDIR)//%_ispc.o: %.ispc 39 | $(ISPC) $(ISPCFLAGS) $< -o $(OBJDIR)/$*_ispc.o -h $(OBJDIR)/$*_ispc.h 40 | 41 | -------------------------------------------------------------------------------- /asst1/prog5_saxpy/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "CycleTimer.h" 5 | #include "saxpy_ispc.h" 6 | 7 | extern void saxpySerial(int N, float a, float* X, float* Y, float* result); 8 | 9 | 10 | // return GB/s 11 | static float 12 | toBW(int bytes, float sec) { 13 | return static_cast(bytes) / (1024. * 1024. * 1024.) / sec; 14 | } 15 | 16 | static float 17 | toGFLOPS(int ops, float sec) { 18 | return static_cast(ops) / 1e9 / sec; 19 | } 20 | 21 | static void verifyResult(int N, float* result, float* gold) { 22 | for (int i=0; i 7 | #else 8 | #include 9 | #include 10 | #endif // __x86_64__ or not 11 | 12 | #include // fprintf 13 | #include // exit 14 | 15 | #elif _WIN32 16 | # include 17 | # include 18 | #else 19 | # include 20 | # include 21 | # include 22 | # include 23 | #endif 24 | 25 | 26 | // This uses the cycle counter of the processor. Different 27 | // processors in the system will have different values for this. If 28 | // you process moves across processors, then the delta time you 29 | // measure will likely be incorrect. This is mostly for fine 30 | // grained measurements where the process is likely to be on the 31 | // same processor. For more global things you should use the 32 | // Time interface. 33 | 34 | // Also note that if you processors' speeds change (i.e. processors 35 | // scaling) or if you are in a heterogenous environment, you will 36 | // likely get spurious results. 37 | class CycleTimer { 38 | public: 39 | typedef unsigned long long SysClock; 40 | 41 | ////////// 42 | // Return the current CPU time, in terms of clock ticks. 43 | // Time zero is at some arbitrary point in the past. 44 | static SysClock currentTicks() { 45 | #if defined(__APPLE__) && !defined(__x86_64__) 46 | return mach_absolute_time(); 47 | #elif defined(_WIN32) 48 | LARGE_INTEGER qwTime; 49 | QueryPerformanceCounter(&qwTime); 50 | return qwTime.QuadPart; 51 | #elif defined(__x86_64__) 52 | unsigned int a, d; 53 | asm volatile("rdtsc" : "=a" (a), "=d" (d)); 54 | return static_cast(a) | 55 | (static_cast(d) << 32); 56 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser. 57 | unsigned int val; 58 | asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val)); 59 | return val; 60 | #else 61 | timespec spec; 62 | clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec); 63 | return CycleTimer::SysClock(static_cast(spec.tv_sec) * 1e9 + static_cast(spec.tv_nsec)); 64 | #endif 65 | } 66 | 67 | ////////// 68 | // Return the current CPU time, in terms of seconds. 69 | // This is slower than currentTicks(). Time zero is at 70 | // some arbitrary point in the past. 71 | static double currentSeconds() { 72 | return currentTicks() * secondsPerTick(); 73 | } 74 | 75 | ////////// 76 | // Return the conversion from seconds to ticks. 77 | static double ticksPerSecond() { 78 | return 1.0/secondsPerTick(); 79 | } 80 | 81 | static const char* tickUnits() { 82 | #if defined(__APPLE__) && !defined(__x86_64__) 83 | return "ns"; 84 | #elif defined(__WIN32__) || defined(__x86_64__) 85 | return "cycles"; 86 | #else 87 | return "ns"; // clock_gettime 88 | #endif 89 | } 90 | 91 | ////////// 92 | // Return the conversion from ticks to seconds. 93 | static double secondsPerTick() { 94 | static bool initialized = false; 95 | static double secondsPerTick_val; 96 | if (initialized) return secondsPerTick_val; 97 | #if defined(__APPLE__) 98 | #ifdef __x86_64__ 99 | int args[] = {CTL_HW, HW_CPU_FREQ}; 100 | unsigned int Hz; 101 | size_t len = sizeof(Hz); 102 | if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) { 103 | fprintf(stderr, "Failed to initialize secondsPerTick_val!\n"); 104 | exit(-1); 105 | } 106 | secondsPerTick_val = 1.0 / (double) Hz; 107 | #else 108 | mach_timebase_info_data_t time_info; 109 | mach_timebase_info(&time_info); 110 | 111 | // Scales to nanoseconds without 1e-9f 112 | secondsPerTick_val = (1e-9*static_cast(time_info.numer))/ 113 | static_cast(time_info.denom); 114 | #endif // x86_64 or not 115 | #elif defined(_WIN32) 116 | LARGE_INTEGER qwTicksPerSec; 117 | QueryPerformanceFrequency(&qwTicksPerSec); 118 | secondsPerTick_val = 1.0/static_cast(qwTicksPerSec.QuadPart); 119 | #else 120 | FILE *fp = fopen("/proc/cpuinfo","r"); 121 | char input[1024]; 122 | if (!fp) { 123 | fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo."); 124 | exit(-1); 125 | } 126 | // In case we don't find it, e.g. on the N900 127 | secondsPerTick_val = 1e-9; 128 | while (!feof(fp) && fgets(input, 1024, fp)) { 129 | // NOTE(boulos): Because reading cpuinfo depends on dynamic 130 | // frequency scaling it's better to read the @ sign first 131 | float GHz, MHz; 132 | if (strstr(input, "model name")) { 133 | char* at_sign = strstr(input, "@"); 134 | if (at_sign) { 135 | char* after_at = at_sign + 1; 136 | char* GHz_str = strstr(after_at, "GHz"); 137 | char* MHz_str = strstr(after_at, "MHz"); 138 | if (GHz_str) { 139 | *GHz_str = '\0'; 140 | if (1 == sscanf(after_at, "%f", &GHz)) { 141 | //printf("GHz = %f\n", GHz); 142 | secondsPerTick_val = 1e-9f / GHz; 143 | break; 144 | } 145 | } else if (MHz_str) { 146 | *MHz_str = '\0'; 147 | if (1 == sscanf(after_at, "%f", &MHz)) { 148 | //printf("MHz = %f\n", MHz); 149 | secondsPerTick_val = 1e-6f / GHz; 150 | break; 151 | } 152 | } 153 | } 154 | } else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) { 155 | //printf("MHz = %f\n", MHz); 156 | secondsPerTick_val = 1e-6f / MHz; 157 | break; 158 | } 159 | } 160 | fclose(fp); 161 | #endif 162 | 163 | initialized = true; 164 | return secondsPerTick_val; 165 | } 166 | 167 | ////////// 168 | // Return the conversion from ticks to milliseconds. 169 | static double msPerTick() { 170 | return secondsPerTick() * 1000.0; 171 | } 172 | 173 | private: 174 | CycleTimer(); 175 | }; 176 | 177 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_ 178 | -------------------------------------------------------------------------------- /asst2/common/ppm.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | 7 | 8 | void 9 | writePPMImage(int* data, int width, int height, const char *filename, int maxIterations) 10 | { 11 | FILE *fp = fopen(filename, "wb"); 12 | 13 | // write ppm header 14 | fprintf(fp, "P6\n"); 15 | fprintf(fp, "%d %d\n", width, height); 16 | fprintf(fp, "255\n"); 17 | 18 | for (int i = 0; i < width*height; ++i) { 19 | 20 | // Clamp iteration count for this pixel, then scale the value 21 | // to 0-1 range. Raise resulting value to a power (<1) to 22 | // increase brightness of low iteration count 23 | // pixels. a.k.a. Make things look cooler. 24 | 25 | float mapped = pow( std::min(static_cast(maxIterations), 26 | static_cast(data[i])) / 256.f, .5f); 27 | 28 | // convert back into 0-255 range, 8-bit channels 29 | unsigned char result = static_cast(255.f * mapped); 30 | for (int j = 0; j < 3; ++j) 31 | fputc(result, fp); 32 | } 33 | fclose(fp); 34 | printf("Wrote image file %s\n", filename); 35 | } 36 | -------------------------------------------------------------------------------- /asst2/figs/task_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst2/figs/task_graph.png -------------------------------------------------------------------------------- /asst2/part_a/.gitignore: -------------------------------------------------------------------------------- 1 | objs/ 2 | runtasks 3 | -------------------------------------------------------------------------------- /asst2/part_a/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ -m64 2 | CXXFLAGS=-I. -I../common -I../tests -Iobjs/ -O3 -std=c++11 -Wall 3 | 4 | APP_NAME=runtasks 5 | OBJDIR=objs 6 | COMMONDIR=../common 7 | 8 | PPM_CXX=$(COMMONDIR)/ppm.cpp 9 | PPM_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(PPM_CXX:.cpp=.o))) 10 | 11 | default: $(APP_NAME) 12 | 13 | .PHONY: dirs clean 14 | 15 | dirs: 16 | /bin/mkdir -p $(OBJDIR)/ 17 | 18 | clean: 19 | /bin/rm -rf $(OBJDIR) *.ppm *~ $(APP_NAME) 20 | 21 | OBJS=$(PPM_OBJ) $(OBJDIR)/tasksys.o 22 | 23 | $(APP_NAME): clean dirs $(OBJS) 24 | $(CXX) ../tests/main.cpp $(CXXFLAGS) -o $@ $(OBJDIR)/tasksys.o -lm -lpthread 25 | 26 | $(OBJDIR)/%.o: $(COMMONDIR)/%.cpp 27 | $(CXX) $< $(CXXFLAGS) -c -o $@ 28 | 29 | $(OBJDIR)/%.o: %.cpp 30 | $(CXX) $< $(CXXFLAGS) -c -o $@ 31 | -------------------------------------------------------------------------------- /asst2/part_a/itasksys.h: -------------------------------------------------------------------------------- 1 | #ifndef _ITASKSYS_H 2 | #define _ITASKSYS_H 3 | #include 4 | 5 | typedef int TaskID; 6 | 7 | class IRunnable { 8 | public: 9 | virtual ~IRunnable(); 10 | 11 | /* 12 | Executes an instance of the task as part of a bulk task launch. 13 | 14 | - task_id: the current task identifier. This value will be 15 | between 0 and num_total_tasks-1. 16 | 17 | - num_total_tasks: the total number of tasks in the bulk 18 | task launch. 19 | */ 20 | virtual void runTask(int task_id, int num_total_tasks) = 0; 21 | }; 22 | 23 | class ITaskSystem { 24 | public: 25 | /* 26 | Instantiates a task system. 27 | 28 | - num_threads: the maximum number of threads that the task system 29 | can use. 30 | */ 31 | ITaskSystem(int num_threads); 32 | virtual ~ITaskSystem(); 33 | virtual const char* name() = 0; 34 | 35 | /* 36 | Executes a bulk task launch of num_total_tasks. Task 37 | execution is synchronous with the calling thread, so run() 38 | will return only when the execution of all tasks is 39 | complete. 40 | */ 41 | virtual void run(IRunnable* runnable, int num_total_tasks) = 0; 42 | 43 | /* 44 | Executes an asynchronous bulk task launch of 45 | num_total_tasks, but with a dependency on prior launched 46 | tasks. 47 | 48 | 49 | The task runtime must complete execution of the tasks 50 | associated with all bulk task launches referenced in the 51 | array `deps` before beginning execution of *any* task in 52 | this bulk task launch. 53 | 54 | The caller must invoke sync() to guarantee completion of the 55 | tasks in this bulk task launch. 56 | 57 | Returns an identifer that can be used in subsequent calls to 58 | runAsnycWithDeps() to specify a dependency of some future 59 | bulk task launch on this bulk task launch. 60 | */ 61 | virtual TaskID runAsyncWithDeps(IRunnable* runnable, int num_total_tasks, 62 | const std::vector& deps) = 0; 63 | 64 | /* 65 | Blocks until all tasks created as a result of **any prior** 66 | runXXX calls are done. 67 | */ 68 | virtual void sync() = 0; 69 | }; 70 | #endif 71 | -------------------------------------------------------------------------------- /asst2/part_a/runtasks_ref_linux: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst2/part_a/runtasks_ref_linux -------------------------------------------------------------------------------- /asst2/part_a/runtasks_ref_osx_arm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst2/part_a/runtasks_ref_osx_arm -------------------------------------------------------------------------------- /asst2/part_a/runtasks_ref_osx_x86: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst2/part_a/runtasks_ref_osx_x86 -------------------------------------------------------------------------------- /asst2/part_a/tasksys.h: -------------------------------------------------------------------------------- 1 | #ifndef _TASKSYS_H 2 | #define _TASKSYS_H 3 | 4 | #include "itasksys.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | /* 14 | * TaskSystemSerial: This class is the student's implementation of a 15 | * serial task execution engine. See definition of ITaskSystem in 16 | * itasksys.h for documentation of the ITaskSystem interface. 17 | */ 18 | class TaskSystemSerial: public ITaskSystem { 19 | public: 20 | TaskSystemSerial(int num_threads); 21 | ~TaskSystemSerial(); 22 | const char* name(); 23 | void run(IRunnable* runnable, int num_total_tasks); 24 | TaskID runAsyncWithDeps(IRunnable* runnable, int num_total_tasks, 25 | const std::vector& deps); 26 | void sync(); 27 | }; 28 | 29 | /* 30 | * TaskSystemParallelSpawn: This class is the student's implementation of a 31 | * parallel task execution engine that spawns threads in every run() 32 | * call. See definition of ITaskSystem in itasksys.h for documentation 33 | * of the ITaskSystem interface. 34 | */ 35 | class TaskSystemParallelSpawn: public ITaskSystem { 36 | public: 37 | TaskSystemParallelSpawn(int num_threads); 38 | ~TaskSystemParallelSpawn(); 39 | const char* name(); 40 | void run(IRunnable* runnable, int num_total_tasks); 41 | TaskID runAsyncWithDeps(IRunnable* runnable, int num_total_tasks, 42 | const std::vector& deps); 43 | void sync(); 44 | private: 45 | int num_thread_; 46 | std::atomic task_idx_; 47 | }; 48 | 49 | /* 50 | * TaskSystemParallelThreadPoolSpinning: This class is the student's 51 | * implementation of a parallel task execution engine that uses a 52 | * thread pool. See definition of ITaskSystem in itasksys.h for 53 | * documentation of the ITaskSystem interface. 54 | */ 55 | class TaskSystemParallelThreadPoolSpinning: public ITaskSystem { 56 | public: 57 | TaskSystemParallelThreadPoolSpinning(int num_threads); 58 | ~TaskSystemParallelThreadPoolSpinning(); 59 | const char* name(); 60 | void run(IRunnable* runnable, int num_total_tasks); 61 | TaskID runAsyncWithDeps(IRunnable* runnable, int num_total_tasks, 62 | const std::vector& deps); 63 | void sync(); 64 | private: 65 | std::vector threads_; 66 | int num_total_tasks_; 67 | IRunnable *runnable_; 68 | std::queue task_index_; 69 | std::mutex lk_; 70 | bool stop_; 71 | std::atomic task_done_; 72 | }; 73 | 74 | /* 75 | * TaskSystemParallelThreadPoolSleeping: This class is the student's 76 | * optimized implementation of a parallel task execution engine that uses 77 | * a thread pool. See definition of ITaskSystem in 78 | * itasksys.h for documentation of the ITaskSystem interface. 79 | */ 80 | class TaskSystemParallelThreadPoolSleeping: public ITaskSystem { 81 | public: 82 | TaskSystemParallelThreadPoolSleeping(int num_threads); 83 | ~TaskSystemParallelThreadPoolSleeping(); 84 | const char* name(); 85 | void run(IRunnable* runnable, int num_total_tasks); 86 | TaskID runAsyncWithDeps(IRunnable* runnable, int num_total_tasks, 87 | const std::vector& deps); 88 | void sync(); 89 | private: 90 | 91 | struct TaskInfo { 92 | TaskID id; // task的ID 93 | IRunnable* runnable; 94 | int num_total_task; // 所有的任务数 95 | int num_done_task; // 已经完成的任务数 96 | bool done() const { 97 | return num_total_task == num_done_task; 98 | } 99 | }; 100 | 101 | 102 | struct WorkInfo { 103 | TaskID id; // work所属的task 104 | IRunnable* runnable; 105 | int cur_index; 106 | int num_total_task; 107 | }; 108 | 109 | bool stop_; 110 | 111 | TaskID global_task_id_; 112 | std::mutex lk_; 113 | std::condition_variable cv_worker_; // worker线程等待的队列 114 | std::condition_variable cv_main_; // sync线程等待的队列 115 | 116 | // 所有加入,但是还没有完成的task的总数,包括不满足条件的 117 | int num_all_undone_task; 118 | 119 | std::vector threads_; // 所有的worker线程 120 | std::unordered_map> graph_; // 维护当前图 121 | std::unordered_map in_degree_; // 每个task的入度 122 | std::queue tasks_; // 所有需要被执行的任务 123 | std::unordered_map task_info_; // 每个任务的信息 124 | 125 | }; 126 | 127 | #endif 128 | -------------------------------------------------------------------------------- /asst2/part_b/.gitignore: -------------------------------------------------------------------------------- 1 | objs/ 2 | runtasks 3 | -------------------------------------------------------------------------------- /asst2/part_b/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ -m64 2 | CXXFLAGS=-I. -I../common -I../tests -Iobjs/ -O3 -std=c++11 -Wall 3 | #CXXFLAGS=-I. -I../common -I../tests -Iobjs/ -g -std=c++11 -Wall 4 | 5 | APP_NAME=runtasks 6 | OBJDIR=objs 7 | COMMONDIR=../common 8 | 9 | PPM_CXX=$(COMMONDIR)/ppm.cpp 10 | PPM_OBJ=$(addprefix $(OBJDIR)/, $(subst $(COMMONDIR)/,, $(PPM_CXX:.cpp=.o))) 11 | 12 | default: $(APP_NAME) 13 | 14 | .PHONY: dirs clean 15 | 16 | dirs: 17 | /bin/mkdir -p $(OBJDIR)/ 18 | 19 | clean: 20 | /bin/rm -rf $(OBJDIR) *.ppm *~ $(APP_NAME) 21 | 22 | OBJS=$(PPM_OBJ) $(OBJDIR)/tasksys.o 23 | 24 | $(APP_NAME): clean dirs $(OBJS) 25 | $(CXX) ../tests/main.cpp $(CXXFLAGS) -o $@ $(OBJDIR)/tasksys.o -lm -lpthread 26 | 27 | $(OBJDIR)/%.o: $(COMMONDIR)/%.cpp 28 | $(CXX) $< $(CXXFLAGS) -c -o $@ 29 | 30 | $(OBJDIR)/%.o: %.cpp 31 | $(CXX) $< $(CXXFLAGS) -c -o $@ 32 | -------------------------------------------------------------------------------- /asst2/part_b/itasksys.h: -------------------------------------------------------------------------------- 1 | #ifndef _ITASKSYS_H 2 | #define _ITASKSYS_H 3 | #include 4 | 5 | typedef int TaskID; 6 | 7 | class IRunnable { 8 | public: 9 | virtual ~IRunnable(); 10 | 11 | /* 12 | Executes an instance of the task as part of a bulk task launch. 13 | 14 | - task_id: the current task identifier. This value will be 15 | between 0 and num_total_tasks-1. 16 | 17 | - num_total_tasks: the total number of tasks in the bulk 18 | task launch. 19 | */ 20 | virtual void runTask(int task_id, int num_total_tasks) = 0; 21 | }; 22 | 23 | class ITaskSystem { 24 | public: 25 | /* 26 | Instantiates a task system. 27 | 28 | - num_threads: the maximum number of threads that the task system 29 | can use. 30 | */ 31 | ITaskSystem(int num_threads); 32 | virtual ~ITaskSystem(); 33 | virtual const char *name() = 0; 34 | 35 | /* 36 | Executes a bulk task launch of num_total_tasks. Task 37 | execution is synchronous with the calling thread, so run() 38 | will return only when the execution of all tasks is 39 | complete. 40 | */ 41 | virtual void run(IRunnable *runnable, int num_total_tasks) = 0; 42 | 43 | /* 44 | Executes an asynchronous bulk task launch of 45 | num_total_tasks, but with a dependency on prior launched 46 | tasks. 47 | 48 | 49 | The task runtime must complete execution of the tasks 50 | associated with all bulk task launches referenced in the 51 | array `deps` before beginning execution of *any* task in 52 | this bulk task launch. 53 | 54 | The caller must invoke sync() to guarantee completion of the 55 | tasks in this bulk task launch. 56 | 57 | Returns an identifer that can be used in subsequent calls to 58 | runAsnycWithDeps() to specify a dependency of some future 59 | bulk task launch on this bulk task launch. 60 | */ 61 | virtual TaskID runAsyncWithDeps(IRunnable *runnable, int num_total_tasks, 62 | const std::vector &deps) = 0; 63 | 64 | /* 65 | Blocks until all tasks created as a result of **any prior** 66 | runXXX calls are done. 67 | */ 68 | virtual void sync() = 0; 69 | }; 70 | #endif 71 | -------------------------------------------------------------------------------- /asst2/part_b/runtasks_ref_linux: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst2/part_b/runtasks_ref_linux -------------------------------------------------------------------------------- /asst2/part_b/runtasks_ref_osx_arm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst2/part_b/runtasks_ref_osx_arm -------------------------------------------------------------------------------- /asst2/part_b/runtasks_ref_osx_x86: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst2/part_b/runtasks_ref_osx_x86 -------------------------------------------------------------------------------- /asst2/part_b/tasksys.h: -------------------------------------------------------------------------------- 1 | #ifndef _TASKSYS_H 2 | #define _TASKSYS_H 3 | 4 | #include "itasksys.h" 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | /* 14 | * TaskSystemSerial: This class is the student's implementation of a 15 | * serial task execution engine. See definition of ITaskSystem in 16 | * itasksys.h for documentation of the ITaskSystem interface. 17 | */ 18 | class TaskSystemSerial: public ITaskSystem { 19 | public: 20 | TaskSystemSerial(int num_threads); 21 | ~TaskSystemSerial(); 22 | const char* name(); 23 | void run(IRunnable* runnable, int num_total_tasks); 24 | TaskID runAsyncWithDeps(IRunnable* runnable, int num_total_tasks, 25 | const std::vector& deps); 26 | void sync(); 27 | }; 28 | 29 | /* 30 | * TaskSystemParallelSpawn: This class is the student's implementation of a 31 | * parallel task execution engine that spawns threads in every run() 32 | * call. See definition of ITaskSystem in itasksys.h for documentation 33 | * of the ITaskSystem interface. 34 | */ 35 | class TaskSystemParallelSpawn: public ITaskSystem { 36 | public: 37 | TaskSystemParallelSpawn(int num_threads); 38 | ~TaskSystemParallelSpawn(); 39 | const char* name(); 40 | void run(IRunnable* runnable, int num_total_tasks); 41 | TaskID runAsyncWithDeps(IRunnable* runnable, int num_total_tasks, 42 | const std::vector& deps); 43 | void sync(); 44 | private: 45 | int num_thread_; 46 | std::atomic task_idx_; 47 | }; 48 | 49 | /* 50 | * TaskSystemParallelThreadPoolSpinning: This class is the student's 51 | * implementation of a parallel task execution engine that uses a 52 | * thread pool. See definition of ITaskSystem in itasksys.h for 53 | * documentation of the ITaskSystem interface. 54 | */ 55 | class TaskSystemParallelThreadPoolSpinning: public ITaskSystem { 56 | public: 57 | TaskSystemParallelThreadPoolSpinning(int num_threads); 58 | ~TaskSystemParallelThreadPoolSpinning(); 59 | const char* name(); 60 | void run(IRunnable* runnable, int num_total_tasks); 61 | TaskID runAsyncWithDeps(IRunnable* runnable, int num_total_tasks, 62 | const std::vector& deps); 63 | void sync(); 64 | private: 65 | std::vector threads_; 66 | int num_total_tasks_; 67 | IRunnable *runnable_; 68 | std::queue task_index_; 69 | std::mutex lk_; 70 | bool stop_; 71 | std::atomic task_done_; 72 | }; 73 | 74 | /* 75 | * TaskSystemParallelThreadPoolSleeping: This class is the student's 76 | * optimized implementation of a parallel task execution engine that uses 77 | * a thread pool. See definition of ITaskSystem in 78 | * itasksys.h for documentation of the ITaskSystem interface. 79 | */ 80 | class TaskSystemParallelThreadPoolSleeping: public ITaskSystem { 81 | public: 82 | TaskSystemParallelThreadPoolSleeping(int num_threads); 83 | ~TaskSystemParallelThreadPoolSleeping(); 84 | const char* name(); 85 | void run(IRunnable* runnable, int num_total_tasks); 86 | TaskID runAsyncWithDeps(IRunnable* runnable, int num_total_tasks, 87 | const std::vector& deps); 88 | void sync(); 89 | private: 90 | 91 | struct TaskInfo { 92 | TaskID id; // task的ID 93 | IRunnable* runnable; 94 | int num_total_task; // 所有的任务数 95 | int num_done_work; // 当前task已经完成的任务 96 | TaskInfo(TaskID _id, IRunnable* _runnable, int _num_total_task, int _num_donw_work): 97 | id(_id), runnable(_runnable), num_total_task(_num_total_task), num_done_work(_num_donw_work) {} 98 | // std::atomic num_done_work; // 当前task已经完成的任务 99 | // TaskInfo(TaskID _id, IRunnable* _runnable, int _num_total_task): 100 | // id(_id), runnable(_runnable), num_total_task(_num_total_task){} 101 | }; 102 | 103 | 104 | struct WorkInfo { 105 | TaskID id; // work所属的task 106 | int cur_index; 107 | }; 108 | 109 | bool stop_; 110 | 111 | TaskID global_task_id_; 112 | std::mutex lk_; 113 | std::condition_variable cv_worker_; // worker线程等待的队列 114 | std::condition_variable cv_main_; // sync线程等待的队列 115 | 116 | // 所有加入,但是还没有完成的task的总数,包括不满足条件的 117 | int num_all_undone_task; 118 | 119 | std::vector threads_; // 所有的worker线程 120 | std::vector> graph_; 121 | // std::unordered_map> graph_; // 维护当前图 122 | std::vector in_degree_; 123 | // std::unordered_map in_degree_; // 每个task的入度 124 | std::queue tasks_; // 所有需要被执行的任务 125 | std::vector task_info_; 126 | // std::unordered_map task_info_; // 每个任务的信息 127 | static constexpr int N = 1024; 128 | }; 129 | 130 | #endif 131 | -------------------------------------------------------------------------------- /asst2/tutorial/Makefile: -------------------------------------------------------------------------------- 1 | CXX=g++ -m64 2 | CXXFLAGS=-O3 -std=c++11 -Wall 3 | 4 | APP_NAME=tutorial 5 | 6 | default: $(APP_NAME) 7 | 8 | .PHONY: dirs clean 9 | 10 | clean: 11 | /bin/rm -rf $(APP_NAME) 12 | 13 | $(APP_NAME): clean 14 | $(CXX) $(CXXFLAGS) -o $@ $@.cpp -lpthread 15 | -------------------------------------------------------------------------------- /asst2/tutorial/README.md: -------------------------------------------------------------------------------- 1 | 2 | # A Primer on C++ Synchronization # 3 | 4 | Your programming assignment 2 solutions will certainly need to create threads, and may need to make use of two types of synchronization primitives: mutexes and condition variables. The following notes explain these two types of synchronization. 5 | 6 | We provide you basic examples of creating C++ threads, locking/unlocking mutexes, and using condition variables in the file `tutorial/tutorial.cpp` provided in the starter code. 7 | 8 | ## Creating C++ Threads ## 9 | 10 | Creating new threads in C++ is simple. To create threads, an application constructs new instances of the `std::thread` object. For example, in the code below, the main thread creates two threads that run the function `my_func`. (Observe that the function `my_func` is used as an argument to the `std::thread` constructor.) The main thread invokes `join()` to determine when the execution of a spawned thread has completed. 11 | 12 | #include 13 | #include 14 | 15 | void my_func(int thread_id, int num_threads) { 16 | printf("Hello from spawned thread %d of %d\n", thread_id, num_threads); 17 | } 18 | 19 | int main(int argc, char** argv) { 20 | 21 | std::thread t0 = std::thread(my_func, 0, 2); 22 | std::thread t1 = std::thread(my_func, 1, 2); 23 | 24 | printf("The main thread is running concurrently with spawned threads.\n"); 25 | 26 | t0.join(); 27 | t1.join(); 28 | 29 | printf("Spawned threads have terminated at this point.\n"); 30 | 31 | return 0; 32 | } 33 | 34 | Full documentation of `std::thread` can be found here: . 35 | 36 | Useful tutorials on creating threads in C++ 11: 37 | 38 | * 39 | * 40 | 41 | ## Mutexes ## 42 | 43 | C++ standard library provides a mutex synchronization primitive, `std::mutex`, for protecting shared data from simultaneous access by multiple application threads. (Note: mutex is short for "mutual exclusion"). 44 | 45 | 46 | 47 | You have already encountered mutexes in prior courses like CS110. A thread locks the mutex using `mutex::lock()`. The calling thread will block until the mutex lock can be acquired. When `lock()` returns to the caller, the calling thread is guaranteed to have the lock. A thread unlocks the mutex using `mutex::unlock()`. 48 | 49 | For those interested, C++ provides a number of wrapper classes that are designed to reduce bugs when using locks (e.g., forgetting to unlock a mutex). You may wish to look at the definitions of [`std::unique_lock`](https://en.cppreference.com/w/cpp/thread/unique_lock) and [`std::lock_guard`](https://en.cppreference.com/w/cpp/thread/lock_guard). For example `lock_guard` automatically locks a specified mutex on construction, and unlocks the mutex when it is goes out of scope. 50 | 51 | We recommend that you take a look at the function `mutex_example()` in `tutorial/tutorial.cpp` for a simple example of using a mutex to protect updates to a shared counter. In this example, the mutex is used to ensure the read-modify-write to the counter is performed atomically. 52 | 53 | ## Condition Variables ## 54 | 55 | A condition variable manages a list of threads waiting for a condition to hold (e.g., an event to occur), and allows other threads to notify the waiting threads that the event of interest has occurred. A condition variable, when used in conjunction with a mutex, provides an easy way to send notifications between threads. 56 | 57 | There are two major operations on a condition variable: `wait()` and `notify()`. 58 | 59 | A thread calls `wait(lock)` to indicate it wishes to wait until a notfication from another thread. Notice that a mutex (wrapped in a `std::unique_lock`) is passed to the call to `wait()`. When the thread is notified, the condition variable will acquire the lock. This means that when a call to `wait()` returns, the calling thread is the current holder of the lock. Typically the lock is used to protect a shared variable that the thread now needs to check to ensure the condition it is waiting for is true. 60 | 61 | For example, the code in `tutorial/tutorial.cpp` creates N threads. N-1 of the threads wait for notification from thread 0, and then when notified, atomically increment a counter that is protected by a shared mutex. 62 | 63 | A thread calls `notify()` on a condition variable to notify exactly one thread waiting on the condition variable and `notify_all()` to notify all threads waiting on the condition variable. Notice how in `tutorial/tutorial.cpp`, thread 0 releases the lock protecting the counter prior to signaling all the waiting threads. 64 | 65 | In your task execution system implementation, how might you use `notify_all()`? Consider a situation where all worker threads are currently waiting for a new bulk task launch, and the application makes a call to `run()`, providing new tasks to execute. 66 | 67 | 68 | Additional references: 69 | 70 | * 71 | * 72 | 73 | ## C++ Atomics ## 74 | 75 | C++ also provides a simple way to make operations on a variable atomic---just create a variable of the type `std::atomic`. For example to create an integer that supports atomic increment, just create a variable of type: 76 | 77 | std::atomic my_counter; 78 | 79 | Now operations on `my_counter`, like `my_counter++` are guaranteed to be performed atomically. For more detail see: . 80 | -------------------------------------------------------------------------------- /asst2/tutorial/tutorial.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | /* 8 | Wrapper class around an integer counter and a mutex. 9 | */ 10 | class Counter { 11 | public: 12 | int counter_; 13 | std::mutex* mutex_; 14 | Counter() { 15 | counter_ = 0; 16 | mutex_ = new std::mutex(); 17 | } 18 | ~Counter() { 19 | delete mutex_; 20 | } 21 | }; 22 | 23 | void increment_counter_fn(Counter* counter) { 24 | for (int i = 0; i < 10000; i++) { 25 | // Call lock() method to acquire lock. 26 | counter->mutex_->lock(); 27 | // Since multiple threads are trying to perform an increment, the 28 | // increment needs to be protected by a mutex. 29 | counter->counter_++; 30 | // Call unlock() method to release lock. 31 | counter->mutex_->unlock(); 32 | } 33 | } 34 | 35 | /* 36 | * Threads increment a shared counter in a tight for loop 10,000 times. 37 | */ 38 | void mutex_example() { 39 | int num_threads = 8; 40 | 41 | printf("==============================================================\n"); 42 | printf("Starting %d threads to increment counter...\n", num_threads); 43 | std::thread* threads = new std::thread[num_threads]; 44 | Counter* counter = new Counter(); 45 | // `num_threads` threads will call `increment_counter_fn`, trying to 46 | // increment `counter`. 47 | for (int i = 0; i < num_threads; i++) { 48 | threads[i] = std::thread(increment_counter_fn, counter); 49 | } 50 | // Wait for spawned threads to complete. 51 | for (int i = 0; i < num_threads; i++) { 52 | threads[i].join(); 53 | } 54 | // Verify that final counter value is (10000 * `num_threads`). 55 | printf("Final counter value: %d...\n", counter->counter_); 56 | printf("==============================================================\n"); 57 | 58 | delete counter; 59 | delete[] threads; 60 | } 61 | 62 | /* 63 | * Wrapper class around a counter, a condition variable, and a mutex. 64 | */ 65 | class ThreadState { 66 | public: 67 | std::condition_variable* condition_variable_; 68 | std::mutex* mutex_; 69 | int counter_; 70 | int num_waiting_threads_; 71 | ThreadState(int num_waiting_threads) { 72 | condition_variable_ = new std::condition_variable(); 73 | mutex_ = new std::mutex(); 74 | counter_ = 0; 75 | num_waiting_threads_ = num_waiting_threads; 76 | } 77 | ~ThreadState() { 78 | delete condition_variable_; 79 | delete mutex_; 80 | } 81 | }; 82 | 83 | void signal_fn(ThreadState* thread_state) { 84 | // Acquire mutex to make sure the shared counter is read in a 85 | // consistent state. 86 | thread_state->mutex_->lock(); 87 | while (thread_state->counter_ < thread_state->num_waiting_threads_) { 88 | thread_state->mutex_->unlock(); 89 | // Release the mutex before calling `notify_all()` to make sure 90 | // waiting threads have a chance to make progress. 91 | thread_state->condition_variable_->notify_all(); 92 | // Re-acquire the mutex to read the shared counter again. 93 | thread_state->mutex_->lock(); 94 | } 95 | thread_state->mutex_->unlock(); 96 | } 97 | 98 | void wait_fn(ThreadState* thread_state) { 99 | // A lock must be held in order to wait on a condition variable. 100 | // This lock is atomically released before the thread goes to sleep 101 | // when `wait()` is called. The lock is atomically re-acquired when 102 | // the thread is woken up using `notify_all()`. 103 | std::unique_lock lk(*thread_state->mutex_); 104 | thread_state->condition_variable_->wait(lk); 105 | // Increment the shared counter with the lock re-acquired to inform the 106 | // signaling thread that this waiting thread has successfully been 107 | // woken up. 108 | thread_state->counter_++; 109 | printf("Lock re-acquired after wait()...\n"); 110 | lk.unlock(); 111 | } 112 | 113 | /* 114 | * Signaling thread spins until each waiting thread increments a shared 115 | * counter after being woken up from the `wait()` method. 116 | */ 117 | void condition_variable_example() { 118 | int num_threads = 3; 119 | 120 | printf("==============================================================\n"); 121 | printf("Starting %d threads for signal-and-waiting...\n", num_threads); 122 | std::thread* threads = new std::thread[num_threads]; 123 | ThreadState* thread_state = new ThreadState(num_threads-1); 124 | threads[0] = std::thread(signal_fn, thread_state); 125 | for (int i = 1; i < num_threads; i++) { 126 | threads[i] = std::thread(wait_fn, thread_state); 127 | } 128 | for (int i = 0; i < num_threads; i++) { 129 | threads[i].join(); 130 | } 131 | printf("==============================================================\n"); 132 | 133 | delete thread_state; 134 | delete[] threads; 135 | } 136 | 137 | 138 | int main(int argc, char** argv) { 139 | mutex_example(); 140 | condition_variable_example(); 141 | } 142 | -------------------------------------------------------------------------------- /asst3/cloud_readme.md: -------------------------------------------------------------------------------- 1 | # AWS Setup Instructions # 2 | 3 | For performance testing, you will need to run it on a VM instance on Amazon Web Services (AWS). We've already sent you student coupons that you can use for billing purposes. Here are the steps for how to get setup for running on AWS. 4 | 5 | NOTE: __Please don't forget to SHUT DOWN your instances when you're done for the day to avoid burning through credits overnight!__ 6 | 7 | ### Creating a VM with a GPU ### 8 | 9 | 1. Now you're ready to create a VM instance. Click on the button that says `Launch Instances`. Choose the `Ubuntu Server 20.04 LTS (HVM), SSD Volume Type` AMI: 10 | ![AMI Selection](handout/choose_ami.png?raw=true) 11 | 12 | 2. Choose the `g4dn.xlarge` GPU Instance Type and then click `4. Add Storage` on the top bar: 13 | ![GPU instance](handout/choose_instance.png?raw=true) 14 | 15 | 3. Change the size of the `Root` volume to 64 GB to accomodate the packages we will need to install to make the instance functional for the assignment: 16 | ![Storage](handout/choose_storage.png?raw=true) 17 | 18 | 5. AWS will ask you to select a key pair. Click the first dropdown and choose `Create a new key pair` and give it whatever name you'd like. This will download a keyfile to your computer called `.pem` which you will use to login to the VM instance you are about to create. Finally, click `Launch Instances`. 19 | ![Key Pair](handout/new_key_pair.png?raw=true) 20 | 21 | __Note: `gd4n.xlarge` instances cost $0.526 / hour, so leaving one running for a whole day will consume $12.624 worth of your AWS coupon.__ 22 | 23 | 4. Now that you've created your VM, you should be able to __SSH__ into it. You need the public IPv4 DNS name to SSH into it, which you can find on the instance page by clicking the `View Instances` button on the current page and then the instance ID for your created instance (note, it may take a moment for the instance to startup and be assigned an IP address): 24 | ![Public DNS Name](handout/public_dns.png?raw=true) 25 | Once you have the IP address, you can login to the instance by running this command: 26 | ~~~~ 27 | ssh -i path/to/key_name.pem ubuntu@ 28 | ~~~~ 29 | 30 | ### Setting up the VM environment ### 31 | 32 | We have included a convenience script, __install.sh__, which performs steps 5,6 and 7 for you. To run it, do: 33 | 34 | ~~~~ 35 | chmod +x install.sh 36 | sudo ./install.sh 37 | source ~/.bashrc 38 | ~~~~ 39 | 40 | If for some reason the script does not work, the manual instructions follow: 41 | 42 | ### Manually setting up the VM environment ### 43 | 44 | 5. Once you SSH into your VM instance, you'll want to install whatever software you need to make the machine a useful development environment for you. For example we recommend: 45 | ~~~~ 46 | sudo apt update 47 | sudo apt install make g++ freeglut3-dev # Required 48 | sudo apt install vim 49 | ~~~~ 50 | 51 | ### Installing CUDA ### 52 | 53 | 6. Now you need to download the CUDA 11 runtime from NVIDIA. SSH into your AWS instance and run the following: 54 | 55 | ~~~~ 56 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin 57 | sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 58 | sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub 59 | sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" 60 | sudo apt-get update 61 | sudo apt-get -y install cuda 62 | ~~~~ 63 | 64 | 7. `nvcc` is the NVIDIA CUDA compiler. The default install locates CUDA binaries in `/usr/local/cuda/bin/`, so you'll want to add this directory to your path. For example, from a bash shell that would be: 65 | 66 | ~~~~ 67 | export PATH=$PATH:/usr/local/cuda/bin 68 | ~~~~ 69 | 70 | In general we recommend that you perform this `$PATH` update on login, so you can add this line to the end of your `.bashrc` file. Don't forget to `source .bashrc` if you want to have this modification take effect without logging out and back in to the instance. 71 | 72 | ### Confirming that CUDA has been installed ### 73 | 74 | Suppose you have carried out steps 5-7 or run the __install.sh__ script. At this point CUDA should be installed and you should be able to run the `nvidia-smi` command to make sure everything is setup correctly. The result of the command should indicate that your VM has one NVIDIA K80 GPU. 75 | 76 | ~~~~ 77 | ubuntu@ip-172-31-20-116:~/asst3$ nvidia-smi 78 | Fri Oct 22 18:08:14 2021 79 | +-----------------------------------------------------------------------------+ 80 | | NVIDIA-SMI 495.29.05 Driver Version: 495.29.05 CUDA Version: 11.5 | 81 | |-------------------------------+----------------------+----------------------+ 82 | | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | 83 | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | 84 | | | | MIG M. | 85 | |===============================+======================+======================| 86 | | 0 Tesla T4 Off | 00000000:00:1E.0 Off | 0 | 87 | | N/A 50C P0 27W / 70W | 0MiB / 15109MiB | 0% Default | 88 | | | | N/A | 89 | +-------------------------------+----------------------+----------------------+ 90 | 91 | +-----------------------------------------------------------------------------+ 92 | | Processes: | 93 | | GPU GI CI PID Type Process name GPU Memory | 94 | | ID ID Usage | 95 | |=============================================================================| 96 | | No running processes found | 97 | +-----------------------------------------------------------------------------+ 98 | ~~~~ 99 | 100 | If you're confused about any of the steps, having problems with setting up your account or have any additional questions, reach us out on Piazza! 101 | 102 | __Again, please don't forget to STOP your instances when you're done with your work for the day!__ 103 | 104 | ### AWS Setup Trouble Shooting 105 | 1. If you received an error message stating that you are not able to launch additional resources in this region, AWS will validate your request. The validation process should take around 20 minutes. If that is not the case, please email AWS at aws-verification@amazon.com. 106 | ![Unavailable Region](handout/location_limit.png?raw=true) 107 | 108 | 2. If you received and error message stating that you have requested more vCPU capacity than your current limit, please check your quota. 109 | ![Quota Navigation Bar](handout/vCPU_trouble.png?raw=true) 110 | ![Quota Dashboard](handout/vCPU_dashboard.png?raw=true) 111 | ![Quota Dashboard Search](handout/vCPU_dashboard_2.png?raw=true) 112 | If your Applied quota value is less than 4, please submit a request for quota increase and put 4 as your requested number of vCPUs. 113 | ![Quota Request](handout/quota_request.png?raw=true) 114 | -------------------------------------------------------------------------------- /asst3/handout/bug_example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/bug_example.jpg -------------------------------------------------------------------------------- /asst3/handout/choose_ami.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/choose_ami.png -------------------------------------------------------------------------------- /asst3/handout/choose_instance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/choose_instance.png -------------------------------------------------------------------------------- /asst3/handout/choose_storage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/choose_storage.png -------------------------------------------------------------------------------- /asst3/handout/dependencies.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/dependencies.jpg -------------------------------------------------------------------------------- /asst3/handout/gpu_instance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/gpu_instance.png -------------------------------------------------------------------------------- /asst3/handout/gpu_instance.png_original: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/gpu_instance.png_original -------------------------------------------------------------------------------- /asst3/handout/ip_address.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/ip_address.png -------------------------------------------------------------------------------- /asst3/handout/location_limit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/location_limit.png -------------------------------------------------------------------------------- /asst3/handout/navigation_quota.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/navigation_quota.png -------------------------------------------------------------------------------- /asst3/handout/new_key_pair.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/new_key_pair.png -------------------------------------------------------------------------------- /asst3/handout/order.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/order.jpg -------------------------------------------------------------------------------- /asst3/handout/point_in_circle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/point_in_circle.jpg -------------------------------------------------------------------------------- /asst3/handout/public_dns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/public_dns.png -------------------------------------------------------------------------------- /asst3/handout/quota_request.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/quota_request.png -------------------------------------------------------------------------------- /asst3/handout/teaser.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/teaser.jpg -------------------------------------------------------------------------------- /asst3/handout/vCPU_dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/vCPU_dashboard.png -------------------------------------------------------------------------------- /asst3/handout/vCPU_dashboard_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/vCPU_dashboard_2.png -------------------------------------------------------------------------------- /asst3/handout/vCPU_trouble.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/handout/vCPU_trouble.png -------------------------------------------------------------------------------- /asst3/install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # A helper script to install CUDA for Ubuntu 20.04 4 | 5 | set -e 6 | 7 | wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin 8 | mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 9 | wget https://developer.download.nvidia.com/compute/cuda/11.5.0/local_installers/cuda-repo-ubuntu2004-11-5-local_11.5.0-495.29.05-1_amd64.deb 10 | dpkg -i cuda-repo-ubuntu2004-11-5-local_11.5.0-495.29.05-1_amd64.deb 11 | apt-key add /var/cuda-repo-ubuntu2004-11-5-local/7fa2af80.pub 12 | 13 | apt-get update 14 | apt-get install -y g++ make cuda freeglut3-dev 15 | 16 | # Update PATH to include bin directory containing nvcc 17 | echo "export PATH=\$PATH:/usr/local/cuda/bin" >> /home/ubuntu/.bashrc 18 | -------------------------------------------------------------------------------- /asst3/render/Makefile: -------------------------------------------------------------------------------- 1 | 2 | EXECUTABLE := render 3 | 4 | CU_FILES := cudaRenderer.cu 5 | 6 | CU_DEPS := 7 | 8 | CC_FILES := main.cpp display.cpp benchmark.cpp refRenderer.cpp \ 9 | noise.cpp ppm.cpp sceneLoader.cpp 10 | 11 | LOGS := logs 12 | 13 | ########################################################### 14 | 15 | ARCH=$(shell uname | sed -e 's/-.*//g') 16 | OBJDIR=objs 17 | CXX=g++ -m64 18 | CXXFLAGS=-O3 -Wall -g -std=c++11 19 | HOSTNAME=$(shell hostname) 20 | 21 | LIBS := 22 | FRAMEWORKS := 23 | 24 | NVCCFLAGS=-O3 -m64 --gpu-architecture compute_61 -std=c++11 25 | LIBS += GL glut cudart 26 | 27 | ifneq ($(wildcard /opt/cuda-8.0/.*),) 28 | # Latedays 29 | LDFLAGS=-L/opt/cuda-8.0/lib64/ -lcudart 30 | else 31 | # GHC 32 | LDFLAGS=-L/usr/local/cuda-9.0/lib64/ -lcudart 33 | endif 34 | 35 | LDLIBS := $(addprefix -l, $(LIBS)) 36 | LDFRAMEWORKS := $(addprefix -framework , $(FRAMEWORKS)) 37 | 38 | NVCC=nvcc 39 | 40 | OBJS=$(OBJDIR)/main.o $(OBJDIR)/display.o $(OBJDIR)/benchmark.o $(OBJDIR)/refRenderer.o \ 41 | $(OBJDIR)/cudaRenderer.o $(OBJDIR)/noise.o $(OBJDIR)/ppm.o $(OBJDIR)/sceneLoader.o 42 | 43 | 44 | .PHONY: dirs clean 45 | 46 | default: $(EXECUTABLE) 47 | 48 | dirs: 49 | mkdir -p $(OBJDIR)/ 50 | 51 | clean: 52 | rm -rf $(OBJDIR) *~ $(EXECUTABLE) $(LOGS) 53 | 54 | check: default 55 | ./checker.pl 56 | 57 | $(EXECUTABLE): dirs $(OBJS) 58 | $(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS) $(LDFRAMEWORKS) 59 | 60 | $(OBJDIR)/%.o: %.cpp 61 | $(CXX) $< $(CXXFLAGS) -c -o $@ 62 | 63 | $(OBJDIR)/%.o: %.cu 64 | $(NVCC) $< $(NVCCFLAGS) -c -o $@ 65 | -------------------------------------------------------------------------------- /asst3/render/checker.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use POSIX; 4 | 5 | my @scene_names = ("rgb", "rgby", "rand10k", "rand100k", "biglittle", "littlebig", "pattern", "bouncingballs", "hypnosis", "fireworks", "snow", "snowsingle"); 6 | my @score_scene_names = ("rgb", "rand10k", "rand100k", "pattern", "snowsingle", "biglittle"); 7 | 8 | my %fast_times; 9 | 10 | my $perf_points = 10; 11 | my $correctness_points = 2; 12 | 13 | my %correct; 14 | 15 | my %your_times; 16 | 17 | `mkdir -p logs`; 18 | `rm -rf logs/*`; 19 | 20 | print "\n"; 21 | print ("--------------\n"); 22 | my $hostname = `hostname`; 23 | print ("Running tests on $hostname\n"); 24 | print ("--------------\n"); 25 | 26 | my $render_ref = "render_ref"; 27 | 28 | foreach my $scene (@scene_names) { 29 | print ("\nScene : $scene\n"); 30 | my @sys_stdout = system ("./render -c $scene -s 1024 > ./logs/correctness_${scene}.log"); 31 | my $return_value = $?; 32 | if ($return_value == 0) { 33 | print ("Correctness passed!\n"); 34 | $correct{$scene} = 1; 35 | } 36 | else { 37 | print ("Correctness failed ... Check ./logs/correctness_${scene}.log\n"); 38 | $correct{$scene} = 0; 39 | } 40 | 41 | if (${scene} ~~ @score_scene_names) { 42 | my $your_time = `./render -r cuda -b 0:4 $scene -s 1024 | tee ./logs/time_${scene}.log | grep Total:`; 43 | chomp($your_time); 44 | $your_time =~ s/^[^0-9]*//; 45 | $your_time =~ s/ ms.*//; 46 | 47 | print ("Your time : $your_time\n"); 48 | $your_times{$scene} = $your_time; 49 | 50 | my $fast_time = `./$render_ref -r cuda -b 0:4 $scene -s 1024 | tee ./logs/time_${scene}.log | grep Total:`; 51 | chomp($fast_time); 52 | $fast_time =~ s/^[^0-9]*//; 53 | $fast_time =~ s/ ms.*//; 54 | 55 | print ("Ref Time: $fast_time\n"); 56 | $fast_times{$scene} = $fast_time; 57 | } 58 | } 59 | 60 | print "\n"; 61 | print ("------------\n"); 62 | print ("Score table:\n"); 63 | print ("------------\n"); 64 | 65 | my $header = sprintf ("| %-15s | %-16s | %-15s | %-15s |\n", "Scene Name", "Ref Time (T_ref)", "Your Time (T)", "Score"); 66 | my $dashes = $header; 67 | $dashes =~ s/./-/g; 68 | print $dashes; 69 | print $header; 70 | print $dashes; 71 | 72 | my $total_score = 0; 73 | 74 | foreach my $scene (@score_scene_names){ 75 | my $score; 76 | my $your_time = $your_times{$scene}; 77 | my $fast_time = $fast_times{$scene}; 78 | 79 | if ($correct{$scene}) { 80 | if ($your_time <= 1.20 * $fast_time) { 81 | $score = $perf_points + $correctness_points; 82 | } 83 | elsif ($your_time > 10 * $fast_time) { 84 | $score = $correctness_points; 85 | } 86 | else { 87 | $score = $correctness_points + ceil ($perf_points * ($fast_time /$your_time)); 88 | } 89 | } 90 | else { 91 | $your_time .= " (F)"; 92 | $score = 0; 93 | } 94 | 95 | printf ("| %-15s | %-16s | %-15s | %-15s |\n", "$scene", "$fast_time", "$your_time", "$score"); 96 | $total_score += $score; 97 | } 98 | print $dashes; 99 | printf ("| %-15s %-16s | %-15s | %-15s |\n", "", "", "Total score:", 100 | $total_score . "/" . ($perf_points+$correctness_points) * ($#score_scene_names + 1)); 101 | print $dashes; 102 | -------------------------------------------------------------------------------- /asst3/render/checker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import subprocess 4 | import os 5 | import shutil 6 | import re 7 | import math 8 | 9 | perf_pts = 10 10 | correctness_pts = 2 11 | 12 | # scene_names = ["rand100k"] 13 | # score_scene_names = {"rand100k"} 14 | scene_names = ["rgb", "rgby", "rand10k", "rand100k", "biglittle", "littlebig", "pattern", "bouncingballs", "hypnosis", "fireworks", "snow", "snowsingle"] 15 | score_scene_names_list = ["rgb", "rand10k", "rand100k", "pattern", "snowsingle", "biglittle"] 16 | score_scene_names = set(score_scene_names_list) 17 | 18 | #### LOGS MANAGEMENT #### 19 | # Set up a new logs dir (remove old logs dir, create new logs dir) 20 | if os.path.isdir('logs'): 21 | shutil.rmtree('logs') 22 | os.mkdir('logs') 23 | 24 | # Helper functions to convert scene names to appropriate log file names 25 | def correctness_log_file(scene): 26 | return "./logs/correctness_%s.log" % scene 27 | 28 | def time_log_file(scene): 29 | return "./logs/time_%s.log" % scene 30 | #### END OF LOGS MANAGEMENT #### 31 | 32 | #### RUNNING THE RENDERERS #### 33 | def check_correctness(render_cmd, scene): 34 | cmd_string = "./%s -c %s -s 1024 > %s" % (render_cmd, scene, correctness_log_file(scene)) 35 | # print("Checking correctness: %s" % cmd_string) 36 | 37 | # Actually run it 38 | result = subprocess.run([cmd_string], shell=True) 39 | 40 | return result.returncode == 0 41 | 42 | # Run a renderer one time and get the time taken 43 | def get_time(render_cmd, scene): 44 | # print("get_time %s %s" % (render_cmd, scene)) 45 | cmd_string = "./%s -r cuda -b 0:4 %s -s 1024 | tee %s | grep Total:" % (render_cmd, scene, time_log_file(scene)) 46 | 47 | # Actually run the renderer 48 | result = subprocess.run([cmd_string], shell=True, capture_output=True) 49 | 50 | # Extract the time taken 51 | time = float(re.search(r'\d+\.\d+', str(result.stdout)).group()) 52 | return time 53 | #### END OF RUNNING THE RENDERERS #### 54 | 55 | 56 | # Run all scenes. Some of them are for performance. 57 | def run_scenes(n_runs): 58 | correct = {} 59 | stu_times = {} 60 | ref_times = {} 61 | for scene in scene_names: 62 | print("\nRunning scene: %s..." % (scene)) 63 | 64 | # Check for correctness 65 | correct[scene] = check_correctness("render", scene) 66 | if not correct[scene]: 67 | print("[%s] Correctness failed ... Check %s" % (scene, correctness_log_file(scene))) 68 | else: 69 | print("[%s] Correctness passed!" % scene) 70 | 71 | # Check for performance 72 | if scene in score_scene_names: 73 | 74 | # Do multiple perf runs 75 | stu_times[scene] = [get_time("render", scene) for _ in range(n_runs)] 76 | ref_times[scene] = [get_time("render_ref", scene) for _ in range(n_runs)] 77 | 78 | print("[%s] Student times: " % (scene), stu_times[scene]) 79 | print("[%s] Reference times: " % (scene), ref_times[scene]) 80 | 81 | return correct, stu_times, ref_times 82 | 83 | # Compute scores 84 | def score_table(correct, stu_times, ref_times): 85 | print("------------") 86 | print("Score table:") 87 | print("------------") 88 | header = "| %-15s | %-16s | %-15s | %-15s |" % ("Scene Name", "Ref Time (T_ref)", "Your Time (T)", "Score") 89 | dashes = "-"*len(header) 90 | print(dashes) 91 | print(header) 92 | print(dashes) 93 | 94 | total_score = 0 95 | 96 | for scene in score_scene_names_list: 97 | stu_time = min(stu_times[scene]) 98 | ref_time = min(ref_times[scene]) 99 | if correct[scene]: 100 | if stu_time <= 1.2 * ref_time: 101 | score = perf_pts + correctness_pts 102 | elif stu_time > 10 * ref_time: 103 | score = correctness_pts 104 | else: 105 | score = correctness_pts + math.ceil(perf_pts * (ref_time / stu_time)) 106 | else: 107 | score = 0 108 | 109 | print("| %-15s | %-16s | %-15s | %-15s |" % (scene, ref_time, stu_time if correct[scene] else "(F)", score)); 110 | total_score += score 111 | 112 | print(dashes) 113 | 114 | max_total_score = (perf_pts + correctness_pts) * len(score_scene_names) 115 | score_string = "%s/%s" % (total_score, max_total_score) 116 | print("| %-15s %-16s | %-15s | %-15s |" % ("", "", "Total score:", score_string)) 117 | 118 | print(dashes) 119 | 120 | correct, stu_times, ref_times = run_scenes(3) 121 | score_table(correct, stu_times, ref_times) 122 | -------------------------------------------------------------------------------- /asst3/render/circleBoxTest.cu_inl: -------------------------------------------------------------------------------- 1 | 2 | // circleInBoxConservative -- 3 | // 4 | // Tests whether circle with center (circleX, circleY) and radius 5 | // `circleRadius` *may intersect* the box defined by coordinates for 6 | // it's left and right sides, and top and bottom edges. For 7 | // efficiency, this is a conservative test. If it returns 0, then the 8 | // circle definitely does not intersect the box. However a result of 9 | // 1 does not imply an intersection actually exists. Further tests 10 | // are needed to determine if an intersection actually exists. For 11 | // example, you could continue with actual point in circle tests, or 12 | // make a subsequent call to circleInBox(). 13 | __device__ __inline__ int 14 | circleInBoxConservative( 15 | float circleX, float circleY, float circleRadius, 16 | float boxL, float boxR, float boxT, float boxB) 17 | { 18 | 19 | // expand box by circle radius. Test if circle center is in the 20 | // expanded box. 21 | 22 | if ( circleX >= (boxL - circleRadius) && 23 | circleX <= (boxR + circleRadius) && 24 | circleY >= (boxB - circleRadius) && 25 | circleY <= (boxT + circleRadius) ) { 26 | return 1; 27 | } else { 28 | return 0; 29 | } 30 | } 31 | 32 | 33 | // circleInBox -- 34 | // 35 | // This is a true circle in box test. It is more expensive than the 36 | // function circleInBoxConservative above, but it's 1/0 result is a 37 | // definitive result. 38 | __device__ __inline__ int 39 | circleInBox( 40 | float circleX, float circleY, float circleRadius, 41 | float boxL, float boxR, float boxT, float boxB) 42 | { 43 | 44 | // clamp circle center to box (finds the closest point on the box) 45 | float closestX = (circleX > boxL) ? ((circleX < boxR) ? circleX : boxR) : boxL; 46 | float closestY = (circleY > boxB) ? ((circleY < boxT) ? circleY : boxT) : boxB; 47 | 48 | // is circle radius less than the distance to the closest point on 49 | // the box? 50 | float distX = closestX - circleX; 51 | float distY = closestY - circleY; 52 | 53 | if ( ((distX*distX) + (distY*distY)) <= (circleRadius*circleRadius) ) { 54 | return 1; 55 | } else { 56 | return 0; 57 | } 58 | } 59 | 60 | -------------------------------------------------------------------------------- /asst3/render/circleRenderer.h: -------------------------------------------------------------------------------- 1 | #ifndef __CIRCLE_RENDERER_H__ 2 | #define __CIRCLE_RENDERER_H__ 3 | 4 | struct Image; 5 | 6 | // fireworks constants 7 | #define NUM_FIREWORKS 15 8 | #define NUM_SPARKS 20 9 | 10 | typedef enum { 11 | CIRCLE_RGB, 12 | CIRCLE_RGBY, 13 | CIRCLE_TEST_10K, 14 | CIRCLE_TEST_100K, 15 | PATTERN, 16 | SNOWFLAKES, 17 | FIREWORKS, 18 | HYPNOSIS, 19 | BOUNCING_BALLS, 20 | SNOWFLAKES_SINGLE_FRAME, 21 | BIG_LITTLE, 22 | LITTLE_BIG 23 | } SceneName; 24 | 25 | 26 | class CircleRenderer { 27 | 28 | public: 29 | 30 | virtual ~CircleRenderer() { }; 31 | 32 | virtual const Image* getImage() = 0; 33 | 34 | virtual void setup() = 0; 35 | 36 | virtual void loadScene(SceneName name) = 0; 37 | 38 | virtual void allocOutputImage(int width, int height) = 0; 39 | 40 | virtual void clearImage() = 0; 41 | 42 | virtual void advanceAnimation() = 0; 43 | 44 | virtual void render() = 0; 45 | 46 | //virtual void dumpParticles(const char* filename) {} 47 | 48 | }; 49 | 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /asst3/render/cudaRenderer.h: -------------------------------------------------------------------------------- 1 | #ifndef __CUDA_RENDERER_H__ 2 | #define __CUDA_RENDERER_H__ 3 | 4 | #ifndef uint 5 | #define uint unsigned int 6 | #endif 7 | 8 | #include "circleRenderer.h" 9 | 10 | 11 | class CudaRenderer : public CircleRenderer { 12 | 13 | private: 14 | 15 | Image* image; 16 | SceneName sceneName; 17 | 18 | int numCircles; 19 | float* position; 20 | float* velocity; 21 | float* color; 22 | float* radius; 23 | 24 | float* cudaDevicePosition; 25 | float* cudaDeviceVelocity; 26 | float* cudaDeviceColor; 27 | float* cudaDeviceRadius; 28 | float* cudaDeviceImageData; 29 | 30 | public: 31 | 32 | CudaRenderer(); 33 | virtual ~CudaRenderer(); 34 | 35 | const Image* getImage(); 36 | 37 | void setup(); 38 | 39 | void loadScene(SceneName name); 40 | 41 | void allocOutputImage(int width, int height); 42 | 43 | void clearImage(); 44 | 45 | void advanceAnimation(); 46 | 47 | void render(); 48 | 49 | void shadePixel( 50 | int circleIndex, 51 | float pixelCenterX, float pixelCenterY, 52 | float px, float py, float pz, 53 | float* pixelData); 54 | }; 55 | 56 | 57 | #endif 58 | -------------------------------------------------------------------------------- /asst3/render/cycleTimer.h: -------------------------------------------------------------------------------- 1 | #ifndef _SYRAH_CYCLE_TIMER_H_ 2 | #define _SYRAH_CYCLE_TIMER_H_ 3 | 4 | #if defined(__APPLE__) 5 | #if defined(__x86_64__) 6 | #include 7 | #else 8 | #include 9 | #include 10 | #endif // __x86_64__ or not 11 | 12 | #include // fprintf 13 | #include // exit 14 | 15 | #elif _WIN32 16 | # include 17 | # include 18 | #else 19 | # include 20 | # include 21 | # include 22 | # include 23 | #endif 24 | 25 | 26 | // This uses the cycle counter of the processor. Different 27 | // processors in the system will have different values for this. If 28 | // you process moves across processors, then the delta time you 29 | // measure will likely be incorrect. This is mostly for fine 30 | // grained measurements where the process is likely to be on the 31 | // same processor. For more global things you should use the 32 | // Time interface. 33 | 34 | // Also note that if you processors' speeds change (i.e. processors 35 | // scaling) or if you are in a heterogenous environment, you will 36 | // likely get spurious results. 37 | class CycleTimer { 38 | public: 39 | typedef unsigned long long SysClock; 40 | 41 | ////////// 42 | // Return the current CPU time, in terms of clock ticks. 43 | // Time zero is at some arbitrary point in the past. 44 | static SysClock currentTicks() { 45 | #if defined(__APPLE__) && !defined(__x86_64__) 46 | return mach_absolute_time(); 47 | #elif defined(_WIN32) 48 | LARGE_INTEGER qwTime; 49 | QueryPerformanceCounter(&qwTime); 50 | return qwTime.QuadPart; 51 | #elif defined(__x86_64__) 52 | unsigned int a, d; 53 | asm volatile("rdtsc" : "=a" (a), "=d" (d)); 54 | return static_cast(a) | 55 | (static_cast(d) << 32); 56 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser. 57 | unsigned int val; 58 | asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val)); 59 | return val; 60 | #else 61 | timespec spec; 62 | clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec); 63 | return CycleTimer::SysClock(static_cast(spec.tv_sec) * 1e9 + static_cast(spec.tv_nsec)); 64 | #endif 65 | } 66 | 67 | ////////// 68 | // Return the current CPU time, in terms of seconds. 69 | // This is slower than currentTicks(). Time zero is at 70 | // some arbitrary point in the past. 71 | static double currentSeconds() { 72 | return currentTicks() * secondsPerTick(); 73 | } 74 | 75 | ////////// 76 | // Return the conversion from seconds to ticks. 77 | static double ticksPerSecond() { 78 | return 1.0/secondsPerTick(); 79 | } 80 | 81 | static const char* tickUnits() { 82 | #if defined(__APPLE__) && !defined(__x86_64__) 83 | return "ns"; 84 | #elif defined(__WIN32__) || defined(__x86_64__) 85 | return "cycles"; 86 | #else 87 | return "ns"; // clock_gettime 88 | #endif 89 | } 90 | 91 | ////////// 92 | // Return the conversion from ticks to seconds. 93 | static double secondsPerTick() { 94 | static bool initialized = false; 95 | static double secondsPerTick_val; 96 | if (initialized) return secondsPerTick_val; 97 | #if defined(__APPLE__) 98 | #ifdef __x86_64__ 99 | int args[] = {CTL_HW, HW_CPU_FREQ}; 100 | unsigned int Hz; 101 | size_t len = sizeof(Hz); 102 | if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) { 103 | fprintf(stderr, "Failed to initialize secondsPerTick_val!\n"); 104 | exit(-1); 105 | } 106 | secondsPerTick_val = 1.0 / (double) Hz; 107 | #else 108 | mach_timebase_info_data_t time_info; 109 | mach_timebase_info(&time_info); 110 | 111 | // Scales to nanoseconds without 1e-9f 112 | secondsPerTick_val = (1e-9*static_cast(time_info.numer))/ 113 | static_cast(time_info.denom); 114 | #endif // x86_64 or not 115 | #elif defined(_WIN32) 116 | LARGE_INTEGER qwTicksPerSec; 117 | QueryPerformanceFrequency(&qwTicksPerSec); 118 | secondsPerTick_val = 1.0/static_cast(qwTicksPerSec.QuadPart); 119 | #else 120 | FILE *fp = fopen("/proc/cpuinfo","r"); 121 | char input[1024]; 122 | if (!fp) { 123 | fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo."); 124 | exit(-1); 125 | } 126 | // In case we don't find it, e.g. on the N900 127 | secondsPerTick_val = 1e-9; 128 | while (!feof(fp) && fgets(input, 1024, fp)) { 129 | // NOTE(boulos): Because reading cpuinfo depends on dynamic 130 | // frequency scaling it's better to read the @ sign first 131 | float GHz, MHz; 132 | if (strstr(input, "model name")) { 133 | char* at_sign = strstr(input, "@"); 134 | if (at_sign) { 135 | char* after_at = at_sign + 1; 136 | char* GHz_str = strstr(after_at, "GHz"); 137 | char* MHz_str = strstr(after_at, "MHz"); 138 | if (GHz_str) { 139 | *GHz_str = '\0'; 140 | if (1 == sscanf(after_at, "%f", &GHz)) { 141 | //printf("GHz = %f\n", GHz); 142 | secondsPerTick_val = 1e-9f / GHz; 143 | break; 144 | } 145 | } else if (MHz_str) { 146 | *MHz_str = '\0'; 147 | if (1 == sscanf(after_at, "%f", &MHz)) { 148 | //printf("MHz = %f\n", MHz); 149 | secondsPerTick_val = 1e-6f / GHz; 150 | break; 151 | } 152 | } 153 | } 154 | } else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) { 155 | //printf("MHz = %f\n", MHz); 156 | secondsPerTick_val = 1e-6f / MHz; 157 | break; 158 | } 159 | } 160 | fclose(fp); 161 | #endif 162 | 163 | initialized = true; 164 | return secondsPerTick_val; 165 | } 166 | 167 | ////////// 168 | // Return the conversion from ticks to milliseconds. 169 | static double msPerTick() { 170 | return secondsPerTick() * 1000.0; 171 | } 172 | 173 | private: 174 | CycleTimer(); 175 | }; 176 | 177 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_ 178 | -------------------------------------------------------------------------------- /asst3/render/display.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "circleRenderer.h" 4 | #include "cycleTimer.h" 5 | #include "image.h" 6 | #include "platformgl.h" 7 | 8 | 9 | void renderPicture(); 10 | 11 | 12 | static struct { 13 | int width; 14 | int height; 15 | bool updateSim; 16 | bool printStats; 17 | bool pauseSim; 18 | double lastFrameTime; 19 | 20 | CircleRenderer* renderer; 21 | 22 | } gDisplay; 23 | 24 | // handleReshape -- 25 | // 26 | // Event handler, fired when the window is resized 27 | void 28 | handleReshape(int w, int h) { 29 | gDisplay.width = w; 30 | gDisplay.height = h; 31 | glViewport(0, 0, gDisplay.width, gDisplay.height); 32 | glutPostRedisplay(); 33 | } 34 | 35 | void 36 | handleDisplay() { 37 | 38 | // simulation and rendering work is done in the renderPicture 39 | // function below 40 | 41 | renderPicture(); 42 | 43 | // the subsequent code uses OpenGL to present the state of the 44 | // rendered image on the screen. 45 | 46 | const Image* img = gDisplay.renderer->getImage(); 47 | 48 | int width = std::min(img->width, gDisplay.width); 49 | int height = std::min(img->height, gDisplay.height); 50 | 51 | glDisable(GL_DEPTH_TEST); 52 | glClearColor(0.f, 0.f, 0.f, 1.f); 53 | glClear(GL_COLOR_BUFFER_BIT); 54 | 55 | glMatrixMode(GL_PROJECTION); 56 | glLoadIdentity(); 57 | glOrtho(0.f, gDisplay.width, 0.f, gDisplay.height, -1.f, 1.f); 58 | 59 | glMatrixMode(GL_MODELVIEW); 60 | glLoadIdentity(); 61 | 62 | // copy image data from the renderer to the OpenGL 63 | // frame-buffer. This is inefficient solution is the processing 64 | // to generate the image is done in CUDA. An improved solution 65 | // would render to a CUDA surface object (stored in GPU memory), 66 | // and then bind this surface as a texture enabling it's use in 67 | // normal openGL rendering 68 | glRasterPos2i(0, 0); 69 | glDrawPixels(width, height, GL_RGBA, GL_FLOAT, img->data); 70 | 71 | double currentTime = CycleTimer::currentSeconds(); 72 | 73 | if (gDisplay.printStats) 74 | printf("%.2f ms\n", 1000.f * (currentTime - gDisplay.lastFrameTime)); 75 | 76 | gDisplay.lastFrameTime = currentTime; 77 | 78 | glutSwapBuffers(); 79 | glutPostRedisplay(); 80 | } 81 | 82 | 83 | // handleKeyPress -- 84 | // 85 | // Keyboard event handler 86 | void 87 | handleKeyPress(unsigned char key, int x, int y) { 88 | 89 | switch (key) { 90 | case 'q': 91 | case 'Q': 92 | exit(1); 93 | break; 94 | case '=': 95 | case '+': 96 | gDisplay.updateSim = true; 97 | break; 98 | case 'p': 99 | case 'P': 100 | gDisplay.pauseSim = !gDisplay.pauseSim; 101 | if (!gDisplay.pauseSim) 102 | gDisplay.updateSim = true; 103 | break; 104 | } 105 | } 106 | 107 | // renderPicture -- 108 | // 109 | // At the reall work is done here, not in the display handler 110 | void 111 | renderPicture() { 112 | 113 | double startTime = CycleTimer::currentSeconds(); 114 | 115 | // clear screen 116 | gDisplay.renderer->clearImage(); 117 | 118 | double endClearTime = CycleTimer::currentSeconds(); 119 | 120 | // update particle positions and state 121 | if (gDisplay.updateSim) { 122 | gDisplay.renderer->advanceAnimation(); 123 | } 124 | if (gDisplay.pauseSim) 125 | gDisplay.updateSim = false; 126 | 127 | double endSimTime = CycleTimer::currentSeconds(); 128 | 129 | // render the particles< into the image 130 | gDisplay.renderer->render(); 131 | 132 | double endRenderTime = CycleTimer::currentSeconds(); 133 | 134 | if (gDisplay.printStats) { 135 | printf("Clear: %.3f ms\n", 1000.f * (endClearTime - startTime)); 136 | printf("Advance: %.3f ms\n", 1000.f * (endSimTime - endClearTime)); 137 | printf("Render: %.3f ms\n", 1000.f * (endRenderTime - endSimTime)); 138 | } 139 | } 140 | 141 | void 142 | startRendererWithDisplay(CircleRenderer* renderer) { 143 | 144 | // setup the display 145 | 146 | const Image* img = renderer->getImage(); 147 | 148 | gDisplay.renderer = renderer; 149 | gDisplay.updateSim = true; 150 | gDisplay.pauseSim = false; 151 | gDisplay.printStats = true; 152 | gDisplay.lastFrameTime = CycleTimer::currentSeconds(); 153 | gDisplay.width = img->width; 154 | gDisplay.height = img->height; 155 | 156 | // configure GLUT 157 | 158 | glutInitWindowSize(gDisplay.width, gDisplay.height); 159 | glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE); 160 | glutCreateWindow("CMU 15-418 Assignment 2 - Circle Renderer"); 161 | glutDisplayFunc(handleDisplay); 162 | glutKeyboardFunc(handleKeyPress); 163 | glutMainLoop(); 164 | } 165 | -------------------------------------------------------------------------------- /asst3/render/exclusiveScan.cu_inl: -------------------------------------------------------------------------------- 1 | 2 | // exclusiveScan.cu_inl 3 | 4 | // This is a shared-memory implementation of exclusive scan. Note that the 5 | // exclusive scan you implemented in Part 1 uses slower *global* memory, and has 6 | // overhead from performing multiple kernel launches. 7 | // Because it uses shared memory, it must be run within a single thread block. 8 | 9 | 10 | // REQUIREMENTS: 11 | // - Input array must have power-of-two length. 12 | // - Number of threads in the thread block must be the size of the array! 13 | // - SCAN_BLOCK_DIM is both the number of threads in the block (must be power of 2) 14 | // and the number of elements that will be scanned. 15 | // You should define this in your cudaRenderer.cu file, 16 | // based on your implementation. 17 | // - The parameter sScratch should be a pointer to an array with 2*SCAN_BLOCK_DIM elements 18 | // - The 3 arrays should be in shared memory. 19 | 20 | // ================= USAGE (in cudaRenderer.cu) ===================== 21 | 22 | // at the top of the file: 23 | 24 | // #define SCAN_BLOCK_DIM BLOCKSIZE // needed by sharedMemExclusiveScan implementation 25 | // #include "exclusiveScan.cu_inl" 26 | 27 | // ... 28 | 29 | // in a kernel: 30 | 31 | // If you're using 2D indices, compute a linear thread index as folows. 32 | // NOTE: scan assumes that every 32 adjacent linear thread indices 33 | // (0-31, 32-63, ...) form a warp, which means they execute in lockstep. 34 | 35 | // If you do linearThreadIndex = threadIdx.x * blockDim.x + threadIdx.y; 36 | // you will get a linear thread index, but it won't be sorted into warps, 37 | // which will break scan! 38 | 39 | // int linearThreadIndex = threadIdx.y * blockDim.x + threadIdx.x; 40 | 41 | // __shared__ uint prefixSumInput[BLOCKSIZE]; 42 | // __shared__ uint prefixSumOutput[BLOCKSIZE]; 43 | // __shared__ uint prefixSumScratch[2 * BLOCKSIZE]; 44 | // sharedMemExclusiveScan(linearThreadIndex, prefixSumInput, prefixSumOutput, prefixSumScratch, BLOCKSIZE); 45 | 46 | 47 | #define LOG2_WARP_SIZE 5U 48 | #define WARP_SIZE (1U << LOG2_WARP_SIZE) 49 | 50 | //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads() 51 | //assuming size <= WARP_SIZE 52 | inline __device__ uint 53 | warpScanInclusive(int threadIndex, uint idata, volatile uint *s_Data, uint size){ 54 | // Note some of the calculations are obscure because they are optimized. 55 | // For example, (threadIndex & (size - 1)) computes threadIndex % size, 56 | // which works, assuming size is a power of 2. 57 | 58 | uint pos = 2 * threadIndex - (threadIndex & (size - 1)); 59 | s_Data[pos] = 0; 60 | pos += size; 61 | s_Data[pos] = idata; 62 | 63 | for(uint offset = 1; offset < size; offset <<= 1) 64 | s_Data[pos] += s_Data[pos - offset]; 65 | 66 | return s_Data[pos]; 67 | } 68 | 69 | inline __device__ uint warpScanExclusive(int threadIndex, uint idata, volatile uint *sScratch, uint size){ 70 | return warpScanInclusive(threadIndex, idata, sScratch, size) - idata; 71 | } 72 | 73 | __inline__ __device__ void 74 | sharedMemExclusiveScan(int threadIndex, uint* sInput, uint* sOutput, volatile uint* sScratch, uint size) 75 | { 76 | if (size > WARP_SIZE) { 77 | 78 | uint idata = sInput[threadIndex]; 79 | 80 | //Bottom-level inclusive warp scan 81 | uint warpResult = warpScanInclusive(threadIndex, idata, sScratch, WARP_SIZE); 82 | 83 | // Save top elements of each warp for exclusive warp scan sync 84 | // to wait for warp scans to complete (because s_Data is being 85 | // overwritten) 86 | __syncthreads(); 87 | 88 | if ( (threadIndex & (WARP_SIZE - 1)) == (WARP_SIZE - 1) ) 89 | sScratch[threadIndex >> LOG2_WARP_SIZE] = warpResult; 90 | 91 | // wait for warp scans to complete 92 | __syncthreads(); 93 | 94 | if ( threadIndex < (SCAN_BLOCK_DIM / WARP_SIZE)) { 95 | // grab top warp elements 96 | uint val = sScratch[threadIndex]; 97 | // calculate exclusive scan and write back to shared memory 98 | sScratch[threadIndex] = warpScanExclusive(threadIndex, val, sScratch, size >> LOG2_WARP_SIZE); 99 | } 100 | 101 | //return updated warp scans with exclusive scan results 102 | __syncthreads(); 103 | 104 | sOutput[threadIndex] = warpResult + sScratch[threadIndex >> LOG2_WARP_SIZE] - idata; 105 | 106 | } else if (threadIndex < WARP_SIZE) { 107 | uint idata = sInput[threadIndex]; 108 | sOutput[threadIndex] = warpScanExclusive(threadIndex, idata, sScratch, size); 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /asst3/render/image.h: -------------------------------------------------------------------------------- 1 | #ifndef __IMAGE_H__ 2 | #define __IMAGE_H__ 3 | 4 | 5 | struct Image { 6 | 7 | Image(int w, int h) { 8 | width = w; 9 | height = h; 10 | data = new float[4 * width * height]; 11 | } 12 | 13 | void clear(float r, float g, float b, float a) { 14 | 15 | int numPixels = width * height; 16 | float* ptr = data; 17 | for (int i=0; i 2 | 百度一下,你就知道

关于百度 About Baidu

©2017 Baidu 使用百度前必读  意见反馈 京ICP证030173号 

3 | -------------------------------------------------------------------------------- /asst3/render/lookupColor.cu_inl: -------------------------------------------------------------------------------- 1 | 2 | 3 | __device__ __inline__ float3 4 | lookupColor(float coord) { 5 | 6 | float scaledCoord = coord * (COLOR_MAP_SIZE-1); 7 | 8 | // using short type rather than int type since 16-bit integer math 9 | // is faster than 32-bit integrer math on NVIDIA GPUs 10 | short maxValue = COLOR_MAP_SIZE-1; 11 | short intCoord = static_cast(scaledCoord); 12 | short base = (intCoord < maxValue) ? intCoord : maxValue; // min 13 | 14 | // linearly interpolate between values in the table based on the 15 | // value of coord 16 | float weight = scaledCoord - static_cast(base); 17 | float oneMinusWeight = 1.f - weight; 18 | 19 | float r = (oneMinusWeight * cuConstColorRamp[base][0]) + (weight * cuConstColorRamp[base+1][0]); 20 | float g = (oneMinusWeight * cuConstColorRamp[base][1]) + (weight * cuConstColorRamp[base+1][1]); 21 | float b = (oneMinusWeight * cuConstColorRamp[base][2]) + (weight * cuConstColorRamp[base+1][2]); 22 | return make_float3(r, g, b); 23 | } 24 | -------------------------------------------------------------------------------- /asst3/render/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "refRenderer.h" 7 | #include "cudaRenderer.h" 8 | #include "platformgl.h" 9 | 10 | #define DEFAULT_IMAGE_SIZE 1024 11 | 12 | 13 | void startRendererWithDisplay(CircleRenderer* renderer); 14 | void startBenchmark(CircleRenderer* renderer, int startFrame, int totalFrames, const std::string& frameFilename); 15 | void CheckBenchmark(CircleRenderer* ref_renderer, CircleRenderer* cuda_renderer, 16 | int benchmarkFrameStart, int totalFrames, const std::string& frameFilename); 17 | 18 | 19 | void usage(const char* progname) { 20 | printf("Usage: %s [options] scenename\n", progname); 21 | printf("Valid scenenames are: rgb, rgby, rand10k, rand100k, biglittle, littlebig, pattern,\n" 22 | " bouncingballs, fireworks, hypnosis, snow, snowsingle\n"); 23 | printf("Program Options:\n"); 24 | printf(" -r --renderer Select renderer: ref or cuda (default=cuda)\n"); 25 | printf(" -s --size Rendered image size: x pixels (default=%d)\n", DEFAULT_IMAGE_SIZE); 26 | printf(" -b --bench Run for frames [START,END) (default=[0,1))\n"); 27 | printf(" -c --check Check correctness of CUDA output against CPU reference\n"); 28 | printf(" -i --interactive Render output to interactive display\n"); 29 | printf(" -f --file Output file name (FILENAME_xxxx.ppm) (default=output)\n"); 30 | printf(" -? --help This message\n"); 31 | } 32 | 33 | 34 | int main(int argc, char** argv) 35 | { 36 | 37 | int benchmarkFrameStart = 0; 38 | int benchmarkFrameEnd = 1; 39 | int imageSize = DEFAULT_IMAGE_SIZE; 40 | 41 | std::string sceneNameStr; 42 | std::string frameFilename("output"); 43 | SceneName sceneName; 44 | bool useRefRenderer = false; 45 | bool checkCorrectness = false; 46 | bool interactiveMode = false; 47 | 48 | // parse commandline options //////////////////////////////////////////// 49 | int opt; 50 | static struct option long_options[] = { 51 | {"help", 0, 0, '?'}, 52 | {"check", 0, 0, 'c'}, 53 | {"bench", 1, 0, 'b'}, 54 | {"interactive", 0, 0, 'i'}, 55 | {"file", 1, 0, 'f'}, 56 | {"renderer", 1, 0, 'r'}, 57 | {"size", 1, 0, 's'}, 58 | {0 ,0, 0, 0} 59 | }; 60 | 61 | while ((opt = getopt_long(argc, argv, "b:f:r:s:ci?", long_options, NULL)) != EOF) { 62 | 63 | switch (opt) { 64 | case 'b': 65 | if (sscanf(optarg, "%d:%d", &benchmarkFrameStart, &benchmarkFrameEnd) != 2) { 66 | fprintf(stderr, "Invalid argument to -b option\n"); 67 | usage(argv[0]); 68 | exit(1); 69 | } 70 | break; 71 | case 'i': 72 | interactiveMode = true; 73 | break; 74 | case 'c': 75 | checkCorrectness = true; 76 | break; 77 | case 'f': 78 | frameFilename = optarg; 79 | break; 80 | case 'r': 81 | if (std::string(optarg).compare("cuda") == 0) { 82 | useRefRenderer = false; 83 | } else if (std::string(optarg).compare("cpuref") == 0) { 84 | useRefRenderer = true; 85 | } else { 86 | fprintf(stderr, "ERROR: Unknown renderer type: %s\n", optarg); 87 | usage(argv[0]); 88 | return 1; 89 | } 90 | break; 91 | case 's': 92 | imageSize = atoi(optarg); 93 | break; 94 | case '?': 95 | default: 96 | usage(argv[0]); 97 | return 1; 98 | } 99 | } 100 | // end parsing of commandline options ////////////////////////////////////// 101 | 102 | 103 | if (optind + 1 > argc) { 104 | fprintf(stderr, "Error: missing scene name\n"); 105 | usage(argv[0]); 106 | return 1; 107 | } 108 | 109 | sceneNameStr = argv[optind]; 110 | 111 | if (sceneNameStr.compare("snow") == 0) { 112 | sceneName = SNOWFLAKES; 113 | } else if (sceneNameStr.compare("snowsingle") == 0) { 114 | sceneName = SNOWFLAKES_SINGLE_FRAME; 115 | } else if (sceneNameStr.compare("rgb") == 0) { 116 | sceneName = CIRCLE_RGB; 117 | } else if (sceneNameStr.compare("rgby") == 0) { 118 | sceneName = CIRCLE_RGBY; 119 | } else if (sceneNameStr.compare("rand10k") == 0) { 120 | sceneName = CIRCLE_TEST_10K; 121 | } else if (sceneNameStr.compare("rand100k") == 0) { 122 | sceneName = CIRCLE_TEST_100K; 123 | } else if (sceneNameStr.compare("pattern") == 0) { 124 | sceneName = PATTERN; 125 | } else if (sceneNameStr.compare("biglittle") == 0) { 126 | sceneName = BIG_LITTLE; 127 | } else if (sceneNameStr.compare("littlebig") == 0) { 128 | sceneName = LITTLE_BIG; 129 | } else if (sceneNameStr.compare("bouncingballs") == 0) { 130 | sceneName = BOUNCING_BALLS; 131 | } else if (sceneNameStr.compare("hypnosis") == 0) { 132 | sceneName = HYPNOSIS; 133 | } else if (sceneNameStr.compare("fireworks") == 0) { 134 | sceneName = FIREWORKS; 135 | }else { 136 | fprintf(stderr, "Unknown scene name (%s)\n", sceneNameStr.c_str()); 137 | usage(argv[0]); 138 | return 1; 139 | } 140 | 141 | printf("Rendering to %dx%d image\n", imageSize, imageSize); 142 | 143 | CircleRenderer* renderer; 144 | 145 | if (checkCorrectness) { 146 | // Need both the renderers 147 | 148 | CircleRenderer* ref_renderer; 149 | CircleRenderer* cuda_renderer; 150 | 151 | ref_renderer = new RefRenderer(); 152 | cuda_renderer = new CudaRenderer(); 153 | 154 | ref_renderer->allocOutputImage(imageSize, imageSize); 155 | ref_renderer->loadScene(sceneName); 156 | ref_renderer->setup(); 157 | cuda_renderer->allocOutputImage(imageSize, imageSize); 158 | cuda_renderer->loadScene(sceneName); 159 | cuda_renderer->setup(); 160 | 161 | // Check the correctness 162 | CheckBenchmark(ref_renderer, cuda_renderer, 0, 1, frameFilename); 163 | } 164 | else { 165 | 166 | if (useRefRenderer) 167 | renderer = new RefRenderer(); 168 | else 169 | renderer = new CudaRenderer(); 170 | 171 | renderer->allocOutputImage(imageSize, imageSize); 172 | renderer->loadScene(sceneName); 173 | renderer->setup(); 174 | 175 | if (!interactiveMode) 176 | startBenchmark(renderer, benchmarkFrameStart, benchmarkFrameEnd - benchmarkFrameStart, frameFilename); 177 | else { 178 | glutInit(&argc, argv); 179 | startRendererWithDisplay(renderer); 180 | } 181 | } 182 | 183 | return 0; 184 | } 185 | -------------------------------------------------------------------------------- /asst3/render/noise.h: -------------------------------------------------------------------------------- 1 | #ifndef __NOISE_H__ 2 | #define __NOISE_H__ 3 | 4 | 5 | void vec2CellNoise(float location[3], float result[2], int index); 6 | 7 | void getNoiseTables(int** permX, int** permY, float** value1D); 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /asst3/render/noiseCuda.cu_inl: -------------------------------------------------------------------------------- 1 | 2 | // included by fastRenderer.cu 3 | 4 | __device__ __inline__ float2 5 | cudaVec2CellNoise(float3 location, int index) 6 | { 7 | int integer_of_x = static_cast( location.x ); 8 | int integer_of_y = static_cast( location.y ); 9 | int integer_of_z = static_cast( location.z ); 10 | int hash = cuConstNoiseXPermutationTable[ (integer_of_x*index) & 0xFF ]; 11 | hash = cuConstNoiseXPermutationTable[ ( hash + integer_of_y ) & 0xFF ]; 12 | hash = cuConstNoiseXPermutationTable[ ( hash + integer_of_z ) & 0xFF ]; 13 | float x_result = cuConstNoise1DValueTable[ hash ]; 14 | hash = cuConstNoiseYPermutationTable[ integer_of_x & 0xFF ]; 15 | hash = cuConstNoiseYPermutationTable[ ( hash + integer_of_y ) & 0xFF ]; 16 | hash = cuConstNoiseYPermutationTable[ ( hash + integer_of_z ) & 0xFF ]; 17 | float y_result = cuConstNoise1DValueTable[ hash ]; 18 | 19 | return make_float2(x_result, y_result); 20 | } 21 | -------------------------------------------------------------------------------- /asst3/render/platformgl.h: -------------------------------------------------------------------------------- 1 | #ifndef __PLATFORM_GL_H__ 2 | #define __PLATFORM_GL_H__ 3 | 4 | #ifdef __APPLE__ 5 | #include 6 | #else 7 | #include 8 | #endif 9 | 10 | #endif 11 | 12 | -------------------------------------------------------------------------------- /asst3/render/ppm.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "image.h" 7 | #include "util.h" 8 | 9 | 10 | 11 | // writePPMImage -- 12 | // 13 | // assumes input pixels are float4 14 | // write 3-channel (8 bit --> 24 bits per pixel) ppm 15 | void 16 | writePPMImage(const Image* image, const char *filename) 17 | { 18 | FILE *fp = fopen(filename, "wb"); 19 | 20 | if (!fp) { 21 | fprintf(stderr, "Error: could not open %s for write\n", filename); 22 | exit(1); 23 | } 24 | 25 | // write ppm header 26 | fprintf(fp, "P6\n"); 27 | fprintf(fp, "%d %d\n", image->width, image->height); 28 | fprintf(fp, "255\n"); 29 | 30 | for (int j=image->height-1; j>=0; j--) { 31 | for (int i=0; iwidth; i++) { 32 | 33 | const float* ptr = &image->data[4 * (j*image->width + i)]; 34 | 35 | char val[3]; 36 | val[0] = static_cast(255.f * CLAMP(ptr[0], 0.f, 1.f)); 37 | val[1] = static_cast(255.f * CLAMP(ptr[1], 0.f, 1.f)); 38 | val[2] = static_cast(255.f * CLAMP(ptr[2], 0.f, 1.f)); 39 | 40 | fputc(val[0], fp); 41 | fputc(val[1], fp); 42 | fputc(val[2], fp); 43 | } 44 | } 45 | 46 | fclose(fp); 47 | printf("Wrote image file %s\n", filename); 48 | } 49 | -------------------------------------------------------------------------------- /asst3/render/ppm.h: -------------------------------------------------------------------------------- 1 | #ifndef __PPM_H__ 2 | #define __PPM_H__ 3 | 4 | struct Image; 5 | 6 | void writePPMImage(const Image* image, const char *filename); 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /asst3/render/refRenderer.h: -------------------------------------------------------------------------------- 1 | #ifndef __REF_RENDERER_H__ 2 | #define __REF_RENDERER_H__ 3 | 4 | #include "circleRenderer.h" 5 | 6 | 7 | class RefRenderer : public CircleRenderer { 8 | 9 | private: 10 | 11 | Image* image; 12 | SceneName sceneName; 13 | 14 | int numCircles; 15 | float* position; 16 | float* velocity; 17 | float* color; 18 | float* radius; 19 | 20 | public: 21 | 22 | RefRenderer(); 23 | virtual ~RefRenderer(); 24 | 25 | const Image* getImage(); 26 | 27 | void setup(); 28 | 29 | void loadScene(SceneName name); 30 | 31 | void allocOutputImage(int width, int height); 32 | 33 | void clearImage(); 34 | 35 | void advanceAnimation(); 36 | 37 | void render(); 38 | 39 | void dumpParticles(const char* filename); 40 | 41 | void shadePixel( 42 | int circleIndex, 43 | float pixelCenterX, float pixelCenterY, 44 | float px, float py, float pz, 45 | float* pixelData); 46 | }; 47 | 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /asst3/render/refTimings.txt: -------------------------------------------------------------------------------- 1 | Performance of reference implementation on all scenes: 2 | (All timings are in millseconds) 3 | 4 | Tests run using benchmark mode flag --bench 0:4 5 | (Reported times are per-frame time for just the call to render()) 6 | 7 | image size: 512x512 image size: 1024x1024 8 | ref cuda (speedup) ref cuda (speedup) 9 | -------------------------------------------------------------------------- 10 | rgb 1.94 0.13 (14.9x) 8.02 0.49 (16.4x) 11 | rgby 1.05 0.12 (8.8x) 4.31 0.46 (9.4x) 12 | pattern 4.32 0.49 (8.8x) 18.86 1.76 (10.7x) 13 | rand10k 208.40 5.86 (35.6x) 882.75 21.26 (41.5x) 14 | rand100k 2084.03 60.47 (41.3x) 8860.17 217.72 (40.7x) 15 | snowsingle 255.55 29.72 (8.6x 1006.35 113.96 (8.8x) 16 | 17 | -------------------------------------------------------------------------------- /asst3/render/render_ref: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/render/render_ref -------------------------------------------------------------------------------- /asst3/render/sceneLoader.h: -------------------------------------------------------------------------------- 1 | #ifndef __SCENE_LOADER_H__ 2 | #define __SCENE_LOADER_H__ 3 | 4 | #include "circleRenderer.h" 5 | 6 | void 7 | loadCircleScene( 8 | SceneName sceneName, 9 | int& numCircles, 10 | float*& position, 11 | float*& velocity, 12 | float*& color, 13 | float*& radius); 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /asst3/render/util.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef __UTIL_H__ 3 | #define __UTIL_H__ 4 | 5 | #include 6 | 7 | #define CLAMP(x, minimum, maximum) std::max(minimum, std::min(x, maximum)) 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /asst3/saxpy/CycleTimer.h: -------------------------------------------------------------------------------- 1 | #ifndef _SYRAH_CYCLE_TIMER_H_ 2 | #define _SYRAH_CYCLE_TIMER_H_ 3 | 4 | #if defined(__APPLE__) 5 | #if defined(__x86_64__) 6 | #include 7 | #else 8 | #include 9 | #include 10 | #endif // __x86_64__ or not 11 | 12 | #include // fprintf 13 | #include // exit 14 | 15 | #elif _WIN32 16 | # include 17 | # include 18 | #else 19 | # include 20 | # include 21 | # include 22 | # include 23 | #endif 24 | 25 | 26 | // This uses the cycle counter of the processor. Different 27 | // processors in the system will have different values for this. If 28 | // you process moves across processors, then the delta time you 29 | // measure will likely be incorrect. This is mostly for fine 30 | // grained measurements where the process is likely to be on the 31 | // same processor. For more global things you should use the 32 | // Time interface. 33 | 34 | // Also note that if you processors' speeds change (i.e. processors 35 | // scaling) or if you are in a heterogenous environment, you will 36 | // likely get spurious results. 37 | class CycleTimer { 38 | public: 39 | typedef unsigned long long SysClock; 40 | 41 | ////////// 42 | // Return the current CPU time, in terms of clock ticks. 43 | // Time zero is at some arbitrary point in the past. 44 | static SysClock currentTicks() { 45 | #if defined(__APPLE__) && !defined(__x86_64__) 46 | return mach_absolute_time(); 47 | #elif defined(_WIN32) 48 | LARGE_INTEGER qwTime; 49 | QueryPerformanceCounter(&qwTime); 50 | return qwTime.QuadPart; 51 | #elif defined(__x86_64__) 52 | unsigned int a, d; 53 | asm volatile("rdtsc" : "=a" (a), "=d" (d)); 54 | return static_cast(a) | 55 | (static_cast(d) << 32); 56 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser. 57 | unsigned int val; 58 | asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val)); 59 | return val; 60 | #else 61 | timespec spec; 62 | clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec); 63 | return CycleTimer::SysClock(static_cast(spec.tv_sec) * 1e9 + static_cast(spec.tv_nsec)); 64 | #endif 65 | } 66 | 67 | ////////// 68 | // Return the current CPU time, in terms of seconds. 69 | // This is slower than currentTicks(). Time zero is at 70 | // some arbitrary point in the past. 71 | static double currentSeconds() { 72 | return currentTicks() * secondsPerTick(); 73 | } 74 | 75 | ////////// 76 | // Return the conversion from seconds to ticks. 77 | static double ticksPerSecond() { 78 | return 1.0/secondsPerTick(); 79 | } 80 | 81 | static const char* tickUnits() { 82 | #if defined(__APPLE__) && !defined(__x86_64__) 83 | return "ns"; 84 | #elif defined(__WIN32__) || defined(__x86_64__) 85 | return "cycles"; 86 | #else 87 | return "ns"; // clock_gettime 88 | #endif 89 | } 90 | 91 | ////////// 92 | // Return the conversion from ticks to seconds. 93 | static double secondsPerTick() { 94 | static bool initialized = false; 95 | static double secondsPerTick_val; 96 | if (initialized) return secondsPerTick_val; 97 | #if defined(__APPLE__) 98 | #ifdef __x86_64__ 99 | int args[] = {CTL_HW, HW_CPU_FREQ}; 100 | unsigned int Hz; 101 | size_t len = sizeof(Hz); 102 | if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) { 103 | fprintf(stderr, "Failed to initialize secondsPerTick_val!\n"); 104 | exit(-1); 105 | } 106 | secondsPerTick_val = 1.0 / (double) Hz; 107 | #else 108 | mach_timebase_info_data_t time_info; 109 | mach_timebase_info(&time_info); 110 | 111 | // Scales to nanoseconds without 1e-9f 112 | secondsPerTick_val = (1e-9*static_cast(time_info.numer))/ 113 | static_cast(time_info.denom); 114 | #endif // x86_64 or not 115 | #elif defined(_WIN32) 116 | LARGE_INTEGER qwTicksPerSec; 117 | QueryPerformanceFrequency(&qwTicksPerSec); 118 | secondsPerTick_val = 1.0/static_cast(qwTicksPerSec.QuadPart); 119 | #else 120 | FILE *fp = fopen("/proc/cpuinfo","r"); 121 | char input[1024]; 122 | if (!fp) { 123 | fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo."); 124 | exit(-1); 125 | } 126 | // In case we don't find it, e.g. on the N900 127 | secondsPerTick_val = 1e-9; 128 | while (!feof(fp) && fgets(input, 1024, fp)) { 129 | // NOTE(boulos): Because reading cpuinfo depends on dynamic 130 | // frequency scaling it's better to read the @ sign first 131 | float GHz, MHz; 132 | if (strstr(input, "model name")) { 133 | char* at_sign = strstr(input, "@"); 134 | if (at_sign) { 135 | char* after_at = at_sign + 1; 136 | char* GHz_str = strstr(after_at, "GHz"); 137 | char* MHz_str = strstr(after_at, "MHz"); 138 | if (GHz_str) { 139 | *GHz_str = '\0'; 140 | if (1 == sscanf(after_at, "%f", &GHz)) { 141 | //printf("GHz = %f\n", GHz); 142 | secondsPerTick_val = 1e-9f / GHz; 143 | break; 144 | } 145 | } else if (MHz_str) { 146 | *MHz_str = '\0'; 147 | if (1 == sscanf(after_at, "%f", &MHz)) { 148 | //printf("MHz = %f\n", MHz); 149 | secondsPerTick_val = 1e-6f / GHz; 150 | break; 151 | } 152 | } 153 | } 154 | } else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) { 155 | //printf("MHz = %f\n", MHz); 156 | secondsPerTick_val = 1e-6f / MHz; 157 | break; 158 | } 159 | } 160 | fclose(fp); 161 | #endif 162 | 163 | initialized = true; 164 | return secondsPerTick_val; 165 | } 166 | 167 | ////////// 168 | // Return the conversion from ticks to milliseconds. 169 | static double msPerTick() { 170 | return secondsPerTick() * 1000.0; 171 | } 172 | 173 | private: 174 | CycleTimer(); 175 | }; 176 | 177 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_ 178 | -------------------------------------------------------------------------------- /asst3/saxpy/Makefile: -------------------------------------------------------------------------------- 1 | 2 | EXECUTABLE := cudaSaxpy 3 | 4 | CU_FILES := saxpy.cu 5 | 6 | CU_DEPS := 7 | 8 | CC_FILES := main.cpp 9 | 10 | ########################################################### 11 | 12 | ARCH=$(shell uname | sed -e 's/-.*//g') 13 | 14 | OBJDIR=objs 15 | CXX=g++ -m64 16 | CXXFLAGS=-O3 -Wall 17 | ifeq ($(ARCH), Darwin) 18 | # Building on mac 19 | LDFLAGS=-L/usr/local/depot/cuda/lib/ -lcudart 20 | else 21 | # Building on Linux 22 | LDFLAGS=-L/usr/local/cuda-9.0/lib64/ -lcudart 23 | endif 24 | NVCC=nvcc 25 | NVCCFLAGS=-O3 -m64 --gpu-architecture compute_61 26 | 27 | 28 | OBJS=$(OBJDIR)/main.o $(OBJDIR)/saxpy.o 29 | 30 | 31 | .PHONY: dirs clean 32 | 33 | default: $(EXECUTABLE) 34 | 35 | dirs: 36 | mkdir -p $(OBJDIR)/ 37 | 38 | clean: 39 | rm -rf $(OBJDIR) *.ppm *~ $(EXECUTABLE) 40 | 41 | $(EXECUTABLE): dirs $(OBJS) 42 | $(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(LDFLAGS) 43 | 44 | $(OBJDIR)/%.o: %.cpp 45 | $(CXX) $< $(CXXFLAGS) -c -o $@ 46 | 47 | $(OBJDIR)/%.o: %.cu 48 | $(NVCC) $< $(NVCCFLAGS) -c -o $@ 49 | -------------------------------------------------------------------------------- /asst3/saxpy/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | void saxpyCuda(int N, float alpha, float* x, float* y, float* result); 8 | void printCudaInfo(); 9 | 10 | 11 | void usage(const char* progname) { 12 | printf("Usage: %s [options]\n", progname); 13 | printf("Program Options:\n"); 14 | printf(" -n --arraysize Number of elements in arrays\n"); 15 | printf(" -? --help This message\n"); 16 | } 17 | 18 | 19 | bool check(int N, float alpha, float* x, float* y, float* result) { 20 | 21 | for (int i = 0; i < N; ++i) { 22 | if (abs(alpha * x[i] + y[i] - result[i]) > 1e-5) { 23 | return false; 24 | } 25 | } 26 | return true; 27 | 28 | } 29 | 30 | 31 | int main(int argc, char** argv) 32 | { 33 | 34 | // default: arrays of 100M numbers 35 | int N = 100 * 1000 * 1000; 36 | 37 | // parse commandline options //////////////////////////////////////////// 38 | int opt; 39 | static struct option long_options[] = { 40 | {"arraysize", 1, 0, 'n'}, 41 | {"help", 0, 0, '?'}, 42 | {0 ,0, 0, 0} 43 | }; 44 | 45 | while ((opt = getopt_long(argc, argv, "?n:", long_options, NULL)) != EOF) { 46 | 47 | switch (opt) { 48 | case 'n': 49 | N = atoi(optarg); 50 | break; 51 | case '?': 52 | default: 53 | usage(argv[0]); 54 | return 1; 55 | } 56 | } 57 | // end parsing of commandline options ////////////////////////////////////// 58 | 59 | const float alpha = 2.0f; 60 | float* xarray = new float[N]; 61 | float* yarray = new float[N]; 62 | float* resultarray = new float[N]; 63 | 64 | for (int i=0; i 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "CycleTimer.h" 8 | 9 | 10 | // return GB/sec 11 | float GBPerSec(int bytes, float sec) { 12 | return static_cast(bytes) / (1024. * 1024. * 1024.) / sec; 13 | } 14 | 15 | 16 | // This is the CUDA "kernel" function that is run on the GPU. You 17 | // know this because it is marked as a __global__ function. 18 | __global__ void 19 | saxpy_kernel(int N, float alpha, float* x, float* y, float* result) { 20 | 21 | // compute overall thread index from position of thread in current 22 | // block, and given the block we are in (in this example only a 1D 23 | // calculation is needed so the code only looks at the .x terms of 24 | // blockDim and threadIdx. 25 | int index = blockIdx.x * blockDim.x + threadIdx.x; 26 | 27 | 28 | // this check is necessary to make the code work for values of N 29 | // that are not a multiple of the thread block size (blockDim.x) 30 | if (index < N) 31 | result[index] = alpha * x[index] + y[index]; 32 | } 33 | 34 | 35 | // saxpyCuda -- 36 | // 37 | // This function is regular C code running on the CPU. It allocates 38 | // memory on the GPU using CUDA API functions, uses CUDA API functions 39 | // to transfer data from the CPU's memory address space to GPU memory 40 | // address space, and launches the CUDA kernel function on the GPU. 41 | void saxpyCuda(int N, float alpha, float* xarray, float* yarray, float* resultarray) { 42 | 43 | // must read both input arrays (xarray and yarray) and write to 44 | // output array (resultarray) 45 | int totalBytes = sizeof(float) * 3 * N; 46 | 47 | // compute number of blocks and threads per block. In this 48 | // application we've hardcoded thread blocks to contain 512 CUDA 49 | // threads. 50 | const int threadsPerBlock = 512; 51 | 52 | // Notice the round up here. The code needs to compute the number 53 | // of threads blocks needed such that there is one thread per 54 | // element of the arrays. This code is written to work for values 55 | // of N that are not multiples of threadPerBlock. 56 | const int blocks = (N + threadsPerBlock - 1) / threadsPerBlock; 57 | 58 | // These are pointers that will be pointers to memory allocated 59 | // *one the GPU*. You should allocate these pointers via 60 | // cudaMalloc. You can access the resulting buffers from CUDA 61 | // device kernel code (see the kernel function saxpy_kernel() 62 | // above) but you cannot access the contents these buffers from 63 | // this thread. CPU threads cannot issue loads and stores from GPU 64 | // memory! 65 | float* device_x = NULL; 66 | float* device_y = NULL; 67 | float* device_result = NULL; 68 | 69 | // 70 | // CS149 TODO: allocate device memory buffers on the GPU using cudaMalloc. 71 | // 72 | // We highly recommend taking a look at NVIDIA's 73 | // tutorial, which clearly walks you through the few lines of code 74 | // you need to write for this part of the assignment: 75 | // 76 | // https://devblogs.nvidia.com/easy-introduction-cuda-c-and-c/ 77 | // 78 | 79 | cudaMalloc(&device_x, N * sizeof(float)); 80 | cudaMalloc(&device_y, N * sizeof(float)); 81 | cudaMalloc(&device_result, N * sizeof(float)); 82 | 83 | // start timing after allocation of device memory 84 | double startTime = CycleTimer::currentSeconds(); 85 | 86 | // 87 | // CS149 TODO: copy input arrays to the GPU using cudaMemcpy 88 | // 89 | 90 | cudaMemcpy(device_x, xarray, N * sizeof(float), cudaMemcpyHostToDevice); 91 | cudaMemcpy(device_y, yarray, N * sizeof(float), cudaMemcpyHostToDevice); 92 | 93 | 94 | double startExecTime = CycleTimer::currentSeconds(); 95 | // run CUDA kernel. (notice the <<< >>> brackets indicating a CUDA 96 | // kernel launch) Execution on the GPU occurs here. 97 | saxpy_kernel<<>>(N, alpha, device_x, device_y, device_result); 98 | 99 | // return before all work id done. 100 | cudaDeviceSynchronize(); 101 | 102 | double endExecTime = CycleTimer::currentSeconds(); 103 | // 104 | // CS149 TODO: copy result from GPU back to CPU using cudaMemcpy 105 | // 106 | cudaMemcpy(resultarray, device_result, N * sizeof(float), cudaMemcpyDeviceToHost); 107 | 108 | // end timing after result has been copied back into host memory 109 | double endTime = CycleTimer::currentSeconds(); 110 | 111 | double ExecTime = endExecTime - startExecTime; 112 | printf("Execute Time: %.3f ms\t\t[%.3f GB/s]\n", 1000.f * ExecTime, GBPerSec(totalBytes, ExecTime)); 113 | 114 | cudaError_t errCode = cudaPeekAtLastError(); 115 | if (errCode != cudaSuccess) { 116 | fprintf(stderr, "WARNING: A CUDA error occured: code=%d, %s\n", 117 | errCode, cudaGetErrorString(errCode)); 118 | } 119 | 120 | double overallDuration = endTime - startTime; 121 | printf("Effective BW by CUDA saxpy: %.3f ms\t\t[%.3f GB/s]\n", 1000.f * overallDuration, GBPerSec(totalBytes, overallDuration)); 122 | 123 | // 124 | // CS149 TODO: free memory buffers on the GPU using cudaFree 125 | // 126 | cudaFree(device_x); 127 | cudaFree(device_y); 128 | cudaFree(device_result); 129 | 130 | } 131 | 132 | void printCudaInfo() { 133 | 134 | // print out stats about the GPU in the machine. Useful if 135 | // students want to know what GPU they are running on. 136 | 137 | int deviceCount = 0; 138 | cudaError_t err = cudaGetDeviceCount(&deviceCount); 139 | 140 | printf("---------------------------------------------------------\n"); 141 | printf("Found %d CUDA devices\n", deviceCount); 142 | 143 | for (int i=0; i(deviceProps.totalGlobalMem) / (1024 * 1024)); 150 | printf(" CUDA Cap: %d.%d\n", deviceProps.major, deviceProps.minor); 151 | } 152 | printf("---------------------------------------------------------\n"); 153 | } 154 | -------------------------------------------------------------------------------- /asst3/scan/CycleTimer.h: -------------------------------------------------------------------------------- 1 | #ifndef _SYRAH_CYCLE_TIMER_H_ 2 | #define _SYRAH_CYCLE_TIMER_H_ 3 | 4 | #if defined(__APPLE__) 5 | #if defined(__x86_64__) 6 | #include 7 | #else 8 | #include 9 | #include 10 | #endif // __x86_64__ or not 11 | 12 | #include // fprintf 13 | #include // exit 14 | 15 | #elif _WIN32 16 | # include 17 | # include 18 | #else 19 | # include 20 | # include 21 | # include 22 | # include 23 | #endif 24 | 25 | 26 | // This uses the cycle counter of the processor. Different 27 | // processors in the system will have different values for this. If 28 | // you process moves across processors, then the delta time you 29 | // measure will likely be incorrect. This is mostly for fine 30 | // grained measurements where the process is likely to be on the 31 | // same processor. For more global things you should use the 32 | // Time interface. 33 | 34 | // Also note that if you processors' speeds change (i.e. processors 35 | // scaling) or if you are in a heterogenous environment, you will 36 | // likely get spurious results. 37 | class CycleTimer { 38 | public: 39 | typedef unsigned long long SysClock; 40 | 41 | ////////// 42 | // Return the current CPU time, in terms of clock ticks. 43 | // Time zero is at some arbitrary point in the past. 44 | static SysClock currentTicks() { 45 | #if defined(__APPLE__) && !defined(__x86_64__) 46 | return mach_absolute_time(); 47 | #elif defined(_WIN32) 48 | LARGE_INTEGER qwTime; 49 | QueryPerformanceCounter(&qwTime); 50 | return qwTime.QuadPart; 51 | #elif defined(__x86_64__) 52 | unsigned int a, d; 53 | asm volatile("rdtsc" : "=a" (a), "=d" (d)); 54 | return static_cast(a) | 55 | (static_cast(d) << 32); 56 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser. 57 | unsigned int val; 58 | asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val)); 59 | return val; 60 | #else 61 | timespec spec; 62 | clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec); 63 | return CycleTimer::SysClock(static_cast(spec.tv_sec) * 1e9 + static_cast(spec.tv_nsec)); 64 | #endif 65 | } 66 | 67 | ////////// 68 | // Return the current CPU time, in terms of seconds. 69 | // This is slower than currentTicks(). Time zero is at 70 | // some arbitrary point in the past. 71 | static double currentSeconds() { 72 | return currentTicks() * secondsPerTick(); 73 | } 74 | 75 | ////////// 76 | // Return the conversion from seconds to ticks. 77 | static double ticksPerSecond() { 78 | return 1.0/secondsPerTick(); 79 | } 80 | 81 | static const char* tickUnits() { 82 | #if defined(__APPLE__) && !defined(__x86_64__) 83 | return "ns"; 84 | #elif defined(__WIN32__) || defined(__x86_64__) 85 | return "cycles"; 86 | #else 87 | return "ns"; // clock_gettime 88 | #endif 89 | } 90 | 91 | ////////// 92 | // Return the conversion from ticks to seconds. 93 | static double secondsPerTick() { 94 | static bool initialized = false; 95 | static double secondsPerTick_val; 96 | if (initialized) return secondsPerTick_val; 97 | #if defined(__APPLE__) 98 | #ifdef __x86_64__ 99 | int args[] = {CTL_HW, HW_CPU_FREQ}; 100 | unsigned int Hz; 101 | size_t len = sizeof(Hz); 102 | if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) { 103 | fprintf(stderr, "Failed to initialize secondsPerTick_val!\n"); 104 | exit(-1); 105 | } 106 | secondsPerTick_val = 1.0 / (double) Hz; 107 | #else 108 | mach_timebase_info_data_t time_info; 109 | mach_timebase_info(&time_info); 110 | 111 | // Scales to nanoseconds without 1e-9f 112 | secondsPerTick_val = (1e-9*static_cast(time_info.numer))/ 113 | static_cast(time_info.denom); 114 | #endif // x86_64 or not 115 | #elif defined(_WIN32) 116 | LARGE_INTEGER qwTicksPerSec; 117 | QueryPerformanceFrequency(&qwTicksPerSec); 118 | secondsPerTick_val = 1.0/static_cast(qwTicksPerSec.QuadPart); 119 | #else 120 | FILE *fp = fopen("/proc/cpuinfo","r"); 121 | char input[1024]; 122 | if (!fp) { 123 | fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo."); 124 | exit(-1); 125 | } 126 | // In case we don't find it, e.g. on the N900 127 | secondsPerTick_val = 1e-9; 128 | while (!feof(fp) && fgets(input, 1024, fp)) { 129 | // NOTE(boulos): Because reading cpuinfo depends on dynamic 130 | // frequency scaling it's better to read the @ sign first 131 | float GHz, MHz; 132 | if (strstr(input, "model name")) { 133 | char* at_sign = strstr(input, "@"); 134 | if (at_sign) { 135 | char* after_at = at_sign + 1; 136 | char* GHz_str = strstr(after_at, "GHz"); 137 | char* MHz_str = strstr(after_at, "MHz"); 138 | if (GHz_str) { 139 | *GHz_str = '\0'; 140 | if (1 == sscanf(after_at, "%f", &GHz)) { 141 | //printf("GHz = %f\n", GHz); 142 | secondsPerTick_val = 1e-9f / GHz; 143 | break; 144 | } 145 | } else if (MHz_str) { 146 | *MHz_str = '\0'; 147 | if (1 == sscanf(after_at, "%f", &MHz)) { 148 | //printf("MHz = %f\n", MHz); 149 | secondsPerTick_val = 1e-6f / GHz; 150 | break; 151 | } 152 | } 153 | } 154 | } else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) { 155 | //printf("MHz = %f\n", MHz); 156 | secondsPerTick_val = 1e-6f / MHz; 157 | break; 158 | } 159 | } 160 | fclose(fp); 161 | #endif 162 | 163 | initialized = true; 164 | return secondsPerTick_val; 165 | } 166 | 167 | ////////// 168 | // Return the conversion from ticks to milliseconds. 169 | static double msPerTick() { 170 | return secondsPerTick() * 1000.0; 171 | } 172 | 173 | private: 174 | CycleTimer(); 175 | }; 176 | 177 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_ 178 | -------------------------------------------------------------------------------- /asst3/scan/Makefile: -------------------------------------------------------------------------------- 1 | EXECUTABLE := cudaScan 2 | 3 | CU_FILES := scan.cu 4 | 5 | CU_DEPS := 6 | 7 | CC_FILES := main.cpp 8 | 9 | all: $(EXECUTABLE) $(REFERENCE) 10 | 11 | LOGS := logs 12 | 13 | ########################################################### 14 | 15 | OBJDIR=objs 16 | CXX=g++ -m64 17 | CXXFLAGS=-O3 -Wall -std=c++11 18 | LDFLAGS=-L/usr/local/cuda-9.0/lib64/ -lcudart 19 | NVCC=nvcc 20 | NVCCFLAGS=-O3 -m64 --gpu-architecture compute_61 -std=c++11 21 | 22 | 23 | OBJS=$(OBJDIR)/main.o $(OBJDIR)/scan.o 24 | 25 | 26 | .PHONY: dirs clean 27 | 28 | default: $(EXECUTABLE) 29 | 30 | dirs: 31 | mkdir -p $(OBJDIR)/ 32 | 33 | clean: 34 | rm -rf $(OBJDIR) *.ppm *~ $(EXECUTABLE) $(LOGS) 35 | 36 | check_scan: default 37 | ./checker.pl scan 38 | 39 | check_find_repeats: default 40 | ./checker.pl find_repeats 41 | 42 | $(EXECUTABLE): dirs $(OBJS) 43 | $(CXX) $(CXXFLAGS) -o $@ $(OBJS) $(LDFLAGS) 44 | 45 | $(OBJDIR)/%.o: %.cpp 46 | $(CXX) $< $(CXXFLAGS) -c -o $@ 47 | 48 | $(OBJDIR)/%.o: %.cu 49 | $(NVCC) $< $(NVCCFLAGS) -c -o $@ 50 | -------------------------------------------------------------------------------- /asst3/scan/checker.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use POSIX; 4 | 5 | my @element_counts = ("1000000", "10000000", "20000000", "40000000"); 6 | 7 | my %fast_times; 8 | my %your_times; 9 | 10 | my $perf_points = 1.25; 11 | my %correct; 12 | my $test; 13 | 14 | `mkdir -p logs`; 15 | `rm -rf logs/*`; 16 | `mkdir logs/test`; 17 | `mkdir logs/ref`; 18 | 19 | if(scalar (@ARGV) != 1 || (@ARGV[0] ne "find_repeats" && @ARGV[0] ne "scan")) 20 | { 21 | print ("Usage: ./checker.pl : test = scan, find_repeats\n"); 22 | exit(1); 23 | } else { 24 | $test = @ARGV[0]; 25 | print("Test: $test" ); 26 | } 27 | 28 | print "\n"; 29 | print ("--------------\n"); 30 | print ("Running tests:\n"); 31 | print ("--------------\n"); 32 | 33 | foreach my $element_count (@element_counts) { 34 | print ("\nElement Count: $element_count\n"); 35 | my @sys_stdout = system ("./cudaScan -m ${test} -i random -n $element_count > ./logs/test/${test}_correctness_${element_count}.log"); 36 | my $return_value = $?; 37 | if ($return_value == 0) { 38 | print ("Correctness passed!\n"); 39 | $correct{$element_count} = 1; 40 | } 41 | else { 42 | print ("Correctness failed\n"); 43 | $correct{$scene} = 0; 44 | } 45 | 46 | my $your_time = `./cudaScan -m ${test} -i random -n $element_count | tee ./logs/test/${test}_time_${element_count}.log | grep \'Student GPU time:\'`; 47 | chomp($your_time); 48 | $your_time =~ s/^[^0-9]*//; 49 | $your_time =~ s/ ms.*//; 50 | print ("Student Time: $your_time\n"); 51 | 52 | my $fast_time = `./cudaScan_ref -m ${test} -i random -n $element_count | tee ./logs/ref/${test}_time_${element_count}.log | grep \'Student GPU time:\'`; 53 | chomp($fast_time); 54 | $fast_time =~ s/^[^0-9]*//; 55 | $fast_time =~ s/ ms.*//; 56 | print ("Ref Time: $fast_time\n"); 57 | 58 | $your_times{$element_count} = $your_time; 59 | $fast_times{$element_count} = $fast_time; 60 | } 61 | 62 | print "\n"; 63 | print ("-------------------------\n"); 64 | print (ucfirst($test). " Score Table:\n"); 65 | print ("-------------------------\n"); 66 | 67 | my $header = sprintf ("| %-15s | %-15s | %-15s | %-15s |\n", "Element Count", "Ref Time", "Student Time", "Score"); 68 | my $dashes = $header; 69 | $dashes =~ s/./-/g; 70 | print $dashes; 71 | print $header; 72 | print $dashes; 73 | 74 | my $total_score = 0; 75 | 76 | foreach my $element_count (@element_counts){ 77 | my $score; 78 | my $fast_time = $fast_times{$element_count}; 79 | my $time = $your_times{$element_count}; 80 | 81 | if ($correct{$element_count}) { 82 | if ($time <= 1.20 * $fast_time) { 83 | $score = $perf_points; 84 | } 85 | else { 86 | $score = $perf_points * ($fast_time /$time); 87 | } 88 | } 89 | else { 90 | $time .= " (F)"; 91 | $score = 0; 92 | } 93 | 94 | printf ("| %-15s | %-15s | %-15s | %-15s |\n", "$element_count", "$fast_time", "$time", "$score"); 95 | $total_score += $score; 96 | } 97 | print $dashes; 98 | printf ("| %-33s | %-15s | %-15s |\n", "", "Total score:", 99 | $total_score . "/" . ($perf_points * keys %fast_times)); 100 | print $dashes; 101 | -------------------------------------------------------------------------------- /asst3/scan/cudaScan_ref: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst3/scan/cudaScan_ref -------------------------------------------------------------------------------- /asst3/scan/log.txt: -------------------------------------------------------------------------------- 1 | --------------------------------------------------------- 2 | Found 4 CUDA devices 3 | Device 0: NVIDIA GeForce GTX 1080 4 | SMs: 20 5 | Global mem: 8120 MB 6 | CUDA Cap: 6.1 7 | Device 1: NVIDIA GeForce GTX 1080 8 | SMs: 20 9 | Global mem: 8120 MB 10 | CUDA Cap: 6.1 11 | Device 2: NVIDIA GeForce GTX 1080 12 | SMs: 20 13 | Global mem: 8120 MB 14 | CUDA Cap: 6.1 15 | Device 3: NVIDIA GeForce GTX 1080 16 | SMs: 20 17 | Global mem: 8120 MB 18 | CUDA Cap: 6.1 19 | --------------------------------------------------------- 20 | Array size: 64 21 | block: 1, thread: 64 22 | cmp: idx: 41, value: 9 23 | 1 blocks, 32 thread ! 24 | 1 blocks, 16 thread ! 25 | 1 blocks, 8 thread ! 26 | 1 blocks, 4 thread ! 27 | 1 blocks, 2 thread ! 28 | 1 blocks, 1 thread ! 29 | 1 blocks, 1 thread ! 30 | 1 blocks, 2 thread ! 31 | 1 blocks, 4 thread ! 32 | 1 blocks, 8 thread ! 33 | 1 blocks, 16 thread ! 34 | 1 blocks, 32 thread ! 35 | fill: idx: 41, prefix_idx: 0, value: 9 36 | block: 1, thread: 64 37 | cmp: idx: 41, value: 9 38 | 1 blocks, 32 thread ! 39 | 1 blocks, 16 thread ! 40 | 1 blocks, 8 thread ! 41 | 1 blocks, 4 thread ! 42 | 1 blocks, 2 thread ! 43 | 1 blocks, 1 thread ! 44 | 1 blocks, 1 thread ! 45 | 1 blocks, 2 thread ! 46 | 1 blocks, 4 thread ! 47 | 1 blocks, 8 thread ! 48 | 1 blocks, 16 thread ! 49 | 1 blocks, 32 thread ! 50 | fill: idx: 41, prefix_idx: 0, value: 9 51 | block: 1, thread: 64 52 | cmp: idx: 41, value: 9 53 | 1 blocks, 32 thread ! 54 | 1 blocks, 16 thread ! 55 | 1 blocks, 8 thread ! 56 | 1 blocks, 4 thread ! 57 | 1 blocks, 2 thread ! 58 | 1 blocks, 1 thread ! 59 | 1 blocks, 1 thread ! 60 | 1 blocks, 2 thread ! 61 | 1 blocks, 4 thread ! 62 | 1 blocks, 8 thread ! 63 | 1 blocks, 16 thread ! 64 | 1 blocks, 32 thread ! 65 | fill: idx: 41, prefix_idx: 0, value: 9 66 | Student GPU time: 0.216 ms 67 | Find_repeats outputs are correct! 68 | -------------------------------------------------------------------------------- /asst4/bfs/Makefile: -------------------------------------------------------------------------------- 1 | all: default grade 2 | 3 | default: main.cpp bfs.cpp 4 | g++ -I../ -std=c++11 -fopenmp -O3 -g -o bfs main.cpp bfs.cpp ../common/graph.cpp ref_bfs.o 5 | grade: grade.cpp bfs.cpp 6 | g++ -I../ -std=c++11 -fopenmp -O3 -g -o bfs_grader grade.cpp bfs.cpp ../common/graph.cpp ref_bfs.o 7 | clean: 8 | rm -rf bfs_grader bfs *~ *.*~ 9 | -------------------------------------------------------------------------------- /asst4/bfs/bfs.h: -------------------------------------------------------------------------------- 1 | #ifndef __BFS_H__ 2 | #define __BFS_H__ 3 | 4 | //#define DEBUG 5 | 6 | #include "common/graph.h" 7 | #include 8 | 9 | struct solution 10 | { 11 | int *distances; 12 | }; 13 | 14 | struct vertex_set { 15 | // # of vertices in the set 16 | int count; 17 | // max size of buffer vertices 18 | int max_vertices; 19 | // array of vertex ids in set 20 | int *vertices; 21 | }; 22 | 23 | 24 | void bfs_top_down(Graph graph, solution* sol); 25 | void bfs_bottom_up(Graph graph, solution* sol); 26 | void bfs_hybrid(Graph graph, solution* sol); 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /asst4/bfs/ref_bfs.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/bfs/ref_bfs.o -------------------------------------------------------------------------------- /asst4/cloud_readme.md: -------------------------------------------------------------------------------- 1 | # AWS Setup Instructions # 2 | 3 | For performance testing, you will need to run it on a VM instance on Amazon Web Services (AWS). We've already sent you student coupons that you can use for billing purposes. Here are the steps for how to get setup for running on AWS. 4 | 5 | NOTE: __Please don't forget to SHUT DOWN your instances when you're done for the day to avoid burning through credits overnight!__ 6 | 7 | ### Creating a VM with 32 vCPU ### 8 | 9 | 1. Now you're ready to create a VM instance. Click on the button that says `Launch Instances`. Choose the `Ubuntu Server 20.04 LTS (HVM), SSD Volume Type` AMI: 10 | ![AMI Selection](handout/AMI.png?raw=true) 11 | 12 | 2. Choose the `m5.8xlarge` Instance Type and then click `4. Add Storage` on the top bar: 13 | ![instance](handout/instance_type_big.png?raw=true) 14 | 15 | 3. Change the size of the `Root` volume to 100 GB to accomodate the packages we will need to install to make the instance functional for the assignment: 16 | ![Storage](handout/storage_big.png?raw=true) 17 | 18 | 5. AWS will ask you to select a key pair. You can use the same key pair from assignment 3. Alternatively, you can create a new one. To create a new one, click the first dropdown and choose `Create a new key pair` and give it whatever name you'd like. This will download a keyfile to your computer called `.pem` which you will use to login to the VM instance you are about to create. Finally, click `Launch Instances`. 19 | ![Key Pair](handout/new_key_pair.png?raw=true) 20 | 21 | __Note: `m5.8xlarge` instances cost $1.536 / hour, so leaving one running for a whole day will consume $36.86 worth of your AWS coupon.__ 22 | 23 | 4. Now that you've created your VM, you should be able to __SSH__ into it. You need the public IP address to SSH into it, which you can find on the instance page by clicking the `View Instances` button on the current page and then the instance ID for your created instance (note, it may take a moment for the instance to startup and be assigned an IP address): 24 | ![IP Address](handout/ip_address.png?raw=true) 25 | Once you have the IP address, you can login to the instance by running this command: 26 | ~~~~ 27 | ssh -i path/to/key_name.pem ubuntu@ 28 | ~~~~ 29 | 30 | 5. Once you SSH into your VM instance, you'll want to install whatever software you need to make the machine a useful development environment for you. For example we recommend: 31 | ~~~~ 32 | sudo apt update 33 | sudo apt install emacs25 34 | sudo apt install make 35 | sudo apt install g++ 36 | ~~~~ 37 | 38 | If you're confused about any of the steps, having problems with setting up your account or have any additional questions, reach us out on Piazza! 39 | 40 | __Again, please don't forget to SHUT DOWN your instances when you're done with your work for the day!__ 41 | -------------------------------------------------------------------------------- /asst4/common/CycleTimer.h: -------------------------------------------------------------------------------- 1 | #ifndef _SYRAH_CYCLE_TIMER_H_ 2 | #define _SYRAH_CYCLE_TIMER_H_ 3 | 4 | #if defined(__APPLE__) 5 | #if defined(__x86_64__) 6 | #include 7 | #else 8 | #include 9 | #include 10 | #endif // __x86_64__ or not 11 | 12 | #include // fprintf 13 | #include // exit 14 | 15 | #elif _WIN32 16 | # include 17 | # include 18 | #else 19 | # include 20 | # include 21 | # include 22 | # include 23 | #endif 24 | 25 | 26 | // This uses the cycle counter of the processor. Different 27 | // processors in the system will have different values for this. If 28 | // you process moves across processors, then the delta time you 29 | // measure will likely be incorrect. This is mostly for fine 30 | // grained measurements where the process is likely to be on the 31 | // same processor. For more global things you should use the 32 | // Time interface. 33 | 34 | // Also note that if you processors' speeds change (i.e. processors 35 | // scaling) or if you are in a heterogenous environment, you will 36 | // likely get spurious results. 37 | class CycleTimer { 38 | public: 39 | typedef unsigned long long SysClock; 40 | 41 | ////////// 42 | // Return the current CPU time, in terms of clock ticks. 43 | // Time zero is at some arbitrary point in the past. 44 | static SysClock currentTicks() { 45 | #if defined(__APPLE__) && !defined(__x86_64__) 46 | return mach_absolute_time(); 47 | #elif defined(_WIN32) 48 | LARGE_INTEGER qwTime; 49 | QueryPerformanceCounter(&qwTime); 50 | return qwTime.QuadPart; 51 | #elif defined(__x86_64__) 52 | unsigned int a, d; 53 | asm volatile("rdtsc" : "=a" (a), "=d" (d)); 54 | return static_cast(a) | 55 | (static_cast(d) << 32); 56 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser. 57 | unsigned int val; 58 | asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val)); 59 | return val; 60 | #else 61 | timespec spec; 62 | clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec); 63 | return CycleTimer::SysClock(static_cast(spec.tv_sec) * 1e9 + static_cast(spec.tv_nsec)); 64 | #endif 65 | } 66 | 67 | ////////// 68 | // Return the current CPU time, in terms of seconds. 69 | // This is slower than currentTicks(). Time zero is at 70 | // some arbitrary point in the past. 71 | static double currentSeconds() { 72 | return currentTicks() * secondsPerTick(); 73 | } 74 | 75 | ////////// 76 | // Return the conversion from seconds to ticks. 77 | static double ticksPerSecond() { 78 | return 1.0/secondsPerTick(); 79 | } 80 | 81 | static const char* tickUnits() { 82 | #if defined(__APPLE__) && !defined(__x86_64__) 83 | return "ns"; 84 | #elif defined(__WIN32__) || defined(__x86_64__) 85 | return "cycles"; 86 | #else 87 | return "ns"; // clock_gettime 88 | #endif 89 | } 90 | 91 | ////////// 92 | // Return the conversion from ticks to seconds. 93 | static double secondsPerTick() { 94 | static bool initialized = false; 95 | static double secondsPerTick_val; 96 | if (initialized) return secondsPerTick_val; 97 | #if defined(__APPLE__) 98 | #ifdef __x86_64__ 99 | int args[] = {CTL_HW, HW_CPU_FREQ}; 100 | unsigned int Hz; 101 | size_t len = sizeof(Hz); 102 | if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) { 103 | fprintf(stderr, "Failed to initialize secondsPerTick_val!\n"); 104 | exit(-1); 105 | } 106 | secondsPerTick_val = 1.0 / (double) Hz; 107 | #else 108 | mach_timebase_info_data_t time_info; 109 | mach_timebase_info(&time_info); 110 | 111 | // Scales to nanoseconds without 1e-9f 112 | secondsPerTick_val = (1e-9*static_cast(time_info.numer))/ 113 | static_cast(time_info.denom); 114 | #endif // x86_64 or not 115 | #elif defined(_WIN32) 116 | LARGE_INTEGER qwTicksPerSec; 117 | QueryPerformanceFrequency(&qwTicksPerSec); 118 | secondsPerTick_val = 1.0/static_cast(qwTicksPerSec.QuadPart); 119 | #else 120 | FILE *fp = fopen("/proc/cpuinfo","r"); 121 | char input[1024]; 122 | if (!fp) { 123 | fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo."); 124 | exit(-1); 125 | } 126 | // In case we don't find it, e.g. on the N900 127 | secondsPerTick_val = 1e-9; 128 | while (!feof(fp) && fgets(input, 1024, fp)) { 129 | // NOTE(boulos): Because reading cpuinfo depends on dynamic 130 | // frequency scaling it's better to read the @ sign first 131 | float GHz, MHz; 132 | if (strstr(input, "model name")) { 133 | char* at_sign = strstr(input, "@"); 134 | if (at_sign) { 135 | char* after_at = at_sign + 1; 136 | char* GHz_str = strstr(after_at, "GHz"); 137 | char* MHz_str = strstr(after_at, "MHz"); 138 | if (GHz_str) { 139 | *GHz_str = '\0'; 140 | if (1 == sscanf(after_at, "%f", &GHz)) { 141 | //printf("GHz = %f\n", GHz); 142 | secondsPerTick_val = 1e-9f / GHz; 143 | break; 144 | } 145 | } else if (MHz_str) { 146 | *MHz_str = '\0'; 147 | if (1 == sscanf(after_at, "%f", &MHz)) { 148 | //printf("MHz = %f\n", MHz); 149 | secondsPerTick_val = 1e-6f / GHz; 150 | break; 151 | } 152 | } 153 | } 154 | } else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) { 155 | //printf("MHz = %f\n", MHz); 156 | secondsPerTick_val = 1e-6f / MHz; 157 | break; 158 | } 159 | } 160 | fclose(fp); 161 | #endif 162 | 163 | initialized = true; 164 | return secondsPerTick_val; 165 | } 166 | 167 | ////////// 168 | // Return the conversion from ticks to milliseconds. 169 | static double msPerTick() { 170 | return secondsPerTick() * 1000.0; 171 | } 172 | 173 | private: 174 | CycleTimer(); 175 | }; 176 | 177 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_ 178 | -------------------------------------------------------------------------------- /asst4/common/contracts.h: -------------------------------------------------------------------------------- 1 | /* Debugging with contracts; simulating cc0 -d 2 | * Enable with gcc -DDEBUG ... 3 | * 4 | * 15-122 Principles of Imperative Computation 5 | * Frank Pfenning 6 | */ 7 | 8 | #include 9 | 10 | /* Unlike typical header files, "contracts.h" may be 11 | * included multiple times, with and without DEBUG defined. 12 | * For this to succeed we first undefine the macros in 13 | * question in order to avoid a redefinition warning. 14 | */ 15 | 16 | #undef ASSERT 17 | #undef REQUIRES 18 | #undef ENSURES 19 | 20 | #ifdef DEBUG 21 | 22 | #define ASSERT(COND) assert(COND) 23 | #define REQUIRES(COND) assert(COND) 24 | #define ENSURES(COND) assert(COND) 25 | 26 | #else 27 | 28 | #define ASSERT(COND) ((void)0) 29 | #define REQUIRES(COND) ((void)0) 30 | #define ENSURES(COND) ((void)0) 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /asst4/common/grade.h: -------------------------------------------------------------------------------- 1 | #ifndef __GRADE_H__ 2 | #define __GRADE_H__ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | 15 | #include 16 | 17 | #include "graph.h" 18 | #include "graph_internal.h" 19 | #include "contracts.h" 20 | 21 | // Epsilon for approximate float comparisons 22 | #define EPSILON 0.00000000001 23 | 24 | // Output column size 25 | #define COL_SIZE 15 26 | 27 | // Point value for apps that are not run. 28 | #define POINTS_NA -1 29 | 30 | // Point value for apps that yeilded incorrect results. 31 | #define POINTS_INCORRECT -2 32 | 33 | /* 34 | * Printing functions 35 | */ 36 | 37 | static void sep(std::ostream& out, char separator = '-', int length = 78) 38 | { 39 | for (int i = 0; i < length; i++) 40 | out << separator; 41 | out << std::endl; 42 | } 43 | 44 | static void printTimingApp(std::ostream& timing, const char* appName) 45 | { 46 | std::cout << std::endl; 47 | std::cout << "Timing results for " << appName << ":" << std::endl; 48 | sep(std::cout, '=', 75); 49 | 50 | timing << std::endl; 51 | timing << "Timing results for " << appName << ":" << std::endl; 52 | sep(timing, '=', 75); 53 | } 54 | 55 | /* 56 | * Correctness checkers 57 | */ 58 | 59 | template 60 | bool compareArrays(Graph graph, T* ref, T* stu) 61 | { 62 | for (int i = 0; i < graph->num_nodes; i++) { 63 | if (ref[i] != stu[i]) { 64 | std::cerr << "*** Results disagree at " << i << " expected " 65 | << ref[i] << " found " << stu[i] << std::endl; 66 | return false; 67 | } 68 | } 69 | return true; 70 | } 71 | 72 | template 73 | bool compareApprox(Graph graph, T* ref, T* stu) 74 | { 75 | for (int i = 0; i < graph->num_nodes; i++) { 76 | if (fabs(ref[i] - stu[i]) > EPSILON) { 77 | std::cerr << "*** Results disagree at " << i << " expected " 78 | << ref[i] << " found " << stu[i] << std::endl; 79 | return false; 80 | } 81 | } 82 | return true; 83 | } 84 | 85 | template 86 | bool compareArraysAndDisplay(Graph graph, T* ref, T*stu) 87 | { 88 | printf("\n----------------------------------\n"); 89 | printf("Visualization of student results"); 90 | printf("\n----------------------------------\n\n"); 91 | 92 | int grid_dim = (int)sqrt(graph->num_nodes); 93 | for (int j=0; jnum_nodes); 104 | for (int j=0; j(graph, ref, stu); 112 | } 113 | 114 | template 115 | bool compareArraysAndRadiiEst(Graph graph, T* ref, T* stu) 116 | { 117 | bool isCorrect = true; 118 | for (int i = 0; i < graph->num_nodes; i++) { 119 | if (ref[i] != stu[i]) { 120 | std::cerr << "*** Results disagree at " << i << " expected " 121 | << ref[i] << " found " << stu[i] << std::endl; 122 | isCorrect = false; 123 | } 124 | } 125 | int stuMaxVal = -1; 126 | int refMaxVal = -1; 127 | #pragma omp parallel for schedule(dynamic, 512) reduction(max: stuMaxVal) 128 | for (int i = 0; i < graph->num_nodes; i++) { 129 | if (stu[i] > stuMaxVal) 130 | stuMaxVal = stu[i]; 131 | } 132 | #pragma omp parallel for schedule(dynamic, 512) reduction(max: refMaxVal) 133 | for (int i = 0; i < graph->num_nodes; i++) { 134 | if (ref[i] > refMaxVal) 135 | refMaxVal = ref[i]; 136 | } 137 | 138 | if (refMaxVal != stuMaxVal) { 139 | std::cerr << "*** Radius estimates differ. Expected: " << refMaxVal << " Got: " << stuMaxVal << std::endl; 140 | isCorrect = false; 141 | } 142 | return isCorrect; 143 | } 144 | 145 | #endif /* __GRADE_H__ */ 146 | -------------------------------------------------------------------------------- /asst4/common/graph.h: -------------------------------------------------------------------------------- 1 | #ifndef __GRAPH_H__ 2 | #define __GRAPH_H__ 3 | 4 | using Vertex = int; 5 | 6 | struct graph 7 | { 8 | // Number of edges in the graph 9 | int num_edges; 10 | // Number of vertices in the graph 11 | int num_nodes; 12 | 13 | // The node reached by vertex i's first outgoing edge is given by 14 | // outgoing_edges[outgoing_starts[i]]. To iterate over all 15 | // outgoing edges, please see the top-down bfs implementation. 16 | int* outgoing_starts; 17 | Vertex* outgoing_edges; 18 | 19 | int* incoming_starts; 20 | Vertex* incoming_edges; 21 | }; 22 | 23 | using Graph = graph*; 24 | 25 | /* Getters */ 26 | static inline int num_nodes(const Graph); 27 | static inline int num_edges(const Graph); 28 | 29 | static inline const Vertex* outgoing_begin(const Graph, Vertex); 30 | static inline const Vertex* outgoing_end(const Graph, Vertex); 31 | static inline int outgoing_size(const Graph, Vertex); 32 | 33 | static inline const Vertex* incoming_begin(const Graph, Vertex); 34 | static inline const Vertex* incoming_end(const Graph, Vertex); 35 | static inline int incoming_size(const Graph, Vertex); 36 | 37 | 38 | /* IO */ 39 | Graph load_graph(const char* filename); 40 | Graph load_graph_binary(const char* filename); 41 | void store_graph_binary(const char* filename, Graph); 42 | 43 | void print_graph(const graph*); 44 | 45 | 46 | /* Deallocation */ 47 | void free_graph(Graph); 48 | 49 | 50 | /* Included here to enable inlining. Don't look. */ 51 | #include "graph_internal.h" 52 | 53 | #endif 54 | -------------------------------------------------------------------------------- /asst4/common/graph_internal.h: -------------------------------------------------------------------------------- 1 | #ifndef __GRAPH_INTERNAL_H__ 2 | #define __GRAPH_INTERNAL_H__ 3 | 4 | #include 5 | #include "contracts.h" 6 | 7 | static inline int num_nodes(const Graph graph) 8 | { 9 | REQUIRES(graph != NULL); 10 | return graph->num_nodes; 11 | } 12 | 13 | static inline int num_edges(const Graph graph) 14 | { 15 | REQUIRES(graph != NULL); 16 | return graph->num_edges; 17 | } 18 | 19 | static inline const Vertex* outgoing_begin(const Graph g, Vertex v) 20 | { 21 | REQUIRES(g != NULL); 22 | REQUIRES(0 <= v && v < num_nodes(g)); 23 | return g->outgoing_edges + g->outgoing_starts[v]; 24 | } 25 | 26 | static inline const Vertex* outgoing_end(const Graph g, Vertex v) 27 | { 28 | REQUIRES(g != NULL); 29 | REQUIRES(0 <= v && v < num_nodes(g)); 30 | int offset = (v == g->num_nodes - 1) ? g->num_edges : g->outgoing_starts[v + 1]; 31 | return g->outgoing_edges + offset; 32 | } 33 | 34 | static inline int outgoing_size(const Graph g, Vertex v) 35 | { 36 | REQUIRES(g != NULL); 37 | REQUIRES(0 <= v && v < num_nodes(g)); 38 | if (v == g->num_nodes - 1) { 39 | return g->num_edges - g->outgoing_starts[v]; 40 | } else { 41 | return g->outgoing_starts[v + 1] - g->outgoing_starts[v]; 42 | } 43 | } 44 | 45 | static inline const Vertex* incoming_begin(const Graph g, Vertex v) 46 | { 47 | REQUIRES(g != NULL); 48 | REQUIRES(0 <= v && v < num_nodes(g)); 49 | return g->incoming_edges + g->incoming_starts[v]; 50 | } 51 | 52 | static inline const Vertex* incoming_end(const Graph g, Vertex v) 53 | { 54 | REQUIRES(g != NULL); 55 | REQUIRES(0 <= v && v < num_nodes(g)); 56 | int offset = (v == g->num_nodes - 1) ? g->num_edges : g->incoming_starts[v + 1]; 57 | return g->incoming_edges + offset; 58 | } 59 | 60 | static inline int incoming_size(const Graph g, Vertex v) 61 | { 62 | REQUIRES(g != NULL); 63 | REQUIRES(0 <= v && v < num_nodes(g)); 64 | if (v == g->num_nodes - 1) { 65 | return g->num_edges - g->incoming_starts[v]; 66 | } else { 67 | return g->incoming_starts[v + 1] - g->incoming_starts[v]; 68 | } 69 | } 70 | 71 | #endif // __GRAPH_INTERNAL_H__ 72 | -------------------------------------------------------------------------------- /asst4/handout/AMI.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/handout/AMI.png -------------------------------------------------------------------------------- /asst4/handout/instance_type.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/handout/instance_type.png -------------------------------------------------------------------------------- /asst4/handout/instance_type_big.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/handout/instance_type_big.png -------------------------------------------------------------------------------- /asst4/handout/ip_address.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/handout/ip_address.png -------------------------------------------------------------------------------- /asst4/handout/new_key_pair.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/handout/new_key_pair.png -------------------------------------------------------------------------------- /asst4/handout/storage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/handout/storage.png -------------------------------------------------------------------------------- /asst4/handout/storage_big.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/handout/storage_big.png -------------------------------------------------------------------------------- /asst4/imgs/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/imgs/1.png -------------------------------------------------------------------------------- /asst4/imgs/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/imgs/2.png -------------------------------------------------------------------------------- /asst4/pagerank/Makefile: -------------------------------------------------------------------------------- 1 | all: default grade 2 | 3 | default: page_rank.cpp main.cpp 4 | g++ -I../ -std=c++11 -fopenmp -g -O3 -o pr main.cpp page_rank.cpp ../common/graph.cpp ref_pr.a 5 | grade: page_rank.cpp grade.cpp 6 | g++ -I../ -std=c++11 -fopenmp -g -O3 -o pr_grader grade.cpp page_rank.cpp ../common/graph.cpp ref_pr.a 7 | clean: 8 | rm -rf pr pr_grader *~ *.*~ 9 | -------------------------------------------------------------------------------- /asst4/pagerank/grade.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | #include "../common/CycleTimer.h" 13 | #include "../common/graph.h" 14 | #include "../common/grade.h" 15 | #include "page_rank.h" 16 | 17 | #define USE_BINARY_GRAPH 1 18 | 19 | #define PageRankDampening 0.3f 20 | #define PageRankConvergence 1e-7d 21 | 22 | void reference_pageRank(Graph g, double* solution, double damping, 23 | double convergence); 24 | 25 | void usage(const char* binary_name) { 26 | std::cout << "Usage: " << binary_name << " [options] graphdir" << std::endl; 27 | std::cout << std::endl; 28 | std::cout << "Options:" << std::endl; 29 | std::cout << " -n INT number of threads" << std::endl; 30 | std::cout << " -r INT number of runs" << std::endl; 31 | std::cout << " -h this commandline help message" << std::endl; 32 | } 33 | 34 | graph* load_graph(std::string graph_filename) { 35 | graph* g; 36 | if (USE_BINARY_GRAPH) { 37 | g = load_graph_binary(graph_filename.c_str()); 38 | } else { 39 | g = load_graph(graph_filename); 40 | printf("storing binary form of graph!\n"); 41 | store_graph_binary(graph_filename.append(".bin").c_str(), g); 42 | delete g; 43 | exit(1); 44 | } 45 | return g; 46 | } 47 | 48 | double run_on_graph(graph* g, int num_threads, int num_runs, std::string graph_name) { 49 | 50 | double* sol_stu = new double[g->num_nodes]; 51 | double* sol_ref = new double[g->num_nodes]; 52 | 53 | omp_set_num_threads(num_threads); 54 | 55 | double start, time; 56 | 57 | //Run implementation 58 | double stu_time = std::numeric_limits::max(); 59 | for (int r = 0; r < num_runs; r++) { 60 | start = CycleTimer::currentSeconds(); 61 | pageRank(g, sol_stu, PageRankDampening, PageRankConvergence); 62 | //reference_pageRank(g, sol_stu, PageRankDampening, PageRankConvergence); 63 | time = CycleTimer::currentSeconds() - start; 64 | stu_time = std::min(stu_time, time); 65 | } 66 | 67 | //Run reference implementation 68 | double ref_time = std::numeric_limits::max(); 69 | for (int r = 0; r < num_runs; r++) { 70 | start = CycleTimer::currentSeconds(); 71 | reference_pageRank(g, sol_ref, PageRankDampening, PageRankConvergence); 72 | time = CycleTimer::currentSeconds() - start; 73 | ref_time = std::min(ref_time, time); 74 | } 75 | 76 | bool correct = compareApprox(g, sol_ref, sol_stu); 77 | 78 | delete(sol_stu); 79 | delete(sol_ref); 80 | 81 | if (!correct) { 82 | std::cout << "Page rank incorrect" << std::endl; 83 | } else { 84 | std::cout << "ref_time: " << ref_time << "s" << std::endl; 85 | std::cout << "stu_time: " << stu_time << "s" << std::endl; 86 | } 87 | 88 | double max_score = 4; 89 | double max_perf_score = 0.8 * max_score; 90 | double correctness_score = 0.2 * max_score; 91 | correctness_score = (correct) ? correctness_score : 0; 92 | 93 | double ratio = (ref_time/stu_time); 94 | 95 | double slope = max_perf_score/(0.7 - 0.3); 96 | double offset = 0.3 * slope; 97 | 98 | double perf_score = (correct) ? ratio*slope - offset : 0; 99 | 100 | if (perf_score < 0) perf_score = 0; 101 | if (perf_score > max_perf_score) perf_score = max_perf_score; 102 | 103 | return (correctness_score + perf_score); 104 | } 105 | 106 | void print_separator_line() { 107 | for (int i = 0; i < 43; i++) { 108 | std::cout<<"-"; 109 | } 110 | std::cout< grade_graphs, std::vector scores) { 114 | 115 | std::cout.precision(5); 116 | std::cout.setf(std::ios::fixed, std:: ios::floatfield); 117 | std::cout< grade_graphs = { "soc-livejournal1_68m.graph", 191 | "com-orkut_117m.graph", 192 | "rmat_200m.graph", 193 | "random_500m.graph"}; 194 | 195 | std::vector scores(grade_graphs.size()); 196 | 197 | int i = 0; 198 | for (auto& graph_name: grade_graphs) { 199 | graph* g = load_graph(graph_dir + '/' + graph_name); 200 | std::cout << "\nGraph: " << graph_name << std::endl; 201 | scores[i] = run_on_graph(g, num_threads, num_runs, graph_name); 202 | delete g; 203 | i++; 204 | } 205 | 206 | print_scores(grade_graphs, scores); 207 | 208 | return 0; 209 | } 210 | -------------------------------------------------------------------------------- /asst4/pagerank/page_rank.cpp: -------------------------------------------------------------------------------- 1 | #include "page_rank.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "../common/CycleTimer.h" 10 | #include "../common/graph.h" 11 | 12 | // #define DEBUG 13 | 14 | // pageRank -- 15 | // 16 | // g: graph to process (see common/graph.h) 17 | // solution: array of per-vertex vertex scores (length of array is num_nodes(g)) 18 | // damping: page-rank algorithm's damping parameter 19 | // convergence: page-rank algorithm's convergence threshold 20 | // 21 | void pageRank(Graph g, double* solution, double damping, double convergence) 22 | { 23 | 24 | // initialize vertex weights to uniform probability. Double 25 | // precision scores are used to avoid underflow for large graphs 26 | 27 | int numNodes = num_nodes(g); 28 | double equal_prob = 1.0 / numNodes; 29 | 30 | std::vector ans(numNodes, equal_prob); 31 | std::vector tmp(numNodes); 32 | 33 | bool converged{false}; 34 | 35 | while (!converged) { 36 | 37 | double no_out_score = 0; 38 | 39 | #ifndef DEBUG 40 | #pragma omp parallel for reduction(+:no_out_score) 41 | #endif 42 | for (int i = 0; i < numNodes; ++i) { 43 | no_out_score += outgoing_size(g, i) == 0 ? damping * ans[i] / numNodes : 0; 44 | } 45 | 46 | 47 | #ifndef DEBUG 48 | #pragma omp parallel for 49 | #endif 50 | for (int i = 0; i < numNodes; ++i) { 51 | double tmp_score = 0; 52 | const Vertex* start = incoming_begin(g, i); 53 | const Vertex* end = incoming_end(g, i); 54 | for (const Vertex* v = start; v != end; ++v) { 55 | tmp_score += ans[*v] / outgoing_size(g, *v); 56 | } 57 | tmp_score = tmp_score * damping + (1.0 - damping) / numNodes; 58 | tmp_score += no_out_score; 59 | tmp[i] = tmp_score; 60 | } 61 | 62 | double diff = 0; 63 | #ifndef DEBUG 64 | #pragma omp parallel for reduction(+:diff) 65 | #endif 66 | for (int i = 0; i < numNodes; ++i) { 67 | diff += std::fabs(ans[i] - tmp[i]); 68 | } 69 | 70 | #ifdef DEBUG 71 | printf("DIFF: %lf | CONVER: %lf\n", diff, convergence); 72 | #endif 73 | std::swap(ans, tmp); 74 | converged = diff < convergence; 75 | } 76 | 77 | memcpy(solution, &*ans.begin(), sizeof(double) * numNodes); 78 | 79 | /* 80 | CS149 students: Implement the page rank algorithm here. You 81 | are expected to parallelize the algorithm using openMP. Your 82 | solution may need to allocate (and free) temporary arrays. 83 | 84 | Basic page rank pseudocode is provided below to get you started: 85 | 86 | // initialization: see example code above 87 | score_old[vi] = 1/numNodes; 88 | 89 | while (!converged) { 90 | 91 | // compute score_new[vi] for all nodes vi: 92 | score_new[vi] = sum over all nodes vj reachable from incoming edges 93 | { score_old[vj] / number of edges leaving vj } 94 | score_new[vi] = (damping * score_new[vi]) + (1.0-damping) / numNodes; 95 | 96 | score_new[vi] += sum over all nodes v in graph with no outgoing edges 97 | { damping * score_old[v] / numNodes } 98 | 99 | // compute how much per-node scores have changed 100 | // quit once algorithm has converged 101 | 102 | global_diff = sum over all nodes vi { abs(score_new[vi] - score_old[vi]) }; 103 | converged = (global_diff < convergence) 104 | } 105 | 106 | */ 107 | } 108 | -------------------------------------------------------------------------------- /asst4/pagerank/page_rank.h: -------------------------------------------------------------------------------- 1 | #ifndef __PAGE_RANK_H__ 2 | #define __PAGE_RANK_H__ 3 | 4 | #include "common/graph.h" 5 | 6 | void pageRank(Graph g, double* solution, double damping, double convergence); 7 | 8 | #endif /* __PAGE_RANK_H__ */ 9 | -------------------------------------------------------------------------------- /asst4/pagerank/ref_pr.a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wangdh15/cs149/9430cf0403fb59b92794de74b73de801dbcd8d11/asst4/pagerank/ref_pr.a -------------------------------------------------------------------------------- /asst4/tools/Makefile: -------------------------------------------------------------------------------- 1 | BINARYNAME=graphTools 2 | 3 | main: 4 | g++ -std=c++11 -g -O3 -o ${BINARYNAME} graphTools.cpp ../common/graph.cpp 5 | clean: 6 | rm -rf pr *~ *.*~ ${BINARYNAME} 7 | -------------------------------------------------------------------------------- /asst4/tools/plaintext.graph: -------------------------------------------------------------------------------- 1 | AdjacencyGraph 2 | # num vertices 3 | 5 4 | # num edges 5 | 8 6 | # edge starts 7 | 0 4 6 7 8 8 | # all the outgoing edges (target vertex) 9 | 1 2 3 4 10 | 2 3 11 | 0 12 | 0 13 | --------------------------------------------------------------------------------