├── .gitignore ├── README.md └── src ├── build_cpu.sh ├── build_gpu.sh ├── build_halide.sh ├── halide_gen_main.cpp ├── tmp.cu ├── timer.h ├── build_halide_main.sh ├── halide_gen.cpp ├── gpu_2d.cu ├── halide_test.cpp ├── gpu.cu ├── halide_gen_usage.sh └── cpu.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/** 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ParallelProgrammingInAction -------------------------------------------------------------------------------- /src/build_cpu.sh: -------------------------------------------------------------------------------- 1 | g++ cpu.cpp -o cpu -std=c++11 -fopenmp -msse3 && ./cpu -------------------------------------------------------------------------------- /src/build_gpu.sh: -------------------------------------------------------------------------------- 1 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-10.1/lib64 2 | /usr/local/cuda-10.1/bin/nvcc -Wno-deprecated-gpu-targets -I/usr/local/cuda-10.1/include gpu.cu -std=c++11 -O3 -o gpu && ./gpu -------------------------------------------------------------------------------- /src/build_halide.sh: -------------------------------------------------------------------------------- 1 | g++ halide_test.cpp -I/home/chenzhen/Workspace/hpc/halide_build/include -L/home/chenzhen/Workspace/hpc/halide_build/lib -lHalide -lpthread -ldl -std=c++11 -fopenmp -msse2 -msse3 -g -o halide_test 2 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/chenzhen/Workspace/hpc/halide_build/lib 3 | ./halide_test 4 | -------------------------------------------------------------------------------- /src/halide_gen_main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "Halide.h" 4 | #include "auto_blur_mat.h" 5 | #include "blur_mat.h" 6 | #include "timer.h" 7 | 8 | using namespace Halide; 9 | 10 | #define WIDTH 8192 11 | #define HEIGHT 4096 12 | 13 | int main() { 14 | Halide::Runtime::Buffer input(WIDTH, HEIGHT); 15 | Halide::Runtime::Buffer output(WIDTH, HEIGHT); 16 | for (int x = 0; x < WIDTH; ++x) { 17 | for (int y = 0; y < HEIGHT; ++y) { 18 | input(x, y) = x * WIDTH + y; 19 | } 20 | } 21 | 22 | Timer t1("AOT"); 23 | blur_mat(input, output); 24 | t1.stop(); 25 | 26 | Timer t2("Auto"); 27 | auto_blur_mat(input, output); 28 | t2.stop(); 29 | 30 | for (int x = 0; x < 4; ++x) { 31 | for (int y = 0; y < 4; ++y) { 32 | printf("%f,", output(x, y)); 33 | } 34 | printf("\n"); 35 | } 36 | return 0; 37 | } -------------------------------------------------------------------------------- /src/tmp.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | // Device code 4 | __global__ void VecAdd(float* A, float* B, float* C, int N) { 5 | int i = blockDim.x * blockIdx.x + threadIdx.x; 6 | if (i < N) C[i] = A[i] + B[i]; 7 | } 8 | 9 | // Host code 10 | int main() { 11 | int N = 32; 12 | size_t size = N * sizeof(float); 13 | 14 | // Allocate input vectors h_A and h_B in host memory 15 | float* h_A = (float*)malloc(size); 16 | float* h_B = (float*)malloc(size); 17 | 18 | // Allocate vectors in device memory 19 | float* d_A; 20 | cudaMalloc(&d_A, size); 21 | float* d_B; 22 | cudaMalloc(&d_B, size); 23 | float* d_C; 24 | cudaMalloc(&d_C, size); 25 | 26 | // Copy vectors from host memory to device memory 27 | cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); 28 | cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); 29 | 30 | // Invoke kernel 31 | int threadsPerBlock = 256; 32 | int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; 33 | VecAdd<<>>(d_A, d_B, d_C, N); 34 | 35 | // Copy result from device memory to host memory 36 | // h_C contains the result in host memory 37 | cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); 38 | 39 | // Free device memory 40 | cudaFree(d_A); 41 | cudaFree(d_B); 42 | cudaFree(d_C); 43 | 44 | // Free host memory 45 | ... 46 | } -------------------------------------------------------------------------------- /src/timer.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | class Timer { 6 | private: 7 | std::chrono::time_point d_start; 8 | std::string _timer_name; 9 | double _ms; 10 | double _cmp_ms = 0; 11 | 12 | public: 13 | Timer(std::string timer_name) { 14 | d_start = std::chrono::high_resolution_clock::now(); 15 | _timer_name = timer_name; 16 | } 17 | 18 | Timer(std::string timer_name, double cmp_ms) { 19 | d_start = std::chrono::high_resolution_clock::now(); 20 | _timer_name = timer_name; 21 | _cmp_ms = cmp_ms; 22 | } 23 | 24 | ~Timer() {} 25 | 26 | void stop() { 27 | auto d_end = std::chrono::high_resolution_clock::now(); 28 | auto _start = 29 | std::chrono::time_point_cast(d_start) 30 | .time_since_epoch() 31 | .count(); 32 | auto _end = std::chrono::time_point_cast(d_end) 33 | .time_since_epoch() 34 | .count(); 35 | auto duration = _end - _start; 36 | _ms = duration * 0.001; 37 | 38 | std::cout << _timer_name << " time cost: " << _ms << "ms "; 39 | if (_cmp_ms != 0) std::cout << "speedup: " << _cmp_ms / _ms << "x"; 40 | std::cout << "\n"; 41 | } 42 | 43 | double get() { return _ms; } 44 | }; -------------------------------------------------------------------------------- /src/build_halide_main.sh: -------------------------------------------------------------------------------- 1 | # build the code generator 2 | g++ halide_gen.cpp /home/chenzhen/Workspace/hpc/halide_build/distrib/tools/GenGen.cpp \ 3 | -I/home/chenzhen/Workspace/hpc/halide_build/include -L/home/chenzhen/Workspace/hpc/halide_build/lib \ 4 | -lHalide -lpthread -ldl -std=c++11 -fopenmp -msse2 -msse3 -fno-rtti -O3 -o blur_generator 5 | 6 | # generate the header file and library 7 | # Set up LD_LIBRARY_PATH so that we can find libHalide.so 8 | export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/home/chenzhen/Workspace/hpc/halide_build/lib 9 | export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/home/chenzhen/Workspace/hpc/halide_build/lib 10 | 11 | 12 | # First let's compile the first generator for the host system: 13 | ./blur_generator -o . -g blur_mat target=host auto_schedule=false 14 | ./blur_generator -o . -g auto_blur_mat -f auto_blur_mat -e static_library,h,schedule \ 15 | -p /home/chenzhen/Workspace/hpc/halide_build/distrib/lib/libautoschedule_adams2019.so \ 16 | target=host auto_schedule=true 17 | 18 | # call the library 19 | g++ halide_gen_main.cpp blur_mat.a auto_blur_mat.a -I/home/chenzhen/Workspace/hpc/src \ 20 | -I/home/chenzhen/Workspace/hpc/halide_build/include -L/home/chenzhen/Workspace/hpc/halide_build/lib \ 21 | -lHalide -lpthread -ldl -std=c++11 -fopenmp -O3 -o halide_generator_main 22 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/chenzhen/Workspace/hpc/halide_build/lib 23 | ./halide_generator_main -------------------------------------------------------------------------------- /src/halide_gen.cpp: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | 3 | using namespace Halide; 4 | 5 | class BlurGenerator : public Halide::Generator { 6 | public: 7 | Input> input = {"input", 2}; 8 | Output> output = {"output", 2}; 9 | 10 | void generate() { 11 | Var x("x"), y("y"), x_inner("x_inner"), y_inner("y_inner"); 12 | Func blur_x("blur_x"); 13 | Expr clamped_x = clamp(x, 0, input.width() - 1); 14 | Expr clamped_y = clamp(y, 0, input.height() - 1); 15 | Func input_clamped; 16 | input_clamped(x, y) = input(clamped_x, clamped_y); 17 | blur_x(x, y) = (input_clamped(x, y) + input_clamped(x + 1, y) + 18 | input_clamped(x + 2, y)) / 19 | 3.0f; 20 | output(x, y) = (blur_x(x, y) + blur_x(x, y + 1) + blur_x(x, y + 2)) / 3.0f; 21 | 22 | output.tile(x, y, x_inner, y_inner, 8, 8); 23 | output.parallel(y); 24 | output.vectorize(y, 12); 25 | blur_x.store_at(output, x); 26 | blur_x.compute_at(output, x); 27 | blur_x.vectorize(x, 12); 28 | } 29 | }; 30 | 31 | class AutoBlurGenerator : public Halide::Generator { 32 | public: 33 | Input> input = {"input", 2}; 34 | Output> output = {"output", 2}; 35 | Func blur_x; 36 | Var x, y, x_inner, y_inner; 37 | 38 | void generate() { 39 | Expr clamped_x = clamp(x, 0, input.width() - 1); 40 | Expr clamped_y = clamp(y, 0, input.height() - 1); 41 | Func input_clamped; 42 | input_clamped(x, y) = input(clamped_x, clamped_y); 43 | blur_x(x, y) = (input_clamped(x, y) + input_clamped(x + 1, y) + 44 | input_clamped(x + 2, y)) / 45 | 3.0f; 46 | output(x, y) = (blur_x(x, y) + blur_x(x, y + 1) + blur_x(x, y + 2)) / 3.0f; 47 | } 48 | 49 | void schedule() { 50 | if (auto_schedule) { 51 | input.set_estimates({{8192, 8192}, {4096, 4096}}); 52 | output.set_estimates({{8192, 8192}, {4096, 4096}}); 53 | } else { 54 | output.tile(x, y, x_inner, y_inner, 8, 8); 55 | output.parallel(y); 56 | output.vectorize(y, 12); 57 | blur_x.store_at(output, x); 58 | blur_x.compute_at(output, x); 59 | blur_x.vectorize(x, 12); 60 | } 61 | } 62 | }; 63 | 64 | // Register our generator: 65 | HALIDE_REGISTER_GENERATOR(BlurGenerator, blur_mat) 66 | HALIDE_REGISTER_GENERATOR(AutoBlurGenerator, auto_blur_mat) -------------------------------------------------------------------------------- /src/gpu_2d.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | #include "timer.h" 7 | 8 | using namespace std; 9 | 10 | __global__ void blur_mat(float **input, float **output, int width, int height) { 11 | int y = blockIdx.y * blockDim.y + threadIdx.y; 12 | int x = blockIdx.x * blockDim.x + threadIdx.x; 13 | int left = x - 1 < 0 ? 0 : x - 1; 14 | int right = x + 1 >= width ? width - 1 : x + 1; 15 | int above = y - 1 < 0 ? 0 : y - 1; 16 | int below = y + 1 >= height ? height - 1 : y + 1; 17 | output[y][x] = (input[y][x] + input[y][left] + input[y][right] + 18 | input[above][left] + input[above][x] + input[above][right] + 19 | input[below][left] + input[below][x] + input[below][right]) / 20 | 9; 21 | } 22 | 23 | __global__ void blur_mat_redup(float **input, float **output, int width, 24 | int height) { 25 | int y = blockIdx.y * blockDim.y + threadIdx.y; 26 | int x = blockIdx.x * blockDim.x + threadIdx.x; 27 | int left = x - 1 < 0 ? 0 : x - 1; 28 | int right = x + 1 >= width ? width - 1 : x + 1; 29 | int above = y - 1 < 0 ? 0 : y - 1; 30 | int below = y + 1 >= height ? height - 1 : y + 1; 31 | output[y][x] = (input[y][x] + input[y][left] + input[y][right]) / 3; 32 | output[y][x] = (output[y][x] + output[above][x] + output[below][x]) / 3; 33 | } 34 | 35 | void print_mat(float *data, int width, int height) { 36 | for (int y = 0; y < height; y++) { 37 | for (int x = 0; x < width; x++) { 38 | cout << data[height * y + x] << ", "; 39 | } 40 | cout << endl; 41 | } 42 | } 43 | 44 | int main() { 45 | cudaSetDevice(3); 46 | const int width = 8192; 47 | const int height = 4096; 48 | 49 | float **input = (float **)malloc(sizeof(float *) * height); 50 | float **output = (float **)malloc(sizeof(float *) * height); 51 | float *input_data = (float *)malloc(sizeof(float) * width * height); 52 | float *output_data = (float *)malloc(sizeof(float) * width * height); 53 | for (int i = 0; i < width * height; ++i) { 54 | input_data[i] = i; 55 | output_data[i] = 0.0f; 56 | } 57 | 58 | float **d_input; 59 | float **d_output; 60 | float *d_input_data; 61 | float *d_output_data; 62 | 63 | cudaMalloc((void **)&d_input, sizeof(float **) * height); 64 | cudaMalloc((void **)&d_output, sizeof(float **) * height); 65 | cudaMalloc((void **)&d_input_data, sizeof(float) * width * height); 66 | cudaMalloc((void **)&d_output_data, sizeof(float) * width * height); 67 | 68 | for (int i = 0; i < height; ++i) { 69 | input[i] = d_input_data + width * i; 70 | output[i] = d_output_data + width * i; 71 | } 72 | 73 | Timer t_copy("Host to device"); 74 | cudaMemcpy(d_input, input, sizeof(float *) * height, cudaMemcpyHostToDevice); 75 | cudaMemcpy(d_output, output, sizeof(float *) * height, 76 | cudaMemcpyHostToDevice); 77 | cudaMemcpy(d_input_data, input_data, sizeof(float) * width * height, 78 | cudaMemcpyHostToDevice); 79 | t_copy.stop(); 80 | 81 | dim3 dim_block(16, 8); 82 | dim3 dim_grid(width / dim_block.x, height / dim_block.y); 83 | 84 | Timer t1("original"); 85 | blur_mat<<>>(d_input, d_output, width, height); 86 | cudaDeviceSynchronize(); 87 | t1.stop(); 88 | 89 | cudaMemcpy(output_data, d_output_data, sizeof(float) * width * height, 90 | cudaMemcpyDeviceToHost); 91 | 92 | Timer t2("redup"); 93 | blur_mat_redup<<>>(d_input, d_output, width, height); 94 | cudaDeviceSynchronize(); 95 | t2.stop(); 96 | 97 | cudaFree(d_input); 98 | cudaFree(d_output); 99 | cudaFree(d_input_data); 100 | cudaFree(d_output_data); 101 | 102 | printf("%f,%f\n", output_data[0], output_data[1200]); 103 | 104 | free(input); 105 | free(output); 106 | free(input_data); 107 | free(output_data); 108 | 109 | return 0; 110 | } 111 | -------------------------------------------------------------------------------- /src/halide_test.cpp: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | 3 | #include 4 | 5 | #include 6 | 7 | #include "timer.h" 8 | 9 | using namespace std; 10 | using namespace Halide; 11 | 12 | #define WIDTH 8192 13 | #define HEIGHT 4096 14 | 15 | void blur_original(Buffer& input) { 16 | Var x("x"), y("y"); 17 | 18 | Expr clamped_x = clamp(x, 0, input.width() - 1); 19 | Expr clamped_y = clamp(y, 0, input.height() - 1); 20 | Func input_clamped; 21 | input_clamped(x, y) = input(clamped_x, clamped_y); 22 | 23 | Func blur_x("blur x"); 24 | Func blur_y("blur y"); 25 | blur_x(x, y) = (input_clamped(x, y) + input_clamped(x + 1, y) + 26 | input_clamped(x + 2, y)) / 27 | 3.0f; 28 | 29 | blur_y(x, y) = (blur_x(x, y) + blur_x(x, y + 1) + blur_x(x, y + 2)) / 3.0f; 30 | Timer t1("1 original"); 31 | Buffer output = blur_y.realize(WIDTH, HEIGHT); 32 | t1.stop(); 33 | } 34 | 35 | void blur_x_root(Buffer& input) { 36 | Var x("x"), y("y"); 37 | 38 | Expr clamped_x = clamp(x, 0, input.width() - 1); 39 | Expr clamped_y = clamp(y, 0, input.height() - 1); 40 | Func input_clamped; 41 | input_clamped(x, y) = input(clamped_x, clamped_y); 42 | 43 | Func blur_x("blur x"); 44 | Func blur_y("blur y"); 45 | blur_x(x, y) = (input_clamped(x, y) + input_clamped(x + 1, y) + 46 | input_clamped(x + 2, y)) / 47 | 3.0f; 48 | 49 | blur_y(x, y) = (blur_x(x, y) + blur_x(x, y + 1) + blur_x(x, y + 2)) / 3.0f; 50 | 51 | blur_x.compute_root(); 52 | Timer t1("1 original"); 53 | Buffer output = blur_y.realize(WIDTH, HEIGHT); 54 | t1.stop(); 55 | } 56 | 57 | void blur_x_at_y(Buffer& input) { 58 | Var x("x"), y("y"); 59 | 60 | Expr clamped_x = clamp(x, 0, input.width() - 1); 61 | Expr clamped_y = clamp(y, 0, input.height() - 1); 62 | Func input_clamped; 63 | input_clamped(x, y) = input(clamped_x, clamped_y); 64 | 65 | Func blur_x("blur x"); 66 | Func blur_y("blur y"); 67 | blur_x(x, y) = (input_clamped(x, y) + input_clamped(x + 1, y) + 68 | input_clamped(x + 2, y)) / 69 | 3.0f; 70 | 71 | blur_y(x, y) = (blur_x(x, y) + blur_x(x, y + 1) + blur_x(x, y + 2)) / 3.0f; 72 | 73 | blur_x.compute_at(blur_y, y); 74 | Timer t1("1 original"); 75 | Buffer output = blur_y.realize(WIDTH, HEIGHT); 76 | t1.stop(); 77 | } 78 | 79 | void blur_x_store(Buffer& input) { 80 | Var x("x"), y("y"); 81 | 82 | Expr clamped_x = clamp(x, 0, input.width() - 1); 83 | Expr clamped_y = clamp(y, 0, input.height() - 1); 84 | Func input_clamped; 85 | input_clamped(x, y) = input(clamped_x, clamped_y); 86 | 87 | Func blur_x("blur x"); 88 | Func blur_y("blur y"); 89 | blur_x(x, y) = (input_clamped(x, y) + input_clamped(x + 1, y) + 90 | input_clamped(x + 2, y)) / 91 | 3.0f; 92 | 93 | blur_y(x, y) = (blur_x(x, y) + blur_x(x, y + 1) + blur_x(x, y + 2)) / 3.0f; 94 | blur_x.store_root(); 95 | blur_x.compute_at(blur_y, y); 96 | Timer t1("1 original"); 97 | Buffer output = blur_y.realize(WIDTH, HEIGHT); 98 | t1.stop(); 99 | } 100 | 101 | void blur_x_at_x(Buffer& input) { 102 | Var x("x"), y("y"); 103 | 104 | Expr clamped_x = clamp(x, 0, input.width() - 1); 105 | Expr clamped_y = clamp(y, 0, input.height() - 1); 106 | Func input_clamped; 107 | input_clamped(x, y) = input(clamped_x, clamped_y); 108 | 109 | Func blur_x("blur x"); 110 | Func blur_y("blur y"); 111 | blur_x(x, y) = (input_clamped(x, y) + input_clamped(x + 1, y) + 112 | input_clamped(x + 2, y)) / 113 | 3.0f; 114 | 115 | blur_y(x, y) = (blur_x(x, y) + blur_x(x, y + 1) + blur_x(x, y + 2)) / 3.0f; 116 | 117 | blur_x.compute_at(blur_y, x); 118 | Timer t1("1 original"); 119 | Buffer output = blur_y.realize(WIDTH, HEIGHT); 120 | t1.stop(); 121 | } 122 | 123 | void blur_tile(Buffer& input) { 124 | Var x("x"), y("y"); 125 | 126 | Expr clamped_x = clamp(x, 0, input.width() - 1); 127 | Expr clamped_y = clamp(y, 0, input.height() - 1); 128 | Func input_clamped; 129 | input_clamped(x, y) = input(clamped_x, clamped_y); 130 | 131 | Func blur_x("blur x"); 132 | Func blur_y("blur y"); 133 | blur_x(x, y) = (input_clamped(x, y) + input_clamped(x + 1, y) + 134 | input_clamped(x + 2, y)) / 135 | 3.0f; 136 | 137 | blur_y(x, y) = (blur_x(x, y) + blur_x(x, y + 1) + blur_x(x, y + 2)) / 3.0f; 138 | 139 | Var x_outer, y_outer, x_inner, y_inner; 140 | blur_y.tile(x, y, x_outer, y_outer, x_inner, y_inner, 32, 32) 141 | .parallel(y_outer); 142 | 143 | // Compute the blur_x per tile of the blur_y 144 | blur_x.compute_at(blur_y, x_outer); 145 | 146 | Timer t1("1 original"); 147 | Buffer output = blur_y.realize(WIDTH, HEIGHT); 148 | t1.stop(); 149 | } 150 | 151 | void blur_mixed(Buffer& input) { 152 | Var x("x"), y("y"); 153 | 154 | Expr clamped_x = clamp(x, 0, input.width() - 1); 155 | Expr clamped_y = clamp(y, 0, input.height() - 1); 156 | Func input_clamped; 157 | input_clamped(x, y) = input(clamped_x, clamped_y); 158 | 159 | Func blur_x("blur x"); 160 | Func blur_y("blur y"); 161 | blur_x(x, y) = (input_clamped(x, y) + input_clamped(x + 1, y) + 162 | input_clamped(x + 2, y)) / 163 | 3.0f; 164 | blur_y(x, y) = (blur_x(x, y) + blur_x(x, y + 1) + blur_x(x, y + 2)) / 3.0f; 165 | 166 | Var x_inner, y_inner; 167 | 168 | blur_y.tile(x, y, x_inner, y_inner, 8, 8); 169 | blur_y.parallel(y); 170 | blur_y.vectorize(x_inner, 8); 171 | blur_x.compute_at(blur_y, x); 172 | blur_x.vectorize(x, 8); 173 | 174 | Timer t1("1 original"); 175 | Buffer output = blur_y.realize(WIDTH, HEIGHT); 176 | t1.stop(); 177 | } 178 | 179 | int main() { 180 | Buffer input(WIDTH, HEIGHT); 181 | for (int x = 0; x < WIDTH; ++x) { 182 | for (int y = 0; y < HEIGHT; ++y) { 183 | input(x, y) = x * WIDTH + y; 184 | } 185 | } 186 | 187 | // blur_original(input); 188 | 189 | // blur_x_root(input); 190 | 191 | // blur_x_at_y(input); 192 | 193 | // blur_x_store(input); 194 | 195 | // blur_x_at_x(input); 196 | 197 | blur_tile(input); 198 | 199 | blur_mixed(input); 200 | 201 | // for (int j = 0; j < 4; j++) { 202 | // for (int i = 0; i < 4; i++) { 203 | // // We can access a pixel of an Buffer object using similar 204 | // // syntax to defining and using functions. 205 | // printf("%f, ", output(i, j)); 206 | // } 207 | // printf("\n"); 208 | // } 209 | 210 | // printf("Success!\n"); 211 | 212 | return 0; 213 | } -------------------------------------------------------------------------------- /src/gpu.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | #include "timer.h" 7 | 8 | using namespace std; 9 | 10 | #define DIM_BLOCK_X (16) 11 | #define DIM_BLOCK_Y (8) 12 | 13 | template 14 | struct Mat { 15 | Mat(int width, int height) : _width(width), _height(height), _data(nullptr) {} 16 | Mat(int width, int height, bool init) 17 | : _width(width), _height(height), _data(nullptr) { 18 | _data = (T *)malloc(sizeof(T) * _width * _height); 19 | if (init) { 20 | for (int i = 0; i < _width * _height; i++) _data[i] = i; 21 | } 22 | } 23 | ~Mat() { 24 | if (_data) free(_data); 25 | } 26 | 27 | __host__ __device__ T get(int x, int y) { return _data[y * _width + x]; } 28 | __device__ void set(int x, int y, T value) { _data[y * _width + x] = value; } 29 | 30 | int _height; 31 | int _width; 32 | T *_data; 33 | }; 34 | 35 | __global__ void blur_mat(Mat *input, Mat *output) { 36 | int width = input->_width; 37 | int height = input->_height; 38 | // int width = 8192; 39 | // int height = 4096; 40 | 41 | int y = blockIdx.y * blockDim.y + threadIdx.y; 42 | int x = blockIdx.x * blockDim.x + threadIdx.x; 43 | int right = x + 1 >= width ? width - 1 : x + 1; 44 | int right_right = x + 2 >= width ? width - 1 : x + 2; 45 | int below = y + 1 >= height ? height - 1 : y + 1; 46 | int below_below = y + 2 >= height ? height - 1 : y + 2; 47 | 48 | float res = 49 | (input->get(x, y) + input->get(right, y) + input->get(right_right, y) + 50 | input->get(x, below) + input->get(right, below) + 51 | input->get(right_right, below) + input->get(x, below_below) + 52 | input->get(right, below_below) + input->get(right_right, below_below)) / 53 | 9; 54 | output->set(x, y, res); 55 | } 56 | 57 | __global__ void blur_mat_redup(Mat *input, Mat *output) { 58 | int width = input->_width; 59 | int height = input->_height; 60 | 61 | int y = blockIdx.y * blockDim.y + threadIdx.y; 62 | int x = blockIdx.x * blockDim.x + threadIdx.x; 63 | int right = x + 1 >= width ? width - 1 : x + 1; 64 | int right_right = x + 2 >= width ? width - 1 : x + 2; 65 | int below = y + 1 >= height ? height - 1 : y + 1; 66 | int below_below = y + 2 >= height ? height - 1 : y + 2; 67 | 68 | output->set( 69 | x, y, 70 | (input->get(x, y) + input->get(right, y) + input->get(right_right, y)) / 71 | 3); 72 | __syncthreads(); 73 | output->set(x, y, 74 | (output->get(x, y) + output->get(x, below) + 75 | output->get(x, below_below)) / 76 | 3); 77 | } 78 | 79 | __global__ void blur_mat_tiling(Mat *input, Mat *output) { 80 | int width = input->_width; 81 | int height = input->_height; 82 | 83 | __shared__ float tile[DIM_BLOCK_Y + 2][DIM_BLOCK_X + 2]; 84 | 85 | int y = blockIdx.y * blockDim.y + threadIdx.y; 86 | int x = blockIdx.x * blockDim.x + threadIdx.x; 87 | int tile_x = threadIdx.x; 88 | int tile_y = threadIdx.y; 89 | 90 | tile[tile_y][tile_x] = input->get(x, y); 91 | 92 | if (tile_x == DIM_BLOCK_X - 1) { 93 | int right = x + 1 >= width - 1 ? width - 1 : x + 1; 94 | int right_right = x + 2 >= width - 1 ? width - 1 : x + 2; 95 | tile[tile_y][tile_x + 1] = input->get(right, y); 96 | tile[tile_y][tile_x + 2] = input->get(right_right, y); 97 | } 98 | 99 | if (tile_y == DIM_BLOCK_Y - 1) { 100 | int below = y + 1 >= height - 1 ? height - 1 : y + 1; 101 | int below_below = y + 2 >= height - 1 ? height - 1 : y + 2; 102 | tile[tile_y + 1][tile_x] = input->get(x, below); 103 | tile[tile_y + 2][tile_x] = input->get(x, below_below); 104 | } 105 | 106 | if (tile_x == DIM_BLOCK_X - 1 && tile_y == DIM_BLOCK_Y - 1) { 107 | int right = x + 1 >= width - 1 ? width - 1 : x + 1; 108 | int right_right = x + 2 >= width - 1 ? width - 1 : x + 2; 109 | int below = y + 1 >= height - 1 ? height - 1 : y + 1; 110 | int below_below = y + 2 >= height - 1 ? height - 1 : y + 2; 111 | tile[tile_y + 1][tile_x + 1] = input->get(right, below); 112 | tile[tile_y + 2][tile_x + 1] = input->get(right, below_below); 113 | tile[tile_y + 1][tile_x + 2] = input->get(right_right, below); 114 | tile[tile_y + 2][tile_x + 2] = input->get(right_right, below_below); 115 | } 116 | 117 | __syncthreads(); 118 | 119 | float res = (tile[tile_y][tile_x] + tile[tile_y][tile_x + 1] + 120 | tile[tile_y][tile_x + 2] + tile[tile_y + 1][tile_x] + 121 | tile[tile_y + 1][tile_x + 1] + tile[tile_y + 1][tile_x + 2] + 122 | tile[tile_y + 2][tile_x] + tile[tile_y + 2][tile_x + 1] + 123 | tile[tile_y + 2][tile_x + 2]) / 124 | 9; 125 | output->set(x, y, res); 126 | __syncthreads(); 127 | } 128 | 129 | void print_mat(Mat &mat) { 130 | for (int y = 0; y < mat._height; y++) { 131 | for (int x = 0; x < mat._width; x++) { 132 | cout << mat.get(x, y) << ", "; 133 | } 134 | cout << endl; 135 | } 136 | } 137 | 138 | int main() { 139 | cudaSetDevice(3); 140 | 141 | const int width = 8192; 142 | const int height = 4096; 143 | 144 | Mat *input = new Mat(width, height, true); 145 | Mat *output = new Mat(width, height, true); 146 | 147 | Mat *d_input; 148 | Mat *d_output; 149 | 150 | Mat *d_input_data = new Mat(width, height); 151 | Mat *d_output_data = new Mat(width, height); 152 | 153 | cudaMalloc((void **)&d_input, sizeof(Mat)); 154 | cudaMalloc((void **)&d_output, sizeof(Mat)); 155 | cudaMalloc((void **)&(d_input_data->_data), sizeof(float) * width * height); 156 | cudaMalloc((void **)&(d_output_data->_data), sizeof(float) * width * height); 157 | 158 | Timer t_copy("Host to device"); 159 | cudaMemcpy(d_input, d_input_data, sizeof(Mat), cudaMemcpyHostToDevice); 160 | cudaMemcpy(d_output, d_output_data, sizeof(Mat), 161 | cudaMemcpyHostToDevice); 162 | cudaMemcpy(d_input_data, input->_data, sizeof(float) * width * height, 163 | cudaMemcpyHostToDevice); 164 | t_copy.stop(); 165 | 166 | dim3 dim_block(DIM_BLOCK_X, DIM_BLOCK_Y); 167 | dim3 dim_grid(width / dim_block.x, height / dim_block.y); 168 | 169 | Timer t1("original"); 170 | blur_mat<<>>(d_input, d_output); 171 | cudaDeviceSynchronize(); 172 | t1.stop(); 173 | 174 | cudaMemcpy(output->_data, d_output_data->_data, 175 | sizeof(float) * width * height, cudaMemcpyDeviceToHost); 176 | 177 | Timer t2("redup"); 178 | blur_mat_redup<<>>(d_input, d_output); 179 | cudaDeviceSynchronize(); 180 | t2.stop(); 181 | cudaMemcpy(output->_data, d_output_data->_data, 182 | sizeof(float) * width * height, cudaMemcpyDeviceToHost); 183 | 184 | Timer t3("tiling"); 185 | blur_mat_tiling<<>>(d_input, d_output); 186 | cudaDeviceSynchronize(); 187 | t3.stop(); 188 | cudaMemcpy(output->_data, d_output_data->_data, 189 | sizeof(float) * width * height, cudaMemcpyDeviceToHost); 190 | 191 | // // for (int i = 0; i < 4; ++i) { 192 | // // for (int j = 0; j < 4; ++j) { 193 | // // printf("%0.2f, ", output->get(j, i)); 194 | // // } 195 | // // printf("\n"); 196 | // // } 197 | 198 | cudaFree(d_input); 199 | cudaFree(d_output); 200 | cudaFree(d_input_data->_data); 201 | d_input_data->_data = nullptr; 202 | cudaFree(d_output_data->_data); 203 | d_output_data->_data = nullptr; 204 | 205 | delete input; 206 | delete output; 207 | delete d_input_data; 208 | delete d_output_data; 209 | 210 | return 0; 211 | } 212 | -------------------------------------------------------------------------------- /src/halide_gen_usage.sh: -------------------------------------------------------------------------------- 1 | # Halide tutorial lesson 15: Generators part 2 2 | 3 | # This shell script demonstrates how to use a binary containing 4 | # Generators from the command line. Normally you'd call these binaries 5 | # from your build system of choice rather than running them manually 6 | # like we do here. 7 | 8 | # This script assumes that you're in the tutorials directory, and the 9 | # generator has been compiled for the current system and is called 10 | # "blur_generator". 11 | 12 | # To run this script: 13 | # bash lesson_15_generators_usage.sh 14 | 15 | # First we define a helper function that checks that a file exists 16 | check_file_exists() 17 | { 18 | FILE=$1 19 | if [ ! -f $FILE ]; then 20 | echo $FILE not found 21 | exit -1 22 | fi 23 | } 24 | 25 | # And another helper function to check if a symbol exists in an object file 26 | check_symbol() 27 | { 28 | FILE=$1 29 | SYM=$2 30 | if !(nm $FILE | grep $SYM > /dev/null); then 31 | echo "$SYM not found in $FILE" 32 | exit -1 33 | fi 34 | } 35 | 36 | # Bail out on error 37 | #set -e 38 | 39 | # Set up LD_LIBRARY_PATH so that we can find libHalide.so 40 | export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/home/chenzhen/Workspace/hpc/halide_build/lib 41 | export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/home/chenzhen/Workspace/hpc/halide_build/lib 42 | 43 | ######################### 44 | # Basic generator usage # 45 | ######################### 46 | 47 | # First let's compile the first generator for the host system: 48 | ./blur_generator -o . -g blur_mat target=host auto_schedule=false 49 | # ./blur_generator -o . -g auto_blur_mat -f auto_blur_mat -e static_library,h,schedule -p /home/chenzhen/Workspace/hpc/halide_build/distrib/lib/libautoschedule_mullapudi2016.so target=host auto_schedule=true 50 | ./blur_generator -o . -g auto_blur_mat -f auto_blur_mat -e static_library,h,schedule -p /home/chenzhen/Workspace/hpc/halide_build/distrib/lib/libautoschedule_adams2019.so target=host auto_schedule=true 51 | # ./blur_generator -o . -g auto_blur_mat -f auto_blur_mat -e static_library,h,schedule -p /home/chenzhen/Workspace/hpc/halide_build/distrib/lib/libautoschedule_li2018.so target=host auto_schedule=true 52 | 53 | 54 | # That should create a pair of files in the current directory: 55 | # "blur_generator.a", and "blur_generator.h", which define a 56 | # function "blur_generator" representing the compiled pipeline. 57 | 58 | check_file_exists blur_mat.a 59 | check_file_exists blur_mat.h 60 | check_symbol blur_mat.a blur_mat 61 | 62 | exit 0 63 | 64 | ##################### 65 | # Cross-compilation # 66 | ##################### 67 | 68 | # We can also use a generator to compile object files for some other 69 | # target. Let's cross-compile a windows 32-bit object file and header 70 | # for the first generator: 71 | 72 | ./blur_generator \ 73 | -g blur_generator \ 74 | -f blur_generator_win32 \ 75 | -o . \ 76 | target=x86-32-windows 77 | 78 | # This generates a file called "blur_generator_win32.lib" in the 79 | # current directory, along with a matching header. The function 80 | # defined is called "blur_generator_win32". 81 | 82 | check_file_exists blur_generator_win32.lib 83 | check_file_exists blur_generator_win32.h 84 | 85 | ################################ 86 | # Generating pipeline variants # 87 | ################################ 88 | 89 | # The full set of command-line arguments to the generator binary are: 90 | 91 | # -g generator_name : Selects which generator to run. If you only have 92 | # one generator in your binary you can omit this. 93 | 94 | # -o directory : Specifies which directory to create the outputs 95 | # in. Usually a build directory. 96 | 97 | # -f name : Specifies the name of the generated function. If you omit 98 | # this, it defaults to the generator name. 99 | 100 | # -n file_base_name : Specifies the basename of the generated file(s). If 101 | # you omit this, it defaults to the name of the generated function. 102 | 103 | # -e static_library,object,c_header,assembly,bitcode,stmt,stmt_html: A list of 104 | # comma-separated values specifying outputs to create. The default is 105 | # "static_library,c_header,registration". "assembly" generates assembly equivalent to the 106 | # generated object file. "bitcode" generates llvm bitcode for the pipeline. 107 | # "stmt" generates human-readable pseudocode for the pipeline (similar to 108 | # setting HL_DEBUG_CODEGEN). "stmt_html" generates an html version of the 109 | # pseudocode, which can be much nicer to read than the raw .stmt file. 110 | 111 | # -r file_base_name : Specifies that the generator should create a 112 | # standalone file for just the runtime. For use when generating multiple 113 | # pipelines from a single generator, to be linked together in one 114 | # executable. See example below. 115 | 116 | # target=... : The target to compile for. 117 | 118 | # my_generator_param=value : The value of your generator params. 119 | 120 | # Let's now generate some human-readable pseudocode for the first 121 | # generator: 122 | 123 | ./blur_generator -g blur_generator -e stmt -o . target=host 124 | 125 | check_file_exists blur_generator.stmt 126 | 127 | # The second generator has generator params, which can be specified on 128 | # the command-line after the target. Let's compile a few different variants: 129 | # ./blur_generator -g my_second_generator -f my_second_generator_1 -o . \ 130 | # target=host parallel=false scale=3.0 rotation=ccw output.type=uint16 131 | 132 | # ./blur_generator -g my_second_generator -f my_second_generator_2 -o . \ 133 | # target=host scale=9.0 rotation=ccw output.type=float32 134 | 135 | # ./blur_generator -g my_second_generator -f my_second_generator_3 -o . \ 136 | # target=host parallel=false output.type=float64 137 | 138 | # check_file_exists my_second_generator_1.a 139 | # check_file_exists my_second_generator_1.h 140 | # check_symbol my_second_generator_1.a my_second_generator_1 141 | # check_file_exists my_second_generator_2.a 142 | # check_file_exists my_second_generator_2.h 143 | # check_symbol my_second_generator_2.a my_second_generator_2 144 | # check_file_exists my_second_generator_3.a 145 | # check_file_exists my_second_generator_3.h 146 | # check_symbol my_second_generator_3.a my_second_generator_3 147 | 148 | # Use of these generated object files and headers is exactly the same 149 | # as in lesson 10. 150 | 151 | ###################### 152 | # The Halide runtime # 153 | ###################### 154 | 155 | # Each generated Halide object file contains a simple runtime that 156 | # defines things like how to run a parallel for loop, how to launch a 157 | # cuda program, etc. You can see this runtime in the generated object 158 | # files. 159 | 160 | # echo "The halide runtime:" 161 | # nm my_second_generator_1.a | grep "[SWT] _\?halide_" 162 | 163 | # Let's define some functions to check that the runtime exists in a file. 164 | check_runtime() 165 | { 166 | if !(nm $1 | grep "[TSW] _\?halide_" > /dev/null); then 167 | echo "Halide runtime not found in $1" 168 | exit -1 169 | fi 170 | } 171 | 172 | check_no_runtime() 173 | { 174 | if nm $1 | grep "[TSW] _\?halide_" > /dev/null; then 175 | echo "Halide runtime found in $1" 176 | exit -1 177 | fi 178 | } 179 | 180 | # Declarations and documentation for these runtime functions are in 181 | # HalideRuntime.h 182 | 183 | # If you're compiling and linking multiple Halide pipelines, then the 184 | # multiple copies of the runtime should combine into a single copy 185 | # (via weak linkage). If you're compiling and linking for multiple 186 | # different targets (e.g. avx and non-avx), then the runtimes might be 187 | # different, and you can't control which copy of the runtime the 188 | # linker selects. 189 | 190 | # You can control this behavior explicitly by compiling your pipelines 191 | # with the no_runtime target flag. Let's generate and link several 192 | # different versions of the first pipeline for different x86 variants: 193 | 194 | # (Note that we'll ask the generators to just give us object files ("-e o"), 195 | # instead of static libraries, so that we can easily link them all into a 196 | # single static library.) 197 | 198 | ./blur_generator \ 199 | -g blur_generator \ 200 | -f blur_generator_basic \ 201 | -e object,c_header\ 202 | -o . \ 203 | target=host-x86-64-no_runtime 204 | 205 | ./blur_generator \ 206 | -g blur_generator \ 207 | -f blur_generator_sse41 \ 208 | -e object,c_header\ 209 | -o . \ 210 | target=host-x86-64-sse41-no_runtime 211 | 212 | ./blur_generator \ 213 | -g blur_generator \ 214 | -f blur_generator_avx \ 215 | -e object,c_header\ 216 | -o . \ 217 | target=host-x86-64-avx-no_runtime 218 | 219 | # These files don't contain the runtime 220 | check_no_runtime blur_generator_basic.o 221 | check_symbol blur_generator_basic.o blur_generator_basic 222 | check_no_runtime blur_generator_sse41.o 223 | check_symbol blur_generator_sse41.o blur_generator_sse41 224 | check_no_runtime blur_generator_avx.o 225 | check_symbol blur_generator_avx.o blur_generator_avx 226 | 227 | # We can then use the generator to emit just the runtime: 228 | ./blur_generator \ 229 | -r halide_runtime_x86 \ 230 | -e object,c_header\ 231 | -o . \ 232 | target=host-x86-64 233 | check_runtime halide_runtime_x86.o 234 | 235 | # Linking the standalone runtime with the three generated object files 236 | # gives us three versions of the pipeline for varying levels of x86, 237 | # combined with a single runtime that will work on nearly all x86 238 | # processors. 239 | ar q blur_generator_multi.a \ 240 | blur_generator_basic.o \ 241 | blur_generator_sse41.o \ 242 | blur_generator_avx.o \ 243 | halide_runtime_x86.o 244 | 245 | check_runtime blur_generator_multi.a 246 | check_symbol blur_generator_multi.a blur_generator_basic 247 | check_symbol blur_generator_multi.a blur_generator_sse41 248 | check_symbol blur_generator_multi.a blur_generator_avx 249 | 250 | echo "Success!" -------------------------------------------------------------------------------- /src/cpu.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | // #include 3 | 4 | #include 5 | #include 6 | 7 | #include "timer.h" 8 | 9 | using namespace std; 10 | 11 | void blur_mat_original(const vector> &input, 12 | vector> &output) { 13 | int height = input.size(); 14 | int width = input[0].size(); 15 | int right, right_right, below, below_below; 16 | for (int x = 0; x < width; ++x) { 17 | right = x + 1 >= width ? width - 1 : x + 1; 18 | right_right = x + 2 >= width ? width - 1 : x + 2; 19 | for (int y = 0; y < height; ++y) { 20 | below = y + 1 >= height ? height - 1 : y + 1; 21 | below_below = y + 2 >= height ? height - 1 : y + 2; 22 | output[y][x] = 23 | ((input[y][x] + input[y][right] + input[y][right_right]) + 24 | (input[below][x] + input[below][right] + input[below][right_right]) + 25 | (input[below_below][x] + input[below_below][right] + 26 | input[below_below][right_right])) / 27 | 9; 28 | } 29 | } 30 | } 31 | 32 | void blur_mat_redup(const vector> &input, 33 | vector> &output) { 34 | int height = input.size(); 35 | int width = input[0].size(); 36 | int right, right_right, below, below_below; 37 | for (int x = 0; x < width; ++x) { 38 | for (int y = 0; y < height; ++y) { 39 | right = x + 1 >= width ? width - 1 : x + 1; 40 | right_right = x + 2 >= width ? width - 1 : x + 2; 41 | output[y][x] = 42 | (input[y][x] + input[y][right] + input[y][right_right]) / 3; 43 | } 44 | } 45 | 46 | for (int x = 0; x < width; ++x) { 47 | for (int y = 0; y < height; ++y) { 48 | below = y + 1 >= height ? height - 1 : y + 1; 49 | below_below = y + 2 >= height ? height - 1 : y + 2; 50 | output[y][x] = 51 | (output[y][x] + output[below][x] + output[below_below][x]) / 3; 52 | } 53 | } 54 | } 55 | 56 | void blur_mat_locality(const vector> &input, 57 | vector> &output) { 58 | int height = input.size(); 59 | int width = input[0].size(); 60 | int right, right_right, below, below_below; 61 | for (int y = 0; y < height; ++y) { 62 | for (int x = 0; x < width; ++x) { 63 | right = x + 1 >= width ? width - 1 : x + 1; 64 | right_right = x + 2 >= width ? width - 1 : x + 2; 65 | output[y][x] = 66 | (input[y][x] + input[y][right] + input[y][right_right]) / 3; 67 | } 68 | } 69 | 70 | for (int y = 0; y < height; ++y) { 71 | for (int x = 0; x < width; ++x) { 72 | below = y + 1 >= height ? height - 1 : y + 1; 73 | below_below = y + 2 >= height ? height - 1 : y + 2; 74 | output[y][x] = 75 | (output[y][x] + output[below][x] + output[below_below][x]) / 3; 76 | } 77 | } 78 | } 79 | 80 | void blur_mat_parallel(const vector> &input, 81 | vector> &output) { 82 | int height = input.size(); 83 | int width = input[0].size(); 84 | #pragma omp parallel for 85 | for (int y = 0; y < height; ++y) { 86 | int below = y + 1 >= height ? height - 1 : y + 1; 87 | int below_below = y + 2 >= height ? height - 1 : y + 2; 88 | for (int x = 0; x < width; ++x) { 89 | int right = x + 1 >= width ? width - 1 : x + 1; 90 | int right_right = x + 2 >= width ? width - 1 : x + 2; 91 | output[y][x] = 92 | ((input[y][x] + input[y][right] + input[y][right_right]) + 93 | (input[below][x] + input[below][right] + input[below][right_right]) + 94 | (input[below_below][x] + input[below_below][right] + 95 | input[below_below][right_right])) / 96 | 9; 97 | } 98 | } 99 | } 100 | 101 | void blur_mat_parallel_redup(const vector> &input, 102 | vector> &output) { 103 | int height = input.size(); 104 | int width = input[0].size(); 105 | #pragma omp parallel for 106 | for (int y = 0; y < height; ++y) { 107 | for (int x = 0; x < width; ++x) { 108 | int right = x + 1 >= width ? width - 1 : x + 1; 109 | int right_right = x + 2 >= width ? width - 1 : x + 2; 110 | output[y][x] = 111 | (input[y][x] + input[y][right] + input[y][right_right]) / 3; 112 | } 113 | } 114 | // can not parallel here 115 | for (int y = 0; y < height; ++y) { 116 | for (int x = 0; x < width; ++x) { 117 | int below = y + 1 >= height ? height - 1 : y + 1; 118 | int below_below = y + 2 >= height ? height - 1 : y + 2; 119 | output[y][x] = 120 | (output[y][x] + output[below][x] + output[below_below][x]) / 3; 121 | } 122 | } 123 | } 124 | 125 | void blur_mat_tiling(const vector> &input, 126 | vector> &output, int tile_width, 127 | int tile_height) { 128 | int height = input.size(); 129 | int width = input[0].size(); 130 | int t_y, t_x, target_y, target_x, right, right_right, below, below_below; 131 | for (int tile_y = 0; tile_y < height / tile_height; ++tile_y) { 132 | t_y = tile_y * tile_height; 133 | for (int tile_x = 0; tile_x < width / tile_width; ++tile_x) { 134 | t_x = tile_x * tile_width; 135 | vector> tile_tmp(tile_height, vector(tile_width, 0)); 136 | for (int y = 0; y < tile_height; ++y) { 137 | target_y = t_y + y; 138 | for (int x = 0; x < tile_width; ++x) { 139 | target_x = t_x + x; 140 | right = target_x + 1 >= width ? width - 1 : target_x + 1; 141 | right_right = target_x + 2 >= width ? width - 1 : target_x + 2; 142 | tile_tmp[y][x] = (input[target_y][target_x] + input[target_y][right] + 143 | input[target_y][right_right]) / 144 | 3; 145 | } 146 | } 147 | 148 | for (int y = 0; y < tile_height; ++y) { 149 | int target_y = t_y + y; 150 | int below = y + 1 >= tile_height ? tile_height - 1 : y + 1; 151 | int below_below = y + 2 >= tile_height ? tile_height - 1 : y + 2; 152 | for (int x = 0; x < tile_width; ++x) { 153 | int target_x = t_x + x; 154 | output[target_y][target_x] = 155 | (tile_tmp[y][x] + tile_tmp[below][x] + tile_tmp[below_below][x]) / 156 | 3; 157 | } 158 | } 159 | } 160 | } 161 | } 162 | 163 | void blur_mat_tiling_parallel(const vector> &input, 164 | vector> &output, int tile_width, 165 | int tile_height) { 166 | int height = input.size(); 167 | int width = input[0].size(); 168 | #pragma omp parallel for 169 | for (int tile_y = 0; tile_y < height / tile_height; ++tile_y) { 170 | int t_y = tile_y * tile_height; 171 | for (int tile_x = 0; tile_x < width / tile_width; ++tile_x) { 172 | int t_x = tile_x * tile_width; 173 | vector> tile_tmp(tile_height, vector(tile_width, 0)); 174 | for (int y = 0; y < tile_height; ++y) { 175 | int target_y = t_y + y; 176 | for (int x = 0; x < tile_width; ++x) { 177 | int target_x = t_x + x; 178 | int right = target_x + 1 >= width ? width - 1 : target_x + 1; 179 | int right_right = target_x + 2 >= width ? width - 1 : target_x + 2; 180 | tile_tmp[y][x] = (input[target_y][target_x] + input[target_y][right] + 181 | input[target_y][right_right]) / 182 | 3; 183 | } 184 | } 185 | 186 | for (int y = 0; y < tile_height; ++y) { 187 | int target_y = t_y + y; 188 | int below = y + 1 >= tile_height ? tile_height - 1 : y + 1; 189 | int below_below = y + 2 >= tile_height ? tile_height - 1 : y + 2; 190 | for (int x = 0; x < tile_width; ++x) { 191 | int target_x = t_x + x; 192 | output[target_y][target_x] = 193 | (tile_tmp[y][x] + tile_tmp[below][x] + tile_tmp[below_below][x]) / 194 | 3; 195 | } 196 | } 197 | } 198 | } 199 | } 200 | 201 | void blur_mat_sse(const vector> &input, 202 | vector> &output) { 203 | int height = input.size(); 204 | int width = input[0].size(); 205 | #pragma omp parallel for 206 | for (int y = 0; y < height; ++y) { 207 | for (int x = 0; x < width; ++x) { 208 | int below = y + 1 >= height ? height - 1 : y + 1; 209 | int below_below = y + 2 >= height ? height - 1 : y + 2; 210 | __m128 vdata_1 = _mm_loadu_ps(&input[y][x]); 211 | __m128 vdata_2 = _mm_loadu_ps(&input[below][x]); 212 | __m128 vdata_3 = _mm_loadu_ps(&input[below_below][x]); 213 | __m128 vres = _mm_add_ps(vdata_1, vdata_2); 214 | vres = _mm_add_ps(vres, vdata_3); 215 | vres = _mm_hadd_ps(vres, vres); 216 | vres = _mm_hadd_ps(vres, vres); 217 | _mm_store_ss(&output[y][x], vres); 218 | output[y][x] /= 12; 219 | } 220 | } 221 | } 222 | 223 | void print_mat(const vector> &mat) { 224 | for (auto row : mat) { 225 | for (auto ele : row) { 226 | cout << ele << ", "; 227 | } 228 | cout << endl; 229 | } 230 | } 231 | 232 | bool check_result(const vector> &m1, 233 | const vector> &m2) { 234 | if (m1.size() != m2.size()) { 235 | cout << "matrix height not equals " << m1.size() << "!=" << m2.size() 236 | << endl; 237 | return false; 238 | } 239 | if (m1[0].size() != m2[0].size()) { 240 | cout << "matrix width not equal " << m1[0].size() << "!=" << m2[0].size(); 241 | return false; 242 | } 243 | 244 | for (int y = 0; y < m1.size(); ++y) { 245 | for (int x = 0; x < m1[0].size(); ++x) { 246 | if (m1[y][x] != m2[y][x]) { 247 | cout << "element at (" << y << "," << x << ") not equal " << m1[y][x] 248 | << "!=" << m2[y][x] << endl; 249 | return false; 250 | } 251 | } 252 | } 253 | return true; 254 | } 255 | 256 | int main() { 257 | const int width = 8192; 258 | const int height = 4096; 259 | 260 | vector> in_data(height, vector(width, 1)); 261 | 262 | vector> out_data_1(height, vector(width, 0)); 263 | Timer t1("1 original"); 264 | blur_mat_original(in_data, out_data_1); 265 | t1.stop(); 266 | 267 | // vector> out_data_2(height, vector(width, 0)); 268 | // Timer t2("2 redup", t1.get()); 269 | // blur_mat_redup(in_data, out_data_2); 270 | // t2.stop(); 271 | 272 | // check_result(out_data_1, out_data_2); 273 | 274 | // vector> out_data_3(height, vector(width, 0)); 275 | // Timer t3("3 locality", t1.get()); 276 | // blur_mat_locality(in_data, out_data_3); 277 | // t3.stop(); 278 | 279 | // check_result(out_data_1, out_data_3); 280 | 281 | // vector> out_data_4(height, vector(width, 0)); 282 | // Timer t4("4 parallel", t1.get()); 283 | // blur_mat_parallel(in_data, out_data_4); 284 | // t4.stop(); 285 | 286 | // check_result(out_data_1, out_data_4); 287 | 288 | // vector> out_data_5(height, vector(width, 0)); 289 | // Timer t5("5 parallel + redup", t1.get()); 290 | // blur_mat_parallel_redup(in_data, out_data_5); 291 | // t5.stop(); 292 | 293 | // check_result(out_data_1, out_data_5); 294 | 295 | vector> out_data_4_6(height, vector(width, 0)); 296 | Timer t4_6("6 1024*512 titing", t1.get()); 297 | blur_mat_tiling(in_data, out_data_4_6, 1024, 512); 298 | t4_6.stop(); 299 | 300 | check_result(out_data_1, out_data_4_6); 301 | 302 | vector> out_data_7(height, vector(width, 0)); 303 | Timer t7("7 tiling + parallel", t1.get()); 304 | blur_mat_tiling_parallel(in_data, out_data_7, 1024, 256); 305 | t7.stop(); 306 | 307 | check_result(out_data_1, out_data_7); 308 | 309 | // vector> out_data_8(height, vector(width, 0)); 310 | // Timer t8("8 parallel + sse", t1.get()); 311 | // blur_mat_sse(in_data, out_data_8); 312 | // t8.stop(); 313 | 314 | // check_result(out_data_1, out_data_8); 315 | 316 | return 0; 317 | } 318 | --------------------------------------------------------------------------------