├── result.jpg ├── algorithm.jpg ├── top_k_cpu.h ├── random_init.sh ├── top_k_gpu.h ├── top_k_thrust.cu ├── top_k.h ├── benchmark.h ├── macro.h ├── run.sh ├── plot.py ├── top_k_cpu.cpp ├── Makefile ├── util.h ├── benchmark.cpp ├── main.cpp ├── README.md ├── util.cpp ├── result.txt └── top_k_gpu.cu /result.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuxianzhi/Top-K/HEAD/result.jpg -------------------------------------------------------------------------------- /algorithm.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuxianzhi/Top-K/HEAD/algorithm.jpg -------------------------------------------------------------------------------- /top_k_cpu.h: -------------------------------------------------------------------------------- 1 | #ifndef _TOP_K_CPU_H_ 2 | #define _TOP_K_CPU_H_ 3 | 4 | #include "macro.h" 5 | 6 | 7 | 8 | // cpu top k 9 | void top_k_cpu_serial(DATATYPE* input, int length, int k, DATATYPE* output); 10 | #endif 11 | -------------------------------------------------------------------------------- /random_init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | line="" 4 | for((i=1;i<=$2;i++)); 5 | do 6 | #random=`expr $RANDOM % 100000`; 7 | #line=${line}"$random " 8 | line=${line}"$i " 9 | done 10 | 11 | rm $1 12 | for((i=1;i<=$3;i++)); 13 | do 14 | echo ${line} >> $1 15 | done 16 | -------------------------------------------------------------------------------- /top_k_gpu.h: -------------------------------------------------------------------------------- 1 | #ifndef _TOP_K_GPU_H_ 2 | #define _TOP_K_GPU_H_ 3 | 4 | #include "macro.h" 5 | 6 | 7 | #ifdef USE_GPU 8 | #include 9 | #include 10 | 11 | // gpu top k 12 | void top_k_gpu(DATATYPE* input, int length, int k, DATATYPE* output); 13 | 14 | #endif 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /top_k_thrust.cu: -------------------------------------------------------------------------------- 1 | #include "top_k.h" 2 | #include "macro.h" 3 | 4 | #ifdef USE_THRUST 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | void top_k_thrust(DATATYPE* input, int length, int k, DATATYPE* output) 12 | { 13 | thrust::sort(thrust::host, input, input+length, thrust::greater()); 14 | memcpy(output, input, sizeof(DATATYPE)*k); 15 | } 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /top_k.h: -------------------------------------------------------------------------------- 1 | #ifndef _TOP_K_H_ 2 | #define _TOP_K_H_ 3 | 4 | #include "macro.h" 5 | 6 | 7 | #ifdef USE_GPU 8 | 9 | // gpu top k 10 | void top_k_gpu(DATATYPE* input, int length, int k, DATATYPE* output); 11 | 12 | #endif 13 | 14 | 15 | #ifdef USE_THRUST 16 | // cuda thrusr top k 17 | void top_k_thrust(DATATYPE* input, int length, int k, DATATYPE* output); 18 | #endif 19 | 20 | 21 | // cpu top k 22 | void top_k_cpu_serial(DATATYPE* input, int length, int k, DATATYPE* output); 23 | 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /benchmark.h: -------------------------------------------------------------------------------- 1 | #ifndef _BENCHMARK_H_ 2 | #define _BENCHMARK_H_ 3 | 4 | #include "macro.h" 5 | 6 | // CPU 7 | #ifdef USE_CPU 8 | // cpu start to time 9 | void cpu_time_tic(); 10 | 11 | // cpu end to time 12 | void cpu_time_toc(); 13 | 14 | // return cpu time(ms) 15 | float cpu_time(); 16 | #endif 17 | 18 | 19 | 20 | // GPU 21 | #ifdef USE_GPU 22 | // gpu start to time 23 | void gpu_time_tic(); 24 | 25 | // gpu end to time 26 | void gpu_time_toc(); 27 | 28 | // return gpu time(ms) 29 | float gpu_time(); 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /macro.h: -------------------------------------------------------------------------------- 1 | #ifndef _MACRO_H_ 2 | #define _MACRO_H_ 3 | 4 | // data type 5 | #define DATATYPE float 6 | 7 | // min number 8 | #define NEG_INF -999999999 9 | 10 | // use cpu 11 | #define USE_CPU 12 | 13 | // use gpu 14 | #define USE_GPU 15 | 16 | #ifdef USE_GPU 17 | #define HANDLE_CUDA_ERROR( err ) (handleCudaError( err, __FILE__, __LINE__ )) 18 | #define GPU_BLOCKS_THRESHOLD 2048 19 | #define GPU_THREADS_THRESHOLD 1024 20 | #define GPU_SHARED_MEM_THRESHOLD 48*1024 21 | #define GPU_THREADS 128 22 | #endif 23 | // use gpu thrust 24 | #define USE_THRUST 25 | #endif 26 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | K_max=20 4 | 5 | # compile 6 | make -B 7 | 8 | # init 9 | rm result.txt 10 | ./random_init.sh /dev/shm/input.txt 7000 10000 11 | 12 | 13 | # run 14 | for((i=1;i<=${K_max};i++)); 15 | do 16 | ./main /dev/shm/input.txt 70000000 $i >> result.txt 17 | done 18 | 19 | # use Python matplotlit to Plot 20 | #grep "Result" result.txt | cut -d " " -f 3 | awk '++i%2' | awk BEGIN{RS=EOF}'{gsub(/\n/,",");print}' 21 | #grep "Result" result.txt | cut -d " " -f 3 | awk 'i++%2' | awk BEGIN{RS=EOF}'{gsub(/\n/,",");print}' 22 | #python plot.py 23 | #eog result.jpg 24 | -------------------------------------------------------------------------------- /plot.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | x=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20] 5 | y=[49.591000,50.984001,50.853001,50.730000,51.206001,50.880001,50.764999,50.785999,50.873001,51.125000,50.712002,51.332001,50.535999,50.435001,50.365002,50.639999,50.949001,50.544998,50.919998,50.741001] 6 | z=[1.154912,1.178304,1.223968,1.272640,1.307232,1.371840,1.440928,3.077440,1.610752,1.727904,1.838752,2.680992,2.209824,2.478240,2.711840,13.756832,3.231936,3.700128,3.849760,5.491296] 7 | plt.figure() 8 | plt.xlabel('K') 9 | plt.ylabel('time: ms') 10 | plt.plot(x, y, color="blue", linewidth=2.5, linestyle="-", label="CPU") 11 | plt.plot(x, z, color="red", linewidth=2.5, linestyle="-", label="GPU") 12 | plt.legend(loc='upper right') 13 | plt.savefig("result.jpg") 14 | -------------------------------------------------------------------------------- /top_k_cpu.cpp: -------------------------------------------------------------------------------- 1 | #include "top_k.h" 2 | #include "macro.h" 3 | 4 | inline void replace_smaller(DATATYPE* array, int k, DATATYPE data) 5 | { 6 | if(data < array[k-1]) 7 | return; 8 | for(int j=k-2; j>=0; j--) 9 | { 10 | if(data > array[j]) 11 | array[j+1] = array[j]; 12 | else{ 13 | array[j+1] = data; 14 | return; 15 | } 16 | } 17 | array[0] = data; 18 | } 19 | void top_k_cpu_serial(DATATYPE* input, int length, int k, DATATYPE* output) 20 | { 21 | // produce k data in decent order 22 | output[0] = input[0]; 23 | for(int i=1; i 28 | # include 29 | 30 | // malloc and free on cpu 31 | void* mallocGPUMem(int size); 32 | void freeGPUMem(void *point); 33 | // copy from cpu to gpu 34 | void cpu2gpu(void *cpudata, void *gpudata, int size); 35 | // copy from gpu to cpu 36 | void gpu2cpu(void *gpudata, void *cpudata, int size); 37 | // copy from gpu to gpu 38 | void gpu2gpu(void *gpudata_dest, void *gpudata_src, int size); 39 | // cuda error 40 | void handleCudaError(cudaError_t err, const char *file, int line); 41 | #endif 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /benchmark.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "benchmark.h" 4 | #include "macro.h" 5 | 6 | // CPU 7 | #ifdef USE_CPU 8 | #include "sys/time.h" 9 | 10 | struct timeval cpu_start, cpu_end; 11 | 12 | // cpu start to time 13 | void cpu_time_tic() 14 | { 15 | gettimeofday(&cpu_start, NULL); 16 | } 17 | 18 | // cpu end to time 19 | void cpu_time_toc() 20 | { 21 | gettimeofday(&cpu_end, NULL); 22 | } 23 | 24 | // return cpu time(ms) 25 | float cpu_time() 26 | { 27 | float time_elapsed=(cpu_end.tv_sec-cpu_start.tv_sec)*1000.0 + (cpu_end.tv_usec-cpu_start.tv_usec)/1000.0; 28 | return time_elapsed; 29 | } 30 | #endif 31 | 32 | 33 | 34 | 35 | // GPU 36 | #ifdef USE_GPU 37 | #include 38 | #include 39 | 40 | cudaEvent_t gpu_start, gpu_end; 41 | 42 | // gpu start to time 43 | void gpu_time_tic() 44 | { 45 | cudaEventCreate(&gpu_start); 46 | cudaEventRecord(gpu_start, 0); 47 | } 48 | 49 | // gpu end to time 50 | void gpu_time_toc() 51 | { 52 | cudaEventCreate(&gpu_end); 53 | cudaEventRecord(gpu_end, 0); 54 | } 55 | 56 | // return gpu time(ms) 57 | float gpu_time() 58 | { 59 | float time_elapsed=0; 60 | cudaEventSynchronize(gpu_start); 61 | cudaEventSynchronize(gpu_end); 62 | cudaEventElapsedTime(&time_elapsed, gpu_start, gpu_end); 63 | cudaEventDestroy(gpu_start); 64 | cudaEventDestroy(gpu_end); 65 | } 66 | #endif 67 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "util.h" 5 | #include "benchmark.h" 6 | #include "top_k.h" 7 | #include "macro.h" 8 | 9 | #ifdef USE_GPU 10 | #include 11 | #include 12 | #endif 13 | 14 | int main(int argc, char *argv[]) 15 | { 16 | char *inputFileName; 17 | int k,n; 18 | parseParmeter(argc, argv, &n, &k, inputFileName); 19 | 20 | DATATYPE *data, *output; 21 | int data_size=sizeof(DATATYPE)*n; 22 | int output_size=sizeof(DATATYPE)*k; 23 | data = (DATATYPE *)mallocCPUMem(data_size); 24 | output = (DATATYPE *)mallocCPUMem(output_size); 25 | int nn = readFromFile(argv[1], data); 26 | if( nn != n) 27 | printf("too much data in file(%d %d)\n", nn, n); 28 | cpu_time_tic(); 29 | top_k_cpu_serial(data, n, k, output); 30 | cpu_time_toc(); 31 | printf("CPU Result %f ms:\n", cpu_time()); 32 | printArray(output, k); 33 | 34 | #ifdef USE_GPU 35 | DATATYPE *data_D, *output_D; 36 | data_D = (DATATYPE *)mallocGPUMem(data_size); 37 | output_D = (DATATYPE *)mallocGPUMem(output_size); 38 | cpu2gpu(data, data_D, data_size); 39 | gpu_time_tic(); 40 | top_k_gpu(data_D, n, k, output_D); 41 | gpu_time_toc(); 42 | zeroArray(output, k); 43 | gpu2cpu(output_D, output, output_size); 44 | printf("GPU Result %f ms:\n", gpu_time()); 45 | printArray(output, k); 46 | freeGPUMem(data_D); 47 | freeGPUMem(output_D); 48 | #endif 49 | 50 | #ifdef USE_THRUST 51 | zeroArray(output, k); 52 | cpu_time_tic(); 53 | top_k_thrust(data, n, k, output); 54 | cpu_time_toc(); 55 | printf("Thrust Result %f ms:\n", cpu_time()); 56 | printArray(output, k); 57 | #endif 58 | freeCPUMem(data); 59 | freeCPUMem(output); 60 | return 0; 61 | } 62 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### CUDA Top K 2 | 3 | ### 问题描述 4 | 5 | 从一组数据中(float类型的数组array)选出最大的K个数据，使用CUDA并行算法在Nvidia GPU上实现加速。 6 | 7 | 8 | ### 小组成员 9 | 10 | * 分组成员(姓名) 11 | * 于献智 12 | * 周冬炎 13 | * 郭云龙 14 | * 程云观 15 | * 颜闻 16 | 17 | 18 | ### 解决思路 19 | 20 | 借鉴经典的CUDA版本的**归并排序**和**归约求和**的并行思想。每个线程保存了局部的Top K值，然后两两线程合并出新的K个最大，直至合并出最终的结果，算法如图所示。并且使用GPU块内共享内存加速，但是加速求解的规模有限（GPU块内共享内存是48KB，对于float类型数据，如果每个block有128个线程，那么至多能求解K的最大为96，96=48*1024/4/128），如果超过块内共享内存求解规模时候使用全局内存替代。 21 | 22 | ![algorithm](algorithm.jpg "algorithm") 23 | 24 | 25 | ### 编译 26 | 27 | 使用[Makefile](Makefile)管理编译命令，可以直接执行```make```命令生成可执行文件```main``` 28 | 29 | 30 | ### 运行 31 | 32 | * 请提前准备好输入文件```input.txt```，以1维数组形式保存了float类型数据，下面是运行命令，可执行文件```main```，输入文件```input.txt```，1维数组大小N，要选取的K的值。 33 | 34 | ```bash 35 | ./main ./input.txt N K 36 | ``` 37 | 38 | * 也可以直接使用我们的[run.sh](run.sh)测试脚本，我们使用[random_init.sh](random_init.sh)生成的70000000数据的测试脚本测试。 39 | 40 | 41 | ### 测试 42 | 43 | * 程序输出包含了CPU的运行时间（ms）和运行结果，以及GPU的运行时间（ms）和运行结果。然后使用```Python```的```matplotlib```模块画图。 44 | 45 | * 测试硬件信息 46 | * CPU: Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz（ core 16/ memory 128GB） 47 | * GPU: GeForce GTX TITAN X （memory 12GB） 48 | 49 | * 测试软件信息 50 | * OS: Ubuntu 14.04 51 | * G++: 5.4.1 52 | * NVCC: 8.0 53 | 54 | * 测试结果 55 | * 测试选用了求解Top1 - 20的问题规模，输入数据是70000000个float大小，CPU的求解代码[top_k_cpu.cpp](top_k_cpu.cpp)，自己实现的GPU的求解代码[top_k_gpu.cu](top_k_gpu.cu)，取得了一块 TITAN X相当于46个CPU核的加速。同时使用GPU自带的Thrust并行函数库的sort函数实现了一个top-k的算法（通过排序然后选取前K个最大的）的代码[top_k_thrust.cu](top_k_thrust.cu).但是因为Thrust库处理的数据规模较小的时候加速效果并不明显，70000000数据的Top1-20的问题执行需要几千毫秒，没有能充分发挥出GPU加速的效果，所以这里没有参与比较。 56 | * 当K的规模增大的时候，块内共享内存几乎被一个Block占据，所以硬件上一个SM不能调度更多的GPU Block执行，降低了Block层级的并行度。 57 | * 这里的TOP 8和TOP 16加速性能下降，可能是因为块内共享内存的Bank Conflict发生导致的性能下降，本来一次访问可以访问的数据需要多次访问，但是整体上加速性能40多倍。 58 | 59 | ![result](result.jpg "result") 60 | -------------------------------------------------------------------------------- /util.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "util.h" 5 | #include "macro.h" 6 | 7 | 8 | // handle input 9 | void parseParmeter(int argc, char *argv[], int *n, int *k, char *inputFileName){ 10 | if(argc < 4 ) 11 | { 12 | printf("input must be 3 parameters, such as <./main filename n k>\n"); 13 | exit(1); 14 | } 15 | 16 | inputFileName = argv[1]; 17 | *n = atoi(argv[2]); 18 | *k = atoi(argv[3]); 19 | #ifdef DEBUG 20 | printf("select %d from %d\n", *k, *n); 21 | #endif 22 | } 23 | 24 | 25 | // read from file 26 | int readFromFile(char *fileName, DATATYPE* data) 27 | { 28 | FILE *fp; 29 | if((fp=fopen(fileName, "r")) == NULL) { 30 | printf("file %s cannot be opened/n", fileName); 31 | exit(1); 32 | } 33 | 34 | int i=0; 35 | while(!feof(fp)) { 36 | fscanf(fp, "%f ", &data[i]); 37 | i++; 38 | } 39 | 40 | fclose(fp); 41 | return i; 42 | } 43 | 44 | 45 | // print array 46 | void printArray(DATATYPE* data, int length) 47 | { 48 | for(int i=0; i0) 66 | return malloc(size); 67 | else 68 | return NULL; 69 | } 70 | 71 | void freeCPUMem(void *point) 72 | { 73 | if(point != NULL) 74 | free(point); 75 | } 76 | #endif 77 | 78 | 79 | #ifdef USE_GPU 80 | #include 81 | #include 82 | 83 | // malloc and free on cpu 84 | void* mallocGPUMem(int size) 85 | { 86 | if(size<=0) 87 | return NULL; 88 | 89 | void* data; 90 | HANDLE_CUDA_ERROR(cudaMalloc(&data, size)); 91 | return data; 92 | } 93 | 94 | void freeGPUMem(void *point) 95 | { 96 | if(point != NULL) 97 | HANDLE_CUDA_ERROR(cudaFree(point)); 98 | } 99 | 100 | // copy from cpu to gpu 101 | void cpu2gpu(void *cpudata, void *gpudata, int size) 102 | { 103 | if(size<=0) 104 | return; 105 | 106 | HANDLE_CUDA_ERROR(cudaMemcpy(gpudata, cpudata, size, cudaMemcpyHostToDevice)); 107 | } 108 | 109 | // copy from gpu to cpu 110 | void gpu2cpu(void *gpudata, void *cpudata, int size) 111 | { 112 | if(size<=0) 113 | return; 114 | 115 | HANDLE_CUDA_ERROR(cudaMemcpy(cpudata, gpudata, size, cudaMemcpyDeviceToHost)); 116 | } 117 | 118 | // copy from gpu to gpu 119 | void gpu2gpu(void *gpudata_dest, void *gpudata_src, int size) 120 | { 121 | if(size<=0) 122 | return; 123 | 124 | HANDLE_CUDA_ERROR(cudaMemcpy(gpudata_dest, gpudata_src, size, cudaMemcpyDeviceToDevice)); 125 | } 126 | 127 | 128 | // cuda error 129 | void handleCudaError(cudaError_t err, const char *file, int line) { 130 | if (err != cudaSuccess) { 131 | printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file,line ); 132 | exit(2); 133 | } 134 | } 135 | #endif 136 | -------------------------------------------------------------------------------- /result.txt: -------------------------------------------------------------------------------- 1 | CPU Result 49.591000 ms: 2 | 10000.000000 3 | GPU Result 1.154912 ms: 4 | 10000.000000 5 | CPU Result 50.984001 ms: 6 | 10000.000000 10000.000000 7 | GPU Result 1.178304 ms: 8 | 10000.000000 10000.000000 9 | CPU Result 50.853001 ms: 10 | 10000.000000 10000.000000 10000.000000 11 | GPU Result 1.223968 ms: 12 | 10000.000000 10000.000000 10000.000000 13 | CPU Result 50.730000 ms: 14 | 10000.000000 10000.000000 10000.000000 10000.000000 15 | GPU Result 1.272640 ms: 16 | 10000.000000 10000.000000 10000.000000 10000.000000 17 | CPU Result 51.206001 ms: 18 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 19 | GPU Result 1.307232 ms: 20 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 21 | CPU Result 50.880001 ms: 22 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 23 | GPU Result 1.371840 ms: 24 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 25 | CPU Result 50.764999 ms: 26 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 27 | GPU Result 1.440928 ms: 28 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 29 | CPU Result 50.785999 ms: 30 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 31 | GPU Result 3.077440 ms: 32 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 33 | CPU Result 50.873001 ms: 34 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 35 | GPU Result 1.610752 ms: 36 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 37 | CPU Result 51.125000 ms: 38 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 39 | GPU Result 1.727904 ms: 40 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 41 | CPU Result 50.712002 ms: 42 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 43 | GPU Result 1.838752 ms: 44 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 45 | CPU Result 51.332001 ms: 46 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 47 | GPU Result 2.680992 ms: 48 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 49 | CPU Result 50.535999 ms: 50 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 51 | GPU Result 2.209824 ms: 52 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 53 | CPU Result 50.435001 ms: 54 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 55 | GPU Result 2.478240 ms: 56 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 57 | CPU Result 50.365002 ms: 58 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 59 | GPU Result 2.711840 ms: 60 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 61 | CPU Result 50.639999 ms: 62 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 63 | GPU Result 13.756832 ms: 64 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 65 | CPU Result 50.949001 ms: 66 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 67 | GPU Result 3.231936 ms: 68 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 69 | CPU Result 50.544998 ms: 70 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 71 | GPU Result 3.700128 ms: 72 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 73 | CPU Result 50.919998 ms: 74 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 75 | GPU Result 3.849760 ms: 76 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 77 | CPU Result 50.741001 ms: 78 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 79 | GPU Result 5.491296 ms: 80 | 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 10000.000000 81 | -------------------------------------------------------------------------------- /top_k_gpu.cu: -------------------------------------------------------------------------------- 1 | #include "top_k.h" 2 | #include "util.h" 3 | #include "macro.h" 4 | #include 5 | 6 | #ifdef USE_GPU 7 | #include 8 | #include 9 | 10 | 11 | 12 | __device__ inline void replace_smaller(DATATYPE* array, int k, DATATYPE data) 13 | { 14 | if(data < array[k-1]) 15 | return; 16 | for(int j=k-2; j>=0; j--) 17 | { 18 | if(data > array[j]) 19 | array[j+1] = array[j]; 20 | else{ 21 | array[j+1] = data; 22 | return; 23 | } 24 | } 25 | array[0] = data; 26 | } 27 | 28 | 29 | __global__ void top_k_gpu_kernel1(DATATYPE* input, int length, int k, DATATYPE* output) 30 | { 31 | // produce k data in decent order 32 | output[0] = input[0]; 33 | for(int i=1; i>1; i>0; i>>=1) { 115 | if(localThreadId < i) 116 | { 117 | mergeTwoK(myPoint, myPoint+i*k, k); 118 | } 119 | __syncthreads(); 120 | } 121 | 122 | if(threadIdx.x == 0) 123 | { 124 | // produce k data in decent order 125 | index = blockIdx.x*localThreadNum*k; 126 | for(i=0; i>1; i>0; i>>=1) { 150 | if(localThreadId < i) 151 | { 152 | mergeTwoK(myPoint, myPoint+i*k, k); 153 | } 154 | __syncthreads(); 155 | } 156 | 157 | if(threadIdx.x == 0) 158 | { 159 | // produce k data in decent order 160 | DATATYPE *outputPoint = output + blockIdx.x*localThreadNum*stride; 161 | for(i=0; i>1; i>0; i>>=1) { 192 | if(localThreadId < i) 193 | { 194 | mergeTwoK(myPoint, myPoint+i*k, k); 195 | } 196 | __syncthreads(); 197 | } 198 | 199 | if(threadIdx.x == 0) 200 | { 201 | // produce k data in decent order 202 | DATATYPE *outputPoint = input + blockIdx.x*localThreadNum*k; 203 | for(i=0; i>1; i>0; i>>=1) { 224 | if(localThreadId < i) 225 | { 226 | mergeTwoK(myPoint, myPoint+i*stride, k); 227 | } 228 | __syncthreads(); 229 | } 230 | 231 | if(threadIdx.x == 0) 232 | { 233 | // produce k data in decent order 234 | DATATYPE *outputPoint = output + blockIdx.x*localThreadNum*stride; 235 | for(i=0; i=0; j--) 251 | { 252 | next = prev; 253 | prev = prev - stride; 254 | if(data > *prev) 255 | *next = *prev; 256 | else{ 257 | *next = data; 258 | return; 259 | } 260 | } 261 | *array = data; 262 | } 263 | __device__ inline void mergeTwoK_stride(DATATYPE* left, DATATYPE* right, int k, int stride) 264 | { 265 | int i; 266 | DATATYPE* current = right; 267 | for(i=0; i>1; i>0; i>>=1) { 308 | if(localThreadId < i) 309 | { 310 | mergeTwoK_stride(myPoint, myPoint+i, k, stride); 311 | } 312 | __syncthreads(); 313 | } 314 | #endif 315 | 316 | if(threadIdx.x == 0) 317 | { 318 | current = myPoint; 319 | // produce k data in decent order 320 | DATATYPE *outputPoint = input + blockIdx.x*localThreadNum*k; 321 | for(i=0; i>1; i>0; i>>=1) { 346 | if(localThreadId < i) 347 | { 348 | mergeTwoK_stride(myPoint, myPoint+i, k, stride); 349 | } 350 | __syncthreads(); 351 | } 352 | 353 | if(threadIdx.x == 0) 354 | { 355 | // produce k data in decent order 356 | DATATYPE *outputPoint = output + blockIdx.x*localThreadNum*skip_stride; 357 | DATATYPE *current = myPoint; 358 | for(i=0; i>>(input, length, k, output); 375 | cudaError_t err = cudaGetLastError(); 376 | HANDLE_CUDA_ERROR(err); 377 | #endif 378 | 379 | #if 0 380 | // k < 12 381 | int blocks = 1; 382 | int threads = (GPU_THREADS < length/(4*k)*2) ? GPU_THREADS : (length/(4*k)*2); 383 | int shared_mem_usage = sizeof(DATATYPE)*k*threads; 384 | top_k_gpu_kernel2<<>>(input, length, k, output); 385 | cudaError_t err = cudaGetLastError(); 386 | HANDLE_CUDA_ERROR(err); 387 | #endif 388 | 389 | #if 1 390 | // each thread at least 2K 391 | int blocks_opt, thread_opt; 392 | if(k < 20) 393 | { 394 | blocks_opt = GPU_BLOCKS_THRESHOLD; 395 | thread_opt = GPU_THREADS; 396 | } 397 | else{ 398 | blocks_opt = 16; 399 | thread_opt = 64; 400 | } 401 | int threads = (thread_opt < length/(4*k)*2) ? thread_opt : (length/(4*k)*2); 402 | int stride = threads * k; 403 | int blocks = (blocks_opt < length / (threads*2*k)) ? blocks_opt : (length / (threads*2*k)); 404 | int shared_mem_usage = sizeof(DATATYPE)*k*threads; 405 | //printf("shared mem usage: (%d %d) %d(%d)\n", blocks, threads, shared_mem_usage, GPU_SHARED_MEM_THRESHOLD); 406 | if(shared_mem_usage < GPU_SHARED_MEM_THRESHOLD) 407 | top_k_gpu_kernel3_1<<>>(input, length, k, output); 408 | else 409 | top_k_gpu_kernel3_1_orig<<>>(input, length, k, output); 410 | threads = (thread_opt < blocks / 2) ? thread_opt : (blocks / 2); 411 | shared_mem_usage = sizeof(DATATYPE)*k*threads; 412 | //printf("shared mem usage: (%d %d) %d(%d)\n", 1, threads, shared_mem_usage, GPU_SHARED_MEM_THRESHOLD); 413 | if(shared_mem_usage < GPU_SHARED_MEM_THRESHOLD) 414 | top_k_gpu_kernel3_2<<<1, threads, shared_mem_usage>>>(input, blocks, stride, k, output); 415 | else 416 | top_k_gpu_kernel3_2_orig<<<1, threads>>>(input, blocks, stride, k, output); 417 | cudaError_t err = cudaGetLastError(); 418 | HANDLE_CUDA_ERROR(err); 419 | #endif 420 | #if 0 421 | // k < 12 422 | int threads = (GPU_THREADS < length/(4*k)*2) ? GPU_THREADS : (length/(4*k)*2); 423 | int stride = threads * k; 424 | int blocks = (GPU_BLOCKS_THRESHOLD < length / (threads*2*k)) ? GPU_BLOCKS_THRESHOLD : (length / (threads*2*k)); 425 | int shared_mem_usage = sizeof(DATATYPE)*k*threads; 426 | if(shared_mem_usage < GPU_SHARED_MEM_THRESHOLD) 427 | top_k_gpu_kernel4_1<<>>(input, length, k, output); 428 | else 429 | printf("%d %d %d\n", blocks, threads, shared_mem_usage); 430 | threads = (GPU_THREADS < blocks / 2) ? GPU_THREADS : (blocks / 2); 431 | shared_mem_usage = sizeof(DATATYPE)*k*threads; 432 | if(shared_mem_usage < GPU_SHARED_MEM_THRESHOLD) 433 | top_k_gpu_kernel4_2<<<1, threads, shared_mem_usage>>>(input, blocks, stride, k, output); 434 | else 435 | printf("%d %d %d\n", blocks, threads, shared_mem_usage); 436 | cudaError_t err = cudaGetLastError(); 437 | HANDLE_CUDA_ERROR(err); 438 | #endif 439 | } 440 | #endif 441 | --------------------------------------------------------------------------------