├── .gitignore ├── README.md ├── RLE-raw-cuda ├── rle │ ├── __init__.py │ └── rle.py ├── rle_cuda.cpp ├── rle_cuda_kernel.cu ├── setup.py └── test.py └── RLE-thrust ├── rle └── __init__.py ├── rle_cuda.cpp ├── rle_cuda_kernel.cu └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pytorch Run-Length Encoding 2 | 3 | this repo contains two implmentations: 4 | 5 | - RLE-raw-cuda: It's written using pure cuda lib 6 | - RLE-thrust: It's written using a higher level lib `thrust` 7 | 8 | -------------------------------------------------------------------------------- /RLE-raw-cuda/rle/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lecoan/pytorch-RLE/32013a050b06070499a5850659c3bb10ac3f3210/RLE-raw-cuda/rle/__init__.py -------------------------------------------------------------------------------- /RLE-raw-cuda/rle/rle.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import rle_cuda 4 | 5 | torch.manual_seed(42) 6 | 7 | 8 | class RLEFunction(object): 9 | @staticmethod 10 | def encode(input, input_int): 11 | countsOut, symbolsOut = rle_cuda.encode(input, input_int) 12 | return torch.squeeze(countsOut), torch.squeeze(symbolsOut) 13 | 14 | @staticmethod 15 | def decode(input): 16 | countsOut = input[0] 17 | symbolsOut = input[1] 18 | line = countsOut.size()[1] - 1 19 | result = torch.zeros((1, countsOut[0][line]), dtype=torch.float32, device=symbolsOut.get_device()) 20 | return rle_cuda.decode(countsOut, symbolsOut, result) 21 | 22 | 23 | class RLE(object): 24 | @staticmethod 25 | def encode(tensor, mask): 26 | tensor = tensor.view(1, -1) 27 | mask = mask.view(1, -1).int() 28 | counts, symbols = RLEFunction.encode(tensor, mask) 29 | print(symbols) 30 | symbols = torch.cat((symbols, torch.ones(1))) 31 | symbols = symbols.type_as(counts) 32 | return torch.stack((counts, symbols)) 33 | 34 | @staticmethod 35 | def decode(tensor, size): 36 | counts = tensor[0] 37 | symbols = tensor[1].narrow(0, 0, counts.size().numel() - 1) 38 | symbols = symbols.type(torch.int64) 39 | output = RLEFunction.decode((counts, symbols)) 40 | return output.reshape(size) 41 | 42 | -------------------------------------------------------------------------------- /RLE-raw-cuda/rle_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | 7 | // CUDA encode declarations 8 | 9 | std::vector rle_cuda_encode( 10 | at::Tensor input, at::Tensor input_int); 11 | /*at::Tensor weights, 12 | at::Tensor bias, 13 | at::Tensor old_h, 14 | at::Tensor old_cell*/ 15 | 16 | at::Tensor rle_cuda_decode( 17 | at::Tensor countsOut, 18 | at::Tensor symbolsOut, 19 | at::Tensor result); 20 | 21 | // C++ interface 22 | 23 | // NOTE: AT_ASSERT has become AT_CHECK on master after 0.4. 24 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") 25 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") 26 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 27 | 28 | std::vector rle_encode(at::Tensor input, at::Tensor input_int) { 29 | CHECK_INPUT(input); 30 | //CHECK_INPUT(input_int); 31 | //std::cout << "in rle_cuda.cpp rle_encode"<< std::endl; 32 | return rle_cuda_encode(input, input_int); 33 | } 34 | 35 | at::Tensor rle_decode( 36 | at::Tensor countsOut, 37 | at::Tensor symbolsOut, 38 | at::Tensor result) { 39 | CHECK_INPUT(countsOut); 40 | CHECK_INPUT(symbolsOut); 41 | CHECK_INPUT(result); 42 | 43 | return rle_cuda_decode( 44 | countsOut, 45 | symbolsOut, 46 | result); 47 | } 48 | 49 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 50 | m.def("encode", &rle_encode, "RLE encode (CUDA)"); 51 | m.def("decode", &rle_decode, "RLE decode (CUDA)"); 52 | } -------------------------------------------------------------------------------- /RLE-raw-cuda/rle_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | // TOOD: where to use? 16 | template 17 | __global__ void tempKernel( 18 | const scalar_t* __restrict__ g_in, 19 | scalar_t* __restrict__ g_temp, 20 | size_t n) { 21 | const int index = blockIdx.x * blockDim.x + threadIdx.x; 22 | const int stride = blockDim.x * gridDim.x; 23 | for (int i = index; i < n; i += stride) { 24 | if (g_in[i]-0.0 < 0.0001 && g_in[i]-0.0 > -0.0001){ 25 | g_temp[i] = 0; 26 | } 27 | else { 28 | g_temp[i] = 2; 29 | } 30 | } 31 | } 32 | 33 | template 34 | __global__ void maskKernel( 35 | const scalar_t* __restrict__ g_in, 36 | int* __restrict__ g_decodeMask, 37 | size_t n) { 38 | 39 | const int index = blockIdx.x * blockDim.x + threadIdx.x; 40 | const int stride = blockDim.x * gridDim.x; 41 | for (int i = index; i < n; i += stride) { 42 | if (g_in[i] == 0){ 43 | g_decodeMask[i] = 0; 44 | } 45 | else { 46 | g_decodeMask[i] = 1; 47 | } 48 | 49 | } 50 | } 51 | 52 | // TODO: where to use? 53 | __global__ void prefixsumKernel( 54 | const int* __restrict__ X, 55 | int* __restrict__ XY, 56 | int* __restrict__ Y, 57 | size_t InputSize) { 58 | auto BLOCK_SIZE = 32*((InputSize+32)/32); 59 | //printf("BLOCK_SIZE=%d\n", BLOCK_SIZE); 60 | int i = blockIdx.x*blockDim.x + threadIdx.x; 61 | if (i < InputSize) {XY[threadIdx.x] = X[i];} 62 | 63 | for (int stride = 1; stride <= BLOCK_SIZE; stride *= 2) { 64 | int index = (threadIdx.x+1)*stride*2 - 1; 65 | if(index < 2*BLOCK_SIZE) 66 | XY[index] += XY[index - stride]; //index is alway bigger than stride 67 | } 68 | for (int stride = BLOCK_SIZE/2; stride > 0; stride /= 2) { 69 | //for (int stride2 = BLOCK_SIZE/2; stride2 > 0; stride2 = stride2/2) { 70 | int index2 = (threadIdx.x+1)*stride*2 - 1; 71 | if(index2 < 2*BLOCK_SIZE) 72 | XY[index2 + stride] += XY[index2]; 73 | } 74 | if (i < InputSize) Y[i] = XY[threadIdx.x]; 75 | } 76 | 77 | 78 | __global__ void compactKernel(int* __restrict__ g_scannedBackwardMask, 79 | int* g_compactedBackwardMask, 80 | int* g_totalRuns, 81 | size_t n) { 82 | const int index = blockIdx.x * blockDim.x + threadIdx.x; 83 | const int stride = blockDim.x * gridDim.x; 84 | for (int i = index; i < n; i += stride) { 85 | if (i == (n - 1)) { 86 | g_compactedBackwardMask[g_scannedBackwardMask[i]] = i + 1; 87 | *g_totalRuns = g_scannedBackwardMask[i]; 88 | } 89 | 90 | if (i == 0) { 91 | if(g_scannedBackwardMask[0] == 1) { 92 | g_compactedBackwardMask[0] = 0; 93 | } 94 | } 95 | else if (g_scannedBackwardMask[i] != g_scannedBackwardMask[i - 1]) { 96 | g_compactedBackwardMask[g_scannedBackwardMask[i] - 1] = i; 97 | } 98 | g_compactedBackwardMask[g_scannedBackwardMask[n-1]] = n; 99 | g_totalRuns[0] = g_scannedBackwardMask[n-1]; 100 | } 101 | } 102 | 103 | 104 | template 105 | __global__ void scatterKernel( 106 | int* g_compactedBackwardMask, 107 | int* g_totalRuns, 108 | scalar_t* __restrict__ g_countsOut) { 109 | const int index = blockIdx.x * blockDim.x + threadIdx.x; 110 | const int stride = blockDim.x * gridDim.x; 111 | int n = *g_totalRuns; 112 | for (int i = index; i < n; i += stride) { 113 | if (i == 0) 114 | g_countsOut[i] = g_compactedBackwardMask[i]; 115 | else 116 | g_countsOut[i] = g_compactedBackwardMask[i] - g_compactedBackwardMask[i-1] - 1; 117 | 118 | } 119 | g_countsOut[n] = g_compactedBackwardMask[n]; 120 | } 121 | 122 | template 123 | __global__ void recordKernel( 124 | int* g_compactedBackwardMask, 125 | int* g_totalRuns, 126 | scalar_t* __restrict__ g_in, 127 | scalar_t* __restrict__ g_symbolsOut) { 128 | const int index = blockIdx.x * blockDim.x + threadIdx.x; 129 | const int stride = blockDim.x * gridDim.x; 130 | int n = *g_totalRuns; 131 | 132 | for (int i = index; i < n; i += stride) { 133 | if(g_compactedBackwardMask[i] != -1){ 134 | g_symbolsOut[i] = g_in[g_compactedBackwardMask[i]]; 135 | } 136 | } 137 | } 138 | 139 | // TODO: where to use? 140 | std::vector rle_cuda_encode_2(at::Tensor input, at::Tensor input_int) { 141 | const auto n = input.size(1); 142 | const int threads = 512; //256 143 | int blocks = (n + threads ) / threads; 144 | if(blocks > 65535) 145 | blocks = 65535; 146 | int *compactedBackwardMask; 147 | auto g_countsOut = at::ones({1, n}, input_int.type()).to(at::kCUDA); 148 | auto g_symbolsOut = at::ones({1, n}, input.type()).to(at::kCUDA); 149 | 150 | if(0 != cudaMalloc(&compactedBackwardMask, n*sizeof(int))) 151 | std::cout<<__LINE__<<" cudaMalloc error "< rle_cuda_encode(at::Tensor input, at::Tensor input_int) { 158 | int device; 159 | cudaGetDevice(&device); 160 | const auto n = input.size(1); 161 | 162 | const int threads = 512; //256 163 | int blocks = (n + threads ) / threads; 164 | if(blocks > 65535) 165 | blocks = 65535; 166 | 167 | int *decodeMask, *scannedBackwardMask; 168 | if(0 != cudaMalloc(&decodeMask, n*sizeof(int))) 169 | std::cout<<__LINE__<<" cudaMalloc error "<<<>>( 183 | input_int.data(), 184 | decodeMask, 185 | n); 186 | })); 187 | err = cudaDeviceSynchronize(); 188 | if ( cudaSuccess != err ) 189 | { 190 | fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n", 191 | __FILE__, __LINE__, cudaGetErrorString( err ) ); 192 | exit( -1 ); 193 | } 194 | thrust::inclusive_scan(thrust::device, decodeMask, decodeMask + n, scannedBackwardMask); 195 | err = cudaDeviceSynchronize(); 196 | if ( cudaSuccess != err ) 197 | { 198 | fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n", 199 | __FILE__, __LINE__, cudaGetErrorString( err ) ); 200 | exit( -1 ); 201 | } 202 | 203 | err = cudaFree(decodeMask); 204 | if ( cudaSuccess != err ) 205 | { 206 | fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n", 207 | __FILE__, __LINE__, cudaGetErrorString( err ) ); 208 | exit( -1 ); 209 | } 210 | cudaDeviceSynchronize(); 211 | int *totalRuns, *compactedBackwardMask; 212 | 213 | cudaMalloc(&compactedBackwardMask, (n+1)*sizeof(int)); 214 | cudaDeviceSynchronize(); 215 | cudaMallocManaged(&totalRuns, sizeof(int)); 216 | cudaDeviceSynchronize(); 217 | compactKernel<<>>(scannedBackwardMask, compactedBackwardMask, totalRuns, n); 218 | err = cudaDeviceSynchronize(); 219 | if ( cudaSuccess != err ) 220 | { 221 | fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n", 222 | __FILE__, __LINE__, cudaGetErrorString( err ) ); 223 | exit( -1 ); 224 | } 225 | err = cudaFree(scannedBackwardMask); 226 | cudaDeviceSynchronize(); 227 | if ( cudaSuccess != err ) 228 | { 229 | fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n", 230 | __FILE__, __LINE__, cudaGetErrorString( err ) ); 231 | exit( -1 ); 232 | } 233 | int k = totalRuns[0]+1; 234 | auto g_countsOut = at::ones({1, k}, input_int.type()).to(at::kCUDA); 235 | cudaDeviceSynchronize(); 236 | 237 | AT_DISPATCH_INTEGRAL_TYPES(input_int.type(), "rle_encode_cuda", ([&] { 238 | scatterKernel<<>>( 239 | compactedBackwardMask, 240 | totalRuns, 241 | g_countsOut.data()); 242 | })); 243 | err = cudaDeviceSynchronize(); 244 | if ( cudaSuccess != err ) 245 | { 246 | fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n", 247 | __FILE__, __LINE__, cudaGetErrorString( err ) ); 248 | exit( -1 ); 249 | } 250 | auto g_symbolsOut = at::ones({1, *totalRuns}, input.type()).to(at::kCUDA); 251 | cudaDeviceSynchronize(); 252 | AT_DISPATCH_FLOATING_TYPES(input.type(), "rle_encode_cuda", ([&] { 253 | recordKernel<<>>( 254 | compactedBackwardMask, 255 | totalRuns, 256 | input.data(), 257 | g_symbolsOut.data()); 258 | })); 259 | err = cudaDeviceSynchronize(); 260 | if ( cudaSuccess != err ) 261 | { 262 | fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n", 263 | __FILE__, __LINE__, cudaGetErrorString( err ) ); 264 | exit( -1 ); 265 | } 266 | err = cudaFree(compactedBackwardMask); 267 | cudaDeviceSynchronize(); 268 | if ( cudaSuccess != err ) 269 | { 270 | fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n", 271 | __FILE__, __LINE__, cudaGetErrorString( err ) ); 272 | exit( -1 ); 273 | } 274 | err = cudaFree(totalRuns); 275 | if ( cudaSuccess != err ) 276 | { 277 | fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n", 278 | __FILE__, __LINE__, cudaGetErrorString( err ) ); 279 | exit( -1 ); 280 | } 281 | err = cudaDeviceSynchronize(); 282 | if ( cudaSuccess != err ) 283 | { 284 | fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n", 285 | __FILE__, __LINE__, cudaGetErrorString( err ) ); 286 | exit( -1 ); 287 | } 288 | return {g_countsOut, g_symbolsOut}; 289 | } 290 | 291 | template 292 | __global__ void sumzeroKernel( 293 | scalar_t* __restrict__ g_countsOut, 294 | int* result, 295 | size_t n) { 296 | const int index = blockIdx.x * blockDim.x + threadIdx.x; 297 | const int stride = blockDim.x * gridDim.x; 298 | 299 | for (int i = index; i < n; i += stride) { 300 | result[i] = g_countsOut[i]; 301 | } 302 | } 303 | 304 | __global__ void sumindexKernel( 305 | int* result, 306 | size_t n ) { 307 | const int index = blockIdx.x * blockDim.x + threadIdx.x; 308 | const int stride = blockDim.x * gridDim.x; 309 | 310 | for (int i = index; i < n; i += stride) { 311 | result[i] += i; 312 | } 313 | } 314 | 315 | template 316 | __global__ void decodeKernel( 317 | int* temp, 318 | scalar_t* __restrict__ g_symbolsOut, 319 | scalar_t* __restrict__ g_output, 320 | size_t n) { 321 | const int index = blockIdx.x * blockDim.x + threadIdx.x; 322 | const int stride = blockDim.x * gridDim.x; 323 | for (int i = index; i < n; i += stride) { 324 | g_output[temp[i]] = g_symbolsOut[i]; 325 | } 326 | } 327 | 328 | 329 | at::Tensor rle_cuda_decode(at::Tensor countsOut, at::Tensor symbolsOut, at::Tensor result) { 330 | const auto n = symbolsOut.size(1); 331 | const int threads = 256; //256 332 | int blocks = (n + threads - 1) / threads; 333 | if(blocks > 65535) 334 | blocks = 65535; 335 | int *temp; 336 | cudaError err; 337 | if(0 != cudaMalloc(&temp, n*sizeof(int))) 338 | std::cout<<__LINE__<<" malloc failed"<<<>>( 350 | countsOut.data(), 351 | temp, 352 | n); 353 | })); 354 | err = cudaDeviceSynchronize(); 355 | if ( cudaSuccess != err ) 356 | { 357 | fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n", 358 | __FILE__, __LINE__, cudaGetErrorString( err ) ); 359 | exit( -1 ); 360 | } 361 | thrust::inclusive_scan(thrust::device, temp, temp + n, temp); 362 | err = cudaDeviceSynchronize(); 363 | if ( cudaSuccess != err ) 364 | { 365 | fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n", 366 | __FILE__, __LINE__, cudaGetErrorString( err ) ); 367 | exit( -1 ); 368 | } 369 | sumindexKernel<<>>(temp, n); 370 | 371 | err = cudaDeviceSynchronize(); 372 | if ( cudaSuccess != err ) 373 | { 374 | fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n", 375 | __FILE__, __LINE__, cudaGetErrorString( err ) ); 376 | exit( -1 ); 377 | } 378 | 379 | auto totalSize = symbolsOut.size(1); 380 | cudaDeviceSynchronize(); 381 | AT_DISPATCH_FLOATING_TYPES(symbolsOut.type(), "rle_encode_cuda", ([&] { 382 | decodeKernel<<>>( 383 | temp, 384 | symbolsOut.data(), 385 | result.data(), 386 | totalSize); 387 | })); 388 | err = cudaDeviceSynchronize(); 389 | if ( cudaSuccess != err ) 390 | { 391 | fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n", 392 | __FILE__, __LINE__, cudaGetErrorString( err ) ); 393 | exit( -1 ); 394 | } 395 | 396 | err = cudaFree(temp); 397 | if ( cudaSuccess != err ) 398 | { 399 | fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n", 400 | __FILE__, __LINE__, cudaGetErrorString( err ) ); 401 | exit( -1 ); 402 | } 403 | err = cudaDeviceSynchronize(); 404 | if ( cudaSuccess != err ) 405 | { 406 | fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n", 407 | __FILE__, __LINE__, cudaGetErrorString( err ) ); 408 | exit( -1 ); 409 | } 410 | return result; 411 | } 412 | 413 | -------------------------------------------------------------------------------- /RLE-raw-cuda/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 3 | 4 | setup( 5 | name='rle', 6 | description="a package used for compress sparse tensor", 7 | packages=["rle"], 8 | package_data={"rle": "rle.py"}, 9 | ext_modules=[ 10 | CUDAExtension('rle_cuda', [ 11 | 'rle_cuda.cpp', 12 | 'rle_cuda_kernel.cu', 13 | ]), 14 | ], 15 | cmdclass={ 16 | 'build_ext': BuildExtension 17 | }) 18 | -------------------------------------------------------------------------------- /RLE-raw-cuda/test.py: -------------------------------------------------------------------------------- 1 | import torch, rle 2 | 3 | 4 | def main(): 5 | a = rle.RLE() 6 | tensor1 = torch.tensor([[0.0, 0.2], [0.1, 0.0], [0.0, 0.0], [0.6, 0.0]]).cuda() 7 | tensor2 = torch.randn(307200, 1).cuda() 8 | b = [tensor1, tensor2] 9 | print(b) 10 | tensor_mask = [torch.tensor([[0, 1], [1, 0], [0, 0], [1, 0]]).cuda(), torch.ones_like(tensor2).cuda()] 11 | result = a.encode(b, tensor_mask) 12 | print(result) 13 | a.sizes = [(torch.Size([1, tup[0][0][tup[0].size()[1] - 1]])) for tup in result] 14 | result = a.decode(result, a.sizes) 15 | result_size = [(tensor.size()) for tensor in result] 16 | print("decode result={},size={}".format(result, result_size)) 17 | 18 | a = rle.RLE() 19 | tensor1 = torch.tensor([[0.0, 0.2], [0.1, 0.0], [0.0, 0.0], [0.6, 0.0]]).cuda() 20 | tensor2 = torch.randn(307200, 1).cuda() 21 | b = [tensor1, tensor2] 22 | print(b) 23 | tensor_mask = [torch.tensor([[0, 1], [1, 0], [0, 0], [1, 0]]).cuda(), torch.ones_like(tensor2).cuda()] 24 | result = a.encode(b, tensor_mask) 25 | print(result) 26 | a.sizes = [(torch.Size([1, tup[0][0][tup[0].size()[1] - 1]])) for tup in result] 27 | result = a.decode(result, a.sizes) 28 | result_size = [(tensor.size()) for tensor in result] 29 | print("decode result={},size={}".format(result, result_size)) 30 | 31 | 32 | def main2(): 33 | a = rle.RLE() 34 | tensor1 = torch.tensor([[0.0, 0.2], [0.1, 0.0], [0.0, 0.0], [0.6, 0.0]]).cuda() 35 | tensor2 = torch.randn(307200, 1).cuda() 36 | b = [tensor1, tensor2] 37 | print(b) 38 | tensor_mask = [torch.tensor([[0, 1], [1, 0], [0, 0], [1, 0]]).cuda(), torch.ones_like(tensor2).cuda()] 39 | result = a.encode(b, tensor_mask) 40 | print(result) 41 | 42 | 43 | if __name__ == '__main__': 44 | main() 45 | -------------------------------------------------------------------------------- /RLE-thrust/rle/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import rle_cuda 3 | 4 | def encode(tensor, lens): 5 | countsOut, symbolsOut = rle_cuda.encode(tensor, lens) 6 | return countsOut, symbolsOut 7 | 8 | def decode(countsOut, symbolsOut, grads): 9 | rle_cuda.decode(countsOut, symbolsOut, grads) 10 | 11 | -------------------------------------------------------------------------------- /RLE-thrust/rle_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | // CUDA encode declarations 6 | std::vector rle_cuda_encode(torch::Tensor& input, int len); 7 | 8 | int rle_cuda_decode( 9 | torch::Tensor& countsOut, 10 | torch::Tensor& symbolsOut, 11 | torch::Tensor& result); 12 | 13 | #define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") 14 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") 15 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 16 | 17 | // 设非零元素为k, 返回长度为两个列表 18 | // 长度为k的浮点数 19 | // 长度为k+1 20 | std::vector rle_encode(torch::Tensor input, int len) { 21 | CHECK_INPUT(input); 22 | return rle_cuda_encode(input, len); 23 | } 24 | 25 | // 结果放在result 26 | int rle_decode( 27 | torch::Tensor& countsOut, 28 | torch::Tensor& symbolsOut, 29 | torch::Tensor& result) { 30 | CHECK_INPUT(countsOut); 31 | CHECK_INPUT(symbolsOut); 32 | CHECK_INPUT(result); 33 | 34 | return rle_cuda_decode( 35 | countsOut, 36 | symbolsOut, 37 | result); 38 | return 0; 39 | } 40 | 41 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 42 | m.def("encode", &rle_encode, "RLE encode (CUDA)"); 43 | m.def("decode", &rle_decode, "RLE decode (CUDA)"); 44 | } -------------------------------------------------------------------------------- /RLE-thrust/rle_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | 17 | 18 | std::vector rle_cuda_encode(torch::Tensor& input, int len) { 19 | const size_t N = input.numel(); 20 | 21 | // allocate storage for output data and run lengths 22 | torch::Tensor output = torch::zeros(len, torch::device(torch::kCUDA).dtype(input.dtype())); 23 | torch::Tensor lengths = torch::zeros(len, torch::dtype(torch::kInt32).device(torch::kCUDA)); 24 | 25 | // compute run lengths 26 | auto len_ptr = (int32_t *)lengths.data_ptr(); 27 | AT_DISPATCH_FLOATING_TYPES(input.type(), "reduce_by_key", ( 28 | [&] { 29 | auto input_ptr = (scalar_t*)input.data_ptr(); 30 | auto output_ptr = (scalar_t*)output.data_ptr(); 31 | 32 | thrust::reduce_by_key 33 | (thrust::device, 34 | input_ptr, input_ptr+N, // input key sequence 35 | thrust::constant_iterator(1), // input value sequence 36 | output_ptr, // output key sequence 37 | len_ptr // output value sequence 38 | ); 39 | 40 | } 41 | )); 42 | return {output, lengths}; 43 | } 44 | 45 | int rle_cuda_decode(torch::Tensor& input, torch::Tensor& lengths, torch::Tensor& output) { 46 | const size_t len = input.numel(); 47 | // scan the lengths 48 | auto len_ptr = (int32_t*) lengths.data_ptr(); 49 | thrust::inclusive_scan(thrust::device, len_ptr, len_ptr+len, len_ptr); 50 | 51 | // output size is sum of the run lengths 52 | int N = output.numel(); 53 | 54 | // compute input index for each output element 55 | thrust::device_vector indices(N); 56 | thrust::lower_bound(thrust::device, len_ptr, len_ptr+len, 57 | thrust::counting_iterator(1), 58 | thrust::counting_iterator(N + 1), 59 | indices.begin()); 60 | 61 | // gather input elements 62 | AT_DISPATCH_FLOATING_TYPES(output.type(), "gather", ( 63 | [&] { 64 | auto input_ptr = (scalar_t*)input.data_ptr(); 65 | auto output_ptr = (scalar_t*)output.data_ptr(); 66 | thrust::gather(thrust::device, indices.begin(), indices.end(), 67 | input_ptr, output_ptr); 68 | } 69 | )); 70 | return 0; 71 | } -------------------------------------------------------------------------------- /RLE-thrust/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils import cpp_extension 3 | 4 | setup( 5 | name='rle', 6 | description="a package used for compress sparse tensor", 7 | packages=["rle"], 8 | package_data={"rle": ["__init__.py"]}, 9 | ext_modules=[ 10 | cpp_extension.CUDAExtension('rle_cuda', [ 11 | 'rle_cuda.cpp', 12 | 'rle_cuda_kernel.cu', 13 | ]), 14 | ], 15 | cmdclass={ 16 | 'build_ext': cpp_extension.BuildExtension 17 | }) 18 | --------------------------------------------------------------------------------