├── include
    ├── mathutil_cuda.h
    └── mathutil_cuda_kernel.h
├── src
    ├── mathutil_cuda.c
    └── mathutil_cuda_kernel.cu
├── __init__.py
├── Makefile
└── README.md


/include/mathutil_cuda.h:
--------------------------------------------------------------------------------
1 | int broadcast_sum(THCudaTensor *a_tensor, THCudaTensor *b_tensor, int x, int y);
2 | 


--------------------------------------------------------------------------------
/src/mathutil_cuda.c:
--------------------------------------------------------------------------------
 1 | #include <THC/THC.h>
 2 | #include "mathutil_cuda_kernel.h"
 3 | 
 4 | extern THCState *state;
 5 | 
 6 | int broadcast_sum(THCudaTensor *a_tensor, THCudaTensor *b_tensor, int x, int y)
 7 | {
 8 |     float *a = THCudaTensor_data(state, a_tensor);
 9 |     float *b = THCudaTensor_data(state, b_tensor);
10 |     cudaStream_t stream = THCState_getCurrentStream(state);
11 | 
12 |     broadcast_sum_cuda(a, b, x, y, stream);
13 | 
14 |     return 1;
15 | }
16 | 


--------------------------------------------------------------------------------
/include/mathutil_cuda_kernel.h:
--------------------------------------------------------------------------------
 1 | #ifndef _MATHUTIL_CUDA_KERNEL
 2 | #define _MATHUTIL_CUDA_KERNEL
 3 | 
 4 | #define IDX2D(i, j, dj) (dj * i + j)
 5 | #define IDX3D(i, j, k, dj, dk) (IDX2D(IDX2D(i, j, dj), k, dk))
 6 | 
 7 | #define BLOCK 512
 8 | #define MAX_STREAMS 512
 9 | 
10 | #ifdef __cplusplus
11 | extern "C" {
12 | #endif
13 | 
14 | void broadcast_sum_cuda(float *a, float *b, int x, int y, cudaStream_t stream);
15 | 
16 | #ifdef __cplusplus
17 | }
18 | #endif
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import unittest
 3 | 
 4 | import torch
 5 | from torch.autograd import Function, Variable
 6 | import mathutils
 7 | 
 8 | 
 9 | class BroadcastAccum(Function):
10 |   """Accumulate x += y using broadcasting sum.
11 |   """
12 |   def forward(self, x, y):
13 |     mathutils.broadcast_sum(x, y, *map(int, x.size()))
14 |     return x
15 | 
16 | 
17 | class TestBroadcastAccum(unittest.TestCase):
18 | 
19 |   def test_broadcast_accum(self):
20 |     N, M = 3, 5
21 |     x = torch.rand(N, M).cuda()
22 |     y = torch.rand(N, 1).cuda()
23 | 
24 |     x_np = x.cpu().numpy()
25 |     y_np = y.cpu().numpy()
26 | 
27 |     x_np += y_np
28 | 
29 |     x = BroadcastAccum()(Variable(x), Variable(y))
30 |     self.assertTrue(np.allclose(x_np, x.data.cpu().numpy()))
31 | 
32 | 
33 | if __name__ == '__main__':
34 |   unittest.main()
35 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Unix commands.
 2 | PYTHON := python
 3 | NVCC_COMPILE := nvcc -c -o
 4 | RM_RF := rm -rf
 5 | 
 6 | # Library compilation rules.
 7 | NVCC_FLAGS := -x cu -Xcompiler -fPIC -shared
 8 | 
 9 | # File structure.
10 | BUILD_DIR := build
11 | INCLUDE_DIRS := include
12 | TORCH_FFI_BUILD := build_ffi.py
13 | MATHUTIL_KERNEL := $(BUILD_DIR)/mathutil_cuda_kernel.so
14 | TORCH_FFI_TARGET := $(BUILD_DIR)/mathutils/_mathutils.so
15 | 
16 | INCLUDE_FLAGS := $(foreach d, $(INCLUDE_DIRS), -I$d)
17 | 
18 | all: $(TORCH_FFI_TARGET)
19 | 
20 | $(TORCH_FFI_TARGET): $(MATHUTIL_KERNEL) $(TORCH_FFI_BUILD)
21 | 	$(PYTHON) $(TORCH_FFI_BUILD)
22 | 
23 | $(BUILD_DIR)/%.so: src/%.cu
24 | 	@ mkdir -p $(BUILD_DIR)
25 | 	# Separate cpp shared library that will be loaded to the extern C ffi
26 | 	$(NVCC_COMPILE) $@ $? $(NVCC_FLAGS) $(INCLUDE_FLAGS)
27 | 
28 | clean:
29 | 	$(RM_RF) $(BUILD_DIR) $(MATHUTIL_KERNEL)
30 | 


--------------------------------------------------------------------------------
/src/mathutil_cuda_kernel.cu:
--------------------------------------------------------------------------------
 1 | #include <curand.h>
 2 | #include <stdio.h>
 3 | #include <math.h>
 4 | #include <float.h>
 5 | 
 6 | #include "mathutil_cuda_kernel.h"
 7 | 
 8 | dim3 cuda_gridsize(int n)
 9 | {
10 |     int k = (n - 1) / BLOCK + 1;
11 |     int x = k;
12 |     int y = 1;
13 |     if(x > 65535) {
14 |         x = ceil(sqrt(k));
15 |         y = (n - 1) / (x * BLOCK) + 1;
16 |     }
17 |     dim3 d(x, y, 1);
18 |     return d;
19 | }
20 | 
21 | __global__ void broadcast_sum_kernel(float *a, float *b, int x, int y, int size)
22 | {
23 |     int i = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
24 |     if(i >= size) return;
25 |     int j = i % y; i = i / y;
26 |     int k = i % x;
27 |     a[IDX2D(k, j, y)] += b[k];
28 | }
29 | 
30 | void broadcast_sum_cuda(float *a, float *b, int x, int y, cudaStream_t stream)
31 | {
32 |     int size = x * y;
33 |     cudaError_t err;
34 | 
35 |     broadcast_sum_kernel<<<cuda_gridsize(size), BLOCK, 0, stream>>>(a, b, x, y, size);
36 | 
37 |     err = cudaGetLastError();
38 |     if (cudaSuccess != err)
39 |     {
40 |         fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
41 |         exit(-1);
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Pytorch Custom CUDA kernel Tutorial
  2 | 
  3 | This repository contains a tutorial code for making a custom CUDA function for
  4 | pytorch. The code is based on the pytorch [C extension
  5 | example](https://github.com/pytorch/extension-ffi).
  6 | 
  7 | **Disclaimer**
  8 | 
  9 | - 2019/01/02: I wrote **[another up-to-date tutorial](https://github.com/chrischoy/MakePytorchPlusPlus)** on how to make a pytorch C++/CUDA extension with a Makefile. Associate git page is on **[https://github.com/chrischoy/MakePytorchPlusPlus](https://github.com/chrischoy/MakePytorchPlusPlus)**
 10 | - 2018/12/09: Pytorch CFFI is now deprecated in favor of [C++ extension](https://pytorch.org/tutorials/advanced/cpp_extension.html) from pytorch v1.0.
 11 | 
 12 | `This tutorial was written when pytorch did not support broadcasting sum. Now that it supports, probably you wouldn't need to make your own broadcasting sum function, but you can still follow the tutorial to build your own custom layer with a custom CUDA kernel.`
 13 | 
 14 | In this repository, we will build a simple CUDA based broadcasting sum
 15 | function.  The current version of pytorch does not support [broadcasting
 16 | sum](https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html), thus we
 17 | have to manually expand a tensor like using `expand_as` which makes a new
 18 | tensor and takes additional memory and computation.
 19 | 
 20 | For example,
 21 | 
 22 | ```python
 23 | a = torch.randn(3, 5)
 24 | b = torch.randn(3, 1)
 25 | # The following line will give an error
 26 | # a += b
 27 | 
 28 | # Expand b to have the same dimension as a
 29 | b_like_a = b.expand_as(a)
 30 | a += b_like_a
 31 | ```
 32 | 
 33 | In this post, we will build a function that can compute `a += b` without
 34 | explicitly expanding `b`.
 35 | 
 36 | ```python
 37 | mathutil.broadcast_sum(a, b, *map(int, a.size()))
 38 | ```
 39 | 
 40 | ## Make a CUDA kernel
 41 | 
 42 | First, let's make a cuda kernel that adds `b` to `a` without making a copy of a tensor `b`.
 43 | 
 44 | ```cuda
 45 | __global__ void broadcast_sum_kernel(float *a, float *b, int x, int y, int size)
 46 | {
 47 |     int i = (blockIdx.x + blockIdx.y * gridDim.x) * blockDim.x + threadIdx.x;
 48 |     if(i >= size) return;
 49 |     int j = i % x; i = i / x;
 50 |     int k = i % y;
 51 |     a[IDX2D(j, k, y)] += b[k];
 52 | }
 53 | ```
 54 | 
 55 | ## Make a C wrapper
 56 | 
 57 | Once you made a CUDA kernel, you have to wrap it with a C code. However, we are not using the pytorch backend yet. Note that the inputs are already device pointers.
 58 | 
 59 | 
 60 | ```c++
 61 | void broadcast_sum_cuda(float *a, float *b, int x, int y, cudaStream_t stream)
 62 | {
 63 |     int size = x * y;
 64 |     cudaError_t err;
 65 | 
 66 |     broadcast_sum_kernel<<<cuda_gridsize(size), BLOCK, 0, stream>>>(a, b, x, y, size);
 67 | 
 68 |     err = cudaGetLastError();
 69 |     if (cudaSuccess != err)
 70 |     {
 71 |         fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
 72 |         exit(-1);
 73 |     }
 74 | }
 75 | ```
 76 | 
 77 | ## Connect Pytorch backends with the C Wrapper
 78 | 
 79 | Next, we have to connect the pytorch backend with our C wrapper. You can expose the device pointer using the function `THCudaTensor_data`. The pointers `a` and `b` are device pointers (on GPU).
 80 | 
 81 | 
 82 | ```c++
 83 | extern THCState *state;
 84 | 
 85 | int broadcast_sum(THCudaTensor *a_tensor, THCudaTensor *b_tensor, int x, int y)
 86 | {
 87 |     float *a = THCudaTensor_data(state, a_tensor);
 88 |     float *b = THCudaTensor_data(state, b_tensor);
 89 |     cudaStream_t stream = THCState_getCurrentStream(state);
 90 | 
 91 |     broadcast_sum_cuda(a, b, x, y, stream);
 92 | 
 93 |     return 1;
 94 | }
 95 | ```
 96 | 
 97 | ## Make a python wrapper
 98 | 
 99 | Now that we built the cuda function and a pytorch function, we need to expose the function to python so that we can use the function in python.
100 | 
101 | We will first build a shared library using `nvcc`.
102 | 
103 | ```shell
104 | nvcc ... -o build/mathutil_cuda_kernel.so src/mathutil_cuda_kernel.cu
105 | ```
106 | 
107 | Then, we will use the pytorch `torch.utils.ffi.create_extension` function which automatically put appropriate headers and builds a python loadable shared library.
108 | 
109 | ```python
110 | from torch.utils.ffi import create_extension
111 | 
112 | ...
113 | 
114 | ffi = create_extension(
115 |     'mathutils',
116 |     headers=[...],
117 |     sources=[...],
118 |     ...
119 | )
120 | 
121 | ffi.build()
122 | ```
123 | 
124 | 
125 | ## Test!
126 | 
127 | Finally, we can test our function by building it.
128 | In the readme, I removed a lot of details, but you can see a working example.
129 | 
130 | ```shell
131 | git clone https://github.com/chrischoy/pytorch-cffi-tutorial
132 | cd pytorch-cffi-tutorial
133 | make
134 | ```
135 | 
136 | ## Note
137 | 
138 | The function only takes `THCudaTensor`, which is `torch.FloatTensor().cuda()` in python.
139 | 


--------------------------------------------------------------------------------