├── .gitignore ├── README.md ├── add_matrix.cpp ├── add_matrix_kernel.cu ├── setup.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pytorch_cppcuda_practice 2 | Practice to write cpp/cuda extension for pytorch 3 | 4 | This simple example writes a custom matrix addition in cuda, and provides python binding with forward/backward operations. 5 | 6 | Sources: [1](https://qiita.com/windfall/items/5031d70c649b06a1534f) [2](https://pytorch.org/tutorials/advanced/cpp_extension.html) 7 | 8 | # Installation 9 | 10 | 1. Install pytorch. 11 | 2. From this repo, run `pip install . --use-feature=in-tree-build` 12 | 13 | # Test 14 | 15 | Run `python test.py`. 16 | 17 | # Misc 18 | 19 | For vscode to resolve ``, add the following lines to the include path (change python path and version): 20 | ``` 21 | "/include/python3.8", 22 | "/lib/python3.8/site-packages/torch/include", 23 | "/lib/python3.8/site-packages/torch/include/torch/csrc/api/include" 24 | ``` 25 | -------------------------------------------------------------------------------- /add_matrix.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") 4 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") 5 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 6 | 7 | 8 | torch::Tensor add_matrix_cu_forward( 9 | torch::Tensor A, 10 | torch::Tensor B); 11 | 12 | 13 | torch::Tensor add_matrix_forward( 14 | torch::Tensor A, 15 | torch::Tensor B 16 | ){ 17 | CHECK_INPUT(A); 18 | CHECK_INPUT(B); 19 | return add_matrix_cu_forward(A, B); 20 | } 21 | 22 | std::vector add_matrix_backward( 23 | torch::Tensor grad_out 24 | ){ 25 | return {grad_out, grad_out}; 26 | } 27 | 28 | 29 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){ 30 | m.def("forward", &add_matrix_forward); 31 | m.def("backward", &add_matrix_backward); 32 | } -------------------------------------------------------------------------------- /add_matrix_kernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | 6 | template 7 | __device__ __forceinline__ scalar_t identity(scalar_t z) { 8 | return z; 9 | } 10 | 11 | 12 | template 13 | __global__ void add_matrix_kernel_forward( 14 | const torch::PackedTensorAccessor A, 15 | const torch::PackedTensorAccessor B, 16 | torch::PackedTensorAccessor out 17 | ){ 18 | const int n = blockIdx.y; 19 | const int c = blockIdx.x * blockDim.x + threadIdx.x; 20 | 21 | if (c < A.size(1)){ 22 | out[n][c] = identity(A[n][c] + B[n][c]); 23 | } 24 | } 25 | 26 | 27 | torch::Tensor add_matrix_cu_forward( 28 | torch::Tensor A, 29 | torch::Tensor B 30 | ){ 31 | torch::Tensor out = torch::zeros_like(A); 32 | 33 | const int n_row = A.size(0); 34 | const int n_col = A.size(1); 35 | const int threads = 1024; 36 | const dim3 blocks((n_col + threads - 1) / threads, n_row); // to cover all elements 37 | 38 | // instantiate kernel 39 | AT_DISPATCH_FLOATING_TYPES(A.type(), "add_matrix_cu_forward", 40 | ([&] { 41 | add_matrix_kernel_forward<<>>( 42 | A.packed_accessor(), 43 | B.packed_accessor(), 44 | out.packed_accessor() 45 | ); 46 | }) 47 | ); 48 | return out; 49 | } -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import CUDAExtension, BuildExtension 3 | 4 | setup( 5 | name='add_matrix_cuda', 6 | version='1.0', 7 | author='kwea123', 8 | author_email='kwea123@gmail.com', 9 | description='cppcuda example', 10 | long_description='cppcuda example', 11 | ext_modules=[ 12 | CUDAExtension( 13 | name='add_matrix_cuda', 14 | sources=['add_matrix.cpp', 'add_matrix_kernel.cu']) 15 | ], 16 | cmdclass={ 17 | 'build_ext': BuildExtension 18 | } 19 | ) -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import add_matrix_cuda 3 | 4 | 5 | class AddMatrixFunction(torch.autograd.Function): 6 | @staticmethod 7 | def forward(ctx, A, B): 8 | output = add_matrix_cuda.forward(A, B) 9 | return output 10 | 11 | @staticmethod 12 | def backward(ctx, grad_out): 13 | dA, dB = add_matrix_cuda.backward(grad_out) 14 | return dA, dB 15 | 16 | 17 | if __name__=='__main__': 18 | f = AddMatrixFunction() 19 | A = torch.rand(100, 100, device='cuda:0').requires_grad_() 20 | B = torch.rand(100, 100, device='cuda:0') 21 | 22 | out = f.apply(A, B) 23 | print('model out:', out) 24 | loss = out.sum() 25 | loss.backward() 26 | print("A.grad:", A.grad) --------------------------------------------------------------------------------