├── .gitignore ├── LICENSE ├── README.md ├── config.mk ├── tvm-paper-notes.md └── tvm-tutorials ├── compute-and-reduce-with-tuple-inputs.ipynb ├── external-tensor-functions.ipynb ├── getting-started.ipynb ├── reduction.ipynb └── schedule_primitives.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Anderson Banihirwe 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![](http://tvmlang.org/images/main/stack_tvmlang.png) (Image Source: http://tvmlang.org/) 2 | 3 | # TVM in Action 4 | 5 | [TVM: End-to-End Optimization Stack for Deep Learning](https://github.com/dmlc/tvm) 6 | 7 | This repo hosts my notes, tutorial materials (source code) for TVM stack as I explore the incredible explosition of deep-learning frameworks and how to bring them together. 8 | 9 | # [Summary of TVM: End-to-End Optimization Stack for Deep Learning](https://arxiv.org/abs/1802.04799) 10 | 11 | ## Abstract 12 | 13 | - Scalable frameworks, such as TensorFlow, MXNet, Caffe, and PyTorch are optimized for a narrow range of serve-class GPUs. 14 | - Deploying workloads to other platforms such as mobile phones, IoT, and specialized accelarators(FPGAs, ASICs) requires laborious manual effort. 15 | - TVM is an end-to-end optimization stack that exposes: 16 | - graph-level 17 | - operator-level optimizations 18 | ---> to provide performance portability to deep learning workloads across diverse hardware back-ends. 19 | 20 | ## Introduction 21 | 22 | - The number and diversity of specialized deep learning (DL) accelerators pose an adoption challenge 23 | - They introduce new hardware abstractions that modern compilers and frameworks are ill-equipped to deal with. 24 | 25 | - Providing support in various DL frameworks for diverse hardware back-ends in the present ad-hoc fashion is **unsustainable**. 26 | 27 | - Hardware targets significantly diverge in terms of memory organization, compute, etc.. 28 | 29 | ![](https://i.imgur.com/XRSZMt0.png) 30 | 31 | - *The Goal*: **easily deploy DL workloads to all kinds of hardware targets, including embedded devives, GPUs, FPGAs, ASCIs (e.g, the TPU).** 32 | 33 | - Current DL frameworks rely on a **computational graph intermediate representation** to implement optimizations such as: 34 | - auto differentiation 35 | - dynamic memory management 36 | 37 | - **Graph-level optimizations** are often too high-level to handle hardware back-end-specific **operator transformations**. 38 | - **Current operator-level libraries** that DL frameworks rely on are: 39 | - too rigid 40 | - specialized 41 | 42 | ---> to be easily ported **across hardware devices** 43 | 44 | - To address these weaknesses, we need a **compiler framework** that can expose optimization opportunities across both 45 | - graph-level and 46 | - operator-level 47 | 48 | ---> to deliver competitive performance across hardware back-ends. 49 | 50 | ### Four fundamental challenges at the computation graph level and tensor operator level 51 | 52 | 1. **High-level dataflow rewriting:** 53 | - Different hardware devices may have vastly different memory hierarchies. 54 | 55 | - Enabling strategies to fuse operators and optimize data layouts are crucial for optimizing memory access. 56 | 57 | 2. **Memory reuse across threads:** 58 | - Modern GPUs and specialized accelerators ahve memory that can be shared across compute cores. 59 | - Traditional shared-nothing nested parallel model is no longer optimal. 60 | - Cooperation among threads on shared memory loaded is required for optimized kernels. 61 | 62 | 3. **Tensorized compute intrinsics:** 63 | - The latest hardware provides new instructions that go beyond vector operations like the GEMM operator in TPU or the tensor core in NVIDIA's Volta. 64 | - Consequently, the scheduling procedure must break computation into tensor arithmetic intrinsics instead of scalar or vector code. 65 | 66 | 4. **Latency Hiding** 67 | - Traditional architectures with simultaneous multithreading and automatically managed caches implicitly hide latency in modern CPUs/GPUs. 68 | - Specialized accelerator designs favor learner control and offload most of the scheduling complexity to the compiler stack. 69 | - Still, scheduling must be peformed carefully to hide memory access latency. 70 | 71 | 72 | ### TVM: An End-to-End Optimization Stack 73 | 74 | - An end-to-end optimizing compiler stack to lower and fine-tune DL workloads to diverse hardware back-ends. 75 | - Designed to separate: 76 | - the algorithm description 77 | - schedule 78 | - hardware interface 79 | - This separation enables **support for novel specialized accelerators** and **their corresponding new intrinsics**. 80 | - TVM presents **two optimization layers**: 81 | - a computation graph optimization layer to address: 82 | - High-level dataflow rewriting 83 | - a tensor optimization layer with new schedule primitives to address: 84 | - memory reuse across threads 85 | - tensorized compute intrinsics 86 | - latency hiding 87 | 88 | ## Optimizing Computational Graphs 89 | 90 | ### Computational Graph 91 | 92 | - Computational graphs are a common way to represent programs in DL frameworks. 93 | - They provide a global view on computation tasks, yet avoid specifying how each computation task needs to be implemented. 94 | 95 | 96 | 97 | ### Operator Fusion 98 | 99 | - An optimization that can greatly reduce execution time, particulary in GPUs and specialized accelerators. 100 | - The idea is to **combine multiple operators together into a single kernel without saving the intermediate results back into global memory** 101 | 102 | ![](https://i.imgur.com/mlNhoDT.png) 103 | 104 | **Four categories of graph operators**: 105 | 106 | - Injective (one-to-one map) 107 | - Reduction 108 | - Complex-out-fusable (can fuse element-wise map to output) 109 | - Opaque (cannot be fused) 110 | 111 | ![](https://i.imgur.com/XnhSWVN.png) 112 | 113 | ### Data Layout Transformation 114 | 115 | - Tensor operations are the basic operators of computational graphs 116 | - They can have divergent layout requirements across different operations 117 | - Optimizing data layout starts with specifying the preferred data layout of each operator given the constraints dictating their implementation in hardware. 118 | 119 | ![](https://i.imgur.com/0J5QxGs.png) 120 | 121 | ### Limitations of Graph-Level Optimizations 122 | 123 | - They are only as effective as what the operator library provides. 124 | - Currently, the few DL frameworks that support operator fusion require the operator library to provide an implementation of the fused patterns. 125 | - With more network operators introduced on a regular basis, this approach is no longer sustainable when targeting an increasing number of hardware back-ends. 126 | - It is not feasible to handcraft operator kernels for this massive space of back-end specific operators 127 | - TVM provides a code-generation approach that can generate tensor operators. 128 | 129 | ## Optimizing Tensor Operations 130 | 131 | ### Tensor Expression Language 132 | 133 | - TVM introduces a dataflow tensor expression language to support automatic code generation. 134 | - Unlike high-level computation graph languages, where the implementation of tensor operations is opaque, *each operation is described in an index formula expression language*. 135 | 136 | ![](https://i.imgur.com/LG1pguT.png) 137 | 138 | - TVM tensor expression language supports common arithmetic and math operations found in common language like C. 139 | - TVM explicitly introduces a **commutative reduction** operator to easily schedule commutative reductions across multiple threads. 140 | - TVM further introduces a **high-order scan operator** that can combine basic compute operators to form recurrent computations over time. 141 | 142 | ### Schedule Space 143 | 144 | - Given a tensor expression, it is challenging to create high-performance implementations for each hardware back-end. 145 | - Each optimized low-level program is the result of different combinations of scheduling strategies, imposing a large burden on the kernel writer. 146 | - TVM adopts the **principle of decoupling compute descriptions from schedule optimizations**. 147 | - Schedules are the specific rules that lower compute descriptions down to back-end-optimized implementations. 148 | 149 | ![](https://i.imgur.com/JUikGQz.png) 150 | 151 | ![](https://i.imgur.com/BCg6gCz.png) 152 | 153 | 154 | ### Nested Parallelism with Cooperation 155 | 156 | - Parallel programming is key to improving the efficiency of compute intensive kernels in deep learning workloads. 157 | - Modern GPUs offer massive parallelism 158 | 159 | ---> Requiring TVM to bake parallel programming models into schedule transformations 160 | 161 | - Most existing solutions adopt a parallel programming model referred to as [nested parallel programs](https://youtu.be/4lS_WThsFoM), which is a form of [fork-join parallelism](https://en.wikipedia.org/wiki/Fork%E2%80%93join_model). 162 | - TVM uses a parallel schedule primitive to parallelize a data parallel task 163 | - Each parallel task can be further recursively subdivided into subtasks to exploit the multi-level thread hierarchy on the target architecture (e.g, thread groups in GPU) 164 | - This model is called **shared-nothing nested parallelism** 165 | - One working thread cannot look at the data of its sibling within the same parallel computation stage. 166 | - Interactions between sibling threads happen at the join stage, when the subtasks are done and the next stage can consume the data produced by the previous stage. 167 | - This programming model **does not enable threads to cooperate with each other in order to perform collective task within the same parallel stage**. 168 | 169 | - A better alternative to the shared-nothing approach is to **fetch data cooperatively across threads** 170 | - This pattern is well known in GPU programming using languages like CUDA, OpenCL and Metal. 171 | - **It has not been implemented into a schedule primitive.** 172 | - TVM introduces the **concept of memory scopes to the schedule space**, so that a stage can be marked as shared. 173 | - Without memory scopes, automatic scope inference will mark the relevant stage as thread-local. 174 | - Memory scopes are useful to GPUs. 175 | - Memory scopes allow us to tag special memory buffers and create special lowering rules when targeting specialized deep learning accelerators. 176 | 177 | ![](https://i.imgur.com/HHYtujL.png) 178 | 179 | 180 | ### Tensorization: Generalizing the Hardware Interface 181 | 182 | - **Tensorization** problem is analogous to the **vectorization** problem for [SIMD architectures](https://en.wikipedia.org/wiki/SIMD). 183 | - Tensorization differs significantly from vectorization 184 | - The inputs to the tensor compute primitives are multi-dimensional, with fixed or variable lengths, and dictate different data layouts. 185 | - Cannot resort to a fixed set of primitives, as new DL accelerators are emerging with their own flavors of tensor instructions. 186 | - To solve this challenge, TVM **separates the hardware interface from the schedule**: 187 | - TVM introduces a tensor intrinsic declaration mechanism 188 | - TVM uses the tensor expression language to declare the behavior of each new hardware intrinsic, as well as the lowering rule associated to it. 189 | - TVM introduces a **tensorize** schedule primitive to replace a unit of computation with the corresponding tensor intrinsics. 190 | - The compiler matches the computation pattern with a hardware declaration, and lowers it to the corresping hardware intrinsic. 191 | 192 | 193 | ### Compiler Support for Latency Hiding 194 | 195 | - **Latency Hiding:** refers to the process of overlapping memory operations with computation to maximize memory and compute utilization. 196 | - It requires different different strategies depending on the hardware back-end that is being targeted. 197 | - On CPUs, memory latency hiding is achieved **implicitly with simultaneous multithreading** or **hardware prefetching techniques**. 198 | - GPUs rely on **rapid context switching of many wraps of threads** to maximize the utilization of functional units. 199 | - TVM provides a virtual threading schedule primitive that lets the programmer specify a high-level data parallel program that TVM automatically lowers to a low-level explicit data dependence program. 200 | 201 | 202 | ## Code Generation and Runtime Support 203 | 204 | ### Code Generation 205 | 206 | - For a specific tuple of data-flow declaration, axis relation hyper-graph, and schedule tree, TVM can generate lowered code by: 207 | - iteratively traversing the schedule tree 208 | - inferring the dependent bounds of the input tensors (using the axis relation hyergraph) 209 | - generating the loop nest in the low-level code 210 | - The code is lowered to an in-memory representation of an imperative C style loop program. 211 | - TVM reuses a variant of Halide's the loop program data structure in this process. 212 | - TVM reuses passes from Halide for common lowering primitives like storage flattening and unrolling, 213 | - and add GPU/accelerator-specific transformations such as: 214 | - *synchronization point detection* 215 | - *virtual thread injection** 216 | - *module generation* 217 | - Finally, the loop program is transformed into **LLVM** or **CUDA/Metal/OpenCL** source code. 218 | 219 | ### Runtime Support 220 | 221 | - For GPU programs, TVM builds the host and device modules **separately** and provide a runtime module system that launch kernels using corresponding driver APIs. 222 | 223 | ### Remote Deployment Profiling 224 | 225 | - TVM includes infrastructure to make profiling and autotuning easier on embedded devices. 226 | - Traditionally, targeting an embedded device for tuning requires: 227 | - cross-compiling on the host side, 228 | - copying to the target device, 229 | - and timing the execution 230 | 231 | - TVM provides remote function call support. Through the **RPC interface**: 232 | - TVM compiles the program on a host compiler 233 | - it uploads to remote embedded devices 234 | - it runs the funcion remotely, 235 | - and it accesses the results in the same script on the host. 236 | 237 | ![](https://i.imgur.com/oL0Z9pp.png) 238 | 239 | 240 | ## Conclusion 241 | 242 | - TVM provides an end-to-end stack to solve fundamental optimization challenges across a diverse set of hardware back-ends. 243 | - TVM can encourage more studies of programming languages, compilation, and open new opportunities for hardware co-design techniques for deep learning systems. 244 | 245 | -------------------------------------------------------------------------------- /config.mk: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------- 2 | # Template configuration for compiling 3 | # 4 | # If you want to change the configuration, please use the following 5 | # steps. Assume you are on the root directory. First copy the this 6 | # file so that any local changes will be ignored by git 7 | # 8 | # $ cp make/config.mk . 9 | # 10 | # Next modify the according entries, and then compile by 11 | # 12 | # $ make 13 | # 14 | # or build in parallel with 8 threads 15 | # 16 | # $ make -j8 17 | #------------------------------------------------------------------------------- 18 | 19 | # whether compile with debug 20 | DEBUG = 0 21 | 22 | # the additional link flags you want to add 23 | ADD_LDFLAGS = 24 | 25 | # the additional compile flags you want to add 26 | ADD_CFLAGS = 27 | 28 | #--------------------------------------------- 29 | # Backend runtimes. 30 | #--------------------------------------------- 31 | # whether enable CUDA during compile 32 | USE_CUDA = 1 33 | 34 | # add the path to CUDA library to link and compile flag 35 | # if you have already add them to environment variable. 36 | CUDA_PATH = /usr/local/cuda 37 | 38 | # ROCM 39 | USE_ROCM = 0 40 | 41 | # whether enable OpenCL during compile 42 | USE_OPENCL = 1 43 | 44 | # whether enable Metal during compile 45 | USE_METAL = 0 46 | 47 | # whether enable SGX during compile 48 | USE_SGX = 0 49 | SGX_SDK = /opt/sgxsdk 50 | 51 | # Whether enable RPC during compile 52 | USE_RPC = 1 53 | 54 | # Whether enable tiny embedded graph runtime. 55 | USE_GRAPH_RUNTIME = 1 56 | 57 | # Whether enable additional graph debug functions 58 | USE_GRAPH_RUNTIME_DEBUG = 0 59 | 60 | # whether build with LLVM support 61 | # Requires LLVM version >= 4.0 62 | # Set LLVM_CONFIG to your version, uncomment to build with llvm support 63 | # 64 | LLVM_CONFIG = llvm-config-6.0 65 | #--------------------------------------------- 66 | # Contrib optional libraries. 67 | #--------------------------------------------- 68 | # Whether use BLAS, choices: openblas, atlas, blas, apple 69 | USE_BLAS = openblas 70 | 71 | # Whether use contrib.random in runtime 72 | USE_RANDOM = 0 73 | 74 | # Whether use NNPack 75 | USE_NNPACK = 0 76 | # NNPACK_PATH = none 77 | 78 | # Whether use CuDNN 79 | USE_CUDNN = 1 80 | 81 | # Whether use MIOpen 82 | USE_MIOPEN = 0 83 | 84 | # Whether use MPS 85 | USE_MPS = 0 86 | 87 | # Whether use cuBLAS 88 | USE_CUBLAS = 1 89 | 90 | # Whether use rocBlas 91 | USE_ROCBLAS = 0 92 | -------------------------------------------------------------------------------- /tvm-paper-notes.md: -------------------------------------------------------------------------------- 1 | # [Summary of TVM: End-to-End Optimization Stack for Deep Learning](https://arxiv.org/abs/1802.04799) 2 | 3 | ## Abstract 4 | 5 | - Scalable frameworks, such as TensorFlow, MXNet, Caffe, and PyTorch are optimized for a narrow range of serve-class GPUs. 6 | - Deploying workloads to other platforms such as mobile phones, IoT, and specialized accelarators(FPGAs, ASICs) requires laborious manual effort. 7 | - TVM is an end-to-end optimization stack that exposes: 8 | - graph-level 9 | - operator-level optimizations 10 | ---> to provide performance portability to deep learning workloads across diverse hardware back-ends. 11 | 12 | ## Introduction 13 | 14 | - The number and diversity of specialized deep learning (DL) accelerators pose an adoption challenge 15 | - They introduce new hardware abstractions that modern compilers and frameworks are ill-equipped to deal with. 16 | 17 | - Providing support in various DL frameworks for diverse hardware back-ends in the present ad-hoc fashion is **unsustainable**. 18 | 19 | - Hardware targets significantly diverge in terms of memory organization, compute, etc.. 20 | 21 | ![](https://i.imgur.com/XRSZMt0.png) 22 | 23 | - *The Goal*: **easily deploy DL workloads to all kinds of hardware targets, including embedded devives, GPUs, FPGAs, ASCIs (e.g, the TPU).** 24 | 25 | - Current DL frameworks rely on a **computational graph intermediate representation** to implement optimizations such as: 26 | - auto differentiation 27 | - dynamic memory management 28 | 29 | - **Graph-level optimizations** are often too high-level to handle hardware back-end-specific **operator transformations**. 30 | - **Current operator-level libraries** that DL frameworks rely on are: 31 | - too rigid 32 | - specialized 33 | 34 | ---> to be easily ported **across hardware devices** 35 | 36 | - To address these weaknesses, we need a **compiler framework** that can expose optimization opportunities across both 37 | - graph-level and 38 | - operator-level 39 | 40 | ---> to deliver competitive performance across hardware back-ends. 41 | 42 | ### Four fundamental challenges at the computation graph level and tensor operator level 43 | 44 | 1. **High-level dataflow rewriting:** 45 | - Different hardware devices may have vastly different memory hierarchies. 46 | 47 | - Enabling strategies to fuse operators and optimize data layouts are crucial for optimizing memory access. 48 | 49 | 2. **Memory reuse across threads:** 50 | - Modern GPUs and specialized accelerators ahve memory that can be shared across compute cores. 51 | - Traditional shared-nothing nested parallel model is no longer optimal. 52 | - Cooperation among threads on shared memory loaded is required for optimized kernels. 53 | 54 | 3. **Tensorized compute intrinsics:** 55 | - The latest hardware provides new instructions that go beyond vector operations like the GEMM operator in TPU or the tensor core in NVIDIA's Volta. 56 | - Consequently, the scheduling procedure must break computation into tensor arithmetic intrinsics instead of scalar or vector code. 57 | 58 | 4. **Latency Hiding** 59 | - Traditional architectures with simultaneous multithreading and automatically managed caches implicitly hide latency in modern CPUs/GPUs. 60 | - Specialized accelerator designs favor learner control and offload most of the scheduling complexity to the compiler stack. 61 | - Still, scheduling must be peformed carefully to hide memory access latency. 62 | 63 | 64 | ### TVM: An End-to-End Optimization Stack 65 | 66 | - An end-to-end optimizing compiler stack to lower and fine-tune DL workloads to diverse hardware back-ends. 67 | - Designed to separate: 68 | - the algorithm description 69 | - schedule 70 | - hardware interface 71 | - This separation enables **support for novel specialized accelerators** and **their corresponding new intrinsics**. 72 | - TVM presents **two optimization layers**: 73 | - a computation graph optimization layer to address: 74 | - High-level dataflow rewriting 75 | - a tensor optimization layer with new schedule primitives to address: 76 | - memory reuse across threads 77 | - tensorized compute intrinsics 78 | - latency hiding 79 | 80 | ## Optimizing Computational Graphs 81 | 82 | ### Computational Graph 83 | 84 | - Computational graphs are a common way to represent programs in DL frameworks. 85 | - They provide a global view on computation tasks, yet avoid specifying how each computation task needs to be implemented. 86 | 87 | 88 | 89 | ### Operator Fusion 90 | 91 | - An optimization that can greatly reduce execution time, particulary in GPUs and specialized accelerators. 92 | - The idea is to **combine multiple operators together into a single kernel without saving the intermediate results back into global memory** 93 | 94 | ![](https://i.imgur.com/mlNhoDT.png) 95 | 96 | **Four categories of graph operators**: 97 | 98 | - Injective (one-to-one map) 99 | - Reduction 100 | - Complex-out-fusable (can fuse element-wise map to output) 101 | - Opaque (cannot be fused) 102 | 103 | ![](https://i.imgur.com/XnhSWVN.png) 104 | 105 | ### Data Layout Transformation 106 | 107 | - Tensor operations are the basic operators of computational graphs 108 | - They can have divergent layout requirements across different operations 109 | - Optimizing data layout starts with specifying the preferred data layout of each operator given the constraints dictating their implementation in hardware. 110 | 111 | ![](https://i.imgur.com/0J5QxGs.png) 112 | 113 | ### Limitations of Graph-Level Optimizations 114 | 115 | - They are only as effective as what the operator library provides. 116 | - Currently, the few DL frameworks that support operator fusion require the operator library to provide an implementation of the fused patterns. 117 | - With more network operators introduced on a regular basis, this approach is no longer sustainable when targeting an increasing number of hardware back-ends. 118 | - It is not feasible to handcraft operator kernels for this massive space of back-end specific operators 119 | - TVM provides a code-generation approach that can generate tensor operators. 120 | 121 | ## Optimizing Tensor Operations 122 | 123 | ### Tensor Expression Language 124 | 125 | - TVM introduces a dataflow tensor expression language to support automatic code generation. 126 | - Unlike high-level computation graph languages, where the implementation of tensor operations is opaque, *each operation is described in an index formula expression language*. 127 | 128 | ![](https://i.imgur.com/LG1pguT.png) 129 | 130 | - TVM tensor expression language supports common arithmetic and math operations found in common language like C. 131 | - TVM explicitly introduces a **commutative reduction** operator to easily schedule commutative reductions across multiple threads. 132 | - TVM further introduces a **high-order scan operator** that can combine basic compute operators to form recurrent computations over time. 133 | 134 | ### Schedule Space 135 | 136 | - Given a tensor expression, it is challenging to create high-performance implementations for each hardware back-end. 137 | - Each optimized low-level program is the result of different combinations of scheduling strategies, imposing a large burden on the kernel writer. 138 | - TVM adopts the **principle of decoupling compute descriptions from schedule optimizations**. 139 | - Schedules are the specific rules that lower compute descriptions down to back-end-optimized implementations. 140 | 141 | ![](https://i.imgur.com/JUikGQz.png) 142 | 143 | ![](https://i.imgur.com/BCg6gCz.png) 144 | 145 | 146 | ### Nested Parallelism with Cooperation 147 | 148 | - Parallel programming is key to improving the efficiency of compute intensive kernels in deep learning workloads. 149 | - Modern GPUs offer massive parallelism 150 | 151 | ---> Requiring TVM to bake parallel programming models into schedule transformations 152 | 153 | - Most existing solutions adopt a parallel programming model referred to as [nested parallel programs](https://youtu.be/4lS_WThsFoM), which is a form of [fork-join parallelism](https://en.wikipedia.org/wiki/Fork%E2%80%93join_model). 154 | - TVM uses a parallel schedule primitive to parallelize a data parallel task 155 | - Each parallel task can be further recursively subdivided into subtasks to exploit the multi-level thread hierarchy on the target architecture (e.g, thread groups in GPU) 156 | - This model is called **shared-nothing nested parallelism** 157 | - One working thread cannot look at the data of its sibling within the same parallel computation stage. 158 | - Interactions between sibling threads happen at the join stage, when the subtasks are done and the next stage can consume the data produced by the previous stage. 159 | - This programming model **does not enable threads to cooperate with each other in order to perform collective task within the same parallel stage**. 160 | 161 | - A better alternative to the shared-nothing approach is to **fetch data cooperatively across threads** 162 | - This pattern is well known in GPU programming using languages like CUDA, OpenCL and Metal. 163 | - **It has not been implemented into a schedule primitive.** 164 | - TVM introduces the **concept of memory scopes to the schedule space**, so that a stage can be marked as shared. 165 | - Without memory scopes, automatic scope inference will mark the relevant stage as thread-local. 166 | - Memory scopes are useful to GPUs. 167 | - Memory scopes allow us to tag special memory buffers and create special lowering rules when targeting specialized deep learning accelerators. 168 | 169 | ![](https://i.imgur.com/HHYtujL.png) 170 | 171 | 172 | ### Tensorization: Generalizing the Hardware Interface 173 | 174 | - **Tensorization** problem is analogous to the **vectorization** problem for [SIMD architectures](https://en.wikipedia.org/wiki/SIMD). 175 | - Tensorization differs significantly from vectorization 176 | - The inputs to the tensor compute primitives are multi-dimensional, with fixed or variable lengths, and dictate different data layouts. 177 | - Cannot resort to a fixed set of primitives, as new DL accelerators are emerging with their own flavors of tensor instructions. 178 | - To solve this challenge, TVM **separates the hardware interface from the schedule**: 179 | - TVM introduces a tensor intrinsic declaration mechanism 180 | - TVM uses the tensor expression language to declare the behavior of each new hardware intrinsic, as well as the lowering rule associated to it. 181 | - TVM introduces a **tensorize** schedule primitive to replace a unit of computation with the corresponding tensor intrinsics. 182 | - The compiler matches the computation pattern with a hardware declaration, and lowers it to the corresping hardware intrinsic. 183 | 184 | 185 | ### Compiler Support for Latency Hiding 186 | 187 | - **Latency Hiding:** refers to the process of overlapping memory operations with computation to maximize memory and compute utilization. 188 | - It requires different different strategies depending on the hardware back-end that is being targeted. 189 | - On CPUs, memory latency hiding is achieved **implicitly with simultaneous multithreading** or **hardware prefetching techniques**. 190 | - GPUs rely on **rapid context switching of many wraps of threads** to maximize the utilization of functional units. 191 | - TVM provides a virtual threading schedule primitive that lets the programmer specify a high-level data parallel program that TVM automatically lowers to a low-level explicit data dependence program. 192 | 193 | 194 | ## Code Generation and Runtime Support 195 | 196 | ### Code Generation 197 | 198 | - For a specific tuple of data-flow declaration, axis relation hyper-graph, and schedule tree, TVM can generate lowered code by: 199 | - iteratively traversing the schedule tree 200 | - inferring the dependent bounds of the input tensors (using the axis relation hyergraph) 201 | - generating the loop nest in the low-level code 202 | - The code is lowered to an in-memory representation of an imperative C style loop program. 203 | - TVM reuses a variant of Halide's the loop program data structure in this process. 204 | - TVM reuses passes from Halide for common lowering primitives like storage flattening and unrolling, 205 | - and add GPU/accelerator-specific transformations such as: 206 | - *synchronization point detection* 207 | - *virtual thread injection** 208 | - *module generation* 209 | - Finally, the loop program is transformed into **LLVM** or **CUDA/Metal/OpenCL** source code. 210 | 211 | ### Runtime Support 212 | 213 | - For GPU programs, TVM builds the host and device modules **separately** and provide a runtime module system that launch kernels using corresponding driver APIs. 214 | 215 | ### Remote Deployment Profiling 216 | 217 | - TVM includes infrastructure to make profiling and autotuning easier on embedded devices. 218 | - Traditionally, targeting an embedded device for tuning requires: 219 | - cross-compiling on the host side, 220 | - copying to the target device, 221 | - and timing the execution 222 | 223 | - TVM provides remote function call support. Through the **RPC interface**: 224 | - TVM compiles the program on a host compiler 225 | - it uploads to remote embedded devices 226 | - it runs the funcion remotely, 227 | - and it accesses the results in the same script on the host. 228 | 229 | ![](https://i.imgur.com/oL0Z9pp.png) 230 | 231 | 232 | ## Conclusion 233 | 234 | - TVM provides an end-to-end stack to solve fundamental optimization challenges across a diverse set of hardware back-ends. 235 | - TVM can encourage more studies of programming languages, compilation, and open new opportunities for hardware co-design techniques for deep learning systems. 236 | 237 | -------------------------------------------------------------------------------- /tvm-tutorials/compute-and-reduce-with-tuple-inputs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import tvm \n", 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "## Describe Batchwise Computation" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "- For operators which have the same shape, we can put them together as the inputs of `tvm.compute`, if we wish they can be scheduled together in the next schedule procedure" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "n = tvm.var(name=\"n\")\n", 34 | "m = tvm.var(name=\"m\")\n", 35 | "A0 = tvm.placeholder(shape=(m, n), name=\"A0\")\n", 36 | "A1 = tvm.placeholder(shape=(m, n), name=\"A1\")" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "B0, B1 = tvm.compute(shape=(m, n), fcompute=lambda i, j: (A0[i, j] + 2, A1[i, j] * 3), \n", 46 | " name=\"B\")" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 6, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "produce B {\n", 59 | " for (i, 0, m) {\n", 60 | " for (j, 0, n) {\n", 61 | " B.v0[((i*n) + j)] = (A0[((i*n) + j)] + 2.000000f)\n", 62 | " B.v1[((i*n) + j)] = (A1[((i*n) + j)]*3.000000f)\n", 63 | " }\n", 64 | " }\n", 65 | "}\n", 66 | "\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "# The generated IR code would be:\n", 72 | "s = tvm.create_schedule(B0.op)\n", 73 | "print(tvm.lower(sch=s, args=[A0, A1, B0, B1], simple_mode=True))" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "## Describe Reduction with Collaborative Inputs" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "- Sometimes, we require multiple inputs to express some reduction operators, and the inputs will collaborate together, e.g. `argmax`\n", 88 | "- In the reduction procedure, `argmax` need to compare the value of operands, also need to keep the index of operand. This can be expressed with `comm_reducer` " 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 7, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "# x and y are the operands of reduction, both of them are a tuple of index and value\n", 98 | "def fcombine(x, y):\n", 99 | " lhs = tvm.select(cond=(x[1] >= y[1]), t=x[0], f=y[0])\n", 100 | " rhs = tvm.select(cond=(x[1] >= y[1]), t=x[1], f=y[1])\n", 101 | " return lhs, rhs" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 8, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "# our identity element also need to be a tuple, so `fidentity` accepts \n", 111 | "# two types as inputs\n", 112 | "def fidentity(t0, t1):\n", 113 | " return tvm.const(value=-1, dtype=t0), tvm.min_value(dtype=t1)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 9, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "argmax = tvm.comm_reducer(fcombine, fidentity, name=\"argmax\")" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 10, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "# describe the reduction computation\n", 132 | "m = tvm.var(\"m\")\n", 133 | "n = tvm.var(\"n\")\n", 134 | "idx = tvm.placeholder(shape=(m, n), name=\"idx\", dtype=\"int32\")\n", 135 | "val = tvm.placeholder(shape=(m, n), name=\"val\", dtype=\"int32\")\n", 136 | "k = tvm.reduce_axis(dom=(0, n), name=\"k\")\n", 137 | "T0, T1 = tvm.compute(shape=(m, ), fcompute=lambda i: argmax((idx[i, k], val[i, k]), axis=k),\n", 138 | " name=\"T\")\n" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 11, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "name": "stdout", 148 | "output_type": "stream", 149 | "text": [ 150 | "produce T {\n", 151 | " for (i, 0, m) {\n", 152 | " T.v0[i] = -1\n", 153 | " T.v1[i] = -2147483648\n", 154 | " for (k, 0, n) {\n", 155 | " T.v0[i] = tvm_if_then_else((T.v1[i] < val[((i*n) + k)]), idx[((i*n) + k)], T.v0[i])\n", 156 | " T.v1[i] = tvm_if_then_else((T.v1[i] < val[((i*n) + k)]), val[((i*n) + k)], T.v1[i])\n", 157 | " }\n", 158 | " }\n", 159 | "}\n", 160 | "\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "# The generated IR code would be:\n", 166 | "s = tvm.create_schedule(T0.op)\n", 167 | "print(tvm.lower(sch=s, args=[idx, val, T0, T1], simple_mode=True))" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "## Schedule Operation with Tuple Inputs\n" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "- Although you will get multiple outputs with one batch operation, but they can only be scheduled together in terms of operation." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 13, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "n = tvm.var(\"n\")\n", 191 | "m = tvm.var(\"m\")\n", 192 | "A0 = tvm.placeholder((m, n), name='A0')\n", 193 | "B0, B1 = tvm.compute((m, n), lambda i, j: (A0[i, j] + 2, A0[i, j] * 3), name='B')\n", 194 | "A1 = tvm.placeholder((m, n), name='A1')\n", 195 | "C = tvm.compute((m, n), lambda i, j: A1[i, j] + B0[i, j], name='C')" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 14, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "s = tvm.create_schedule(C.op)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 16, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "s[B0].compute_at(s[C], C.op.axis[0])" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 17, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | "// attr [B.v0] storage_scope = \"global\"\n", 226 | "allocate B.v0[float32 * 1 * n]\n", 227 | "// attr [B.v1] storage_scope = \"global\"\n", 228 | "allocate B.v1[float32 * 1 * n]\n", 229 | "produce C {\n", 230 | " for (i, 0, m) {\n", 231 | " produce B {\n", 232 | " for (j, 0, n) {\n", 233 | " B.v0[j] = (A0[((i*n) + j)] + 2.000000f)\n", 234 | " B.v1[j] = (A0[((i*n) + j)]*3.000000f)\n", 235 | " }\n", 236 | " }\n", 237 | " for (j, 0, n) {\n", 238 | " C[((i*n) + j)] = (A1[((i*n) + j)] + B.v0[j])\n", 239 | " }\n", 240 | " }\n", 241 | "}\n", 242 | "\n" 243 | ] 244 | } 245 | ], 246 | "source": [ 247 | "print(tvm.lower(sch=s, args=[A0, A1, C], simple_mode=True))" 248 | ] 249 | } 250 | ], 251 | "metadata": { 252 | "kernelspec": { 253 | "display_name": "Python 3", 254 | "language": "python", 255 | "name": "python3" 256 | }, 257 | "language_info": { 258 | "codemirror_mode": { 259 | "name": "ipython", 260 | "version": 3 261 | }, 262 | "file_extension": ".py", 263 | "mimetype": "text/x-python", 264 | "name": "python", 265 | "nbconvert_exporter": "python", 266 | "pygments_lexer": "ipython3", 267 | "version": "3.6.5" 268 | } 269 | }, 270 | "nbformat": 4, 271 | "nbformat_minor": 2 272 | } 273 | -------------------------------------------------------------------------------- /tvm-tutorials/external-tensor-functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## External Tensor Functions" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "- TVM supports transparent code generation.\n", 15 | "- TVM supports black box function calls natively as well. \n", 16 | "- Specifically, TVM supports all tensor functions that are [DLPack](https://github.com/dmlc/dlpack) compatible.\n" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import tvm \n", 26 | "import numpy as np \n", 27 | "from tvm.contrib import cblas" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Use Extern Tensor Function\n" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "n = 1024\n", 44 | "l = 128\n", 45 | "m = 235" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 3, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "bias = tvm.var(name='bias', dtype=tvm.float32)\n", 55 | "A = tvm.placeholder(shape=(n, l), name='A')\n", 56 | "B = tvm.placeholder(shape=(l, m), name='B')\n", 57 | "# Compute several tensor via extern function.\n", 58 | "C = tvm.extern(shape=(n, m), inputs=[A, B], \n", 59 | " fcompute=lambda ins, outs: tvm.call_packed(\"tvm.contrib.cblas.matmul\",\n", 60 | " ins[0], ins[1], outs[0], False, False),\n", 61 | " name=\"C\")" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "D = tvm.compute(shape=C.shape, fcompute=lambda i, j: C[i, j] + bias, name=\"D\")" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 5, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "s = tvm.create_schedule(D.op)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "## Verify the Result" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 6, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "ctx = tvm.cpu(dev_id=0)\n", 96 | "f = tvm.build(sch=s, args=[A, B, D, bias], target=\"llvm\")" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 7, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)\n", 106 | "b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)\n", 107 | "d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)\n", 108 | "bb = 10.0" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 8, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "f(a, b, d, bb)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 9, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "np.testing.assert_allclose(\n", 127 | " d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 10, rtol=1e-5)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "## Extern Contrib Wrappers" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "- TVM also provides extern contrib wrappers to useful extern calls, the following line is equivalent to the previous example." 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 10, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "from tvm.contrib import cblas" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 11, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "C = cblas.matmul(lhs=A, rhs=B)\n", 160 | "D = tvm.compute(shape=C.shape, fcompute=lambda i, j: C[i, j] + bias, name=\"D\")\n", 161 | "s = tvm.create_schedule(D.op)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "## Hook Python Function as Extern" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "- Since we can call into any PackedFunc in TVM. We can use the extern function to callback into python.\n", 176 | "\n", 177 | "- The following example registers a python function into tvm runtime system and use it to complete one stage of the computation. \n", 178 | "- This makes TVM much more flexible. For example, we can insert front-end callbacks to inspect the intermediate results or mix customized code with TVM." 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 12, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "@tvm.register_func(\"tvm.contrib.my_tvm_addone\")\n", 188 | "def my_tvm_addone(x, y):\n", 189 | " print(\"my_tvm_addone signatures: %s, %s\" % (type(x), type(y)))\n", 190 | " tvm.nd.array(x.asnumpy() + 1).copyto(y)\n" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 13, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "A = tvm.placeholder((n,), name=\"A\")\n", 200 | "B = tvm.extern(A.shape, inputs=[A], \n", 201 | " fcompute=lambda ins, outs: tvm.call_packed(\"tvm.contrib.my_tvm_addone\", \n", 202 | " ins[0], outs[0]), \n", 203 | " name=\"C\")" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 14, 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "name": "stdout", 213 | "output_type": "stream", 214 | "text": [ 215 | "my_tvm_addone signatures: , \n" 216 | ] 217 | } 218 | ], 219 | "source": [ 220 | "s = tvm.create_schedule(B.op)\n", 221 | "f = tvm.build(s, [A, B], \"llvm\")\n", 222 | "a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)\n", 223 | "b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx)\n", 224 | "f(a, b)\n", 225 | "np.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1, rtol=1e-5)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [] 234 | } 235 | ], 236 | "metadata": { 237 | "kernelspec": { 238 | "display_name": "Python 3", 239 | "language": "python", 240 | "name": "python3" 241 | }, 242 | "language_info": { 243 | "codemirror_mode": { 244 | "name": "ipython", 245 | "version": 3 246 | }, 247 | "file_extension": ".py", 248 | "mimetype": "text/x-python", 249 | "name": "python", 250 | "nbconvert_exporter": "python", 251 | "pygments_lexer": "ipython3", 252 | "version": "3.6.5" 253 | } 254 | }, 255 | "nbformat": 4, 256 | "nbformat_minor": 2 257 | } 258 | -------------------------------------------------------------------------------- /tvm-tutorials/getting-started.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import tvm\n", 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "- Global declarations of environment" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "tgt_host=\"llvm\"\n", 27 | "# Change it to respective GPU if gpu is enabled Ex: cuda, opencl\n", 28 | "tgt=\"cuda\"" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "## Describe the Computation" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "n = tvm.var(\"n\")\n", 45 | "A = tvm.placeholder((n,), name='A')\n", 46 | "B = tvm.placeholder((n,), name='B')\n", 47 | "C = tvm.compute(A.shape, lambda i: A[i] + B[i], name=\"C\")" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "print(type(C))" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "## Schedule the Computation" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "- A schedule is a set of transformation of computation that transforms the loop of computations in the program\n", 79 | "\n", 80 | "- After we construct the schedule, by default the schedule computes C in a serial manner in a row-major order." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 5, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "s = tvm.create_schedule(C.op)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 6, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "tvm.schedule.Schedule" 101 | ] 102 | }, 103 | "execution_count": 6, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "type(s)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 7, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/plain": [ 120 | "\u001b[0;31mType:\u001b[0m Schedule\n", 121 | "\u001b[0;31mString form:\u001b[0m schedule(0x1d46a90)\n", 122 | "\u001b[0;31mFile:\u001b[0m ~/opt/miniconda3/envs/tvm/lib/python3.6/site-packages/tvm-0.2.0-py3.6-linux-x86_64.egg/tvm/schedule.py\n", 123 | "\u001b[0;31mDocstring:\u001b[0m Schedule for all the stages.\n" 124 | ] 125 | }, 126 | "metadata": {}, 127 | "output_type": "display_data" 128 | } 129 | ], 130 | "source": [ 131 | "s?" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "- Use the split construct to split the first axis of C\n", 139 | " - this will split the original iteration axis into product of two iterations." 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 8, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "bx, tx = s[C].split(C.op.axis[0], factor=64)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 9, 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "iter_var(i.outer, )" 160 | ] 161 | }, 162 | "execution_count": 9, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "bx" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 10, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/plain": [ 179 | "iter_var(i.inner, )" 180 | ] 181 | }, 182 | "execution_count": 10, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | } 186 | ], 187 | "source": [ 188 | "tx" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "- Finally bind the iteration axis `bx` and `tx` to threads in the GPU compute grid. \n", 196 | "- These are GPU specific constructs that allows us to generate code that runs on GPU. " 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 11, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "if tgt == \"cuda\":\n", 206 | " s[C].bind(bx, tvm.thread_axis(\"blockIdx.x\"))\n", 207 | " s[C].bind(tx, tvm.thread_axis(\"threadIdx.x\"))" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "## Compilation" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "- After finishing to specify the schedule, we can compile it into a TVM function. \n", 222 | "- By default TVM compiles into a type-erased function that can be directly called from python side" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 12, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "fadd = tvm.build(sch=s, args=[A, B, C], target=tgt, target_host=tgt_host, name=\"myadd\")" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 13, 237 | "metadata": {}, 238 | "outputs": [ 239 | { 240 | "data": { 241 | "text/plain": [ 242 | "Module(llvm, 21b49b0)" 243 | ] 244 | }, 245 | "execution_count": 13, 246 | "metadata": {}, 247 | "output_type": "execute_result" 248 | } 249 | ], 250 | "source": [ 251 | "fadd" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 14, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "data": { 261 | "text/plain": [ 262 | "tvm.module.Module" 263 | ] 264 | }, 265 | "execution_count": 14, 266 | "metadata": {}, 267 | "output_type": "execute_result" 268 | } 269 | ], 270 | "source": [ 271 | "type(fadd)" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 15, 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "data": { 281 | "text/plain": [ 282 | "\u001b[0;31mSignature:\u001b[0m \u001b[0mfadd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 283 | "\u001b[0;31mType:\u001b[0m Module\n", 284 | "\u001b[0;31mString form:\u001b[0m Module(llvm, 21b49b0)\n", 285 | "\u001b[0;31mFile:\u001b[0m ~/opt/miniconda3/envs/tvm/lib/python3.6/site-packages/tvm-0.2.0-py3.6-linux-x86_64.egg/tvm/module.py\n", 286 | "\u001b[0;31mDocstring:\u001b[0m Module container of all TVM generated functions\n" 287 | ] 288 | }, 289 | "metadata": {}, 290 | "output_type": "display_data" 291 | } 292 | ], 293 | "source": [ 294 | "fadd?" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "## Run the function" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "- Create a gpu context\n", 309 | "- Use `tvm.nd.array` to copy data to gpu\n", 310 | "- fadd runs the actual computation\n", 311 | "- Use `asnumpy()` to copy the gpu array to cpu so that we can use this to verify correctness. " 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 16, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "ctx = tvm.context(dev_type=tgt, dev_id=0)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 17, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "n = 1024\n", 330 | "a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx=ctx)\n", 331 | "b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx=ctx)\n", 332 | "c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx=ctx)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 18, 338 | "metadata": {}, 339 | "outputs": [ 340 | { 341 | "name": "stdout", 342 | "output_type": "stream", 343 | "text": [ 344 | "CPU times: user 17 ms, sys: 274 µs, total: 17.2 ms\n", 345 | "Wall time: 17.1 ms\n" 346 | ] 347 | } 348 | ], 349 | "source": [ 350 | "%%time\n", 351 | "fadd(a, b, c)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 19, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 20, 366 | "metadata": {}, 367 | "outputs": [ 368 | { 369 | "data": { 370 | "text/plain": [ 371 | "array([1.0944474 , 1.3696101 , 1.4258708 , ..., 1.0076509 , 1.4810429 ,\n", 372 | " 0.42075178], dtype=float32)" 373 | ] 374 | }, 375 | "execution_count": 20, 376 | "metadata": {}, 377 | "output_type": "execute_result" 378 | } 379 | ], 380 | "source": [ 381 | "c.asnumpy()" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 21, 387 | "metadata": {}, 388 | "outputs": [ 389 | { 390 | "data": { 391 | "text/plain": [ 392 | "(array([0.6563842 , 0.5160306 , 0.96769214, ..., 0.21516554, 0.57610613,\n", 393 | " 0.11985361], dtype=float32),\n", 394 | " array([0.43806314, 0.8535795 , 0.45817864, ..., 0.7924853 , 0.9049368 ,\n", 395 | " 0.30089816], dtype=float32))" 396 | ] 397 | }, 398 | "execution_count": 21, 399 | "metadata": {}, 400 | "output_type": "execute_result" 401 | } 402 | ], 403 | "source": [ 404 | "a.asnumpy(), b.asnumpy()" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 22, 410 | "metadata": {}, 411 | "outputs": [ 412 | { 413 | "data": { 414 | "text/plain": [ 415 | "\n", 416 | "array([0.6563842 , 0.5160306 , 0.96769214, ..., 0.21516554, 0.57610613,\n", 417 | " 0.11985361], dtype=float32)" 418 | ] 419 | }, 420 | "execution_count": 22, 421 | "metadata": {}, 422 | "output_type": "execute_result" 423 | } 424 | ], 425 | "source": [ 426 | "a" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": {}, 432 | "source": [ 433 | "## Inspect the Generated Code " 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 23, 439 | "metadata": {}, 440 | "outputs": [ 441 | { 442 | "name": "stdout", 443 | "output_type": "stream", 444 | "text": [ 445 | "-----------------------GPU code--------------------------\n", 446 | "extern \"C\" __global__ void myadd__kernel0( float* __restrict__ C, float* __restrict__ A, float* __restrict__ B, int n) {\n", 447 | " if (((int)blockIdx.x) < (n / 64)) {\n", 448 | " C[((((int)blockIdx.x) * 64) + ((int)threadIdx.x))] = (A[((((int)blockIdx.x) * 64) + ((int)threadIdx.x))] + B[((((int)blockIdx.x) * 64) + ((int)threadIdx.x))]);\n", 449 | " } else {\n", 450 | " if ((((int)blockIdx.x) * 64) < (n - ((int)threadIdx.x))) {\n", 451 | " C[((((int)blockIdx.x) * 64) + ((int)threadIdx.x))] = (A[((((int)blockIdx.x) * 64) + ((int)threadIdx.x))] + B[((((int)blockIdx.x) * 64) + ((int)threadIdx.x))]);\n", 452 | " }\n", 453 | " }\n", 454 | "}\n", 455 | "\n", 456 | "\n" 457 | ] 458 | } 459 | ], 460 | "source": [ 461 | "if tgt == \"cuda\":\n", 462 | " dev_module = fadd.imported_modules[0]\n", 463 | " print(\"-----------------------GPU code--------------------------\")\n", 464 | " print(dev_module.get_source())\n", 465 | " \n", 466 | "else:\n", 467 | " print(fadd.get_source())" 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "metadata": {}, 473 | "source": [ 474 | "## Save Compiled Module" 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "metadata": {}, 480 | "source": [ 481 | "- Besides runtime compilation, we can save the compiled modules into file and load them back later. This is called ahead of time compilation\n", 482 | "\n", 483 | "- The following code first does the following step:\n", 484 | " - It saves the compiled host module into an object file.\n", 485 | " - Then it saves the device module into a ptx file.\n", 486 | " - cc.create_shared calls a env compiler(GCC) to create a shared library." 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": 24, 492 | "metadata": {}, 493 | "outputs": [], 494 | "source": [ 495 | "from tvm.contrib import cc\n", 496 | "from tvm.contrib import util" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 25, 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [ 505 | "temp = util.tempdir()\n", 506 | "fadd.save(file_name=temp.relpath(\"myadd.o\"))" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": 26, 512 | "metadata": {}, 513 | "outputs": [], 514 | "source": [ 515 | "if tgt == \"cuda\":\n", 516 | " fadd.imported_modules[0].save(temp.relpath(\"myadd.ptx\"))" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": 27, 522 | "metadata": {}, 523 | "outputs": [], 524 | "source": [ 525 | "cc.create_shared(output=temp.relpath(\"myadd.so\"), objects=[temp.relpath(\"myadd.o\")])" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 28, 531 | "metadata": {}, 532 | "outputs": [ 533 | { 534 | "name": "stdout", 535 | "output_type": "stream", 536 | "text": [ 537 | "['myadd.tvm_meta.json', 'myadd.o', 'myadd.ptx', 'myadd.so']\n" 538 | ] 539 | } 540 | ], 541 | "source": [ 542 | "print(temp.listdir())" 543 | ] 544 | }, 545 | { 546 | "cell_type": "markdown", 547 | "metadata": {}, 548 | "source": [ 549 | "## Load Compiled Module" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 29, 555 | "metadata": {}, 556 | "outputs": [], 557 | "source": [ 558 | "fadd1 = tvm.module.load(temp.relpath(\"myadd.so\"))\n", 559 | "if tgt == \"cuda\":\n", 560 | " fadd1_dev = tvm.module.load(temp.relpath(\"myadd.ptx\"))\n", 561 | " fadd1.import_module(fadd1_dev)\n", 562 | "fadd1(a, b, c)\n", 563 | "np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())\n" 564 | ] 565 | }, 566 | { 567 | "cell_type": "markdown", 568 | "metadata": {}, 569 | "source": [ 570 | "## Pack Everything into One Library" 571 | ] 572 | }, 573 | { 574 | "cell_type": "markdown", 575 | "metadata": {}, 576 | "source": [ 577 | "In the above example, we store the device and host code seperatedly. TVM also supports export everything as one shared library. Under the hood, we pack the device modules into binary blobs and link them together with the host code. " 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": 30, 583 | "metadata": {}, 584 | "outputs": [], 585 | "source": [ 586 | "fadd.export_library(temp.relpath(\"mypack.so\"))" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": 31, 592 | "metadata": {}, 593 | "outputs": [], 594 | "source": [ 595 | "fadd2 = tvm.module.load(temp.relpath(\"mypack.so\"))" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": 32, 601 | "metadata": {}, 602 | "outputs": [], 603 | "source": [ 604 | "fadd2(a, b, c)" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": 33, 610 | "metadata": {}, 611 | "outputs": [], 612 | "source": [ 613 | "np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())" 614 | ] 615 | }, 616 | { 617 | "cell_type": "markdown", 618 | "metadata": {}, 619 | "source": [ 620 | "**NOTE**: Runtime API and Thread-Safety\n", 621 | "\n", 622 | "The compiled modules of TVM do not depend on the TVM compiler. Instead, it only depends on a minimum runtime library. TVM runtime library wraps the device drivers and provides thread-safe and device agnostic call into the compiled functions.\n", 623 | "\n", 624 | "This means you can call the compiled TVM function from any thread, on any GPUs." 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": 35, 630 | "metadata": {}, 631 | "outputs": [ 632 | { 633 | "name": "stdout", 634 | "output_type": "stream", 635 | "text": [ 636 | "The version_information extension is already loaded. To reload it, use:\n", 637 | " %reload_ext version_information\n" 638 | ] 639 | }, 640 | { 641 | "data": { 642 | "application/json": { 643 | "Software versions": [ 644 | { 645 | "module": "Python", 646 | "version": "3.6.5 64bit [GCC 4.8.2 20140120 (Red Hat 4.8.2-15)]" 647 | }, 648 | { 649 | "module": "IPython", 650 | "version": "6.4.0" 651 | }, 652 | { 653 | "module": "OS", 654 | "version": "Linux 4.13.0 41 generic x86_64 with debian stretch sid" 655 | }, 656 | { 657 | "module": "numpy", 658 | "version": "1.14.3" 659 | }, 660 | { 661 | "module": "tvm", 662 | "version": "0.2.0" 663 | } 664 | ] 665 | }, 666 | "text/html": [ 667 | "
SoftwareVersion
Python3.6.5 64bit [GCC 4.8.2 20140120 (Red Hat 4.8.2-15)]
IPython6.4.0
OSLinux 4.13.0 41 generic x86_64 with debian stretch sid
numpy1.14.3
tvm0.2.0
Thu May 17 17:10:38 2018 CDT
" 668 | ], 669 | "text/latex": [ 670 | "\\begin{tabular}{|l|l|}\\hline\n", 671 | "{\\bf Software} & {\\bf Version} \\\\ \\hline\\hline\n", 672 | "Python & 3.6.5 64bit [GCC 4.8.2 20140120 (Red Hat 4.8.2-15)] \\\\ \\hline\n", 673 | "IPython & 6.4.0 \\\\ \\hline\n", 674 | "OS & Linux 4.13.0 41 generic x86\\_64 with debian stretch sid \\\\ \\hline\n", 675 | "numpy & 1.14.3 \\\\ \\hline\n", 676 | "tvm & 0.2.0 \\\\ \\hline\n", 677 | "\\hline \\multicolumn{2}{|l|}{Thu May 17 17:10:38 2018 CDT} \\\\ \\hline\n", 678 | "\\end{tabular}\n" 679 | ], 680 | "text/plain": [ 681 | "Software versions\n", 682 | "Python 3.6.5 64bit [GCC 4.8.2 20140120 (Red Hat 4.8.2-15)]\n", 683 | "IPython 6.4.0\n", 684 | "OS Linux 4.13.0 41 generic x86_64 with debian stretch sid\n", 685 | "numpy 1.14.3\n", 686 | "tvm 0.2.0\n", 687 | "Thu May 17 17:10:38 2018 CDT" 688 | ] 689 | }, 690 | "execution_count": 35, 691 | "metadata": {}, 692 | "output_type": "execute_result" 693 | } 694 | ], 695 | "source": [ 696 | "%load_ext version_information\n", 697 | "%version_information numpy, tvm" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": null, 703 | "metadata": {}, 704 | "outputs": [], 705 | "source": [] 706 | }, 707 | { 708 | "cell_type": "code", 709 | "execution_count": null, 710 | "metadata": {}, 711 | "outputs": [], 712 | "source": [] 713 | } 714 | ], 715 | "metadata": { 716 | "kernelspec": { 717 | "display_name": "Python 3", 718 | "language": "python", 719 | "name": "python3" 720 | }, 721 | "language_info": { 722 | "codemirror_mode": { 723 | "name": "ipython", 724 | "version": 3 725 | }, 726 | "file_extension": ".py", 727 | "mimetype": "text/x-python", 728 | "name": "python", 729 | "nbconvert_exporter": "python", 730 | "pygments_lexer": "ipython3", 731 | "version": "3.6.5" 732 | } 733 | }, 734 | "nbformat": 4, 735 | "nbformat_minor": 2 736 | } 737 | -------------------------------------------------------------------------------- /tvm-tutorials/reduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import tvm \n", 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "## Describe Sum of Rows\n", 18 | "\n", 19 | "`B = numpy.sum(A, axis=1)`" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "n = tvm.var(\"n\")\n", 29 | "m = tvm.var(\"m\")" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "A = tvm.placeholder((n, m), name=\"A\")\n", 39 | "k = tvm.reduce_axis(dom=(0, m), name=\"k\")\n", 40 | "B = tvm.compute(shape=(n,), fcompute=lambda i: tvm.sum(A[i, k], axis=k), name=\"B\")" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "## Schedule the Reduction" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "produce B {\n", 60 | " for (i, 0, n) {\n", 61 | " B[i] = 0.000000f\n", 62 | " for (k, 0, m) {\n", 63 | " B[i] = (B[i] + A[((i*m) + k)])\n", 64 | " }\n", 65 | " }\n", 66 | "}\n", 67 | "\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "s = tvm.create_schedule(B.op)\n", 73 | "print(tvm.lower(sch=s, args=[A, B], simple_mode=True))" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "Let's split both the row axis of B as well axis by different factors. The result is a nested reduction. " 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 6, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "ko, ki = s[B].split(parent=B.op.reduce_axis[0], factor=16)\n", 90 | "xo, xi = s[B].split(parent=B.op.axis[0], factor=32)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 7, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "name": "stdout", 100 | "output_type": "stream", 101 | "text": [ 102 | "produce B {\n", 103 | " for (i.outer, 0, ((n + 31)/32)) {\n", 104 | " for (i.inner, 0, 32) {\n", 105 | " if (likely(((i.outer*32) < (n - i.inner)))) {\n", 106 | " B[((i.outer*32) + i.inner)] = 0.000000f\n", 107 | " }\n", 108 | " for (k.outer, 0, ((m + 15)/16)) {\n", 109 | " for (k.inner, 0, 16) {\n", 110 | " if (likely(((i.outer*32) < (n - i.inner)))) {\n", 111 | " if (likely(((k.outer*16) < (m - k.inner)))) {\n", 112 | " B[((i.outer*32) + i.inner)] = (B[((i.outer*32) + i.inner)] + A[(((((i.outer*32) + i.inner)*m) + (k.outer*16)) + k.inner)])\n", 113 | " }\n", 114 | " }\n", 115 | " }\n", 116 | " }\n", 117 | " }\n", 118 | " }\n", 119 | "}\n", 120 | "\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "print(tvm.lower(sch=s, args=[A, B], simple_mode=True))" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "If we are building a GPU kernel, we can bind the rows of B to GPU threads" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 8, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "produce B {\n", 145 | " // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = ((n + 31)/32)\n", 146 | " // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 32\n", 147 | " if (likely(((blockIdx.x*32) < (n - threadIdx.x)))) {\n", 148 | " B[((blockIdx.x*32) + threadIdx.x)] = 0.000000f\n", 149 | " }\n", 150 | " for (k.outer, 0, ((m + 15)/16)) {\n", 151 | " for (k.inner, 0, 16) {\n", 152 | " if (likely(((blockIdx.x*32) < (n - threadIdx.x)))) {\n", 153 | " if (likely(((k.outer*16) < (m - k.inner)))) {\n", 154 | " B[((blockIdx.x*32) + threadIdx.x)] = (B[((blockIdx.x*32) + threadIdx.x)] + A[(((((blockIdx.x*32) + threadIdx.x)*m) + (k.outer*16)) + k.inner)])\n", 155 | " }\n", 156 | " }\n", 157 | " }\n", 158 | " }\n", 159 | "}\n", 160 | "\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "s[B].bind(ivar=xo, thread_ivar=tvm.thread_axis(\"blockIdx.x\"))\n", 166 | "s[B].bind(ivar=xi, thread_ivar=tvm.thread_axis(\"threadIdx.x\"))\n", 167 | "print(tvm.lower(sch=s, args=[A, B], simple_mode=True))" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "## Reduction Factoring and Parallelization" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 9, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "s = tvm.create_schedule(ops=B.op)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 10, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "ko, ki = s[B].split(parent=B.op.reduce_axis[0], factor=16)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 11, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "BF = s.rfactor(tensor=B, axis=ki, factor_axis=0)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 12, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "name": "stdout", 211 | "output_type": "stream", 212 | "text": [ 213 | "// attr [B.rf] storage_scope = \"global\"\n", 214 | "allocate B.rf[float32 * 16 * n]\n", 215 | "produce B.rf {\n", 216 | " for (k.inner, 0, 16) {\n", 217 | " for (i, 0, n) {\n", 218 | " B.rf[((k.inner*n) + i)] = 0.000000f\n", 219 | " for (k.outer, 0, ((m + 15)/16)) {\n", 220 | " if ((k.inner < (m - (k.outer*16)))) {\n", 221 | " B.rf[((k.inner*n) + i)] = (B.rf[((k.inner*n) + i)] + A[((k.inner + (i*m)) + (k.outer*16))])\n", 222 | " }\n", 223 | " }\n", 224 | " }\n", 225 | " }\n", 226 | "}\n", 227 | "produce B {\n", 228 | " for (ax0, 0, n) {\n", 229 | " B[ax0] = 0.000000f\n", 230 | " for (k.inner.v, 0, 16) {\n", 231 | " B[ax0] = (B[ax0] + B.rf[(ax0 + (k.inner.v*n))])\n", 232 | " }\n", 233 | " }\n", 234 | "}\n", 235 | "\n" 236 | ] 237 | } 238 | ], 239 | "source": [ 240 | "print(tvm.lower(sch=s, args=[A, B], simple_mode=True))" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 14, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "name": "stdout", 250 | "output_type": "stream", 251 | "text": [ 252 | "[reduce(combiner=comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0.000000f]), source=[B.rf(k.inner.v, ax0)], axis=[iter_var(k.inner.v, Range(min=0, extent=16))], where=(uint1)1, value_index=0)]\n" 253 | ] 254 | } 255 | ], 256 | "source": [ 257 | "print(s[B].op.body)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "## Cross Thread Reduction " 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "- We can now parallelize over the factored axis. \n", 272 | "- Here the reduction axis of B is marked to be a thread.\n", 273 | "- TVM allows reduction axis to be marked as thread if it is the only axis in reduction and cross thread reduction is possible in the device. " 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 15, 279 | "metadata": {}, 280 | "outputs": [ 281 | { 282 | "name": "stdout", 283 | "output_type": "stream", 284 | "text": [ 285 | "extern \"C\" __global__ void default_function__kernel0( float* __restrict__ A, float* __restrict__ B, int m, int n) {\n", 286 | " float B_rf[1];\n", 287 | " __shared__ float red_buf0[512];\n", 288 | " B_rf[0] = 0.000000e+00f;\n", 289 | " for (int k_outer = 0; k_outer < ((15 + m) / 16); ++k_outer) {\n", 290 | " if ((((int)blockIdx.x) * 32) < (n - ((int)threadIdx.y))) {\n", 291 | " if (((int)threadIdx.x) < (m - (k_outer * 16))) {\n", 292 | " B_rf[0] = (B_rf[0] + A[(((((((int)blockIdx.x) * 32) + ((int)threadIdx.y)) * m) + ((int)threadIdx.x)) + (k_outer * 16))]);\n", 293 | " }\n", 294 | " }\n", 295 | " }\n", 296 | " ((volatile __shared__ float*)red_buf0)[((((int)threadIdx.y) * 16) + ((int)threadIdx.x))] = (((((int)blockIdx.x) * 32) < (n - ((int)threadIdx.y))) ? B_rf[0] : 0.000000e+00f);\n", 297 | " __syncthreads();\n", 298 | " if (((int)threadIdx.x) < 8) {\n", 299 | " ((volatile __shared__ float*)red_buf0)[((((int)threadIdx.y) * 16) + ((int)threadIdx.x))] = (((volatile __shared__ float*)red_buf0)[((((int)threadIdx.y) * 16) + ((int)threadIdx.x))] + ((volatile __shared__ float*)red_buf0)[((8 + (((int)threadIdx.y) * 16)) + ((int)threadIdx.x))]);\n", 300 | " ((volatile __shared__ float*)red_buf0)[((((int)threadIdx.y) * 16) + ((int)threadIdx.x))] = (((volatile __shared__ float*)red_buf0)[((((int)threadIdx.y) * 16) + ((int)threadIdx.x))] + ((volatile __shared__ float*)red_buf0)[((4 + (((int)threadIdx.y) * 16)) + ((int)threadIdx.x))]);\n", 301 | " ((volatile __shared__ float*)red_buf0)[((((int)threadIdx.y) * 16) + ((int)threadIdx.x))] = (((volatile __shared__ float*)red_buf0)[((((int)threadIdx.y) * 16) + ((int)threadIdx.x))] + ((volatile __shared__ float*)red_buf0)[((2 + (((int)threadIdx.y) * 16)) + ((int)threadIdx.x))]);\n", 302 | " ((volatile __shared__ float*)red_buf0)[((((int)threadIdx.y) * 16) + ((int)threadIdx.x))] = (((volatile __shared__ float*)red_buf0)[((((int)threadIdx.y) * 16) + ((int)threadIdx.x))] + ((volatile __shared__ float*)red_buf0)[((1 + (((int)threadIdx.y) * 16)) + ((int)threadIdx.x))]);\n", 303 | " }\n", 304 | " __syncthreads();\n", 305 | " if ((((int)blockIdx.x) * 32) < (n - ((int)threadIdx.y))) {\n", 306 | " if (((int)threadIdx.x) == 0) {\n", 307 | " B[((((int)blockIdx.x) * 32) + ((int)threadIdx.y))] = ((volatile __shared__ float*)red_buf0)[(((int)threadIdx.y) * 16)];\n", 308 | " }\n", 309 | " }\n", 310 | "}\n", 311 | "\n", 312 | "\n" 313 | ] 314 | } 315 | ], 316 | "source": [ 317 | "xo, xi = s[B].split(s[B].op.axis[0], factor=32)\n", 318 | "s[B].bind(xo, tvm.thread_axis(\"blockIdx.x\"))\n", 319 | "s[B].bind(xi, tvm.thread_axis(\"threadIdx.y\"))\n", 320 | "tx = tvm.thread_axis(\"threadIdx.x\")\n", 321 | "s[B].bind(s[B].op.reduce_axis[0], tx)\n", 322 | "s[BF].compute_at(s[B], s[B].op.reduce_axis[0])\n", 323 | "s[B].set_store_predicate(tx.var.equal(0))\n", 324 | "fcuda = tvm.build(s, [A, B], \"cuda\")\n", 325 | "print(fcuda.imported_modules[0].get_source())" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 16, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "nn = 128\n", 335 | "ctx = tvm.gpu(0)\n", 336 | "a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), ctx)\n", 337 | "b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), ctx)\n", 338 | "fcuda(a, b)" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 17, 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "np.testing.assert_allclose(b.asnumpy(), np.sum(a.asnumpy(), axis=1), rtol=1e-4)" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "## Describe Convolution via 2D Reduction " 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": 18, 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "n = tvm.var(\"n\")\n", 364 | "Input = tvm.placeholder(shape=(n, n), name=\"Input\")\n", 365 | "Filter = tvm.placeholder(shape=(3, 3), name=\"Filter\")" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 19, 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "di = tvm.reduce_axis(dom=(0, 3), name=\"di\")\n", 375 | "dj = tvm.reduce_axis(dom=(0, 3), name=\"dj\")" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 20, 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "Output = tvm.compute(shape=(n-2, n-2),\n", 385 | " fcompute=lambda i, j: tvm.sum(Input[i + di, j + dj] * Filter[di, dj], axis=[di, dj]),\n", 386 | " name=\"Output\")" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 21, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "s = tvm.create_schedule(ops=Output.op)" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 22, 401 | "metadata": {}, 402 | "outputs": [ 403 | { 404 | "name": "stdout", 405 | "output_type": "stream", 406 | "text": [ 407 | "produce Output {\n", 408 | " for (i, 0, (n + -2)) {\n", 409 | " for (j, 0, (n + -2)) {\n", 410 | " Output[((i*(n + -2)) + j)] = 0.000000f\n", 411 | " for (di, 0, 3) {\n", 412 | " for (dj, 0, 3) {\n", 413 | " Output[((i*(n + -2)) + j)] = (Output[((i*(n + -2)) + j)] + (Input[((j + ((i + di)*n)) + dj)]*Filter[((di*3) + dj)]))\n", 414 | " }\n", 415 | " }\n", 416 | " }\n", 417 | " }\n", 418 | "}\n", 419 | "\n" 420 | ] 421 | } 422 | ], 423 | "source": [ 424 | "print(tvm.lower(sch=s, args=[Input, Filter, Output], simple_mode=True))" 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "metadata": {}, 430 | "source": [ 431 | "## Define General Commutative Reduction Operation " 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": 23, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "n = tvm.var('n')\n", 441 | "m = tvm.var('m')\n", 442 | "product = tvm.comm_reducer(lambda x, y: x*y,\n", 443 | " lambda t: tvm.const(1, dtype=t), name=\"product\")\n", 444 | "A = tvm.placeholder((n, m), name='A')\n", 445 | "k = tvm.reduce_axis((0, m), name='k')\n", 446 | "B = tvm.compute((n,), lambda i: product(A[i, k], axis=k), name='B')" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 24, 452 | "metadata": {}, 453 | "outputs": [], 454 | "source": [ 455 | "s = tvm.create_schedule(ops=B.op)" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 26, 461 | "metadata": {}, 462 | "outputs": [ 463 | { 464 | "name": "stdout", 465 | "output_type": "stream", 466 | "text": [ 467 | "produce B {\n", 468 | " for (i, 0, n) {\n", 469 | " B[i] = 1.000000f\n", 470 | " for (k, 0, m) {\n", 471 | " B[i] = (B[i]*A[((i*m) + k)])\n", 472 | " }\n", 473 | " }\n", 474 | "}\n", 475 | "\n" 476 | ] 477 | } 478 | ], 479 | "source": [ 480 | "print(tvm.lower(sch=s, args=[A, B], simple_mode=True))" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": null, 486 | "metadata": {}, 487 | "outputs": [], 488 | "source": [] 489 | } 490 | ], 491 | "metadata": { 492 | "kernelspec": { 493 | "display_name": "Python 3", 494 | "language": "python", 495 | "name": "python3" 496 | }, 497 | "language_info": { 498 | "codemirror_mode": { 499 | "name": "ipython", 500 | "version": 3 501 | }, 502 | "file_extension": ".py", 503 | "mimetype": "text/x-python", 504 | "name": "python", 505 | "nbconvert_exporter": "python", 506 | "pygments_lexer": "ipython3", 507 | "version": "3.6.5" 508 | } 509 | }, 510 | "nbformat": 4, 511 | "nbformat_minor": 2 512 | } 513 | -------------------------------------------------------------------------------- /tvm-tutorials/schedule_primitives.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import tvm \n", 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "- There often exist several methods to compute the same result, however, different methods will result in different locality and performance. So TVM asks user to provide how to execute the computation called Schedule.\n", 18 | "\n", 19 | "- A Schedule is a set of transformation of computation that transforms the loop of computations in the program." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "# declare some variables for use later\n", 29 | "n = tvm.var(\"n\")\n", 30 | "m = tvm.var(\"m\")" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "- A schedule can be created from a list of ops, by default the schedule computes tensor in a serial manner in a row-major order. " 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# declare a matrix element-wise multiply \n", 47 | "A = tvm.placeholder(shape=(m, n), name=\"A\")\n", 48 | "B = tvm.placeholder(shape=(m, n), name=\"B\")\n", 49 | "C = tvm.compute(shape=(m, n), \n", 50 | " fcompute=lambda i, j: A[i, j] * B[i, j], name=\"C\")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 4, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "s = tvm.create_schedule(ops=[C.op])" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "lower will transform the computation from definition to the real callable function. With argument `simple_mode=True`, it will return you a readable C like statement" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 5, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "produce C {\n", 79 | " for (i, 0, m) {\n", 80 | " for (j, 0, n) {\n", 81 | " C[((i*n) + j)] = (A[((i*n) + j)]*B[((i*n) + j)])\n", 82 | " }\n", 83 | " }\n", 84 | "}\n", 85 | "\n" 86 | ] 87 | } 88 | ], 89 | "source": [ 90 | "print(tvm.lower(sch=s, args=[A, B, C], simple_mode=True))" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "- One schedule is composed by multiple stages, and one stage represents schedule for one operation. TVM provides various methods to schedule every stage. " 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "## Split" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "`split` can split a specified axis into two axises by `factor`. " 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 6, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "A = tvm.placeholder(shape=(m, ), name=\"A\")\n", 121 | "B = tvm.compute(shape=(m, ),\n", 122 | " fcompute=lambda i: A[i] * 2,\n", 123 | " name=\"B\")" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 7, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "s = tvm.create_schedule(B.op)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 8, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "xo, xi = s[B].split(B.op.axis[0], factor=32)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 9, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "produce B {\n", 154 | " for (i.outer, 0, ((m + 31)/32)) {\n", 155 | " for (i.inner, 0, 32) {\n", 156 | " if (likely(((i.outer*32) < (m - i.inner)))) {\n", 157 | " B[((i.outer*32) + i.inner)] = (A[((i.outer*32) + i.inner)]*2.000000f)\n", 158 | " }\n", 159 | " }\n", 160 | " }\n", 161 | "}\n", 162 | "\n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "print(tvm.lower(sch=s, args=[A, B], simple_mode=True))" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "You can also split a axis by nparts, which splits the axis contrary with factor" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 10, 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "produce B {\n", 187 | " for (i.outer, 0, 32) {\n", 188 | " for (i.inner, 0, ((m + 31)/32)) {\n", 189 | " if (likely(((i.outer*((m + 31)/32)) < (m - i.inner)))) {\n", 190 | " if (likely(((0 - i.inner) <= (i.outer*((m + 31)/32))))) {\n", 191 | " B[((i.outer*((m + 31)/32)) + i.inner)] = (A[((i.outer*((m + 31)/32)) + i.inner)]*2.000000f)\n", 192 | " }\n", 193 | " }\n", 194 | " }\n", 195 | " }\n", 196 | "}\n", 197 | "\n" 198 | ] 199 | } 200 | ], 201 | "source": [ 202 | "A = tvm.placeholder(shape=(m, ), name=\"A\")\n", 203 | "B = tvm.compute(shape=(m, ),\n", 204 | " fcompute=lambda i: A[i] * 2,\n", 205 | " name=\"B\")\n", 206 | "s = tvm.create_schedule(ops=B.op)\n", 207 | "bx, tx = s[B].split(B.op.axis[0], nparts=32)\n", 208 | "print(tvm.lower(sch=s, args=[A, B], simple_mode=True))" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "## tile \n", 216 | "\n", 217 | "`tile` helps execute the computation tile by tile over two axises. \n" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 12, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "name": "stdout", 227 | "output_type": "stream", 228 | "text": [ 229 | "produce B {\n", 230 | " for (i.outer, 0, ((m + 9)/10)) {\n", 231 | " for (j.outer, 0, ((n + 4)/5)) {\n", 232 | " for (i.inner, 0, 10) {\n", 233 | " for (j.inner, 0, 5) {\n", 234 | " if (likely(((i.outer*10) < (m - i.inner)))) {\n", 235 | " if (likely(((j.outer*5) < (n - j.inner)))) {\n", 236 | " B[(((j.outer*5) + (((i.outer*10) + i.inner)*n)) + j.inner)] = A[(((j.outer*5) + (((i.outer*10) + i.inner)*n)) + j.inner)]\n", 237 | " }\n", 238 | " }\n", 239 | " }\n", 240 | " }\n", 241 | " }\n", 242 | " }\n", 243 | "}\n", 244 | "\n" 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "A = tvm.placeholder(shape=(m, n), name=\"A\")\n", 250 | "B = tvm.compute(shape=(m, n), \n", 251 | " fcompute=lambda i, j: A[i, j], name=\"B\")\n", 252 | "s = tvm.create_schedule(B.op)\n", 253 | "xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)\n", 254 | "print(tvm.lower(sch=s, args=[A, B], simple_mode=True))" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "## fuse\n", 262 | "\n", 263 | "`fuse` can fuse two consecutive axises of one computation. " 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 13, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "A = tvm.placeholder((m, n), name='A')\n", 273 | "B = tvm.compute((m, n), lambda i, j: A[i, j], name='B')\n", 274 | "\n", 275 | "s = tvm.create_schedule(B.op)\n", 276 | "# tile to four axises first: (i.outer, j.outer, i.inner, j.inner)\n", 277 | "xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 14, 283 | "metadata": {}, 284 | "outputs": [ 285 | { 286 | "name": "stdout", 287 | "output_type": "stream", 288 | "text": [ 289 | "produce B {\n", 290 | " for (i.outer, 0, ((m + 9)/10)) {\n", 291 | " for (j.outer, 0, ((n + 4)/5)) {\n", 292 | " for (i.inner.j.inner.fused, 0, 50) {\n", 293 | " if (likely(((i.outer*10) < (m - (i.inner.j.inner.fused/5))))) {\n", 294 | " if (likely(((j.outer*5) < (n - (i.inner.j.inner.fused % 5))))) {\n", 295 | " B[(((j.outer*5) + (i.inner.j.inner.fused % 5)) + (((i.outer*10) + (i.inner.j.inner.fused/5))*n))] = A[(((j.outer*5) + (i.inner.j.inner.fused % 5)) + (((i.outer*10) + (i.inner.j.inner.fused/5))*n))]\n", 296 | " }\n", 297 | " }\n", 298 | " }\n", 299 | " }\n", 300 | " }\n", 301 | "}\n", 302 | "\n" 303 | ] 304 | } 305 | ], 306 | "source": [ 307 | "# then fuse (i.inner, j.inner) into one axis: (i.inner.j.inner.fused)\n", 308 | "fused = s[B].fuse(xi, yi)\n", 309 | "print(tvm.lower(s, [A, B], simple_mode=True))" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "## reorder \n", 317 | "\n", 318 | "`reorder` can reorder the axises in the specified order." 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 16, 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "name": "stdout", 328 | "output_type": "stream", 329 | "text": [ 330 | "produce B {\n", 331 | " for (i.inner, 0, 10) {\n", 332 | " for (j.outer, 0, ((n + 4)/5)) {\n", 333 | " for (i.outer, 0, ((m + 9)/10)) {\n", 334 | " for (j.inner, 0, 5) {\n", 335 | " if (likely((i.inner < (m - (i.outer*10))))) {\n", 336 | " if (likely(((j.outer*5) < (n - j.inner)))) {\n", 337 | " B[(((j.outer*5) + ((i.inner + (i.outer*10))*n)) + j.inner)] = A[(((j.outer*5) + ((i.inner + (i.outer*10))*n)) + j.inner)]\n", 338 | " }\n", 339 | " }\n", 340 | " }\n", 341 | " }\n", 342 | " }\n", 343 | " }\n", 344 | "}\n", 345 | "\n" 346 | ] 347 | } 348 | ], 349 | "source": [ 350 | "A = tvm.placeholder((m, n), name='A')\n", 351 | "B = tvm.compute((m, n), lambda i, j: A[i, j], name='B')\n", 352 | "\n", 353 | "s = tvm.create_schedule(B.op)\n", 354 | "# tile to four axises first: (i.outer, j.outer, i.inner, j.inner)\n", 355 | "xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)\n", 356 | "# then reorder the axises: (i.inner, j.outer, i.outer, j.inner)\n", 357 | "s[B].reorder(xi, yo, xo, yi)\n", 358 | "print(tvm.lower(s, [A, B], simple_mode=True))" 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": {}, 364 | "source": [ 365 | "## bind\n", 366 | "`bind` can bind a specified axis with a thread axis, often used in gpu programming." 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 17, 372 | "metadata": {}, 373 | "outputs": [ 374 | { 375 | "name": "stdout", 376 | "output_type": "stream", 377 | "text": [ 378 | "produce B {\n", 379 | " // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = ((n + 63)/64)\n", 380 | " // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 64\n", 381 | " if (likely(((blockIdx.x*64) < (n - threadIdx.x)))) {\n", 382 | " B[((blockIdx.x*64) + threadIdx.x)] = (A[((blockIdx.x*64) + threadIdx.x)]*2.000000f)\n", 383 | " }\n", 384 | "}\n", 385 | "\n" 386 | ] 387 | } 388 | ], 389 | "source": [ 390 | "A = tvm.placeholder((n,), name='A')\n", 391 | "B = tvm.compute(A.shape, lambda i: A[i] * 2, name='B')\n", 392 | "\n", 393 | "s = tvm.create_schedule(B.op)\n", 394 | "bx, tx = s[B].split(B.op.axis[0], factor=64)\n", 395 | "s[B].bind(bx, tvm.thread_axis(\"blockIdx.x\"))\n", 396 | "s[B].bind(tx, tvm.thread_axis(\"threadIdx.x\"))\n", 397 | "print(tvm.lower(s, [A, B], simple_mode=True))\n" 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": {}, 403 | "source": [ 404 | "## compute_at\n", 405 | "\n", 406 | "For a schedule consists of multiple operators, tvm will compute tensors at the root separately by default." 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 18, 412 | "metadata": {}, 413 | "outputs": [ 414 | { 415 | "name": "stdout", 416 | "output_type": "stream", 417 | "text": [ 418 | "produce B {\n", 419 | " for (i, 0, m) {\n", 420 | " B[i] = (A[i] + 1.000000f)\n", 421 | " }\n", 422 | "}\n", 423 | "produce C {\n", 424 | " for (i, 0, m) {\n", 425 | " C[i] = (B[i]*2.000000f)\n", 426 | " }\n", 427 | "}\n", 428 | "\n" 429 | ] 430 | } 431 | ], 432 | "source": [ 433 | "A = tvm.placeholder((m,), name='A')\n", 434 | "B = tvm.compute((m,), lambda i: A[i]+1, name='B')\n", 435 | "C = tvm.compute((m,), lambda i: B[i]*2, name='C')\n", 436 | "\n", 437 | "s = tvm.create_schedule(C.op)\n", 438 | "print(tvm.lower(s, [A, B, C], simple_mode=True))\n" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": {}, 444 | "source": [ 445 | "compute_at can move computation of B into the first axis of computation of C." 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 19, 451 | "metadata": {}, 452 | "outputs": [ 453 | { 454 | "name": "stdout", 455 | "output_type": "stream", 456 | "text": [ 457 | "produce C {\n", 458 | " for (i, 0, m) {\n", 459 | " produce B {\n", 460 | " B[i] = (A[i] + 1.000000f)\n", 461 | " }\n", 462 | " C[i] = (B[i]*2.000000f)\n", 463 | " }\n", 464 | "}\n", 465 | "\n" 466 | ] 467 | } 468 | ], 469 | "source": [ 470 | "A = tvm.placeholder((m,), name='A')\n", 471 | "B = tvm.compute((m,), lambda i: A[i]+1, name='B')\n", 472 | "C = tvm.compute((m,), lambda i: B[i]*2, name='C')\n", 473 | "\n", 474 | "s = tvm.create_schedule(C.op)\n", 475 | "s[B].compute_at(s[C], C.op.axis[0])\n", 476 | "print(tvm.lower(s, [A, B, C], simple_mode=True))\n" 477 | ] 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "metadata": {}, 482 | "source": [ 483 | "## compute_inline\n", 484 | "\n", 485 | "`compute_inline` can mark one stage as inline, then the body of computation will be expanded and inserted at the address where the tensor is required." 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 20, 491 | "metadata": {}, 492 | "outputs": [ 493 | { 494 | "name": "stdout", 495 | "output_type": "stream", 496 | "text": [ 497 | "produce C {\n", 498 | " for (i, 0, m) {\n", 499 | " C[i] = ((A[i]*2.000000f) + 2.000000f)\n", 500 | " }\n", 501 | "}\n", 502 | "\n" 503 | ] 504 | } 505 | ], 506 | "source": [ 507 | "A = tvm.placeholder((m,), name='A')\n", 508 | "B = tvm.compute((m,), lambda i: A[i]+1, name='B')\n", 509 | "C = tvm.compute((m,), lambda i: B[i]*2, name='C')\n", 510 | "\n", 511 | "s = tvm.create_schedule(C.op)\n", 512 | "s[B].compute_inline()\n", 513 | "print(tvm.lower(s, [A, B, C], simple_mode=True))" 514 | ] 515 | }, 516 | { 517 | "cell_type": "markdown", 518 | "metadata": {}, 519 | "source": [ 520 | "## compute_root\n", 521 | "\n", 522 | "`compute_root` can move computation of one stage to the root." 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": 21, 528 | "metadata": {}, 529 | "outputs": [ 530 | { 531 | "name": "stdout", 532 | "output_type": "stream", 533 | "text": [ 534 | "produce B {\n", 535 | " for (i, 0, m) {\n", 536 | " B[i] = (A[i] + 1.000000f)\n", 537 | " }\n", 538 | "}\n", 539 | "produce C {\n", 540 | " for (i, 0, m) {\n", 541 | " C[i] = (B[i]*2.000000f)\n", 542 | " }\n", 543 | "}\n", 544 | "\n" 545 | ] 546 | } 547 | ], 548 | "source": [ 549 | "A = tvm.placeholder((m,), name='A')\n", 550 | "B = tvm.compute((m,), lambda i: A[i]+1, name='B')\n", 551 | "C = tvm.compute((m,), lambda i: B[i]*2, name='C')\n", 552 | "\n", 553 | "s = tvm.create_schedule(C.op)\n", 554 | "s[B].compute_at(s[C], C.op.axis[0])\n", 555 | "s[B].compute_root()\n", 556 | "print(tvm.lower(s, [A, B, C], simple_mode=True))\n" 557 | ] 558 | } 559 | ], 560 | "metadata": { 561 | "kernelspec": { 562 | "display_name": "Python 3", 563 | "language": "python", 564 | "name": "python3" 565 | }, 566 | "language_info": { 567 | "codemirror_mode": { 568 | "name": "ipython", 569 | "version": 3 570 | }, 571 | "file_extension": ".py", 572 | "mimetype": "text/x-python", 573 | "name": "python", 574 | "nbconvert_exporter": "python", 575 | "pygments_lexer": "ipython3", 576 | "version": "3.6.5" 577 | } 578 | }, 579 | "nbformat": 4, 580 | "nbformat_minor": 2 581 | } 582 | --------------------------------------------------------------------------------