├── .gitignore
├── LICENSE
├── README.md
├── config.mk
├── tvm-paper-notes.md
└── tvm-tutorials
    ├── compute-and-reduce-with-tuple-inputs.ipynb
    ├── external-tensor-functions.ipynb
    ├── getting-started.ipynb
    ├── reduction.ipynb
    └── schedule_primitives.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Anderson Banihirwe
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![](http://tvmlang.org/images/main/stack_tvmlang.png) (Image Source: http://tvmlang.org/)
  2 | 
  3 | # TVM in Action
  4 | 
  5 | [TVM: End-to-End Optimization Stack for Deep Learning](https://github.com/dmlc/tvm)
  6 | 
  7 | This repo hosts my notes, tutorial materials (source code) for TVM stack as I explore the incredible explosition of deep-learning frameworks and how to bring them together. 
  8 | 
  9 | # [Summary of TVM: End-to-End Optimization Stack for Deep Learning](https://arxiv.org/abs/1802.04799)
 10 | 
 11 | ## Abstract
 12 | 
 13 | - Scalable frameworks, such as TensorFlow, MXNet, Caffe, and PyTorch are optimized for a narrow range of serve-class GPUs.
 14 | - Deploying workloads to other platforms such as mobile phones, IoT, and specialized accelarators(FPGAs, ASICs) requires laborious manual effort.
 15 | - TVM is an end-to-end optimization stack that exposes:
 16 |   - graph-level
 17 |   - operator-level optimizations
 18 |   ---> to provide performance portability to deep learning workloads across diverse hardware back-ends.
 19 | 
 20 | ## Introduction
 21 | 
 22 | - The number and diversity of specialized deep learning (DL) accelerators pose an adoption challenge
 23 |   - They introduce new hardware abstractions that modern compilers and frameworks are ill-equipped to deal with.
 24 | 
 25 | - Providing support in various DL frameworks for diverse hardware back-ends in the present ad-hoc fashion is **unsustainable**.
 26 | 
 27 | - Hardware targets significantly diverge in terms of memory organization, compute, etc..
 28 | 
 29 | ![](https://i.imgur.com/XRSZMt0.png)
 30 | 
 31 | - *The Goal*: **easily deploy DL workloads to all kinds of hardware targets, including embedded devives, GPUs, FPGAs, ASCIs (e.g, the TPU).**
 32 | 
 33 | - Current DL frameworks rely on a **computational graph intermediate representation** to implement optimizations such as:
 34 |   - auto differentiation
 35 |   - dynamic memory management
 36 | 
 37 | - **Graph-level optimizations** are often too high-level to handle hardware back-end-specific **operator transformations**.
 38 | - **Current operator-level libraries** that DL frameworks rely on are:
 39 |   - too rigid
 40 |   - specialized
 41 | 
 42 |   ---> to be easily ported **across hardware devices**
 43 | 
 44 | - To address these weaknesses, we need a **compiler framework** that can expose optimization opportunities across both
 45 |   - graph-level and
 46 |   - operator-level
 47 | 
 48 |   ---> to deliver competitive performance across hardware back-ends.
 49 | 
 50 | ### Four fundamental challenges at the computation graph level and tensor operator level
 51 | 
 52 | 1. **High-level dataflow rewriting:**
 53 |     - Different hardware devices may have vastly different memory hierarchies.
 54 | 
 55 |     - Enabling strategies to fuse operators and optimize data layouts are crucial for optimizing memory access.
 56 | 
 57 | 2. **Memory reuse across threads:**
 58 |    - Modern GPUs and specialized accelerators ahve memory that can be shared across compute cores.
 59 |    - Traditional shared-nothing nested parallel model is no longer optimal.
 60 |    - Cooperation among threads on shared memory loaded is required for optimized kernels. 
 61 | 
 62 | 3. **Tensorized compute intrinsics:**
 63 |    - The latest hardware provides new instructions that go beyond vector operations like the GEMM operator in TPU or the tensor core in NVIDIA's Volta.
 64 |    - Consequently, the scheduling procedure must break computation into tensor arithmetic intrinsics instead of scalar or vector code.
 65 | 
 66 | 4. **Latency Hiding**
 67 |     - Traditional architectures with simultaneous multithreading and automatically managed caches implicitly hide latency in modern CPUs/GPUs.
 68 |     - Specialized accelerator designs favor learner control and offload most of the scheduling complexity to the compiler stack.
 69 |     - Still, scheduling must be peformed carefully to hide memory access latency.
 70 | 
 71 | 
 72 | ### TVM: An End-to-End Optimization Stack
 73 | 
 74 | - An end-to-end optimizing compiler stack to lower and fine-tune DL workloads to diverse hardware back-ends. 
 75 | - Designed to separate:
 76 |   - the algorithm description
 77 |   - schedule
 78 |   - hardware interface
 79 | - This separation enables **support for novel specialized accelerators** and **their corresponding new intrinsics**. 
 80 | - TVM presents **two optimization layers**:
 81 |   - a computation graph optimization layer to address:
 82 |     - High-level dataflow rewriting
 83 |   - a tensor optimization layer with new schedule primitives to address:
 84 |     - memory reuse across threads
 85 |     - tensorized compute intrinsics
 86 |     - latency hiding
 87 | 
 88 | ## Optimizing Computational Graphs
 89 | 
 90 | ### Computational Graph
 91 | 
 92 | - Computational graphs are a common way to represent programs in DL frameworks. 
 93 | - They provide a global view on computation tasks, yet avoid specifying how each computation task needs to be implemented. 
 94 | 
 95 | 
 96 | 
 97 | ### Operator Fusion
 98 | 
 99 | - An optimization that can greatly reduce execution time, particulary in GPUs and specialized accelerators.
100 | - The idea is to **combine multiple operators together into a single kernel without saving the intermediate results back into global memory**
101 |  
102 | ![](https://i.imgur.com/mlNhoDT.png)
103 | 
104 | **Four categories of graph operators**:
105 | 
106 | - Injective (one-to-one map)
107 | - Reduction
108 | - Complex-out-fusable (can fuse element-wise map to output)
109 | - Opaque (cannot be fused)
110 | 
111 | ![](https://i.imgur.com/XnhSWVN.png)
112 | 
113 | ### Data Layout Transformation
114 | 
115 | - Tensor operations are the basic operators of computational graphs
116 | - They can have divergent layout requirements across different operations
117 | - Optimizing data layout starts with specifying the preferred data layout of each operator given the constraints dictating their implementation in hardware.
118 | 
119 | ![](https://i.imgur.com/0J5QxGs.png)
120 | 
121 | ### Limitations of Graph-Level Optimizations
122 | 
123 | - They are only as effective as what the operator library provides.
124 | - Currently, the few DL frameworks that support operator fusion require the operator library to provide an implementation of the fused patterns.
125 |     - With more network operators introduced on a regular basis, this approach is no longer sustainable when targeting an increasing number of hardware back-ends.
126 | - It is not feasible to handcraft operator kernels for this massive space of back-end specific operators
127 |     - TVM provides a code-generation approach that can generate tensor operators. 
128 | 
129 | ## Optimizing Tensor Operations
130 | 
131 | ### Tensor Expression Language
132 | 
133 | - TVM introduces a dataflow tensor expression language to support automatic code generation.
134 | - Unlike high-level computation graph languages, where the implementation of tensor operations is opaque, *each operation is described in an index formula expression language*.
135 | 
136 | ![](https://i.imgur.com/LG1pguT.png)
137 | 
138 | - TVM tensor expression language supports common arithmetic and math operations found in common language like C. 
139 | - TVM explicitly introduces a **commutative reduction** operator to easily schedule commutative reductions across multiple threads. 
140 | - TVM further introduces a **high-order scan operator** that can combine basic compute operators to form recurrent computations over time. 
141 | 
142 | ### Schedule Space 
143 | 
144 | - Given a tensor expression, it is challenging to create high-performance implementations for each hardware back-end. 
145 | - Each optimized low-level program is the result of different combinations of scheduling strategies, imposing a large burden on the kernel writer.
146 | - TVM adopts the **principle of decoupling compute descriptions from schedule optimizations**.
147 | - Schedules are the specific rules that lower compute descriptions down to back-end-optimized implementations. 
148 | 
149 | ![](https://i.imgur.com/JUikGQz.png)
150 | 
151 | ![](https://i.imgur.com/BCg6gCz.png)
152 | 
153 | 
154 | ### Nested Parallelism with Cooperation
155 | 
156 | - Parallel programming is key to improving the efficiency of compute intensive kernels in deep learning workloads. 
157 | - Modern GPUs offer massive parallelism 
158 |     
159 |     ---> Requiring TVM to bake parallel programming models into schedule transformations
160 | 
161 | - Most existing solutions adopt a parallel programming model referred to as [nested parallel programs](https://youtu.be/4lS_WThsFoM), which is a form of [fork-join parallelism](https://en.wikipedia.org/wiki/Fork%E2%80%93join_model). 
162 | - TVM uses a parallel schedule primitive to parallelize a data parallel task
163 |   - Each parallel task can be further recursively subdivided into subtasks to exploit the multi-level thread hierarchy on the target architecture (e.g, thread groups in GPU)
164 | - This model is called **shared-nothing nested parallelism**
165 |   - One working thread cannot look at the data of its sibling within the same parallel computation stage.
166 |   - Interactions between sibling threads happen at the join stage, when the subtasks are done and the next stage can consume the data produced by the previous stage. 
167 |   - This programming model **does not enable threads to cooperate with each other in order to perform collective task within the same parallel stage**.
168 | 
169 | - A better alternative to the shared-nothing approach is to **fetch data cooperatively across threads**
170 |     - This pattern is well known in GPU programming using languages like CUDA, OpenCL and Metal.
171 |     - **It has not been implemented into a schedule primitive.**
172 | - TVM introduces the **concept of memory scopes to the schedule space**, so that a stage can be marked as shared.
173 |     - Without memory scopes, automatic scope inference will mark the relevant stage as thread-local.
174 |     - Memory scopes are useful to GPUs.
175 |     - Memory scopes allow us to tag special memory buffers and create special lowering rules when targeting specialized deep learning accelerators. 
176 | 
177 | ![](https://i.imgur.com/HHYtujL.png)
178 | 
179 | 
180 | ### Tensorization: Generalizing the Hardware Interface
181 | 
182 | - **Tensorization** problem is analogous to the **vectorization** problem for [SIMD architectures](https://en.wikipedia.org/wiki/SIMD). 
183 | - Tensorization differs significantly from vectorization
184 |     - The inputs to the tensor compute primitives are multi-dimensional, with fixed or variable lengths, and dictate different data layouts.
185 |     - Cannot resort to a fixed set of primitives, as new DL accelerators are emerging with their own flavors of tensor instructions. 
186 | - To solve this challenge, TVM **separates the hardware interface from the schedule**:
187 |     - TVM introduces a tensor intrinsic declaration mechanism
188 |     - TVM uses the tensor expression language to declare the behavior of each new hardware intrinsic, as well as the lowering rule associated to it. 
189 |     - TVM introduces a **tensorize** schedule primitive to replace a unit of computation with the corresponding tensor intrinsics. 
190 |     - The compiler matches the computation pattern with a hardware declaration, and lowers it to the corresping hardware intrinsic. 
191 |    
192 | 
193 | ### Compiler Support for Latency Hiding
194 | 
195 | - **Latency Hiding:** refers to the process of overlapping memory operations with computation to maximize memory and compute utilization. 
196 | - It requires different different strategies depending on the hardware back-end that is being targeted. 
197 | - On CPUs, memory latency hiding is achieved **implicitly with simultaneous multithreading** or **hardware prefetching techniques**. 
198 | - GPUs rely on **rapid context switching of many wraps of threads** to maximize the utilization of functional units. 
199 | - TVM provides a virtual threading schedule primitive that lets the programmer specify a high-level data parallel program that TVM automatically lowers to a low-level explicit data dependence program. 
200 | 
201 | 
202 | ## Code Generation and Runtime Support 
203 | 
204 | ### Code Generation
205 | 
206 | - For a specific tuple of data-flow declaration, axis relation hyper-graph, and schedule tree, TVM can generate lowered code by:
207 |   - iteratively traversing the schedule tree
208 |   - inferring the dependent bounds of the input tensors (using the axis relation hyergraph)
209 |   - generating the loop nest in the low-level code
210 | - The code is lowered to an in-memory representation of an imperative C style loop program. 
211 | - TVM reuses a variant of Halide's the loop program data structure in this process. 
212 | - TVM reuses passes from Halide for common lowering primitives like storage flattening and unrolling, 
213 |   - and add GPU/accelerator-specific transformations such as:
214 |     - *synchronization point detection*
215 |     - *virtual thread injection**
216 |     - *module generation*
217 | - Finally, the loop program is transformed into **LLVM** or **CUDA/Metal/OpenCL** source code.
218 | 
219 | ### Runtime Support
220 | 
221 | - For GPU programs, TVM builds the host and device modules **separately** and provide a runtime module system that launch kernels using corresponding driver APIs. 
222 | 
223 | ### Remote Deployment Profiling
224 | 
225 | - TVM includes infrastructure to make profiling and autotuning easier on embedded devices. 
226 | - Traditionally, targeting an embedded device for tuning requires:
227 |   - cross-compiling on the host side, 
228 |   - copying to the target device, 
229 |   - and timing the execution
230 | 
231 | - TVM provides remote function call support. Through the **RPC interface**:
232 |   - TVM compiles the program on a host compiler
233 |   - it uploads to remote embedded devices
234 |   - it runs the funcion remotely, 
235 |   - and it accesses the results in the same script on the host. 
236 | 
237 | ![](https://i.imgur.com/oL0Z9pp.png)
238 | 
239 | 
240 | ## Conclusion
241 | 
242 | - TVM provides an end-to-end stack to solve fundamental optimization challenges across a diverse set of hardware back-ends.
243 | - TVM can encourage more studies of programming languages, compilation, and open new opportunities for hardware co-design techniques for deep learning systems. 
244 | 
245 | 


--------------------------------------------------------------------------------
/config.mk:
--------------------------------------------------------------------------------
 1 | #-------------------------------------------------------------------------------
 2 | #  Template configuration for compiling
 3 | #
 4 | #  If you want to change the configuration, please use the following
 5 | #  steps. Assume you are on the root directory. First copy the this
 6 | #  file so that any local changes will be ignored by git
 7 | #
 8 | #  $ cp make/config.mk .
 9 | #
10 | #  Next modify the according entries, and then compile by
11 | #
12 | #  $ make
13 | #
14 | #  or build in parallel with 8 threads
15 | #
16 | #  $ make -j8
17 | #-------------------------------------------------------------------------------
18 | 
19 | # whether compile with debug
20 | DEBUG = 0
21 | 
22 | # the additional link flags you want to add
23 | ADD_LDFLAGS =
24 | 
25 | # the additional compile flags you want to add
26 | ADD_CFLAGS =
27 | 
28 | #---------------------------------------------
29 | # Backend runtimes.
30 | #---------------------------------------------
31 | # whether enable CUDA during compile
32 | USE_CUDA = 1
33 | 
34 | # add the path to CUDA library to link and compile flag
35 | # if you have already add them to environment variable.
36 | CUDA_PATH = /usr/local/cuda
37 | 
38 | # ROCM
39 | USE_ROCM = 0
40 | 
41 | # whether enable OpenCL during compile
42 | USE_OPENCL = 1
43 | 
44 | # whether enable Metal during compile
45 | USE_METAL = 0
46 | 
47 | # whether enable SGX during compile
48 | USE_SGX = 0
49 | SGX_SDK = /opt/sgxsdk
50 | 
51 | # Whether enable RPC during compile
52 | USE_RPC = 1
53 | 
54 | # Whether enable tiny embedded graph runtime.
55 | USE_GRAPH_RUNTIME = 1
56 | 
57 | #  Whether enable additional graph debug functions
58 | USE_GRAPH_RUNTIME_DEBUG = 0
59 | 
60 | # whether build with LLVM support
61 | # Requires LLVM version >= 4.0
62 | # Set LLVM_CONFIG to your version, uncomment to build with llvm support
63 | #
64 | LLVM_CONFIG = llvm-config-6.0
65 | #---------------------------------------------
66 | # Contrib optional libraries.
67 | #---------------------------------------------
68 | # Whether use BLAS, choices: openblas, atlas, blas, apple
69 | USE_BLAS = openblas
70 | 
71 | # Whether use contrib.random in runtime
72 | USE_RANDOM = 0
73 | 
74 | # Whether use NNPack
75 | USE_NNPACK = 0
76 | # NNPACK_PATH = none
77 | 
78 | # Whether use CuDNN
79 | USE_CUDNN = 1
80 | 
81 | # Whether use MIOpen
82 | USE_MIOPEN = 0
83 | 
84 | # Whether use MPS
85 | USE_MPS = 0
86 | 
87 | # Whether use cuBLAS
88 | USE_CUBLAS = 1
89 | 
90 | # Whether use rocBlas
91 | USE_ROCBLAS = 0
92 | 


--------------------------------------------------------------------------------
/tvm-paper-notes.md:
--------------------------------------------------------------------------------
  1 | # [Summary of TVM: End-to-End Optimization Stack for Deep Learning](https://arxiv.org/abs/1802.04799)
  2 | 
  3 | ## Abstract
  4 | 
  5 | - Scalable frameworks, such as TensorFlow, MXNet, Caffe, and PyTorch are optimized for a narrow range of serve-class GPUs.
  6 | - Deploying workloads to other platforms such as mobile phones, IoT, and specialized accelarators(FPGAs, ASICs) requires laborious manual effort.
  7 | - TVM is an end-to-end optimization stack that exposes:
  8 |   - graph-level
  9 |   - operator-level optimizations
 10 |   ---> to provide performance portability to deep learning workloads across diverse hardware back-ends.
 11 | 
 12 | ## Introduction
 13 | 
 14 | - The number and diversity of specialized deep learning (DL) accelerators pose an adoption challenge
 15 |   - They introduce new hardware abstractions that modern compilers and frameworks are ill-equipped to deal with.
 16 | 
 17 | - Providing support in various DL frameworks for diverse hardware back-ends in the present ad-hoc fashion is **unsustainable**.
 18 | 
 19 | - Hardware targets significantly diverge in terms of memory organization, compute, etc..
 20 | 
 21 | ![](https://i.imgur.com/XRSZMt0.png)
 22 | 
 23 | - *The Goal*: **easily deploy DL workloads to all kinds of hardware targets, including embedded devives, GPUs, FPGAs, ASCIs (e.g, the TPU).**
 24 | 
 25 | - Current DL frameworks rely on a **computational graph intermediate representation** to implement optimizations such as:
 26 |   - auto differentiation
 27 |   - dynamic memory management
 28 | 
 29 | - **Graph-level optimizations** are often too high-level to handle hardware back-end-specific **operator transformations**.
 30 | - **Current operator-level libraries** that DL frameworks rely on are:
 31 |   - too rigid
 32 |   - specialized
 33 | 
 34 |   ---> to be easily ported **across hardware devices**
 35 | 
 36 | - To address these weaknesses, we need a **compiler framework** that can expose optimization opportunities across both
 37 |   - graph-level and
 38 |   - operator-level
 39 | 
 40 |   ---> to deliver competitive performance across hardware back-ends.
 41 | 
 42 | ### Four fundamental challenges at the computation graph level and tensor operator level
 43 | 
 44 | 1. **High-level dataflow rewriting:**
 45 |     - Different hardware devices may have vastly different memory hierarchies.
 46 | 
 47 |     - Enabling strategies to fuse operators and optimize data layouts are crucial for optimizing memory access.
 48 | 
 49 | 2. **Memory reuse across threads:**
 50 |    - Modern GPUs and specialized accelerators ahve memory that can be shared across compute cores.
 51 |    - Traditional shared-nothing nested parallel model is no longer optimal.
 52 |    - Cooperation among threads on shared memory loaded is required for optimized kernels. 
 53 | 
 54 | 3. **Tensorized compute intrinsics:**
 55 |    - The latest hardware provides new instructions that go beyond vector operations like the GEMM operator in TPU or the tensor core in NVIDIA's Volta.
 56 |    - Consequently, the scheduling procedure must break computation into tensor arithmetic intrinsics instead of scalar or vector code.
 57 | 
 58 | 4. **Latency Hiding**
 59 |     - Traditional architectures with simultaneous multithreading and automatically managed caches implicitly hide latency in modern CPUs/GPUs.
 60 |     - Specialized accelerator designs favor learner control and offload most of the scheduling complexity to the compiler stack.
 61 |     - Still, scheduling must be peformed carefully to hide memory access latency.
 62 | 
 63 | 
 64 | ### TVM: An End-to-End Optimization Stack
 65 | 
 66 | - An end-to-end optimizing compiler stack to lower and fine-tune DL workloads to diverse hardware back-ends. 
 67 | - Designed to separate:
 68 |   - the algorithm description
 69 |   - schedule
 70 |   - hardware interface
 71 | - This separation enables **support for novel specialized accelerators** and **their corresponding new intrinsics**. 
 72 | - TVM presents **two optimization layers**:
 73 |   - a computation graph optimization layer to address:
 74 |     - High-level dataflow rewriting
 75 |   - a tensor optimization layer with new schedule primitives to address:
 76 |     - memory reuse across threads
 77 |     - tensorized compute intrinsics
 78 |     - latency hiding
 79 | 
 80 | ## Optimizing Computational Graphs
 81 | 
 82 | ### Computational Graph
 83 | 
 84 | - Computational graphs are a common way to represent programs in DL frameworks. 
 85 | - They provide a global view on computation tasks, yet avoid specifying how each computation task needs to be implemented. 
 86 | 
 87 | 
 88 | 
 89 | ### Operator Fusion
 90 | 
 91 | - An optimization that can greatly reduce execution time, particulary in GPUs and specialized accelerators.
 92 | - The idea is to **combine multiple operators together into a single kernel without saving the intermediate results back into global memory**
 93 |  
 94 | ![](https://i.imgur.com/mlNhoDT.png)
 95 | 
 96 | **Four categories of graph operators**:
 97 | 
 98 | - Injective (one-to-one map)
 99 | - Reduction
100 | - Complex-out-fusable (can fuse element-wise map to output)
101 | - Opaque (cannot be fused)
102 | 
103 | ![](https://i.imgur.com/XnhSWVN.png)
104 | 
105 | ### Data Layout Transformation
106 | 
107 | - Tensor operations are the basic operators of computational graphs
108 | - They can have divergent layout requirements across different operations
109 | - Optimizing data layout starts with specifying the preferred data layout of each operator given the constraints dictating their implementation in hardware.
110 | 
111 | ![](https://i.imgur.com/0J5QxGs.png)
112 | 
113 | ### Limitations of Graph-Level Optimizations
114 | 
115 | - They are only as effective as what the operator library provides.
116 | - Currently, the few DL frameworks that support operator fusion require the operator library to provide an implementation of the fused patterns.
117 |     - With more network operators introduced on a regular basis, this approach is no longer sustainable when targeting an increasing number of hardware back-ends.
118 | - It is not feasible to handcraft operator kernels for this massive space of back-end specific operators
119 |     - TVM provides a code-generation approach that can generate tensor operators. 
120 | 
121 | ## Optimizing Tensor Operations
122 | 
123 | ### Tensor Expression Language
124 | 
125 | - TVM introduces a dataflow tensor expression language to support automatic code generation.
126 | - Unlike high-level computation graph languages, where the implementation of tensor operations is opaque, *each operation is described in an index formula expression language*.
127 | 
128 | ![](https://i.imgur.com/LG1pguT.png)
129 | 
130 | - TVM tensor expression language supports common arithmetic and math operations found in common language like C. 
131 | - TVM explicitly introduces a **commutative reduction** operator to easily schedule commutative reductions across multiple threads. 
132 | - TVM further introduces a **high-order scan operator** that can combine basic compute operators to form recurrent computations over time. 
133 | 
134 | ### Schedule Space 
135 | 
136 | - Given a tensor expression, it is challenging to create high-performance implementations for each hardware back-end. 
137 | - Each optimized low-level program is the result of different combinations of scheduling strategies, imposing a large burden on the kernel writer.
138 | - TVM adopts the **principle of decoupling compute descriptions from schedule optimizations**.
139 | - Schedules are the specific rules that lower compute descriptions down to back-end-optimized implementations. 
140 | 
141 | ![](https://i.imgur.com/JUikGQz.png)
142 | 
143 | ![](https://i.imgur.com/BCg6gCz.png)
144 | 
145 | 
146 | ### Nested Parallelism with Cooperation
147 | 
148 | - Parallel programming is key to improving the efficiency of compute intensive kernels in deep learning workloads. 
149 | - Modern GPUs offer massive parallelism 
150 |     
151 |     ---> Requiring TVM to bake parallel programming models into schedule transformations
152 | 
153 | - Most existing solutions adopt a parallel programming model referred to as [nested parallel programs](https://youtu.be/4lS_WThsFoM), which is a form of [fork-join parallelism](https://en.wikipedia.org/wiki/Fork%E2%80%93join_model). 
154 | - TVM uses a parallel schedule primitive to parallelize a data parallel task
155 |   - Each parallel task can be further recursively subdivided into subtasks to exploit the multi-level thread hierarchy on the target architecture (e.g, thread groups in GPU)
156 | - This model is called **shared-nothing nested parallelism**
157 |   - One working thread cannot look at the data of its sibling within the same parallel computation stage.
158 |   - Interactions between sibling threads happen at the join stage, when the subtasks are done and the next stage can consume the data produced by the previous stage. 
159 |   - This programming model **does not enable threads to cooperate with each other in order to perform collective task within the same parallel stage**.
160 | 
161 | - A better alternative to the shared-nothing approach is to **fetch data cooperatively across threads**
162 |     - This pattern is well known in GPU programming using languages like CUDA, OpenCL and Metal.
163 |     - **It has not been implemented into a schedule primitive.**
164 | - TVM introduces the **concept of memory scopes to the schedule space**, so that a stage can be marked as shared.
165 |     - Without memory scopes, automatic scope inference will mark the relevant stage as thread-local.
166 |     - Memory scopes are useful to GPUs.
167 |     - Memory scopes allow us to tag special memory buffers and create special lowering rules when targeting specialized deep learning accelerators. 
168 | 
169 | ![](https://i.imgur.com/HHYtujL.png)
170 | 
171 | 
172 | ### Tensorization: Generalizing the Hardware Interface
173 | 
174 | - **Tensorization** problem is analogous to the **vectorization** problem for [SIMD architectures](https://en.wikipedia.org/wiki/SIMD). 
175 | - Tensorization differs significantly from vectorization
176 |     - The inputs to the tensor compute primitives are multi-dimensional, with fixed or variable lengths, and dictate different data layouts.
177 |     - Cannot resort to a fixed set of primitives, as new DL accelerators are emerging with their own flavors of tensor instructions. 
178 | - To solve this challenge, TVM **separates the hardware interface from the schedule**:
179 |     - TVM introduces a tensor intrinsic declaration mechanism
180 |     - TVM uses the tensor expression language to declare the behavior of each new hardware intrinsic, as well as the lowering rule associated to it. 
181 |     - TVM introduces a **tensorize** schedule primitive to replace a unit of computation with the corresponding tensor intrinsics. 
182 |     - The compiler matches the computation pattern with a hardware declaration, and lowers it to the corresping hardware intrinsic. 
183 |    
184 | 
185 | ### Compiler Support for Latency Hiding
186 | 
187 | - **Latency Hiding:** refers to the process of overlapping memory operations with computation to maximize memory and compute utilization. 
188 | - It requires different different strategies depending on the hardware back-end that is being targeted. 
189 | - On CPUs, memory latency hiding is achieved **implicitly with simultaneous multithreading** or **hardware prefetching techniques**. 
190 | - GPUs rely on **rapid context switching of many wraps of threads** to maximize the utilization of functional units. 
191 | - TVM provides a virtual threading schedule primitive that lets the programmer specify a high-level data parallel program that TVM automatically lowers to a low-level explicit data dependence program. 
192 | 
193 | 
194 | ## Code Generation and Runtime Support 
195 | 
196 | ### Code Generation
197 | 
198 | - For a specific tuple of data-flow declaration, axis relation hyper-graph, and schedule tree, TVM can generate lowered code by:
199 |   - iteratively traversing the schedule tree
200 |   - inferring the dependent bounds of the input tensors (using the axis relation hyergraph)
201 |   - generating the loop nest in the low-level code
202 | - The code is lowered to an in-memory representation of an imperative C style loop program. 
203 | - TVM reuses a variant of Halide's the loop program data structure in this process. 
204 | - TVM reuses passes from Halide for common lowering primitives like storage flattening and unrolling, 
205 |   - and add GPU/accelerator-specific transformations such as:
206 |     - *synchronization point detection*
207 |     - *virtual thread injection**
208 |     - *module generation*
209 | - Finally, the loop program is transformed into **LLVM** or **CUDA/Metal/OpenCL** source code.
210 | 
211 | ### Runtime Support
212 | 
213 | - For GPU programs, TVM builds the host and device modules **separately** and provide a runtime module system that launch kernels using corresponding driver APIs. 
214 | 
215 | ### Remote Deployment Profiling
216 | 
217 | - TVM includes infrastructure to make profiling and autotuning easier on embedded devices. 
218 | - Traditionally, targeting an embedded device for tuning requires:
219 |   - cross-compiling on the host side, 
220 |   - copying to the target device, 
221 |   - and timing the execution
222 | 
223 | - TVM provides remote function call support. Through the **RPC interface**:
224 |   - TVM compiles the program on a host compiler
225 |   - it uploads to remote embedded devices
226 |   - it runs the funcion remotely, 
227 |   - and it accesses the results in the same script on the host. 
228 | 
229 | ![](https://i.imgur.com/oL0Z9pp.png)
230 | 
231 | 
232 | ## Conclusion
233 | 
234 | - TVM provides an end-to-end stack to solve fundamental optimization challenges across a diverse set of hardware back-ends.
235 | - TVM can encourage more studies of programming languages, compilation, and open new opportunities for hardware co-design techniques for deep learning systems. 
236 | 
237 | 


--------------------------------------------------------------------------------
/tvm-tutorials/compute-and-reduce-with-tuple-inputs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import tvm \n",
 10 |     "import numpy as np"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "## Describe Batchwise Computation"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "- For operators which have the same shape, we can put them together as the inputs of `tvm.compute`, if we wish they can be scheduled together in the next schedule procedure"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "n = tvm.var(name=\"n\")\n",
 34 |     "m = tvm.var(name=\"m\")\n",
 35 |     "A0 = tvm.placeholder(shape=(m, n), name=\"A0\")\n",
 36 |     "A1 = tvm.placeholder(shape=(m, n), name=\"A1\")"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 3,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "B0, B1 = tvm.compute(shape=(m, n), fcompute=lambda i, j: (A0[i, j] + 2, A1[i, j] * 3), \n",
 46 |     "                     name=\"B\")"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 6,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "name": "stdout",
 56 |      "output_type": "stream",
 57 |      "text": [
 58 |       "produce B {\n",
 59 |       "  for (i, 0, m) {\n",
 60 |       "    for (j, 0, n) {\n",
 61 |       "      B.v0[((i*n) + j)] = (A0[((i*n) + j)] + 2.000000f)\n",
 62 |       "      B.v1[((i*n) + j)] = (A1[((i*n) + j)]*3.000000f)\n",
 63 |       "    }\n",
 64 |       "  }\n",
 65 |       "}\n",
 66 |       "\n"
 67 |      ]
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "# The generated IR code would be:\n",
 72 |     "s = tvm.create_schedule(B0.op)\n",
 73 |     "print(tvm.lower(sch=s, args=[A0, A1, B0, B1], simple_mode=True))"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "## Describe Reduction with Collaborative Inputs"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "- Sometimes, we require multiple inputs to express some reduction operators, and the inputs will collaborate together, e.g. `argmax`\n",
 88 |     "- In the reduction procedure, `argmax` need to compare the value of operands, also need to keep the index of operand. This can be expressed with `comm_reducer` "
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 7,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "# x and y are the operands of reduction, both of them are a tuple of index and value\n",
 98 |     "def fcombine(x, y):\n",
 99 |     "    lhs = tvm.select(cond=(x[1] >= y[1]), t=x[0], f=y[0])\n",
100 |     "    rhs = tvm.select(cond=(x[1] >= y[1]), t=x[1], f=y[1])\n",
101 |     "    return lhs, rhs"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 8,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "# our identity element also need to be a tuple, so `fidentity` accepts \n",
111 |     "# two types as inputs\n",
112 |     "def fidentity(t0, t1):\n",
113 |     "    return tvm.const(value=-1, dtype=t0), tvm.min_value(dtype=t1)"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 9,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "argmax = tvm.comm_reducer(fcombine, fidentity, name=\"argmax\")"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 10,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "# describe the reduction computation\n",
132 |     "m = tvm.var(\"m\")\n",
133 |     "n = tvm.var(\"n\")\n",
134 |     "idx = tvm.placeholder(shape=(m, n), name=\"idx\", dtype=\"int32\")\n",
135 |     "val = tvm.placeholder(shape=(m, n), name=\"val\", dtype=\"int32\")\n",
136 |     "k = tvm.reduce_axis(dom=(0, n), name=\"k\")\n",
137 |     "T0, T1 = tvm.compute(shape=(m, ), fcompute=lambda i: argmax((idx[i, k], val[i, k]), axis=k),\n",
138 |     "                     name=\"T\")\n"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 11,
144 |    "metadata": {},
145 |    "outputs": [
146 |     {
147 |      "name": "stdout",
148 |      "output_type": "stream",
149 |      "text": [
150 |       "produce T {\n",
151 |       "  for (i, 0, m) {\n",
152 |       "    T.v0[i] = -1\n",
153 |       "    T.v1[i] = -2147483648\n",
154 |       "    for (k, 0, n) {\n",
155 |       "      T.v0[i] = tvm_if_then_else((T.v1[i] < val[((i*n) + k)]), idx[((i*n) + k)], T.v0[i])\n",
156 |       "      T.v1[i] = tvm_if_then_else((T.v1[i] < val[((i*n) + k)]), val[((i*n) + k)], T.v1[i])\n",
157 |       "    }\n",
158 |       "  }\n",
159 |       "}\n",
160 |       "\n"
161 |      ]
162 |     }
163 |    ],
164 |    "source": [
165 |     "# The generated IR code would be:\n",
166 |     "s = tvm.create_schedule(T0.op)\n",
167 |     "print(tvm.lower(sch=s, args=[idx, val, T0, T1], simple_mode=True))"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "## Schedule Operation with Tuple Inputs\n"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "- Although you will get multiple outputs with one batch operation, but they can only be scheduled together in terms of operation."
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 13,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "n = tvm.var(\"n\")\n",
191 |     "m = tvm.var(\"m\")\n",
192 |     "A0 = tvm.placeholder((m, n), name='A0')\n",
193 |     "B0, B1 = tvm.compute((m, n), lambda i, j: (A0[i, j] + 2, A0[i, j] * 3), name='B')\n",
194 |     "A1 = tvm.placeholder((m, n), name='A1')\n",
195 |     "C = tvm.compute((m, n), lambda i, j: A1[i, j] + B0[i, j], name='C')"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 14,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "s = tvm.create_schedule(C.op)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 16,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "s[B0].compute_at(s[C], C.op.axis[0])"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 17,
219 |    "metadata": {},
220 |    "outputs": [
221 |     {
222 |      "name": "stdout",
223 |      "output_type": "stream",
224 |      "text": [
225 |       "// attr [B.v0] storage_scope = \"global\"\n",
226 |       "allocate B.v0[float32 * 1 * n]\n",
227 |       "// attr [B.v1] storage_scope = \"global\"\n",
228 |       "allocate B.v1[float32 * 1 * n]\n",
229 |       "produce C {\n",
230 |       "  for (i, 0, m) {\n",
231 |       "    produce B {\n",
232 |       "      for (j, 0, n) {\n",
233 |       "        B.v0[j] = (A0[((i*n) + j)] + 2.000000f)\n",
234 |       "        B.v1[j] = (A0[((i*n) + j)]*3.000000f)\n",
235 |       "      }\n",
236 |       "    }\n",
237 |       "    for (j, 0, n) {\n",
238 |       "      C[((i*n) + j)] = (A1[((i*n) + j)] + B.v0[j])\n",
239 |       "    }\n",
240 |       "  }\n",
241 |       "}\n",
242 |       "\n"
243 |      ]
244 |     }
245 |    ],
246 |    "source": [
247 |     "print(tvm.lower(sch=s, args=[A0, A1, C], simple_mode=True))"
248 |    ]
249 |   }
250 |  ],
251 |  "metadata": {
252 |   "kernelspec": {
253 |    "display_name": "Python 3",
254 |    "language": "python",
255 |    "name": "python3"
256 |   },
257 |   "language_info": {
258 |    "codemirror_mode": {
259 |     "name": "ipython",
260 |     "version": 3
261 |    },
262 |    "file_extension": ".py",
263 |    "mimetype": "text/x-python",
264 |    "name": "python",
265 |    "nbconvert_exporter": "python",
266 |    "pygments_lexer": "ipython3",
267 |    "version": "3.6.5"
268 |   }
269 |  },
270 |  "nbformat": 4,
271 |  "nbformat_minor": 2
272 | }
273 | 


--------------------------------------------------------------------------------
/tvm-tutorials/external-tensor-functions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## External Tensor Functions"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "- TVM supports transparent code generation.\n",
 15 |     "- TVM supports black box function calls natively as well. \n",
 16 |     "- Specifically, TVM supports all tensor functions that are [DLPack](https://github.com/dmlc/dlpack) compatible.\n"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import tvm \n",
 26 |     "import numpy as np \n",
 27 |     "from tvm.contrib import cblas"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Use Extern Tensor Function\n"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 2,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "n = 1024\n",
 44 |     "l = 128\n",
 45 |     "m = 235"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 3,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "bias = tvm.var(name='bias', dtype=tvm.float32)\n",
 55 |     "A = tvm.placeholder(shape=(n, l), name='A')\n",
 56 |     "B = tvm.placeholder(shape=(l, m), name='B')\n",
 57 |     "# Compute several tensor via extern function.\n",
 58 |     "C = tvm.extern(shape=(n, m), inputs=[A, B], \n",
 59 |     "               fcompute=lambda ins, outs: tvm.call_packed(\"tvm.contrib.cblas.matmul\",\n",
 60 |     "                                                         ins[0], ins[1], outs[0], False, False),\n",
 61 |     "              name=\"C\")"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 4,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "D = tvm.compute(shape=C.shape, fcompute=lambda i, j: C[i, j] + bias, name=\"D\")"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 5,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "s = tvm.create_schedule(D.op)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "## Verify the Result"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 6,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "ctx = tvm.cpu(dev_id=0)\n",
 96 |     "f = tvm.build(sch=s, args=[A, B, D, bias], target=\"llvm\")"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 7,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx)\n",
106 |     "b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx)\n",
107 |     "d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), ctx)\n",
108 |     "bb = 10.0"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 8,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "f(a, b, d, bb)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 9,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "np.testing.assert_allclose(\n",
127 |     "    d.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()) + 10, rtol=1e-5)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "## Extern Contrib Wrappers"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "- TVM also provides extern contrib wrappers to useful extern calls, the following line is equivalent to the previous example."
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 10,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "from tvm.contrib import cblas"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 11,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "C = cblas.matmul(lhs=A, rhs=B)\n",
160 |     "D = tvm.compute(shape=C.shape, fcompute=lambda i, j: C[i, j] + bias, name=\"D\")\n",
161 |     "s = tvm.create_schedule(D.op)"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "## Hook Python Function as Extern"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "- Since we can call into any PackedFunc in TVM. We can use the extern function to callback into python.\n",
176 |     "\n",
177 |     "- The following example registers a python function into tvm runtime system and use it to complete one stage of the computation. \n",
178 |     "- This makes TVM much more flexible. For example, we can insert front-end callbacks to inspect the intermediate results or mix customized code with TVM."
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 12,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "@tvm.register_func(\"tvm.contrib.my_tvm_addone\")\n",
188 |     "def my_tvm_addone(x, y):\n",
189 |     "    print(\"my_tvm_addone signatures: %s, %s\" % (type(x), type(y)))\n",
190 |     "    tvm.nd.array(x.asnumpy() + 1).copyto(y)\n"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 13,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "A = tvm.placeholder((n,), name=\"A\")\n",
200 |     "B = tvm.extern(A.shape, inputs=[A], \n",
201 |     "               fcompute=lambda ins, outs: tvm.call_packed(\"tvm.contrib.my_tvm_addone\", \n",
202 |     "                                              ins[0], outs[0]), \n",
203 |     "               name=\"C\")"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 14,
209 |    "metadata": {},
210 |    "outputs": [
211 |     {
212 |      "name": "stdout",
213 |      "output_type": "stream",
214 |      "text": [
215 |       "my_tvm_addone signatures: <class 'tvm.ndarray.NDArray'>, <class 'tvm.ndarray.NDArray'>\n"
216 |      ]
217 |     }
218 |    ],
219 |    "source": [
220 |     "s = tvm.create_schedule(B.op)\n",
221 |     "f = tvm.build(s, [A, B], \"llvm\")\n",
222 |     "a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx)\n",
223 |     "b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), ctx)\n",
224 |     "f(a, b)\n",
225 |     "np.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1, rtol=1e-5)"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": []
234 |   }
235 |  ],
236 |  "metadata": {
237 |   "kernelspec": {
238 |    "display_name": "Python 3",
239 |    "language": "python",
240 |    "name": "python3"
241 |   },
242 |   "language_info": {
243 |    "codemirror_mode": {
244 |     "name": "ipython",
245 |     "version": 3
246 |    },
247 |    "file_extension": ".py",
248 |    "mimetype": "text/x-python",
249 |    "name": "python",
250 |    "nbconvert_exporter": "python",
251 |    "pygments_lexer": "ipython3",
252 |    "version": "3.6.5"
253 |   }
254 |  },
255 |  "nbformat": 4,
256 |  "nbformat_minor": 2
257 | }
258 | 


--------------------------------------------------------------------------------
/tvm-tutorials/getting-started.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import tvm\n",
 10 |     "import numpy as np"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "- Global declarations of environment"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 2,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "tgt_host=\"llvm\"\n",
 27 |     "# Change it to respective GPU if gpu is enabled Ex: cuda, opencl\n",
 28 |     "tgt=\"cuda\""
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "## Describe the Computation"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 3,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "n = tvm.var(\"n\")\n",
 45 |     "A = tvm.placeholder((n,), name='A')\n",
 46 |     "B = tvm.placeholder((n,), name='B')\n",
 47 |     "C = tvm.compute(A.shape, lambda i: A[i] + B[i], name=\"C\")"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 4,
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "name": "stdout",
 57 |      "output_type": "stream",
 58 |      "text": [
 59 |       "<class 'tvm.tensor.Tensor'>\n"
 60 |      ]
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "print(type(C))"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "## Schedule the Computation"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "- A schedule is a set of transformation of computation that transforms the loop of computations in the program\n",
 79 |     "\n",
 80 |     "- After we construct the schedule, by default the schedule computes C in a serial manner in a row-major order."
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 5,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "s = tvm.create_schedule(C.op)"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 6,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "data": {
 99 |       "text/plain": [
100 |        "tvm.schedule.Schedule"
101 |       ]
102 |      },
103 |      "execution_count": 6,
104 |      "metadata": {},
105 |      "output_type": "execute_result"
106 |     }
107 |    ],
108 |    "source": [
109 |     "type(s)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 7,
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "data": {
119 |       "text/plain": [
120 |        "\u001b[0;31mType:\u001b[0m        Schedule\n",
121 |        "\u001b[0;31mString form:\u001b[0m schedule(0x1d46a90)\n",
122 |        "\u001b[0;31mFile:\u001b[0m        ~/opt/miniconda3/envs/tvm/lib/python3.6/site-packages/tvm-0.2.0-py3.6-linux-x86_64.egg/tvm/schedule.py\n",
123 |        "\u001b[0;31mDocstring:\u001b[0m   Schedule for all the stages.\n"
124 |       ]
125 |      },
126 |      "metadata": {},
127 |      "output_type": "display_data"
128 |     }
129 |    ],
130 |    "source": [
131 |     "s?"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "- Use the split construct to split the first axis of C\n",
139 |     "    - this will split the original iteration axis into product of two iterations."
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 8,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "bx, tx = s[C].split(C.op.axis[0], factor=64)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 9,
154 |    "metadata": {},
155 |    "outputs": [
156 |     {
157 |      "data": {
158 |       "text/plain": [
159 |        "iter_var(i.outer, )"
160 |       ]
161 |      },
162 |      "execution_count": 9,
163 |      "metadata": {},
164 |      "output_type": "execute_result"
165 |     }
166 |    ],
167 |    "source": [
168 |     "bx"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 10,
174 |    "metadata": {},
175 |    "outputs": [
176 |     {
177 |      "data": {
178 |       "text/plain": [
179 |        "iter_var(i.inner, )"
180 |       ]
181 |      },
182 |      "execution_count": 10,
183 |      "metadata": {},
184 |      "output_type": "execute_result"
185 |     }
186 |    ],
187 |    "source": [
188 |     "tx"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {},
194 |    "source": [
195 |     "- Finally bind the iteration axis `bx` and `tx` to threads in the GPU compute grid. \n",
196 |     "- These are GPU specific constructs that allows us to generate code that runs on GPU. "
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 11,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": [
205 |     "if tgt == \"cuda\":\n",
206 |     "    s[C].bind(bx, tvm.thread_axis(\"blockIdx.x\"))\n",
207 |     "    s[C].bind(tx, tvm.thread_axis(\"threadIdx.x\"))"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "## Compilation"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "metadata": {},
220 |    "source": [
221 |     "- After finishing to specify the schedule, we can compile it into a TVM function. \n",
222 |     "- By default TVM compiles into a type-erased function that can be directly called from python side"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 12,
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "fadd = tvm.build(sch=s, args=[A, B, C], target=tgt, target_host=tgt_host, name=\"myadd\")"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 13,
237 |    "metadata": {},
238 |    "outputs": [
239 |     {
240 |      "data": {
241 |       "text/plain": [
242 |        "Module(llvm, 21b49b0)"
243 |       ]
244 |      },
245 |      "execution_count": 13,
246 |      "metadata": {},
247 |      "output_type": "execute_result"
248 |     }
249 |    ],
250 |    "source": [
251 |     "fadd"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 14,
257 |    "metadata": {},
258 |    "outputs": [
259 |     {
260 |      "data": {
261 |       "text/plain": [
262 |        "tvm.module.Module"
263 |       ]
264 |      },
265 |      "execution_count": 14,
266 |      "metadata": {},
267 |      "output_type": "execute_result"
268 |     }
269 |    ],
270 |    "source": [
271 |     "type(fadd)"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": 15,
277 |    "metadata": {},
278 |    "outputs": [
279 |     {
280 |      "data": {
281 |       "text/plain": [
282 |        "\u001b[0;31mSignature:\u001b[0m   \u001b[0mfadd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
283 |        "\u001b[0;31mType:\u001b[0m        Module\n",
284 |        "\u001b[0;31mString form:\u001b[0m Module(llvm, 21b49b0)\n",
285 |        "\u001b[0;31mFile:\u001b[0m        ~/opt/miniconda3/envs/tvm/lib/python3.6/site-packages/tvm-0.2.0-py3.6-linux-x86_64.egg/tvm/module.py\n",
286 |        "\u001b[0;31mDocstring:\u001b[0m   Module container of all TVM generated functions\n"
287 |       ]
288 |      },
289 |      "metadata": {},
290 |      "output_type": "display_data"
291 |     }
292 |    ],
293 |    "source": [
294 |     "fadd?"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "markdown",
299 |    "metadata": {},
300 |    "source": [
301 |     "## Run the function"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "markdown",
306 |    "metadata": {},
307 |    "source": [
308 |     "- Create a gpu context\n",
309 |     "- Use `tvm.nd.array` to copy data to gpu\n",
310 |     "- fadd runs the actual computation\n",
311 |     "- Use `asnumpy()` to copy the gpu array to cpu so that we can use this to verify correctness. "
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": 16,
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": [
320 |     "ctx = tvm.context(dev_type=tgt, dev_id=0)"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": 17,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "n = 1024\n",
330 |     "a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx=ctx)\n",
331 |     "b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx=ctx)\n",
332 |     "c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx=ctx)"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": 18,
338 |    "metadata": {},
339 |    "outputs": [
340 |     {
341 |      "name": "stdout",
342 |      "output_type": "stream",
343 |      "text": [
344 |       "CPU times: user 17 ms, sys: 274 µs, total: 17.2 ms\n",
345 |       "Wall time: 17.1 ms\n"
346 |      ]
347 |     }
348 |    ],
349 |    "source": [
350 |     "%%time\n",
351 |     "fadd(a, b, c)"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 19,
357 |    "metadata": {},
358 |    "outputs": [],
359 |    "source": [
360 |     "np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": 20,
366 |    "metadata": {},
367 |    "outputs": [
368 |     {
369 |      "data": {
370 |       "text/plain": [
371 |        "array([1.0944474 , 1.3696101 , 1.4258708 , ..., 1.0076509 , 1.4810429 ,\n",
372 |        "       0.42075178], dtype=float32)"
373 |       ]
374 |      },
375 |      "execution_count": 20,
376 |      "metadata": {},
377 |      "output_type": "execute_result"
378 |     }
379 |    ],
380 |    "source": [
381 |     "c.asnumpy()"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": 21,
387 |    "metadata": {},
388 |    "outputs": [
389 |     {
390 |      "data": {
391 |       "text/plain": [
392 |        "(array([0.6563842 , 0.5160306 , 0.96769214, ..., 0.21516554, 0.57610613,\n",
393 |        "        0.11985361], dtype=float32),\n",
394 |        " array([0.43806314, 0.8535795 , 0.45817864, ..., 0.7924853 , 0.9049368 ,\n",
395 |        "        0.30089816], dtype=float32))"
396 |       ]
397 |      },
398 |      "execution_count": 21,
399 |      "metadata": {},
400 |      "output_type": "execute_result"
401 |     }
402 |    ],
403 |    "source": [
404 |     "a.asnumpy(), b.asnumpy()"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "code",
409 |    "execution_count": 22,
410 |    "metadata": {},
411 |    "outputs": [
412 |     {
413 |      "data": {
414 |       "text/plain": [
415 |        "<tvm.NDArray shape=(1024,), gpu(0)>\n",
416 |        "array([0.6563842 , 0.5160306 , 0.96769214, ..., 0.21516554, 0.57610613,\n",
417 |        "       0.11985361], dtype=float32)"
418 |       ]
419 |      },
420 |      "execution_count": 22,
421 |      "metadata": {},
422 |      "output_type": "execute_result"
423 |     }
424 |    ],
425 |    "source": [
426 |     "a"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "markdown",
431 |    "metadata": {},
432 |    "source": [
433 |     "## Inspect the Generated Code "
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": 23,
439 |    "metadata": {},
440 |    "outputs": [
441 |     {
442 |      "name": "stdout",
443 |      "output_type": "stream",
444 |      "text": [
445 |       "-----------------------GPU code--------------------------\n",
446 |       "extern \"C\" __global__ void myadd__kernel0( float* __restrict__ C,  float* __restrict__ A,  float* __restrict__ B, int n) {\n",
447 |       "  if (((int)blockIdx.x) < (n / 64)) {\n",
448 |       "    C[((((int)blockIdx.x) * 64) + ((int)threadIdx.x))] = (A[((((int)blockIdx.x) * 64) + ((int)threadIdx.x))] + B[((((int)blockIdx.x) * 64) + ((int)threadIdx.x))]);\n",
449 |       "  } else {\n",
450 |       "    if ((((int)blockIdx.x) * 64) < (n - ((int)threadIdx.x))) {\n",
451 |       "      C[((((int)blockIdx.x) * 64) + ((int)threadIdx.x))] = (A[((((int)blockIdx.x) * 64) + ((int)threadIdx.x))] + B[((((int)blockIdx.x) * 64) + ((int)threadIdx.x))]);\n",
452 |       "    }\n",
453 |       "  }\n",
454 |       "}\n",
455 |       "\n",
456 |       "\n"
457 |      ]
458 |     }
459 |    ],
460 |    "source": [
461 |     "if tgt == \"cuda\":\n",
462 |     "    dev_module = fadd.imported_modules[0]\n",
463 |     "    print(\"-----------------------GPU code--------------------------\")\n",
464 |     "    print(dev_module.get_source())\n",
465 |     "    \n",
466 |     "else:\n",
467 |     "    print(fadd.get_source())"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "markdown",
472 |    "metadata": {},
473 |    "source": [
474 |     "## Save Compiled Module"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "markdown",
479 |    "metadata": {},
480 |    "source": [
481 |     "- Besides runtime compilation, we can save the compiled modules into file and load them back later. This is called ahead of time compilation\n",
482 |     "\n",
483 |     "- The following code first does the following step:\n",
484 |     "    - It saves the compiled host module into an object file.\n",
485 |     "    - Then it saves the device module into a ptx file.\n",
486 |     "    - cc.create_shared calls a env compiler(GCC) to create a shared library."
487 |    ]
488 |   },
489 |   {
490 |    "cell_type": "code",
491 |    "execution_count": 24,
492 |    "metadata": {},
493 |    "outputs": [],
494 |    "source": [
495 |     "from tvm.contrib import cc\n",
496 |     "from tvm.contrib import util"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "code",
501 |    "execution_count": 25,
502 |    "metadata": {},
503 |    "outputs": [],
504 |    "source": [
505 |     "temp = util.tempdir()\n",
506 |     "fadd.save(file_name=temp.relpath(\"myadd.o\"))"
507 |    ]
508 |   },
509 |   {
510 |    "cell_type": "code",
511 |    "execution_count": 26,
512 |    "metadata": {},
513 |    "outputs": [],
514 |    "source": [
515 |     "if tgt == \"cuda\":\n",
516 |     "    fadd.imported_modules[0].save(temp.relpath(\"myadd.ptx\"))"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "code",
521 |    "execution_count": 27,
522 |    "metadata": {},
523 |    "outputs": [],
524 |    "source": [
525 |     "cc.create_shared(output=temp.relpath(\"myadd.so\"), objects=[temp.relpath(\"myadd.o\")])"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "code",
530 |    "execution_count": 28,
531 |    "metadata": {},
532 |    "outputs": [
533 |     {
534 |      "name": "stdout",
535 |      "output_type": "stream",
536 |      "text": [
537 |       "['myadd.tvm_meta.json', 'myadd.o', 'myadd.ptx', 'myadd.so']\n"
538 |      ]
539 |     }
540 |    ],
541 |    "source": [
542 |     "print(temp.listdir())"
543 |    ]
544 |   },
545 |   {
546 |    "cell_type": "markdown",
547 |    "metadata": {},
548 |    "source": [
549 |     "## Load Compiled Module"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "code",
554 |    "execution_count": 29,
555 |    "metadata": {},
556 |    "outputs": [],
557 |    "source": [
558 |     "fadd1 = tvm.module.load(temp.relpath(\"myadd.so\"))\n",
559 |     "if tgt == \"cuda\":\n",
560 |     "    fadd1_dev = tvm.module.load(temp.relpath(\"myadd.ptx\"))\n",
561 |     "    fadd1.import_module(fadd1_dev)\n",
562 |     "fadd1(a, b, c)\n",
563 |     "np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())\n"
564 |    ]
565 |   },
566 |   {
567 |    "cell_type": "markdown",
568 |    "metadata": {},
569 |    "source": [
570 |     "## Pack Everything into One Library"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "markdown",
575 |    "metadata": {},
576 |    "source": [
577 |     "In the above example, we store the device and host code seperatedly. TVM also supports export everything as one shared library. Under the hood, we pack the device modules into binary blobs and link them together with the host code. "
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "code",
582 |    "execution_count": 30,
583 |    "metadata": {},
584 |    "outputs": [],
585 |    "source": [
586 |     "fadd.export_library(temp.relpath(\"mypack.so\"))"
587 |    ]
588 |   },
589 |   {
590 |    "cell_type": "code",
591 |    "execution_count": 31,
592 |    "metadata": {},
593 |    "outputs": [],
594 |    "source": [
595 |     "fadd2 = tvm.module.load(temp.relpath(\"mypack.so\"))"
596 |    ]
597 |   },
598 |   {
599 |    "cell_type": "code",
600 |    "execution_count": 32,
601 |    "metadata": {},
602 |    "outputs": [],
603 |    "source": [
604 |     "fadd2(a, b, c)"
605 |    ]
606 |   },
607 |   {
608 |    "cell_type": "code",
609 |    "execution_count": 33,
610 |    "metadata": {},
611 |    "outputs": [],
612 |    "source": [
613 |     "np.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())"
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "markdown",
618 |    "metadata": {},
619 |    "source": [
620 |     "**NOTE**: Runtime API and Thread-Safety\n",
621 |     "\n",
622 |     "The compiled modules of TVM do not depend on the TVM compiler. Instead, it only depends on a minimum runtime library. TVM runtime library wraps the device drivers and provides thread-safe and device agnostic call into the compiled functions.\n",
623 |     "\n",
624 |     "This means you can call the compiled TVM function from any thread, on any GPUs."
625 |    ]
626 |   },
627 |   {
628 |    "cell_type": "code",
629 |    "execution_count": 35,
630 |    "metadata": {},
631 |    "outputs": [
632 |     {
633 |      "name": "stdout",
634 |      "output_type": "stream",
635 |      "text": [
636 |       "The version_information extension is already loaded. To reload it, use:\n",
637 |       "  %reload_ext version_information\n"
638 |      ]
639 |     },
640 |     {
641 |      "data": {
642 |       "application/json": {
643 |        "Software versions": [
644 |         {
645 |          "module": "Python",
646 |          "version": "3.6.5 64bit [GCC 4.8.2 20140120 (Red Hat 4.8.2-15)]"
647 |         },
648 |         {
649 |          "module": "IPython",
650 |          "version": "6.4.0"
651 |         },
652 |         {
653 |          "module": "OS",
654 |          "version": "Linux 4.13.0 41 generic x86_64 with debian stretch sid"
655 |         },
656 |         {
657 |          "module": "numpy",
658 |          "version": "1.14.3"
659 |         },
660 |         {
661 |          "module": "tvm",
662 |          "version": "0.2.0"
663 |         }
664 |        ]
665 |       },
666 |       "text/html": [
667 |        "<table><tr><th>Software</th><th>Version</th></tr><tr><td>Python</td><td>3.6.5 64bit [GCC 4.8.2 20140120 (Red Hat 4.8.2-15)]</td></tr><tr><td>IPython</td><td>6.4.0</td></tr><tr><td>OS</td><td>Linux 4.13.0 41 generic x86_64 with debian stretch sid</td></tr><tr><td>numpy</td><td>1.14.3</td></tr><tr><td>tvm</td><td>0.2.0</td></tr><tr><td colspan='2'>Thu May 17 17:10:38 2018 CDT</td></tr></table>"
668 |       ],
669 |       "text/latex": [
670 |        "\\begin{tabular}{|l|l|}\\hline\n",
671 |        "{\\bf Software} & {\\bf Version} \\\\ \\hline\\hline\n",
672 |        "Python & 3.6.5 64bit [GCC 4.8.2 20140120 (Red Hat 4.8.2-15)] \\\\ \\hline\n",
673 |        "IPython & 6.4.0 \\\\ \\hline\n",
674 |        "OS & Linux 4.13.0 41 generic x86\\_64 with debian stretch sid \\\\ \\hline\n",
675 |        "numpy & 1.14.3 \\\\ \\hline\n",
676 |        "tvm & 0.2.0 \\\\ \\hline\n",
677 |        "\\hline \\multicolumn{2}{|l|}{Thu May 17 17:10:38 2018 CDT} \\\\ \\hline\n",
678 |        "\\end{tabular}\n"
679 |       ],
680 |       "text/plain": [
681 |        "Software versions\n",
682 |        "Python 3.6.5 64bit [GCC 4.8.2 20140120 (Red Hat 4.8.2-15)]\n",
683 |        "IPython 6.4.0\n",
684 |        "OS Linux 4.13.0 41 generic x86_64 with debian stretch sid\n",
685 |        "numpy 1.14.3\n",
686 |        "tvm 0.2.0\n",
687 |        "Thu May 17 17:10:38 2018 CDT"
688 |       ]
689 |      },
690 |      "execution_count": 35,
691 |      "metadata": {},
692 |      "output_type": "execute_result"
693 |     }
694 |    ],
695 |    "source": [
696 |     "%load_ext version_information\n",
697 |     "%version_information numpy, tvm"
698 |    ]
699 |   },
700 |   {
701 |    "cell_type": "code",
702 |    "execution_count": null,
703 |    "metadata": {},
704 |    "outputs": [],
705 |    "source": []
706 |   },
707 |   {
708 |    "cell_type": "code",
709 |    "execution_count": null,
710 |    "metadata": {},
711 |    "outputs": [],
712 |    "source": []
713 |   }
714 |  ],
715 |  "metadata": {
716 |   "kernelspec": {
717 |    "display_name": "Python 3",
718 |    "language": "python",
719 |    "name": "python3"
720 |   },
721 |   "language_info": {
722 |    "codemirror_mode": {
723 |     "name": "ipython",
724 |     "version": 3
725 |    },
726 |    "file_extension": ".py",
727 |    "mimetype": "text/x-python",
728 |    "name": "python",
729 |    "nbconvert_exporter": "python",
730 |    "pygments_lexer": "ipython3",
731 |    "version": "3.6.5"
732 |   }
733 |  },
734 |  "nbformat": 4,
735 |  "nbformat_minor": 2
736 | }
737 | 


--------------------------------------------------------------------------------
/tvm-tutorials/reduction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import tvm \n",
 10 |     "import numpy as np"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "## Describe Sum of Rows\n",
 18 |     "\n",
 19 |     "`B = numpy.sum(A, axis=1)`"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "n = tvm.var(\"n\")\n",
 29 |     "m = tvm.var(\"m\")"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "A = tvm.placeholder((n, m), name=\"A\")\n",
 39 |     "k = tvm.reduce_axis(dom=(0, m), name=\"k\")\n",
 40 |     "B = tvm.compute(shape=(n,), fcompute=lambda i: tvm.sum(A[i, k], axis=k), name=\"B\")"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "## Schedule the Reduction"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 4,
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "name": "stdout",
 57 |      "output_type": "stream",
 58 |      "text": [
 59 |       "produce B {\n",
 60 |       "  for (i, 0, n) {\n",
 61 |       "    B[i] = 0.000000f\n",
 62 |       "    for (k, 0, m) {\n",
 63 |       "      B[i] = (B[i] + A[((i*m) + k)])\n",
 64 |       "    }\n",
 65 |       "  }\n",
 66 |       "}\n",
 67 |       "\n"
 68 |      ]
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "s = tvm.create_schedule(B.op)\n",
 73 |     "print(tvm.lower(sch=s, args=[A, B], simple_mode=True))"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "Let's split both the row axis of B as well axis by different factors. The result is a nested reduction. "
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 6,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "ko, ki = s[B].split(parent=B.op.reduce_axis[0], factor=16)\n",
 90 |     "xo, xi = s[B].split(parent=B.op.axis[0], factor=32)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 7,
 96 |    "metadata": {},
 97 |    "outputs": [
 98 |     {
 99 |      "name": "stdout",
100 |      "output_type": "stream",
101 |      "text": [
102 |       "produce B {\n",
103 |       "  for (i.outer, 0, ((n + 31)/32)) {\n",
104 |       "    for (i.inner, 0, 32) {\n",
105 |       "      if (likely(((i.outer*32) < (n - i.inner)))) {\n",
106 |       "        B[((i.outer*32) + i.inner)] = 0.000000f\n",
107 |       "      }\n",
108 |       "      for (k.outer, 0, ((m + 15)/16)) {\n",
109 |       "        for (k.inner, 0, 16) {\n",
110 |       "          if (likely(((i.outer*32) < (n - i.inner)))) {\n",
111 |       "            if (likely(((k.outer*16) < (m - k.inner)))) {\n",
112 |       "              B[((i.outer*32) + i.inner)] = (B[((i.outer*32) + i.inner)] + A[(((((i.outer*32) + i.inner)*m) + (k.outer*16)) + k.inner)])\n",
113 |       "            }\n",
114 |       "          }\n",
115 |       "        }\n",
116 |       "      }\n",
117 |       "    }\n",
118 |       "  }\n",
119 |       "}\n",
120 |       "\n"
121 |      ]
122 |     }
123 |    ],
124 |    "source": [
125 |     "print(tvm.lower(sch=s, args=[A, B], simple_mode=True))"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "If we are building a GPU kernel, we can bind the rows of B to GPU threads"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 8,
138 |    "metadata": {},
139 |    "outputs": [
140 |     {
141 |      "name": "stdout",
142 |      "output_type": "stream",
143 |      "text": [
144 |       "produce B {\n",
145 |       "  // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = ((n + 31)/32)\n",
146 |       "  // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 32\n",
147 |       "  if (likely(((blockIdx.x*32) < (n - threadIdx.x)))) {\n",
148 |       "    B[((blockIdx.x*32) + threadIdx.x)] = 0.000000f\n",
149 |       "  }\n",
150 |       "  for (k.outer, 0, ((m + 15)/16)) {\n",
151 |       "    for (k.inner, 0, 16) {\n",
152 |       "      if (likely(((blockIdx.x*32) < (n - threadIdx.x)))) {\n",
153 |       "        if (likely(((k.outer*16) < (m - k.inner)))) {\n",
154 |       "          B[((blockIdx.x*32) + threadIdx.x)] = (B[((blockIdx.x*32) + threadIdx.x)] + A[(((((blockIdx.x*32) + threadIdx.x)*m) + (k.outer*16)) + k.inner)])\n",
155 |       "        }\n",
156 |       "      }\n",
157 |       "    }\n",
158 |       "  }\n",
159 |       "}\n",
160 |       "\n"
161 |      ]
162 |     }
163 |    ],
164 |    "source": [
165 |     "s[B].bind(ivar=xo, thread_ivar=tvm.thread_axis(\"blockIdx.x\"))\n",
166 |     "s[B].bind(ivar=xi, thread_ivar=tvm.thread_axis(\"threadIdx.x\"))\n",
167 |     "print(tvm.lower(sch=s, args=[A, B], simple_mode=True))"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "## Reduction Factoring and Parallelization"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 9,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "s = tvm.create_schedule(ops=B.op)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 10,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "ko, ki = s[B].split(parent=B.op.reduce_axis[0], factor=16)"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 11,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "BF = s.rfactor(tensor=B, axis=ki, factor_axis=0)"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 12,
207 |    "metadata": {},
208 |    "outputs": [
209 |     {
210 |      "name": "stdout",
211 |      "output_type": "stream",
212 |      "text": [
213 |       "// attr [B.rf] storage_scope = \"global\"\n",
214 |       "allocate B.rf[float32 * 16 * n]\n",
215 |       "produce B.rf {\n",
216 |       "  for (k.inner, 0, 16) {\n",
217 |       "    for (i, 0, n) {\n",
218 |       "      B.rf[((k.inner*n) + i)] = 0.000000f\n",
219 |       "      for (k.outer, 0, ((m + 15)/16)) {\n",
220 |       "        if ((k.inner < (m - (k.outer*16)))) {\n",
221 |       "          B.rf[((k.inner*n) + i)] = (B.rf[((k.inner*n) + i)] + A[((k.inner + (i*m)) + (k.outer*16))])\n",
222 |       "        }\n",
223 |       "      }\n",
224 |       "    }\n",
225 |       "  }\n",
226 |       "}\n",
227 |       "produce B {\n",
228 |       "  for (ax0, 0, n) {\n",
229 |       "    B[ax0] = 0.000000f\n",
230 |       "    for (k.inner.v, 0, 16) {\n",
231 |       "      B[ax0] = (B[ax0] + B.rf[(ax0 + (k.inner.v*n))])\n",
232 |       "    }\n",
233 |       "  }\n",
234 |       "}\n",
235 |       "\n"
236 |      ]
237 |     }
238 |    ],
239 |    "source": [
240 |     "print(tvm.lower(sch=s, args=[A, B], simple_mode=True))"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 14,
246 |    "metadata": {},
247 |    "outputs": [
248 |     {
249 |      "name": "stdout",
250 |      "output_type": "stream",
251 |      "text": [
252 |       "[reduce(combiner=comm_reducer(result=[(x + y)], lhs=[x], rhs=[y], identity_element=[0.000000f]), source=[B.rf(k.inner.v, ax0)], axis=[iter_var(k.inner.v, Range(min=0, extent=16))], where=(uint1)1, value_index=0)]\n"
253 |      ]
254 |     }
255 |    ],
256 |    "source": [
257 |     "print(s[B].op.body)"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "markdown",
262 |    "metadata": {},
263 |    "source": [
264 |     "## Cross Thread Reduction "
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "metadata": {},
270 |    "source": [
271 |     "- We can now parallelize over the factored axis. \n",
272 |     "- Here the reduction axis of B is marked to be a thread.\n",
273 |     "- TVM allows reduction axis to be marked as thread if it is the only axis in reduction and cross thread reduction is possible in the device. "
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 15,
279 |    "metadata": {},
280 |    "outputs": [
281 |     {
282 |      "name": "stdout",
283 |      "output_type": "stream",
284 |      "text": [
285 |       "extern \"C\" __global__ void default_function__kernel0( float* __restrict__ A,  float* __restrict__ B, int m, int n) {\n",
286 |       "   float B_rf[1];\n",
287 |       "  __shared__ float red_buf0[512];\n",
288 |       "  B_rf[0] = 0.000000e+00f;\n",
289 |       "  for (int k_outer = 0; k_outer < ((15 + m) / 16); ++k_outer) {\n",
290 |       "    if ((((int)blockIdx.x) * 32) < (n - ((int)threadIdx.y))) {\n",
291 |       "      if (((int)threadIdx.x) < (m - (k_outer * 16))) {\n",
292 |       "        B_rf[0] = (B_rf[0] + A[(((((((int)blockIdx.x) * 32) + ((int)threadIdx.y)) * m) + ((int)threadIdx.x)) + (k_outer * 16))]);\n",
293 |       "      }\n",
294 |       "    }\n",
295 |       "  }\n",
296 |       "  ((volatile __shared__ float*)red_buf0)[((((int)threadIdx.y) * 16) + ((int)threadIdx.x))] = (((((int)blockIdx.x) * 32) < (n - ((int)threadIdx.y))) ? B_rf[0] : 0.000000e+00f);\n",
297 |       "  __syncthreads();\n",
298 |       "  if (((int)threadIdx.x) < 8) {\n",
299 |       "    ((volatile __shared__ float*)red_buf0)[((((int)threadIdx.y) * 16) + ((int)threadIdx.x))] = (((volatile __shared__ float*)red_buf0)[((((int)threadIdx.y) * 16) + ((int)threadIdx.x))] + ((volatile __shared__ float*)red_buf0)[((8 + (((int)threadIdx.y) * 16)) + ((int)threadIdx.x))]);\n",
300 |       "    ((volatile __shared__ float*)red_buf0)[((((int)threadIdx.y) * 16) + ((int)threadIdx.x))] = (((volatile __shared__ float*)red_buf0)[((((int)threadIdx.y) * 16) + ((int)threadIdx.x))] + ((volatile __shared__ float*)red_buf0)[((4 + (((int)threadIdx.y) * 16)) + ((int)threadIdx.x))]);\n",
301 |       "    ((volatile __shared__ float*)red_buf0)[((((int)threadIdx.y) * 16) + ((int)threadIdx.x))] = (((volatile __shared__ float*)red_buf0)[((((int)threadIdx.y) * 16) + ((int)threadIdx.x))] + ((volatile __shared__ float*)red_buf0)[((2 + (((int)threadIdx.y) * 16)) + ((int)threadIdx.x))]);\n",
302 |       "    ((volatile __shared__ float*)red_buf0)[((((int)threadIdx.y) * 16) + ((int)threadIdx.x))] = (((volatile __shared__ float*)red_buf0)[((((int)threadIdx.y) * 16) + ((int)threadIdx.x))] + ((volatile __shared__ float*)red_buf0)[((1 + (((int)threadIdx.y) * 16)) + ((int)threadIdx.x))]);\n",
303 |       "  }\n",
304 |       "  __syncthreads();\n",
305 |       "  if ((((int)blockIdx.x) * 32) < (n - ((int)threadIdx.y))) {\n",
306 |       "    if (((int)threadIdx.x) == 0) {\n",
307 |       "      B[((((int)blockIdx.x) * 32) + ((int)threadIdx.y))] = ((volatile __shared__ float*)red_buf0)[(((int)threadIdx.y) * 16)];\n",
308 |       "    }\n",
309 |       "  }\n",
310 |       "}\n",
311 |       "\n",
312 |       "\n"
313 |      ]
314 |     }
315 |    ],
316 |    "source": [
317 |     "xo, xi = s[B].split(s[B].op.axis[0], factor=32)\n",
318 |     "s[B].bind(xo, tvm.thread_axis(\"blockIdx.x\"))\n",
319 |     "s[B].bind(xi, tvm.thread_axis(\"threadIdx.y\"))\n",
320 |     "tx = tvm.thread_axis(\"threadIdx.x\")\n",
321 |     "s[B].bind(s[B].op.reduce_axis[0], tx)\n",
322 |     "s[BF].compute_at(s[B], s[B].op.reduce_axis[0])\n",
323 |     "s[B].set_store_predicate(tx.var.equal(0))\n",
324 |     "fcuda = tvm.build(s, [A, B], \"cuda\")\n",
325 |     "print(fcuda.imported_modules[0].get_source())"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 16,
331 |    "metadata": {},
332 |    "outputs": [],
333 |    "source": [
334 |     "nn = 128\n",
335 |     "ctx  = tvm.gpu(0)\n",
336 |     "a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), ctx)\n",
337 |     "b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), ctx)\n",
338 |     "fcuda(a, b)"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 17,
344 |    "metadata": {},
345 |    "outputs": [],
346 |    "source": [
347 |     "np.testing.assert_allclose(b.asnumpy(), np.sum(a.asnumpy(), axis=1), rtol=1e-4)"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "markdown",
352 |    "metadata": {},
353 |    "source": [
354 |     "## Describe Convolution via 2D Reduction "
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": 18,
360 |    "metadata": {},
361 |    "outputs": [],
362 |    "source": [
363 |     "n = tvm.var(\"n\")\n",
364 |     "Input = tvm.placeholder(shape=(n, n), name=\"Input\")\n",
365 |     "Filter = tvm.placeholder(shape=(3, 3), name=\"Filter\")"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": 19,
371 |    "metadata": {},
372 |    "outputs": [],
373 |    "source": [
374 |     "di = tvm.reduce_axis(dom=(0, 3), name=\"di\")\n",
375 |     "dj = tvm.reduce_axis(dom=(0, 3), name=\"dj\")"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 20,
381 |    "metadata": {},
382 |    "outputs": [],
383 |    "source": [
384 |     "Output = tvm.compute(shape=(n-2, n-2),\n",
385 |     "                    fcompute=lambda i, j: tvm.sum(Input[i + di, j + dj] * Filter[di, dj], axis=[di, dj]),\n",
386 |     "                    name=\"Output\")"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": 21,
392 |    "metadata": {},
393 |    "outputs": [],
394 |    "source": [
395 |     "s = tvm.create_schedule(ops=Output.op)"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": 22,
401 |    "metadata": {},
402 |    "outputs": [
403 |     {
404 |      "name": "stdout",
405 |      "output_type": "stream",
406 |      "text": [
407 |       "produce Output {\n",
408 |       "  for (i, 0, (n + -2)) {\n",
409 |       "    for (j, 0, (n + -2)) {\n",
410 |       "      Output[((i*(n + -2)) + j)] = 0.000000f\n",
411 |       "      for (di, 0, 3) {\n",
412 |       "        for (dj, 0, 3) {\n",
413 |       "          Output[((i*(n + -2)) + j)] = (Output[((i*(n + -2)) + j)] + (Input[((j + ((i + di)*n)) + dj)]*Filter[((di*3) + dj)]))\n",
414 |       "        }\n",
415 |       "      }\n",
416 |       "    }\n",
417 |       "  }\n",
418 |       "}\n",
419 |       "\n"
420 |      ]
421 |     }
422 |    ],
423 |    "source": [
424 |     "print(tvm.lower(sch=s, args=[Input, Filter, Output], simple_mode=True))"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "markdown",
429 |    "metadata": {},
430 |    "source": [
431 |     "## Define General Commutative Reduction Operation "
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": 23,
437 |    "metadata": {},
438 |    "outputs": [],
439 |    "source": [
440 |     "n = tvm.var('n')\n",
441 |     "m = tvm.var('m')\n",
442 |     "product = tvm.comm_reducer(lambda x, y: x*y,\n",
443 |     "    lambda t: tvm.const(1, dtype=t), name=\"product\")\n",
444 |     "A = tvm.placeholder((n, m), name='A')\n",
445 |     "k = tvm.reduce_axis((0, m), name='k')\n",
446 |     "B = tvm.compute((n,), lambda i: product(A[i, k], axis=k), name='B')"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "code",
451 |    "execution_count": 24,
452 |    "metadata": {},
453 |    "outputs": [],
454 |    "source": [
455 |     "s = tvm.create_schedule(ops=B.op)"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": 26,
461 |    "metadata": {},
462 |    "outputs": [
463 |     {
464 |      "name": "stdout",
465 |      "output_type": "stream",
466 |      "text": [
467 |       "produce B {\n",
468 |       "  for (i, 0, n) {\n",
469 |       "    B[i] = 1.000000f\n",
470 |       "    for (k, 0, m) {\n",
471 |       "      B[i] = (B[i]*A[((i*m) + k)])\n",
472 |       "    }\n",
473 |       "  }\n",
474 |       "}\n",
475 |       "\n"
476 |      ]
477 |     }
478 |    ],
479 |    "source": [
480 |     "print(tvm.lower(sch=s, args=[A, B], simple_mode=True))"
481 |    ]
482 |   },
483 |   {
484 |    "cell_type": "code",
485 |    "execution_count": null,
486 |    "metadata": {},
487 |    "outputs": [],
488 |    "source": []
489 |   }
490 |  ],
491 |  "metadata": {
492 |   "kernelspec": {
493 |    "display_name": "Python 3",
494 |    "language": "python",
495 |    "name": "python3"
496 |   },
497 |   "language_info": {
498 |    "codemirror_mode": {
499 |     "name": "ipython",
500 |     "version": 3
501 |    },
502 |    "file_extension": ".py",
503 |    "mimetype": "text/x-python",
504 |    "name": "python",
505 |    "nbconvert_exporter": "python",
506 |    "pygments_lexer": "ipython3",
507 |    "version": "3.6.5"
508 |   }
509 |  },
510 |  "nbformat": 4,
511 |  "nbformat_minor": 2
512 | }
513 | 


--------------------------------------------------------------------------------
/tvm-tutorials/schedule_primitives.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import tvm \n",
 10 |     "import numpy as np"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "- There often exist several methods to compute the same result, however, different methods will result in different locality and performance. So TVM asks user to provide how to execute the computation called Schedule.\n",
 18 |     "\n",
 19 |     "- A Schedule is a set of transformation of computation that transforms the loop of computations in the program."
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 2,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "# declare some variables for use later\n",
 29 |     "n = tvm.var(\"n\")\n",
 30 |     "m = tvm.var(\"m\")"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "- A schedule can be created from a list of ops, by default the schedule computes tensor in a serial manner in a row-major order. "
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "# declare a matrix element-wise multiply \n",
 47 |     "A = tvm.placeholder(shape=(m, n), name=\"A\")\n",
 48 |     "B = tvm.placeholder(shape=(m, n), name=\"B\")\n",
 49 |     "C = tvm.compute(shape=(m, n), \n",
 50 |     "                fcompute=lambda i, j: A[i, j] * B[i, j], name=\"C\")"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 4,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "s = tvm.create_schedule(ops=[C.op])"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "lower will transform the computation from definition to the real callable function. With argument `simple_mode=True`, it will return you a readable C like statement"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 5,
 72 |    "metadata": {},
 73 |    "outputs": [
 74 |     {
 75 |      "name": "stdout",
 76 |      "output_type": "stream",
 77 |      "text": [
 78 |       "produce C {\n",
 79 |       "  for (i, 0, m) {\n",
 80 |       "    for (j, 0, n) {\n",
 81 |       "      C[((i*n) + j)] = (A[((i*n) + j)]*B[((i*n) + j)])\n",
 82 |       "    }\n",
 83 |       "  }\n",
 84 |       "}\n",
 85 |       "\n"
 86 |      ]
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "print(tvm.lower(sch=s, args=[A, B, C], simple_mode=True))"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "- One schedule is composed by multiple stages, and one stage represents schedule for one operation. TVM provides various methods to schedule every stage. "
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "## Split"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "`split` can split a specified axis into two axises by `factor`. "
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 6,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "A = tvm.placeholder(shape=(m, ), name=\"A\")\n",
121 |     "B = tvm.compute(shape=(m, ),\n",
122 |     "                fcompute=lambda i: A[i] * 2,\n",
123 |     "                name=\"B\")"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 7,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "s = tvm.create_schedule(B.op)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 8,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "xo, xi = s[B].split(B.op.axis[0], factor=32)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 9,
147 |    "metadata": {},
148 |    "outputs": [
149 |     {
150 |      "name": "stdout",
151 |      "output_type": "stream",
152 |      "text": [
153 |       "produce B {\n",
154 |       "  for (i.outer, 0, ((m + 31)/32)) {\n",
155 |       "    for (i.inner, 0, 32) {\n",
156 |       "      if (likely(((i.outer*32) < (m - i.inner)))) {\n",
157 |       "        B[((i.outer*32) + i.inner)] = (A[((i.outer*32) + i.inner)]*2.000000f)\n",
158 |       "      }\n",
159 |       "    }\n",
160 |       "  }\n",
161 |       "}\n",
162 |       "\n"
163 |      ]
164 |     }
165 |    ],
166 |    "source": [
167 |     "print(tvm.lower(sch=s, args=[A, B], simple_mode=True))"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "You can also split a axis by nparts, which splits the axis contrary with factor"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 10,
180 |    "metadata": {},
181 |    "outputs": [
182 |     {
183 |      "name": "stdout",
184 |      "output_type": "stream",
185 |      "text": [
186 |       "produce B {\n",
187 |       "  for (i.outer, 0, 32) {\n",
188 |       "    for (i.inner, 0, ((m + 31)/32)) {\n",
189 |       "      if (likely(((i.outer*((m + 31)/32)) < (m - i.inner)))) {\n",
190 |       "        if (likely(((0 - i.inner) <= (i.outer*((m + 31)/32))))) {\n",
191 |       "          B[((i.outer*((m + 31)/32)) + i.inner)] = (A[((i.outer*((m + 31)/32)) + i.inner)]*2.000000f)\n",
192 |       "        }\n",
193 |       "      }\n",
194 |       "    }\n",
195 |       "  }\n",
196 |       "}\n",
197 |       "\n"
198 |      ]
199 |     }
200 |    ],
201 |    "source": [
202 |     "A = tvm.placeholder(shape=(m, ), name=\"A\")\n",
203 |     "B = tvm.compute(shape=(m, ),\n",
204 |     "                fcompute=lambda i: A[i] * 2,\n",
205 |     "                name=\"B\")\n",
206 |     "s = tvm.create_schedule(ops=B.op)\n",
207 |     "bx, tx = s[B].split(B.op.axis[0], nparts=32)\n",
208 |     "print(tvm.lower(sch=s, args=[A, B], simple_mode=True))"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "metadata": {},
214 |    "source": [
215 |     "## tile \n",
216 |     "\n",
217 |     "`tile` helps execute the computation tile by tile over two axises. \n"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 12,
223 |    "metadata": {},
224 |    "outputs": [
225 |     {
226 |      "name": "stdout",
227 |      "output_type": "stream",
228 |      "text": [
229 |       "produce B {\n",
230 |       "  for (i.outer, 0, ((m + 9)/10)) {\n",
231 |       "    for (j.outer, 0, ((n + 4)/5)) {\n",
232 |       "      for (i.inner, 0, 10) {\n",
233 |       "        for (j.inner, 0, 5) {\n",
234 |       "          if (likely(((i.outer*10) < (m - i.inner)))) {\n",
235 |       "            if (likely(((j.outer*5) < (n - j.inner)))) {\n",
236 |       "              B[(((j.outer*5) + (((i.outer*10) + i.inner)*n)) + j.inner)] = A[(((j.outer*5) + (((i.outer*10) + i.inner)*n)) + j.inner)]\n",
237 |       "            }\n",
238 |       "          }\n",
239 |       "        }\n",
240 |       "      }\n",
241 |       "    }\n",
242 |       "  }\n",
243 |       "}\n",
244 |       "\n"
245 |      ]
246 |     }
247 |    ],
248 |    "source": [
249 |     "A = tvm.placeholder(shape=(m, n), name=\"A\")\n",
250 |     "B = tvm.compute(shape=(m, n), \n",
251 |     "                fcompute=lambda i, j: A[i, j], name=\"B\")\n",
252 |     "s = tvm.create_schedule(B.op)\n",
253 |     "xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)\n",
254 |     "print(tvm.lower(sch=s, args=[A, B], simple_mode=True))"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "markdown",
259 |    "metadata": {},
260 |    "source": [
261 |     "## fuse\n",
262 |     "\n",
263 |     "`fuse` can fuse two consecutive axises of one computation. "
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 13,
269 |    "metadata": {},
270 |    "outputs": [],
271 |    "source": [
272 |     "A = tvm.placeholder((m, n), name='A')\n",
273 |     "B = tvm.compute((m, n), lambda i, j: A[i, j], name='B')\n",
274 |     "\n",
275 |     "s = tvm.create_schedule(B.op)\n",
276 |     "# tile to four axises first: (i.outer, j.outer, i.inner, j.inner)\n",
277 |     "xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": 14,
283 |    "metadata": {},
284 |    "outputs": [
285 |     {
286 |      "name": "stdout",
287 |      "output_type": "stream",
288 |      "text": [
289 |       "produce B {\n",
290 |       "  for (i.outer, 0, ((m + 9)/10)) {\n",
291 |       "    for (j.outer, 0, ((n + 4)/5)) {\n",
292 |       "      for (i.inner.j.inner.fused, 0, 50) {\n",
293 |       "        if (likely(((i.outer*10) < (m - (i.inner.j.inner.fused/5))))) {\n",
294 |       "          if (likely(((j.outer*5) < (n - (i.inner.j.inner.fused % 5))))) {\n",
295 |       "            B[(((j.outer*5) + (i.inner.j.inner.fused % 5)) + (((i.outer*10) + (i.inner.j.inner.fused/5))*n))] = A[(((j.outer*5) + (i.inner.j.inner.fused % 5)) + (((i.outer*10) + (i.inner.j.inner.fused/5))*n))]\n",
296 |       "          }\n",
297 |       "        }\n",
298 |       "      }\n",
299 |       "    }\n",
300 |       "  }\n",
301 |       "}\n",
302 |       "\n"
303 |      ]
304 |     }
305 |    ],
306 |    "source": [
307 |     "# then fuse (i.inner, j.inner) into one axis: (i.inner.j.inner.fused)\n",
308 |     "fused = s[B].fuse(xi, yi)\n",
309 |     "print(tvm.lower(s, [A, B], simple_mode=True))"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "markdown",
314 |    "metadata": {},
315 |    "source": [
316 |     "## reorder \n",
317 |     "\n",
318 |     "`reorder` can reorder the axises in the specified order."
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 16,
324 |    "metadata": {},
325 |    "outputs": [
326 |     {
327 |      "name": "stdout",
328 |      "output_type": "stream",
329 |      "text": [
330 |       "produce B {\n",
331 |       "  for (i.inner, 0, 10) {\n",
332 |       "    for (j.outer, 0, ((n + 4)/5)) {\n",
333 |       "      for (i.outer, 0, ((m + 9)/10)) {\n",
334 |       "        for (j.inner, 0, 5) {\n",
335 |       "          if (likely((i.inner < (m - (i.outer*10))))) {\n",
336 |       "            if (likely(((j.outer*5) < (n - j.inner)))) {\n",
337 |       "              B[(((j.outer*5) + ((i.inner + (i.outer*10))*n)) + j.inner)] = A[(((j.outer*5) + ((i.inner + (i.outer*10))*n)) + j.inner)]\n",
338 |       "            }\n",
339 |       "          }\n",
340 |       "        }\n",
341 |       "      }\n",
342 |       "    }\n",
343 |       "  }\n",
344 |       "}\n",
345 |       "\n"
346 |      ]
347 |     }
348 |    ],
349 |    "source": [
350 |     "A = tvm.placeholder((m, n), name='A')\n",
351 |     "B = tvm.compute((m, n), lambda i, j: A[i, j], name='B')\n",
352 |     "\n",
353 |     "s = tvm.create_schedule(B.op)\n",
354 |     "# tile to four axises first: (i.outer, j.outer, i.inner, j.inner)\n",
355 |     "xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5)\n",
356 |     "# then reorder the axises: (i.inner, j.outer, i.outer, j.inner)\n",
357 |     "s[B].reorder(xi, yo, xo, yi)\n",
358 |     "print(tvm.lower(s, [A, B], simple_mode=True))"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "markdown",
363 |    "metadata": {},
364 |    "source": [
365 |     "## bind\n",
366 |     "`bind` can bind a specified axis with a thread axis, often used in gpu programming."
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 17,
372 |    "metadata": {},
373 |    "outputs": [
374 |     {
375 |      "name": "stdout",
376 |      "output_type": "stream",
377 |      "text": [
378 |       "produce B {\n",
379 |       "  // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = ((n + 63)/64)\n",
380 |       "  // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 64\n",
381 |       "  if (likely(((blockIdx.x*64) < (n - threadIdx.x)))) {\n",
382 |       "    B[((blockIdx.x*64) + threadIdx.x)] = (A[((blockIdx.x*64) + threadIdx.x)]*2.000000f)\n",
383 |       "  }\n",
384 |       "}\n",
385 |       "\n"
386 |      ]
387 |     }
388 |    ],
389 |    "source": [
390 |     "A = tvm.placeholder((n,), name='A')\n",
391 |     "B = tvm.compute(A.shape, lambda i: A[i] * 2, name='B')\n",
392 |     "\n",
393 |     "s = tvm.create_schedule(B.op)\n",
394 |     "bx, tx = s[B].split(B.op.axis[0], factor=64)\n",
395 |     "s[B].bind(bx, tvm.thread_axis(\"blockIdx.x\"))\n",
396 |     "s[B].bind(tx, tvm.thread_axis(\"threadIdx.x\"))\n",
397 |     "print(tvm.lower(s, [A, B], simple_mode=True))\n"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "markdown",
402 |    "metadata": {},
403 |    "source": [
404 |     "## compute_at\n",
405 |     "\n",
406 |     "For a schedule consists of multiple operators, tvm will compute tensors at the root separately by default."
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 18,
412 |    "metadata": {},
413 |    "outputs": [
414 |     {
415 |      "name": "stdout",
416 |      "output_type": "stream",
417 |      "text": [
418 |       "produce B {\n",
419 |       "  for (i, 0, m) {\n",
420 |       "    B[i] = (A[i] + 1.000000f)\n",
421 |       "  }\n",
422 |       "}\n",
423 |       "produce C {\n",
424 |       "  for (i, 0, m) {\n",
425 |       "    C[i] = (B[i]*2.000000f)\n",
426 |       "  }\n",
427 |       "}\n",
428 |       "\n"
429 |      ]
430 |     }
431 |    ],
432 |    "source": [
433 |     "A = tvm.placeholder((m,), name='A')\n",
434 |     "B = tvm.compute((m,), lambda i: A[i]+1, name='B')\n",
435 |     "C = tvm.compute((m,), lambda i: B[i]*2, name='C')\n",
436 |     "\n",
437 |     "s = tvm.create_schedule(C.op)\n",
438 |     "print(tvm.lower(s, [A, B, C], simple_mode=True))\n"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "markdown",
443 |    "metadata": {},
444 |    "source": [
445 |     "compute_at can move computation of B into the first axis of computation of C."
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": 19,
451 |    "metadata": {},
452 |    "outputs": [
453 |     {
454 |      "name": "stdout",
455 |      "output_type": "stream",
456 |      "text": [
457 |       "produce C {\n",
458 |       "  for (i, 0, m) {\n",
459 |       "    produce B {\n",
460 |       "      B[i] = (A[i] + 1.000000f)\n",
461 |       "    }\n",
462 |       "    C[i] = (B[i]*2.000000f)\n",
463 |       "  }\n",
464 |       "}\n",
465 |       "\n"
466 |      ]
467 |     }
468 |    ],
469 |    "source": [
470 |     "A = tvm.placeholder((m,), name='A')\n",
471 |     "B = tvm.compute((m,), lambda i: A[i]+1, name='B')\n",
472 |     "C = tvm.compute((m,), lambda i: B[i]*2, name='C')\n",
473 |     "\n",
474 |     "s = tvm.create_schedule(C.op)\n",
475 |     "s[B].compute_at(s[C], C.op.axis[0])\n",
476 |     "print(tvm.lower(s, [A, B, C], simple_mode=True))\n"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "markdown",
481 |    "metadata": {},
482 |    "source": [
483 |     "## compute_inline\n",
484 |     "\n",
485 |     "`compute_inline` can mark one stage as inline, then the body of computation will be expanded and inserted at the address where the tensor is required."
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "code",
490 |    "execution_count": 20,
491 |    "metadata": {},
492 |    "outputs": [
493 |     {
494 |      "name": "stdout",
495 |      "output_type": "stream",
496 |      "text": [
497 |       "produce C {\n",
498 |       "  for (i, 0, m) {\n",
499 |       "    C[i] = ((A[i]*2.000000f) + 2.000000f)\n",
500 |       "  }\n",
501 |       "}\n",
502 |       "\n"
503 |      ]
504 |     }
505 |    ],
506 |    "source": [
507 |     "A = tvm.placeholder((m,), name='A')\n",
508 |     "B = tvm.compute((m,), lambda i: A[i]+1, name='B')\n",
509 |     "C = tvm.compute((m,), lambda i: B[i]*2, name='C')\n",
510 |     "\n",
511 |     "s = tvm.create_schedule(C.op)\n",
512 |     "s[B].compute_inline()\n",
513 |     "print(tvm.lower(s, [A, B, C], simple_mode=True))"
514 |    ]
515 |   },
516 |   {
517 |    "cell_type": "markdown",
518 |    "metadata": {},
519 |    "source": [
520 |     "## compute_root\n",
521 |     "\n",
522 |     "`compute_root` can move computation of one stage to the root."
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "code",
527 |    "execution_count": 21,
528 |    "metadata": {},
529 |    "outputs": [
530 |     {
531 |      "name": "stdout",
532 |      "output_type": "stream",
533 |      "text": [
534 |       "produce B {\n",
535 |       "  for (i, 0, m) {\n",
536 |       "    B[i] = (A[i] + 1.000000f)\n",
537 |       "  }\n",
538 |       "}\n",
539 |       "produce C {\n",
540 |       "  for (i, 0, m) {\n",
541 |       "    C[i] = (B[i]*2.000000f)\n",
542 |       "  }\n",
543 |       "}\n",
544 |       "\n"
545 |      ]
546 |     }
547 |    ],
548 |    "source": [
549 |     "A = tvm.placeholder((m,), name='A')\n",
550 |     "B = tvm.compute((m,), lambda i: A[i]+1, name='B')\n",
551 |     "C = tvm.compute((m,), lambda i: B[i]*2, name='C')\n",
552 |     "\n",
553 |     "s = tvm.create_schedule(C.op)\n",
554 |     "s[B].compute_at(s[C], C.op.axis[0])\n",
555 |     "s[B].compute_root()\n",
556 |     "print(tvm.lower(s, [A, B, C], simple_mode=True))\n"
557 |    ]
558 |   }
559 |  ],
560 |  "metadata": {
561 |   "kernelspec": {
562 |    "display_name": "Python 3",
563 |    "language": "python",
564 |    "name": "python3"
565 |   },
566 |   "language_info": {
567 |    "codemirror_mode": {
568 |     "name": "ipython",
569 |     "version": 3
570 |    },
571 |    "file_extension": ".py",
572 |    "mimetype": "text/x-python",
573 |    "name": "python",
574 |    "nbconvert_exporter": "python",
575 |    "pygments_lexer": "ipython3",
576 |    "version": "3.6.5"
577 |   }
578 |  },
579 |  "nbformat": 4,
580 |  "nbformat_minor": 2
581 | }
582 | 


--------------------------------------------------------------------------------