├── .gitignore ├── LICENSE.md ├── README.md ├── aurora ├── __init__.py ├── autodiff │ ├── __init__.py │ ├── autodiff.py │ ├── executor.py │ ├── gradients.py │ ├── math.py │ ├── numerical_gradient.py │ └── utils.py ├── datasets │ ├── __init__.py │ ├── data │ │ └── mnist.pkl.gz │ ├── mnist.py │ └── synthetic.py ├── ndarray │ ├── __init__.py │ ├── _base.py │ ├── gpu_op.py │ └── ndarray.py ├── nn │ ├── __init__.py │ ├── activations.py │ ├── conv.py │ ├── loss_functions.py │ ├── pooling.py │ ├── pyx │ │ ├── __init__.py │ │ ├── fast_pooling.pyx │ │ └── im2col.pyx │ └── utils.py └── optim │ ├── __init__.py │ ├── adam.py │ ├── base.py │ └── sgd.py ├── cuda ├── Makefile └── src │ ├── CMakeLists.txt │ ├── c_runtime_api.cc │ ├── c_runtime_api.h │ ├── cpu_device_api.cc │ ├── cpu_device_api.h │ ├── cuda_device_api.cc │ ├── cuda_device_api.h │ ├── cudnn_operations.cu │ ├── device_api.h │ ├── dlarray.h │ ├── gpu_op.cu │ └── runtime_base.h ├── examples ├── __init__.py ├── mnist.py ├── mnist_cnn.py └── notebooks │ └── mnist_cnn.ipynb ├── requirements.txt ├── resources └── logo.png ├── setup.py └── tests ├── __init__.py ├── nn_primitives ├── __init__.py └── test_cython.py ├── test_autodiff_cpu.py ├── test_autodiff_gpu.py ├── test_gpu_operations.py └── utils ├── __init__.py └── gradient_check.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.so 3 | *.pyd 4 | *~ 5 | 6 | build/ 7 | dist/ 8 | .idea 9 | indi.egg-info/ 10 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Aurora: Minimal Deep Learning Library. 2 | 3 | Aurora is a minimal deep learning library written in Python, Cython, and C++ with the help of Numpy, CUDA, and cuDNN. Though it is simple, Aurora comes with some advanced design concepts found it a typical deep learning library. 4 | 5 | * Automatic differentiation using static computational graphs. 6 | * Shape and type inference. 7 | * Static memory allocation for efficient training and inference. 8 | 9 | 10 | ### Installation 11 | 12 | Aurora relies on several external libraries including `CUDA`, `cuDNN`, and `NumPy`. For `CUDA` and `cuDNN` installation instructions please refer official documentation. Python dependencies can be installed by running the `requirements.txt` file. 13 | 14 | ##### Environment setup 15 | 16 | To utilize GPU capabilities of the Aurora library, you need to have a Nvidia GPU. If `CUDA` toolkit is not already installed, first install the latest version of the `CUDA` toolkit as well as `cuDNN` library. Next, set following environment variables. 17 | 18 | ```bash 19 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH 20 | export PATH=/usr/local/cuda/bin:$PATH 21 | ``` 22 | 23 | ##### Cloning the Repository 24 | 25 | You can clone Aurora repository using following command. 26 | 27 | `git clone https://github.com/upul/Aurora.git` 28 | 29 | 30 | ##### Building the GPU Backend 31 | 32 | Next, you need to build GPU backend. So please `cuda` directory and run `make` command as shown below. 33 | 34 | 1. Go to `cuda` directory (`cd cuda`) 35 | 2. Run `make` 36 | 37 | ##### Installing the Library 38 | 39 | Go to `Aurora` directory and run: 40 | 41 | 1. `pip install -r requirements.txt` 42 | 2. `pip install .` 43 | 44 | 45 | ### Examples 46 | 47 | Following lists some noticeable examples. For the complete list of examples please refer [`examples`](https://github.com/upul/Aurora/tree/master/examples) directory. Also, for Jupyter notebooks please refer [`examples/notebooks`](https://github.com/upul/Aurora/tree/master/examples/notebooks) folder. 48 | 49 | 1. [mnist](https://github.com/upul/Aurora/blob/master/examples/mnist.py) 50 | 2. [mnist_cnn](https://github.com/upul/Aurora/blob/master/examples/mnist_cnn.py) 51 | 52 | 53 | ### Future Work 54 | 55 | Following features will be added in upcoming releases. 56 | 57 | * Dropout and Batch Normalization. 58 | * High-level API similar to Keras. 59 | * Ability to load pre-trained models. 60 | * Model checkpointing. 61 | 62 | 63 | ### Acknowledgement 64 | 65 | It all started with [CSE 599G1: Deep Learning System Design](http://dlsys.cs.washington.edu/) course. This course really helped me to understand fundamentals of Deep Learning System design. My answers to the two programming assignments of [CSE 599G1](http://dlsys.cs.washington.edu/) was the foundation of Aurora library. So I would like to acknowledge with much appreciation the instructors and teaching assistants of the [SE 599G1](http://dlsys.cs.washington.edu/) course. 66 | 67 | 68 | ### References. 69 | 70 | 1. [CSE 599G1: Deep Learning System Design](http://dlsys.cs.washington.edu/) 71 | 2. [MXNet Architecture](https://mxnet.incubator.apache.org/architecture/index.html) 72 | 3. [Parallel Programming With CUDA | Udacity](https://www.udacity.com/course/intro-to-parallel-programming--cs344) 73 | 4. [Programming Massively Parallel Processors, Third Edition: A Hands-on Approach 3rd Edition](https://www.amazon.com/Programming-Massively-Parallel-Processors-Hands/dp/0128119861/ref=pd_sim_14_3?_encoding=UTF8&psc=1&refRID=1Z3KFKEPTFQJE7MZQ40G) 74 | -------------------------------------------------------------------------------- /aurora/__init__.py: -------------------------------------------------------------------------------- 1 | import aurora.nn 2 | import aurora.optim 3 | import aurora.datasets 4 | 5 | __all__ = ['nn', 'optim', 'datasets'] 6 | 7 | try: 8 | from aurora.ndarray import gpu_op 9 | 10 | __all__.append("ndarray") 11 | except ImportError: 12 | pass 13 | -------------------------------------------------------------------------------- /aurora/autodiff/__init__.py: -------------------------------------------------------------------------------- 1 | from .autodiff import Node 2 | from .autodiff import Parameter 3 | from .autodiff import Variable 4 | from .autodiff import broadcast_to 5 | from .autodiff import matmul 6 | from .autodiff import reduce_sum 7 | from .autodiff import reshape 8 | from .executor import Executor 9 | from .gradients import gradients 10 | from .math import tanh 11 | from .numerical_gradient import eval_numerical_grad 12 | 13 | __all__ = ["Variable", "Parameter", "gradients", "Node", "Executor", 14 | "reduce_sum", "broadcast_to", "matmul", "sigmoid", 15 | "tanh", 'eval_numerical_grad', 'reshape'] 16 | -------------------------------------------------------------------------------- /aurora/autodiff/autodiff.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | try: 3 | from aurora.ndarray import gpu_op, ndarray 4 | except ImportError: 5 | pass 6 | 7 | 8 | class Node(object): 9 | """ Node object represents a node in the computational graph""" 10 | 11 | def __init__(self): 12 | """ New node will be created by Op objects __call__ method""" 13 | # list of inputs to this node 14 | self.inputs = [] 15 | # operator 16 | self.op = None 17 | # constants 18 | self.const = None 19 | # name of the node mainly use for debugging 20 | self.name = "" 21 | 22 | def __add__(self, other): 23 | """ Adding two nodes and returns a new node""" 24 | if isinstance(other, Node): 25 | return add(self, other) 26 | else: 27 | return add_const(self, other) 28 | 29 | def __sub__(self, other): 30 | if isinstance(other, Node): 31 | return sub(self, other) 32 | else: 33 | return sub_const(self, other) 34 | 35 | def __rsub__(self, other): 36 | return ref_sub_const(self, other) 37 | 38 | def __mul__(self, other): 39 | if isinstance(other, Node): 40 | return mul(self, other) 41 | else: 42 | return mul_const(self, other) 43 | 44 | def __truediv__(self, other): 45 | if isinstance(other, Node): 46 | return div(self, other) 47 | else: 48 | return div_const(self, other) 49 | 50 | # Allow left-hand-side add and multiply. 51 | __radd__ = __add__ 52 | __rmul__ = __mul__ 53 | __rdiv__ = __truediv__ 54 | 55 | 56 | class Op(object): 57 | """ Op class represents operations perform on nodes""" 58 | 59 | def __call__(self): 60 | """ 61 | Create a new node which represents operations perform on the graph 62 | 63 | Parameters 64 | ---------- 65 | None 66 | 67 | Returns 68 | ------- 69 | Node 70 | The new node object 71 | """ 72 | new_node = Node() 73 | new_node.op = self 74 | return new_node 75 | 76 | def compute(self, node, input_vals, output_val, use_numpy=True): 77 | """ 78 | Given the values of input nodes, compute the output value 79 | 80 | Parameters 81 | ---------- 82 | :type use_numpy: object 83 | :param use_numpy: 84 | :param node: Node that performs the computation 85 | :param input_vals: Values of input node 86 | 87 | Returns 88 | ------- 89 | :return: The output value of the node 90 | """ 91 | raise NotImplementedError 92 | 93 | def gradient(self, node, output_grads): 94 | """ 95 | Given the value of output gradients this operation calculate the 96 | gradient contribution of each input node 97 | 98 | Parameters 99 | ---------- 100 | :param node: 101 | :param output_grads: 102 | 103 | Returns 104 | ------- 105 | :return: A list of gradient contribution to each input node respectively 106 | """ 107 | raise NotImplementedError 108 | 109 | def infer_shape(self, node, input_shapes): 110 | raise NotImplementedError 111 | 112 | 113 | class AddOp(Op): 114 | """ 115 | 116 | """ 117 | 118 | def __call__(self, nodeA, nodeB): 119 | """ 120 | This Operator element-wise two nodes 121 | 122 | Parameters 123 | ---------- 124 | :param nodeA: LHS operand 125 | :param nodeB: RHS operand 126 | 127 | Returns 128 | ------- 129 | :return: A new Node which represents the element-wise plus operation 130 | """ 131 | new_node = Op.__call__(self) 132 | new_node.inputs = [nodeA, nodeB] 133 | new_node.name = '({}+{})'.format(nodeA.name, nodeB.name) 134 | return new_node 135 | 136 | def compute(self, node, input_vals, output_val, use_numpy=True): 137 | """ 138 | Given values of two input nodes, return result of element-wise addition. 139 | Parameters 140 | ---------- 141 | :param node: 142 | :param input_vals: List of two input nodes 143 | 144 | Returens 145 | -------- 146 | :return: The result of the element-wise addition operation 147 | """ 148 | assert len(input_vals) == 2 149 | # return input_vals[0] + input_vals[1] 150 | if use_numpy: 151 | output_val[:] = input_vals[0] + input_vals[1] 152 | else: 153 | if input_vals[0].shape == input_vals[1].shape: 154 | gpu_op.matrix_elementwise_add(input_vals[0], input_vals[1], output_val) 155 | elif input_vals[0].shape == (1,): 156 | const = input_vals[0].asnumpy()[0] # TODO: (upul) do we need this ? check it? 157 | gpu_op.matrix_elementwise_add_by_const(input_vals[1], const, output_val) 158 | elif input_vals[1].shape == (1,): 159 | const = input_vals[1].asnumpy()[1] # TODO: (upul) do we need this ? check it? 160 | gpu_op.matrix_elementwise_add_by_const(input_vals[0], const, output_val) 161 | else: 162 | pass # TODO: (upul) handle input[0] and input[1] in different shapes 163 | 164 | def gradient(self, node, output_grads): 165 | """ 166 | Given the values of output gradients, calculate the gradients of input nodes 167 | 168 | Parameters 169 | ---------- 170 | :param node: 171 | :param output_grads: Gradient contribution of output nodes 172 | 173 | Returns 174 | ------- 175 | :return: A list of gradient contribution of output nodes 176 | """ 177 | return [output_grads, output_grads] 178 | 179 | def infer_shape(self, node, input_shapes): 180 | assert len(input_shapes) == 2 181 | assert input_shapes[0] == input_shapes[1] 182 | return input_shapes[0] 183 | 184 | 185 | class AddByConstOp(Op): 186 | """ 187 | Operator represents the element-wise addition of a node and a const 188 | """ 189 | 190 | def __call__(self, node_A, const_val): 191 | """ 192 | 193 | :param node: 194 | :param const_val: 195 | :return: 196 | """ 197 | new_node = Op.__call__(self) 198 | new_node.const = const_val 199 | new_node.inputs = [node_A] 200 | new_node.name = '({0:s}+{1:f})'.format(node_A.name, const_val) 201 | return new_node 202 | 203 | def compute(self, node, input_vals, output_val, use_numpy=True): 204 | """ 205 | 206 | :param node: 207 | :param input_vals: 208 | :return: 209 | """ 210 | assert len(input_vals) == 1 211 | if use_numpy: 212 | output_val[:] = node.const + input_vals[0] 213 | else: 214 | gpu_op.matrix_elementwise_add_by_const( 215 | input_vals[0], node.const, output_val) 216 | 217 | def gradient(self, node, output_grads): 218 | """ 219 | 220 | :param node: 221 | :param output_grads: 222 | :return: 223 | """ 224 | return [output_grads] 225 | 226 | def infer_shape(self, node, input_shapes): 227 | assert len(input_shapes) == 1 228 | # assert node.const.shape == input_shapes[0] 229 | return input_shapes[0] 230 | 231 | 232 | class SubOp(Op): 233 | def __call__(self, node_A, node_B): 234 | new_node = Op.__call__(self) 235 | new_node.inputs = [node_A, node_B] 236 | new_node.name = '({0:s}-{1:s})'.format(node_A.name, node_B.name) 237 | return new_node 238 | 239 | def compute(self, node, input_vals, output_val, use_numpy=True): 240 | assert len(input_vals) == 2 241 | if use_numpy: 242 | output_val[:] = input_vals[0] - input_vals[1] 243 | else: 244 | gpu_op.matrix_elementwise_subtract(input_vals[0], input_vals[1], output_val) 245 | 246 | def gradient(self, node, output_grads): 247 | return [output_grads, -1 * output_grads] 248 | 249 | def infer_shape(self, node, input_shapes): 250 | assert len(input_shapes) == 2 251 | assert input_shapes[0] == input_shapes[1] 252 | return input_shapes[0] 253 | 254 | 255 | class SubByConstOp(Op): 256 | def __call__(self, node_A, const_val): 257 | new_node = Op.__call__(self) 258 | new_node.inputs = [node_A] 259 | new_node.const = const_val 260 | new_node.name = '({0:s}-{1:f})'.format(node_A.name, const_val) 261 | return new_node 262 | 263 | def compute(self, node, input_vals, output_val, use_numpy=True): 264 | assert len(input_vals) == 1 265 | if use_numpy: 266 | output_val[:] = input_vals[0] - node.const 267 | else: 268 | gpu_op.matrix_elementwise_subtract_by_const(input_vals[0], node.const, output_val) 269 | 270 | def gradient(self, node, output_grads): 271 | return [output_grads] 272 | 273 | def infer_shape(self, node, input_shapes): 274 | assert len(input_shapes) == 1 275 | return input_shapes[0] 276 | 277 | 278 | class ReflectedSubByConstOp(Op): 279 | def __call__(self, node_A, const_val): 280 | new_node = Op.__call__(self) 281 | new_node.inputs = [node_A] 282 | new_node.const = const_val 283 | new_node.name = '({0:f}-{1:s})'.format(const_val, node_A.name) 284 | return new_node 285 | 286 | def compute(self, node, input_vals, output_val, use_numpy=True): 287 | assert len(input_vals) == 1 288 | return node.const - input_vals[0] 289 | 290 | def gradient(self, node, output_grads): 291 | return [-1 * output_grads] 292 | 293 | def infer_shape(self, node, input_shapes): 294 | assert len(input_shapes) == 1 295 | return input_shapes[0] 296 | 297 | 298 | class OnesLikeOp(Op): 299 | def __call__(self, node_A): 300 | new_node = Op.__call__(self) 301 | new_node.inputs = [node_A] 302 | new_node.name = 'Oneslike({})'.format(node_A.name) 303 | return new_node 304 | 305 | def compute(self, node, input_vals, output_val, use_numpy=True): 306 | assert len(input_vals) == 1 307 | if use_numpy: 308 | assert isinstance(input_vals[0], np.ndarray) 309 | output_val[:] = np.ones(input_vals[0].shape) 310 | else: 311 | gpu_op.array_set(output_val, 1) 312 | 313 | def gradient(self, node, output_grads): 314 | return [zeros_like(node.inputs[0])] 315 | 316 | def infer_shape(self, node, input_shapes): 317 | assert len(input_shapes) == 1 318 | if input_shapes[0] == 1: # TODO (upul) do we need this if ? 319 | return (1,) 320 | else: 321 | return input_shapes[0] 322 | 323 | 324 | class ZerosLikeOp(Op): 325 | def __call__(self, node_A): 326 | new_node = Op.__call__(self) 327 | new_node.inputs = [node_A] 328 | new_node.name = 'Zeroslike({})'.format(node_A.name) 329 | return new_node 330 | 331 | def compute(self, node, input_vals, output_val, use_numpy=True): 332 | assert len(input_vals) == 1 333 | if use_numpy: 334 | assert isinstance(input_vals[0], np.ndarray) 335 | output_val[:] = np.zeros(input_vals[0].shape) 336 | else: 337 | gpu_op.array_set(output_val, 0) 338 | 339 | def gradient(self, node, output_grads): 340 | return [zeros_like(node.inputs[0])] 341 | 342 | def infer_shape(self, node, input_shapes): 343 | assert len(input_shapes) == 1 344 | if input_shapes[0] == 1: # TODO (upul) do we need this if ? 345 | return (1,) 346 | else: 347 | return input_shapes[0] 348 | 349 | 350 | class ReshapeOp(Op): 351 | def __call__(self, node_A, newshape): 352 | new_node = Op.__call__(self) 353 | new_node.inputs = [node_A] 354 | new_node.newshape = newshape 355 | new_node.name = 'Reshape({})'.format(node_A.name) 356 | return new_node 357 | 358 | def compute(self, node, input_vals, output_val, use_numpy=True): 359 | assert len(input_vals) == 1 360 | if use_numpy: 361 | assert isinstance(input_vals[0], np.ndarray) 362 | output_val[:] = np.reshape(input_vals[0], newshape=node.newshape) 363 | else: 364 | # TODO: (upul) changing share is not an expensive operation. But looks 365 | # : bit ugly. Can't we find out an alternative approach? 366 | input_shape = input_vals[0].shape 367 | ndarray.reshape(output_val, input_shape) 368 | input_vals[0].copyto(output_val) 369 | ndarray.reshape(output_val, node.newshape) 370 | 371 | def gradient(self, node, output_grads): 372 | return [reshape_grad(node.inputs[0], output_grads)] 373 | 374 | def infer_shape(self, node, input_shapes): 375 | assert len(input_shapes) == 1 376 | return node.newshape 377 | 378 | 379 | class ReshapeGradientOp(Op): 380 | def __call__(self, node_A, node_B): 381 | new_node = Op.__call__(self) 382 | new_node.inputs = [node_A, node_B] 383 | new_node.name = 'ReshapeGradientOp({0:s})'.format(node_A.name) 384 | return new_node 385 | 386 | def compute(self, node, input_vals, output_val, use_numpy=True): 387 | assert len(input_vals) == 2 388 | if use_numpy: 389 | output_val[:] = input_vals[1].reshape(input_vals[0].shape) 390 | else: 391 | # TODO: (upul) changing share is not an expensive operation. But looks 392 | # : bit ugly. Can't we find out an alternative approach? 393 | ndarray.reshape(output_val, input_vals[0].shape) 394 | input_vals[1].copyto(output_val) 395 | 396 | def gradient(self, node, output_grads): 397 | raise NotImplementedError('Gradient of ReshapeGradientOp not supported') 398 | 399 | def infer_shape(self, node, input_shapes): 400 | assert len(input_shapes) == 2 401 | return input_shapes[0] 402 | 403 | 404 | class MulOp(Op): 405 | def __call__(self, node_A, node_B): 406 | new_node = Op.__call__(self) 407 | new_node.inputs = [node_A, node_B] 408 | new_node.name = '({0:s}*{1:s})'.format(node_A.name, node_B.name) 409 | return new_node 410 | 411 | def compute(self, node, input_vals, output_val, use_numpy=True): 412 | assert len(input_vals) == 2 413 | if use_numpy: 414 | output_val[:] = input_vals[0] * input_vals[1] 415 | else: 416 | ip_1_shape = input_vals[0].shape 417 | ip_2_shape = input_vals[1].shape 418 | if ip_1_shape == ip_2_shape: 419 | gpu_op.matrix_elementwise_multiply(input_vals[0], input_vals[1], output_val) 420 | elif ip_1_shape == (1,): 421 | const_val = input_vals[0].asnumpy()[0] 422 | gpu_op.matrix_elementwise_multiply_by_const(input_vals[1], const_val, output_val) 423 | elif ip_2_shape == (1,): 424 | const_val = input_vals[1].asnumpy()[0] 425 | gpu_op.matrix_elementwise_multiply_by_const(input_vals[0], const_val, output_val) 426 | else: 427 | pass # TODO (upul) handle ip_1_shape != ip_2_shape 428 | 429 | def gradient(self, node, output_grads): 430 | return [node.inputs[1] * output_grads, node.inputs[0] * output_grads] 431 | 432 | def infer_shape(self, node, input_shapes): 433 | assert len(input_shapes) == 2 434 | if input_shapes[0] == (1,): 435 | return input_shapes[1] 436 | elif input_shapes[1] == (1,): 437 | return input_shapes[0] 438 | elif input_shapes[0] == input_shapes[1]: 439 | return input_shapes[0] 440 | else: 441 | stmt = 'Invalid dimensions {0:s}, (1:s)'.format(input_shapes[0], input_shapes[1]) 442 | raise RuntimeError(stmt) 443 | 444 | 445 | class MulByConstOp(Op): 446 | def __call__(self, node_A, const_val): 447 | new_node = Op.__call__(self) 448 | new_node.inputs = [node_A] 449 | new_node.const = const_val 450 | new_node.name = '({0:s}*{1:f})'.format(node_A.name, const_val) 451 | return new_node 452 | 453 | def compute(self, node, input_vals, output_val, use_numpy=True): 454 | assert len(input_vals) == 1 455 | if use_numpy: 456 | output_val[:] = node.const * input_vals[0] 457 | else: 458 | gpu_op.matrix_elementwise_multiply_by_const( 459 | input_vals[0], node.const, output_val) 460 | 461 | def gradient(self, node, output_grads): 462 | return [node.const * output_grads] 463 | 464 | def infer_shape(self, node, input_shapes): 465 | assert len(input_shapes) == 1 466 | return input_shapes[0] 467 | 468 | 469 | class DivOp(Op): 470 | def __call__(self, node_A, node_B): 471 | new_node = Op.__call__(self) 472 | new_node.inputs = [node_A, node_B] 473 | new_node.name = '({0:s}/{1:s})'.format(node_A.name, node_B.name) 474 | return new_node 475 | 476 | def compute(self, node, input_vals, output_val, use_numpy=True): 477 | assert len(input_vals) == 2 478 | if use_numpy: 479 | output_val[:] = input_vals[0] / input_vals[1] 480 | else: 481 | gpu_op.matrix_elementwise_division(input_vals[0], input_vals[1], output_val) 482 | 483 | def gradient(self, node, output_grads): 484 | grad_A = output_grads / node.inputs[1] 485 | grad_B = -1.0 * output_grads * node.inputs[0] / (node.inputs[1] * node.inputs[1]) 486 | return [grad_A, grad_B] 487 | 488 | def infer_shape(self, node, input_shapes): 489 | assert len(input_shapes) == 2 490 | assert input_shapes[0] == input_shapes[1] 491 | return input_shapes[0] 492 | 493 | 494 | class DivByConstOp(Op): 495 | def __call__(self, node_A, const_val): 496 | new_node = Op.__call__(self) 497 | new_node.inputs = [node_A] 498 | new_node.const = const_val 499 | new_node.name = '({0:s}/{1:f})'.format(node_A.name, const_val) 500 | return new_node 501 | 502 | def compute(self, node, input_vals, output_val, use_numpy=True): 503 | assert len(input_vals) == 1 504 | if use_numpy: 505 | output_val[:] = input_vals[0] / node.const 506 | else: 507 | gpu_op.matrix_elementwise_div_by_const(input_vals[0], node.const, output_val) 508 | 509 | def gradient(self, node, output_grads): 510 | return [output_grads / node.const] 511 | 512 | def infer_shape(self, node, input_shapes): 513 | assert len(input_shapes) == 1 514 | return input_shapes[0] 515 | 516 | 517 | class PlaceholderOp(Op): 518 | """Op to feed value to a nodes.""" 519 | 520 | def __call__(self): 521 | """Creates a variable node.""" 522 | new_node = Op.__call__(self) 523 | return new_node 524 | 525 | def compute(self, node, input_vals, output_val, use_numpy=True): 526 | """No compute function since node value is fed directly in Executor.""" 527 | assert False, "placeholder values provided by feed_dict" 528 | 529 | def gradient(self, node, output_grad): 530 | """No gradient function since node has no inputs.""" 531 | return None 532 | 533 | 534 | class ReduceSumOp(Op): 535 | """ 536 | 537 | """ 538 | 539 | def __call__(self, node_A): 540 | new_node = Op.__call__(self) 541 | new_node.inputs = [node_A] 542 | new_node.name = 'ReduceSum({0:s})'.format(node_A.name) 543 | return new_node 544 | 545 | def compute(self, node, input_vals, output_val, use_numpy=True): 546 | """ 547 | 548 | :param node: 549 | :param input_vals: 550 | :param output_val: 551 | :param use_numpy: 552 | :return: 553 | """ 554 | assert len(input_vals) == 1 555 | if use_numpy: 556 | assert isinstance(output_val, np.ndarray) 557 | output_val[:] = np.sum(input_vals[0], axis=0) 558 | else: 559 | gpu_op.reduce_sum_axis_zero(input_vals[0], output_val) 560 | 561 | def gradient(self, node, output_grads): 562 | return [output_grads] 563 | 564 | def infer_shape(self, node, input_shapes): 565 | assert len(input_shapes) == 1 566 | if len(input_shapes[0]) == 1: 567 | return (1,) 568 | else: 569 | return tuple(input_shapes[0][i] 570 | for i in range(1, len(input_shapes[0]))) 571 | 572 | 573 | class BroadcastToOp(Op): 574 | def __call__(self, node_A, node_B): 575 | new_node = Op.__call__(self) 576 | new_node.inputs = [node_A, node_B] 577 | new_node.name = 'BroadcastTo({0:s}, {1:s}.shape)'.format(node_A.name, node_B.name) 578 | return new_node 579 | 580 | def compute(self, node, input_vals, output_val, use_numpy=True): 581 | assert len(input_vals) == 2 582 | if use_numpy: 583 | output_val[:] = np.broadcast_to(input_vals[0], input_vals[1].shape) 584 | else: 585 | gpu_op.broadcast_to(input_vals[0], output_val) 586 | 587 | def gradient(self, node, output_grads): 588 | grad_A = reduce_sum(output_grads) 589 | grad_B = zeros_like(node.inputs[1]) 590 | return [grad_A, grad_B] 591 | 592 | def infer_shape(self, node, input_shapes): 593 | assert len(input_shapes) == 2 594 | return input_shapes[1] 595 | 596 | 597 | class MatMulOp(Op): # TODO: (upul) double check what this class is doing 598 | def __call__(self, node_A, node_B, trans_A=False, trans_B=False): 599 | new_node = Op.__call__(self) 600 | new_node.inputs = [node_A, node_B] 601 | new_node.trans_A = trans_A 602 | new_node.trans_B = trans_B 603 | new_node.name = 'MatMul({0:s}, {1:s}'.format(node_A.name, node_B.name) 604 | return new_node 605 | 606 | def compute(self, node, input_vals, output_val, use_numpy=True): 607 | assert len(input_vals) == 2 608 | if use_numpy: 609 | if node.trans_A: 610 | input_vals[0] = input_vals[0].T 611 | if node.trans_B: 612 | input_vals[1] = input_vals[1].T 613 | output_val[:] = np.dot(input_vals[0], input_vals[1]) 614 | else: 615 | gpu_op.matrix_multiply( 616 | input_vals[0], node.trans_A, 617 | input_vals[1], node.trans_B, 618 | output_val) 619 | 620 | def gradient(self, node, output_grads): 621 | grad_A = matmul(output_grads, node.inputs[1], trans_A=False, trans_B=True) 622 | grad_B = matmul(node.inputs[0], output_grads, trans_A=True, trans_B=False) 623 | return [grad_A, grad_B] 624 | 625 | def infer_shape(self, node, input_shapes): 626 | """Need to handle input_vals[0].shape != input_vals[1].shape""" 627 | assert len(input_shapes) == 2 628 | (row_A, col_A) = input_shapes[0] 629 | if node.trans_A: 630 | row_A, col_A = col_A, row_A 631 | (row_B, col_B) = input_shapes[1] 632 | if node.trans_B: 633 | row_B, col_B = col_B, row_B 634 | 635 | assert col_A == row_B 636 | return (row_A, col_B) 637 | 638 | 639 | def Variable(name): 640 | """User defined variables in an expression. 641 | e.g. x = Variable(name = "x") 642 | """ 643 | placeholder_node = placeholder() 644 | placeholder_node.name = name 645 | return placeholder_node 646 | 647 | 648 | def Parameter(name, init): 649 | """ 650 | example: w = Parameter(name='w', state=...) 651 | :param name: 652 | :param init: 653 | :return: 654 | """ 655 | parameter_node = placeholder() 656 | parameter_node.name = name 657 | parameter_node.const = init 658 | return parameter_node 659 | 660 | 661 | # Global singleton operations 662 | add = AddOp() 663 | add_const = AddByConstOp() 664 | sub = SubOp() 665 | sub_const = SubByConstOp() 666 | ref_sub_const = ReflectedSubByConstOp() 667 | mul = MulOp() 668 | mul_const = MulByConstOp() 669 | div = DivOp() 670 | div_const = DivByConstOp() 671 | zeros_like = ZerosLikeOp() 672 | ones_like = OnesLikeOp() 673 | reduce_sum = ReduceSumOp() 674 | broadcast_to = BroadcastToOp() 675 | reshape = ReshapeOp() 676 | reshape_grad = ReshapeGradientOp() 677 | matmul = MatMulOp() 678 | placeholder = PlaceholderOp() 679 | -------------------------------------------------------------------------------- /aurora/autodiff/executor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from aurora.autodiff.autodiff import PlaceholderOp 3 | from .utils import find_topo_sort 4 | try: 5 | from aurora.ndarray import gpu_op, ndarray 6 | except ImportError: 7 | pass 8 | 9 | 10 | class Executor: 11 | """ 12 | 13 | """ 14 | 15 | def __init__(self, eval_list, use_gpu=False): 16 | """ 17 | Executor computes values for a given subset of nodes in a computation graph. 18 | 19 | Parameters: 20 | ----------- 21 | :param eval_list: Values of the nodes of this list need to be computed 22 | """ 23 | self.eval_node_list = eval_list 24 | self.ctx = None 25 | if use_gpu: 26 | self.ctx = ndarray.gpu(0) 27 | 28 | self.topo_order = find_topo_sort(self.eval_node_list) 29 | self.node_to_arr_map = None 30 | self.node_to_shape_map = None 31 | self.feed_shapes = None 32 | 33 | def infer_shape(self, feed_shapes): 34 | """ 35 | Given the shapes of the feed_shapes dictionary, we infer shapes of all nodes in the graph 36 | :param feed_shapes: 37 | :return: 38 | """ 39 | self.node_to_shape_map = {} 40 | for node in self.topo_order: 41 | if node in self.node_to_shape_map: 42 | continue 43 | 44 | # TODO (upul): following if condition looks like a hack. Find a better approach 45 | if isinstance(node.op, PlaceholderOp) and node.const is not None: 46 | self.node_to_shape_map[node] = node.const.shape 47 | continue 48 | 49 | if node in feed_shapes: 50 | self.node_to_shape_map[node] = feed_shapes[node] 51 | else: 52 | input_shpes = [] 53 | for input_node in node.inputs: 54 | input_shpes.append(self.node_to_shape_map[input_node]) 55 | 56 | self.node_to_shape_map[node] = node.op.infer_shape(node, input_shpes) 57 | 58 | def memory_plan(self, feed_shapes): 59 | """ 60 | 61 | :param feed_shapes: 62 | :return: 63 | """ 64 | # topo_order = find_topo_sort(self.eval_node_list) # TODO (upul) cache this 65 | # self.node_to_arr_map = {} 66 | # for node in topo_order: 67 | # self.node_to_arr_map[node] = ndarray.empty(self.node_to_shape_map[node], ctx=self.ctx) 68 | 69 | if self.node_to_arr_map is None: 70 | self.node_to_arr_map = {} 71 | 72 | for node in self.topo_order: 73 | if node in feed_shapes: 74 | continue 75 | self.node_to_arr_map[node] = ndarray.empty(self.node_to_shape_map[node], ctx=self.ctx) 76 | 77 | def run(self, feed_shapes, convert_to_numpy_ret_vals=False): 78 | """ 79 | Values of the nodes given in eval_list are evaluated against feed_dict 80 | 81 | Parameters 82 | ---------- 83 | :param feed_shapes: A dictionary of nodes who values are specified by the user 84 | 85 | Returns 86 | ------- 87 | :return: Values of the nodes specified by the eval_list 88 | """ 89 | def are_feed_shapes_equal(sa, sb): 90 | if (not isinstance(sa, dict)) or (not isinstance(sb, dict)): 91 | return False 92 | unmatched_item = set(sa.items()) ^ set(sb.items()) 93 | return len(unmatched_item) == 0 94 | 95 | # Assume self.ctx is None implies numpy array and numpy ops. 96 | use_numpy = self.ctx is None 97 | node_to_val_map = {} 98 | for node, value in feed_shapes.items(): 99 | if use_numpy: 100 | # all values passed in feed_dict must be np.ndarray 101 | assert isinstance(value, np.ndarray) 102 | node_to_val_map[node] = value 103 | else: 104 | # convert values to ndarray.NDArray if necessary 105 | if isinstance(value, np.ndarray): 106 | node_to_val_map[node] = ndarray.array(value, ctx=self.ctx) 107 | elif isinstance(value, ndarray.NDArray): 108 | node_to_val_map[node] = value 109 | else: 110 | assert False, "feed_dict value type not supported" 111 | 112 | # collect shapes for all placeholders 113 | feed_shapes = {} 114 | for node in node_to_val_map: 115 | feed_shapes[node] = node_to_val_map[node].shape 116 | 117 | # infer shape if feed_shapes changed since last run 118 | # e.g. call run() on test data after trainng 119 | if (not are_feed_shapes_equal(feed_shapes, self.feed_shapes)): 120 | self.infer_shape(feed_shapes) 121 | self.feed_shapes = feed_shapes 122 | # plan memory if using GPU 123 | if (not use_numpy): 124 | self.memory_plan(feed_shapes) 125 | 126 | # Traverse graph in topo order and compute values for all nodes. 127 | for node in self.topo_order: 128 | if node in node_to_val_map: 129 | # Skip placeholder nodes. Values already provided by feed_dict. 130 | continue 131 | 132 | # TODO (upul): following if condition looks like a hack. Find a better approach 133 | if isinstance(node.op, PlaceholderOp) and node.const is not None: 134 | node_to_val_map[node] = node.const 135 | continue 136 | 137 | input_vals = [node_to_val_map[n] for n in node.inputs] 138 | if use_numpy: 139 | node_val = np.empty(shape=self.node_to_shape_map[node]) 140 | else: 141 | node_val = self.node_to_arr_map[node] 142 | # node_val is modified in-place whether np.ndarray or NDArray 143 | node.op.compute(node, input_vals, node_val, use_numpy) 144 | node_to_val_map[node] = node_val 145 | 146 | # Collect node values. 147 | if not use_numpy and convert_to_numpy_ret_vals: 148 | return [node_to_val_map[n].asnumpy() for n in self.eval_node_list] 149 | 150 | return [node_to_val_map[n] for n in self.eval_node_list] 151 | 152 | @staticmethod 153 | def _are_feed_shapes_equal(sa, sb): 154 | if (not isinstance(sa, dict)) or (not isinstance(sb, dict)): 155 | return False 156 | unmatched_items = set(sa.items()) ^ set(sb.items()) 157 | return len(unmatched_items) 158 | -------------------------------------------------------------------------------- /aurora/autodiff/gradients.py: -------------------------------------------------------------------------------- 1 | from .utils import sum_node_list 2 | from .utils import find_topo_sort 3 | from .autodiff import ones_like 4 | 5 | # TODO: (upul) clean in improve comments 6 | def gradients(output_node, node_list): 7 | # a map from node to a list of gradient contributions from each output node 8 | node_to_output_grads_list = {} 9 | # Special note on initializing gradient of output_node as oneslike_op(output_node): 10 | # We are really taking a derivative of the scalar reduce_sum(output_node) 11 | # instead of the vector output_node. But this is the common case for loss function. 12 | node_to_output_grads_list[output_node] = [ones_like(output_node)] 13 | # a map from node to the gradient of that node 14 | node_to_output_grad = {} 15 | # Traverse graph in reverse topological order given the output_node that we are taking gradient wrt. 16 | reverse_topo_order = reversed(find_topo_sort([output_node])) 17 | for node in reverse_topo_order: 18 | output_grad = sum_node_list(node_to_output_grads_list[node]) 19 | node_to_output_grad[node] = output_grad 20 | 21 | input_grads_list = node.op.gradient(node, output_grad) 22 | for i in range(len(node.inputs)): 23 | if node.inputs[i] not in node_to_output_grads_list: 24 | node_to_output_grads_list[node.inputs[i]] = [] 25 | node_to_output_grads_list[node.inputs[i]].append(input_grads_list[i]) 26 | 27 | # Collect results for gradients requested. 28 | grad_node_list = [node_to_output_grad[node] for node in node_list] 29 | return grad_node_list 30 | -------------------------------------------------------------------------------- /aurora/autodiff/math.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from aurora.autodiff.autodiff import Op 4 | 5 | 6 | class TanhOp(Op): 7 | """ 8 | Tanh Activation function 9 | 10 | """ 11 | 12 | def __call__(self, node_A): 13 | new_node = Op.__call__(self) 14 | new_node.inputs = [node_A] 15 | new_node.name = 'Tanh({0:s})'.format(node_A.name) 16 | return new_node 17 | 18 | def compute(self, node, input_vals, output_val, use_numpy=True): 19 | assert len(input_vals) == 1 20 | if use_numpy: 21 | output_val[:] = np.tanh(input_vals[0]) 22 | else: 23 | raise NotImplementedError('GPU version of TanhOp not yet implemented') 24 | 25 | def gradient(self, node, output_grads): 26 | x = node.inputs[0] 27 | g = 1 - (tanh(x) * tanh(x)) 28 | return [g * output_grads] 29 | 30 | def infer_shape(self, node, input_shapes): 31 | assert len(input_shapes) 32 | return input_shapes[0] 33 | 34 | 35 | # Global singleton operations 36 | tanh = TanhOp() 37 | 38 | # TODO: (upul) other basic math functions such as sin, cos, min, max, and etc 39 | -------------------------------------------------------------------------------- /aurora/autodiff/numerical_gradient.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .executor import Executor 3 | 4 | 5 | def eval_numerical_grad(f, feed_dict, wrt, h=1e-5): 6 | wrt_val = feed_dict[wrt] 7 | grad = np.zeros_like(wrt_val) 8 | 9 | it = np.nditer(wrt_val, flags=['multi_index'], op_flags=['readwrite']) 10 | while not it.finished: 11 | ix = it.multi_index 12 | old_val = wrt_val[ix] 13 | wrt_val[ix] = old_val + h 14 | executor = Executor([f]) 15 | feed_dict[wrt] = wrt_val 16 | 17 | result_plus, = executor.run(feed_shapes=feed_dict) 18 | wrt_val[ix] = old_val - h 19 | executor = Executor([f]) 20 | 21 | feed_dict[wrt] = wrt_val 22 | result_minus, = executor.run(feed_shapes=feed_dict) 23 | 24 | grad[ix] = np.sum((result_plus - result_minus) / (2.0 * h)) 25 | 26 | wrt_val[ix] = old_val 27 | feed_dict[wrt] = wrt_val 28 | it.iternext() 29 | return grad 30 | -------------------------------------------------------------------------------- /aurora/autodiff/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def find_topo_sort(node_list): 5 | """ 6 | 7 | :param node_list: 8 | :return: 9 | """ 10 | visited = set() 11 | topo_order = [] 12 | for node in node_list: 13 | depth_first_search(node, visited, topo_order) 14 | return topo_order 15 | 16 | 17 | def depth_first_search(node, visited, topo_order): 18 | """ 19 | 20 | :param node: 21 | :param visited: 22 | :param topo_order: 23 | :return: 24 | """ 25 | if node in visited: 26 | return 27 | visited.add(node) 28 | for n in node.inputs: 29 | depth_first_search(n, visited, topo_order) 30 | topo_order.append(node) 31 | 32 | 33 | def sum_node_list(node_list): 34 | """ 35 | Custom sum function in order to avoid 36 | create redundant nodes in Python sum implementation 37 | :param node_list: 38 | :return: 39 | """ 40 | from operator import add 41 | from functools import reduce 42 | return reduce(add, node_list) 43 | -------------------------------------------------------------------------------- /aurora/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .synthetic import spiral 2 | from .mnist import MNIST 3 | 4 | __all__ = ['spiral', 'MNIST'] -------------------------------------------------------------------------------- /aurora/datasets/data/mnist.pkl.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/upul/Aurora/415a80ac5f7083475baca4a2d187cd102ba7a6c5/aurora/datasets/data/mnist.pkl.gz -------------------------------------------------------------------------------- /aurora/datasets/mnist.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gzip 3 | import pickle 4 | import os 5 | 6 | 7 | class MNIST: 8 | def __init__(self, batch_size): 9 | self.batch_size = batch_size 10 | 11 | train, valid, test = self._load_data() 12 | self.X_train, self.y_train = train[0], train[1] 13 | 14 | # encoding y_train using one-hot encoding 15 | self.y_train_one_hot = np.zeros((self.y_train.shape[0], 10)) 16 | self.y_train_one_hot[np.arange(self.y_train.shape[0]), self.y_train] = 1 17 | 18 | self.X_valid, self.y_valid = valid[0], valid[1] 19 | self.X_test, self.y_test = test[0], test[1] 20 | 21 | def train_batch_generator(self): 22 | while True: 23 | rand_indices = np.random.choice(self.X_train.shape[0], self.batch_size, False) 24 | yield self.X_train[rand_indices], self.y_train_one_hot[rand_indices] 25 | 26 | def validation(self): 27 | return self.X_valid, self.y_valid 28 | 29 | def testing(self): 30 | return self.X_test, self.y_test 31 | 32 | def num_features(self): 33 | return self.X_train.shape[1] 34 | 35 | def _load_data(self): 36 | script_dir = os.path.dirname(__file__) 37 | mnist_file = os.path.join(os.path.join(script_dir, 'data'), 'mnist.pkl.gz') 38 | 39 | with gzip.open(mnist_file, 'rb') as mnist_file: 40 | u = pickle._Unpickler(mnist_file) 41 | u.encoding = 'latin1' 42 | train, val, test = u.load() 43 | return train, val, test 44 | -------------------------------------------------------------------------------- /aurora/datasets/synthetic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # TODO: (upul) improve the interface of following method 4 | def spiral(num_cls, dim, point_per_cls, rnd_state=1024): 5 | np.random.seed(rnd_state) 6 | points_per_cls = 100 # number of points per class 7 | dim = 2 # dimensionality 8 | num_cls = 3 # number of classes 9 | X_data = np.zeros((points_per_cls * num_cls, dim)) 10 | y_data = np.zeros(points_per_cls * num_cls, dtype='uint8') 11 | for j in range(num_cls): 12 | ix = range(points_per_cls * j, points_per_cls * (j + 1)) 13 | r = np.linspace(0.0, 1, points_per_cls) 14 | t = np.linspace(j * 4, (j + 1) * 4, points_per_cls) + np.random.randn(points_per_cls) * 0.2 # theta 15 | X_data[ix] = np.c_[r * np.sin(t), r * np.cos(t)] 16 | y_data[ix] = j 17 | 18 | y_data_encoded = np.zeros((points_per_cls * num_cls, num_cls)) 19 | y_data_encoded[range(points_per_cls * num_cls), y_data] = 1 20 | return X_data, y_data, y_data_encoded 21 | -------------------------------------------------------------------------------- /aurora/ndarray/__init__.py: -------------------------------------------------------------------------------- 1 | from . import ndarray 2 | from . import gpu_op 3 | 4 | __all__ = ['ndarray', 'gpu_op'] -------------------------------------------------------------------------------- /aurora/ndarray/_base.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: disable=invalid-name 3 | """ ctypes library of dlsys and helper functions """ 4 | from __future__ import absolute_import 5 | 6 | import os 7 | import ctypes 8 | from pathlib import Path 9 | 10 | 11 | def _load_lib(): 12 | """Load libary in build/lib.""" 13 | lib_root = Path(__file__).parents[2] 14 | lib_path = os.path.join(lib_root, 'cuda/build/lib/') 15 | path_to_so_file = os.path.join(lib_path, "libc_runtime_api.so") 16 | lib = ctypes.CDLL(path_to_so_file, ctypes.RTLD_GLOBAL) 17 | return lib 18 | 19 | 20 | # global library instance 21 | try: 22 | _LIB = _load_lib() 23 | except: 24 | # TODO: (upul) Do we need to log the error message? 25 | pass 26 | 27 | 28 | ################## 29 | # Helper Methods # 30 | ################## 31 | 32 | def check_call(ret): 33 | """Check the return value of C API call 34 | 35 | This function will crash when error occurs. 36 | Wrap every API call with this function 37 | 38 | Parameters 39 | ---------- 40 | ret : int 41 | return value from API calls 42 | """ 43 | assert (ret == 0) 44 | 45 | 46 | def c_array(ctype, values): 47 | """Create ctypes array from a python array 48 | 49 | Parameters 50 | ---------- 51 | ctype : ctypes data type 52 | data type of the array we want to convert to 53 | 54 | values : tuple or list 55 | data content 56 | 57 | Returns 58 | ------- 59 | out : ctypes array 60 | Created ctypes array 61 | """ 62 | return (ctype * len(values))(*values) 63 | -------------------------------------------------------------------------------- /aurora/ndarray/gpu_op.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from ._base import _LIB 5 | from . import ndarray as _nd 6 | 7 | 8 | def array_set(arr, value): 9 | assert isinstance(arr, _nd.NDArray) 10 | _LIB.DLGpuArraySet(arr.handle, ctypes.c_float(value)) 11 | 12 | 13 | def broadcast_to(in_arr, out_arr): 14 | assert isinstance(in_arr, _nd.NDArray) 15 | assert isinstance(out_arr, _nd.NDArray) 16 | _LIB.DLGpuBroadcastTo(in_arr.handle, out_arr.handle) 17 | 18 | 19 | def reduce_sum_axis_zero(in_arr, out_arr): 20 | assert isinstance(in_arr, _nd.NDArray) 21 | assert isinstance(out_arr, _nd.NDArray) 22 | _LIB.DLGpuReduceSumAxisZero(in_arr.handle, out_arr.handle) 23 | 24 | 25 | def matrix_elementwise_add(matA, matB, matC): 26 | assert isinstance(matA, _nd.NDArray) 27 | assert isinstance(matB, _nd.NDArray) 28 | assert isinstance(matC, _nd.NDArray) 29 | _LIB.DLGpuMatrixElementwiseAdd(matA.handle, matB.handle, matC.handle) 30 | 31 | 32 | def matrix_elementwise_add_by_const(in_mat, val, out_mat): 33 | assert isinstance(in_mat, _nd.NDArray) 34 | assert isinstance(out_mat, _nd.NDArray) 35 | _LIB.DLGpuMatrixElementwiseAddByConst( 36 | in_mat.handle, ctypes.c_float(val), out_mat.handle) 37 | 38 | 39 | def matrix_elementwise_subtract(matA, matB, matC): 40 | assert isinstance(matA, _nd.NDArray) 41 | assert isinstance(matB, _nd.NDArray) 42 | assert isinstance(matC, _nd.NDArray) 43 | _LIB.DLGpuMatrixElementwiseSubtract(matA.handle, matB.handle, matC.handle) 44 | 45 | 46 | def matrix_elementwise_subtract_by_const(in_mat, val, out_mat): 47 | assert isinstance(in_mat, _nd.NDArray) 48 | assert isinstance(out_mat, _nd.NDArray) 49 | _LIB.DLGpuMatrixElementwiseSubtractByConst( 50 | in_mat.handle, ctypes.c_float(val), out_mat.handle) 51 | 52 | 53 | def matrix_elementwise_multiply(matA, matB, matC): 54 | assert isinstance(matA, _nd.NDArray) 55 | assert isinstance(matB, _nd.NDArray) 56 | assert isinstance(matC, _nd.NDArray) 57 | _LIB.DLGpuMatrixElementwiseMultiply( 58 | matA.handle, matB.handle, matC.handle) 59 | 60 | 61 | def matrix_elementwise_multiply_by_const(in_mat, val, out_mat): 62 | assert isinstance(in_mat, _nd.NDArray) 63 | assert isinstance(out_mat, _nd.NDArray) 64 | _LIB.DLGpuMatrixMultiplyByConst( 65 | in_mat.handle, ctypes.c_float(val), out_mat.handle) 66 | 67 | 68 | def matrix_elementwise_division(matA, matB, matC): 69 | assert isinstance(matA, _nd.NDArray) 70 | assert isinstance(matB, _nd.NDArray) 71 | assert isinstance(matC, _nd.NDArray) 72 | _LIB.DLGpuMatrixElementwiseDiv( 73 | matA.handle, matB.handle, matC.handle) 74 | 75 | 76 | def matrix_elementwise_div_by_const(in_mat, val, out_mat): 77 | assert isinstance(in_mat, _nd.NDArray) 78 | assert isinstance(out_mat, _nd.NDArray) 79 | _LIB.DLGpuMatrixElementwiseDivByConst( 80 | in_mat.handle, ctypes.c_float(val), out_mat.handle) 81 | 82 | 83 | def matrix_elementwise_sqrt(in_mat, out_mat): 84 | assert isinstance(in_mat, _nd.NDArray) 85 | assert isinstance(out_mat, _nd.NDArray) 86 | _LIB.DLGpuMatrixElementwiseSqrt(in_mat.handle, out_mat.handle) 87 | 88 | 89 | def matrix_multiply(matA, transA, matB, transB, matC): 90 | assert isinstance(matA, _nd.NDArray) 91 | assert isinstance(matB, _nd.NDArray) 92 | assert isinstance(matC, _nd.NDArray) 93 | _LIB.DLGpuMatrixMultiply( 94 | matA.handle, transA, matB.handle, transB, matC.handle) 95 | 96 | 97 | def relu(in_arr, out_arr): 98 | assert isinstance(in_arr, _nd.NDArray) 99 | assert isinstance(out_arr, _nd.NDArray) 100 | _LIB.DLGpuRelu(in_arr.handle, out_arr.handle) 101 | 102 | 103 | def relu_gradient(in_arr, in_grad_arr, out_arr): 104 | assert isinstance(in_arr, _nd.NDArray) 105 | assert isinstance(in_grad_arr, _nd.NDArray) 106 | assert isinstance(out_arr, _nd.NDArray) 107 | _LIB.DLGpuReluGradient(in_arr.handle, in_grad_arr.handle, out_arr.handle) 108 | 109 | 110 | def softmax(in_arr, out_arr): 111 | assert isinstance(in_arr, _nd.NDArray) 112 | assert isinstance(out_arr, _nd.NDArray) 113 | _LIB.DLGpuSoftmax(in_arr.handle, out_arr.handle) 114 | 115 | 116 | def softmax_cross_entropy(in_arr_a, in_arr_b, out_arr): 117 | assert isinstance(in_arr_a, _nd.NDArray) 118 | assert isinstance(in_arr_b, _nd.NDArray) 119 | assert isinstance(out_arr, _nd.NDArray) 120 | _LIB.DLGpuSoftmaxCrossEntropy( 121 | in_arr_a.handle, in_arr_b.handle, out_arr.handle) 122 | 123 | 124 | def cudnn_relu_forward(in_array, out_array): 125 | assert isinstance(in_array, _nd.NDArray) 126 | assert isinstance(out_array, _nd.NDArray) 127 | _LIB.cudnnReLUForward(in_array.handle, out_array.handle) 128 | 129 | 130 | def cudnn_conv2d_forward(input, filter, bias, stride_height, stride_width, padding_height, padding_width, output): 131 | assert isinstance(input, _nd.NDArray) 132 | assert isinstance(filter, _nd.NDArray) 133 | assert isinstance(bias, _nd.NDArray) 134 | assert isinstance(stride_height, int) 135 | assert isinstance(stride_width, int) 136 | assert isinstance(padding_height, int) 137 | assert isinstance(padding_width, int) 138 | assert isinstance(output, _nd.NDArray) 139 | _LIB.cudnnConv2DForward(input.handle, filter.handle, 140 | bias.handle, 141 | stride_height, stride_width, 142 | padding_height, padding_width, 143 | output.handle) 144 | 145 | 146 | def cudnn_pool_forward(input, 147 | pooling_height, pooling_width, 148 | stride_height, stride_width, 149 | mode, 150 | output): 151 | assert isinstance(input, _nd.NDArray) 152 | assert isinstance(stride_height, int) 153 | assert isinstance(stride_width, int) 154 | assert isinstance(pooling_height, int) 155 | assert isinstance(pooling_width, int) 156 | assert isinstance(mode, str) 157 | assert isinstance(output, _nd.NDArray) 158 | 159 | mode = mode.encode('utf-8') 160 | 161 | _LIB.cudnnPoolForward(input.handle, 162 | stride_height, stride_width, 163 | pooling_height, pooling_width, 164 | ctypes.c_char_p(mode), 165 | output.handle) 166 | 167 | 168 | def cudnn_pool_backward(input, 169 | output_grads, 170 | output, 171 | pooling_height, pooling_width, 172 | stride_height, stride_width, 173 | mode, 174 | pool_grad): 175 | assert isinstance(input, _nd.NDArray) 176 | assert isinstance(output_grads, _nd.NDArray) 177 | assert isinstance(output, _nd.NDArray) 178 | assert isinstance(pool_grad, _nd.NDArray) 179 | 180 | assert isinstance(pooling_height, int) 181 | assert isinstance(pooling_width, int) 182 | assert isinstance(stride_height, int) 183 | assert isinstance(stride_width, int) 184 | 185 | mode = mode.encode('utf-8') 186 | 187 | _LIB.cudnnPoolBackward(input.handle, 188 | output_grads.handle, 189 | output.handle, 190 | pooling_height, pooling_width, 191 | stride_height, stride_width, 192 | mode, 193 | pool_grad.handle) 194 | 195 | 196 | def cudnn_conv2d_backward_filter(input, 197 | output_grads, 198 | stride_height, 199 | stride_width, 200 | padding_height, 201 | padding_width, 202 | filter_grad): 203 | assert isinstance(input, _nd.NDArray) 204 | assert isinstance(output_grads, _nd.NDArray) 205 | assert isinstance(stride_height, int) 206 | assert isinstance(stride_width, int) 207 | assert isinstance(padding_height, int) 208 | assert isinstance(padding_width, int) 209 | assert isinstance(filter_grad, _nd.NDArray) 210 | _LIB.cudnnConv2DBackwardFilter(input.handle, 211 | output_grads.handle, 212 | stride_height, stride_width, 213 | padding_height, padding_width, 214 | filter_grad.handle) 215 | 216 | 217 | def cudnn_conv2d_backward_data(filter, 218 | output_grad, 219 | stride_height, 220 | stride_width, 221 | padding_height, 222 | padding_width, 223 | data_grad): 224 | assert isinstance(filter, _nd.NDArray) 225 | assert isinstance(output_grad, _nd.NDArray) 226 | assert isinstance(stride_height, int) 227 | assert isinstance(stride_width, int) 228 | assert isinstance(padding_height, int) 229 | assert isinstance(padding_width, int) 230 | assert isinstance(data_grad, _nd.NDArray) 231 | _LIB.cudnnConv2DBackwardData(filter.handle, 232 | output_grad.handle, 233 | stride_height, 234 | stride_width, 235 | padding_height, 236 | padding_width, 237 | data_grad.handle) 238 | 239 | 240 | def cudnn_conv2d_backward_bias(output_grads, bias_grads): 241 | assert isinstance(output_grads, _nd.NDArray) 242 | assert isinstance(bias_grads, _nd.NDArray) 243 | _LIB.cudnnConv2DBackwardBias(output_grads.handle, bias_grads.handle) 244 | -------------------------------------------------------------------------------- /aurora/ndarray/ndarray.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from ._base import _LIB, check_call, c_array 4 | from . import ndarray as _nd 5 | import ctypes 6 | import numpy as np 7 | 8 | 9 | class DLContext(ctypes.Structure): 10 | """DL context strucure.""" 11 | _fields_ = [("device_id", ctypes.c_int), 12 | ("device_type", ctypes.c_int)] 13 | 14 | MASK2STR = { 15 | 1: 'cpu', 16 | 2: 'gpu', 17 | } 18 | 19 | def __init__(self, device_id, device_type): 20 | super(DLContext, self).__init__() 21 | self.device_id = device_id 22 | self.device_type = device_type 23 | 24 | def __repr__(self): 25 | return "%s(%d)" % ( 26 | DLContext.MASK2STR[self.device_type], self.device_id) 27 | 28 | 29 | class DLArray(ctypes.Structure): 30 | """DLArray in C API""" 31 | _fields_ = [("data", ctypes.c_void_p), 32 | ("ctx", DLContext), 33 | ("ndim", ctypes.c_int), 34 | ("shape", ctypes.POINTER(ctypes.c_int64))] 35 | 36 | 37 | DLArrayHandle = ctypes.POINTER(DLArray) 38 | 39 | 40 | def cpu(dev_id=0): 41 | """Construct a CPU device 42 | Parameters 43 | ---------- 44 | dev_id : int, optional 45 | The integer device id 46 | """ 47 | return DLContext(dev_id, 1) 48 | 49 | 50 | def gpu(dev_id=0): 51 | """Construct a CPU device 52 | Parameters 53 | ---------- 54 | dev_id : int, optional 55 | The integer device id 56 | """ 57 | return DLContext(dev_id, 2) 58 | 59 | 60 | def is_gpu_ctx(ctx): 61 | """Return if context is GPU context. 62 | Parameters 63 | ---------- 64 | ctx : DLContext 65 | The query context 66 | """ 67 | return ctx and ctx.device_type == 2 68 | 69 | 70 | class NDArray(object): 71 | """Lightweight NDArray class of DL runtime. 72 | Strictly this is only an Array Container(a buffer object) 73 | No arthimetic operations are defined. 74 | """ 75 | __slots__ = ["handle"] 76 | 77 | # pylint: disable=no-member 78 | def __init__(self, handle): 79 | """Initialize the function with handle 80 | Parameters 81 | ---------- 82 | handle : DLArrayHandle 83 | the handle to the underlying C++ DLArray 84 | """ 85 | self.handle = handle 86 | 87 | def __del__(self): 88 | check_call(_LIB.DLArrayFree(self.handle)) 89 | 90 | @property 91 | def shape(self): 92 | """Shape of this array""" 93 | return tuple(self.handle.contents.shape[i] 94 | for i in range(self.handle.contents.ndim)) 95 | 96 | @property 97 | def ctx(self): 98 | """context of this array""" 99 | return self.handle.contents.ctx 100 | 101 | def __setitem__(self, in_slice, value): 102 | """Set ndarray value""" 103 | if (not isinstance(in_slice, slice) or 104 | in_slice.start is not None 105 | or in_slice.stop is not None): 106 | raise ValueError('Array only support set from numpy array') 107 | if isinstance(value, NDArray): 108 | if value.handle is not self.handle: 109 | value.copyto(self) 110 | elif isinstance(value, (np.ndarray, np.generic)): 111 | self._sync_copyfrom(value) 112 | else: 113 | raise TypeError('type %s not supported' % str(type(value))) 114 | 115 | def _sync_copyfrom(self, source_array): 116 | """Peform an synchronize copy from the array. 117 | Parameters 118 | ---------- 119 | source_array : array_like 120 | The data source we should like to copy from. 121 | """ 122 | if not isinstance(source_array, np.ndarray): 123 | try: 124 | source_array = np.array(source_array, dtype=np.float32) 125 | except: 126 | raise TypeError('array must be an array_like data,' + 127 | 'type %s is not supported' 128 | % str(type(source_array))) 129 | source_array = np.ascontiguousarray(source_array, dtype=np.float32) 130 | if source_array.shape != self.shape: 131 | raise ValueError('array shape do not match the shape of NDArray') 132 | source_arr, shape = NDArray._numpyasarray(source_array) 133 | check_call(_LIB.DLArrayCopyFromTo( 134 | ctypes.byref(source_arr), self.handle, None)) 135 | # de-allocate shape until now 136 | _ = shape 137 | 138 | @staticmethod 139 | def _numpyasarray(np_data): 140 | """Return a DLArray representation of a numpy array.""" 141 | data = np_data 142 | assert data.flags['C_CONTIGUOUS'] 143 | arr = DLArray() 144 | shape = c_array(ctypes.c_int64, data.shape) 145 | arr.data = data.ctypes.data_as(ctypes.c_void_p) 146 | arr.shape = shape 147 | arr.ndim = data.ndim 148 | # CPU device 149 | arr.ctx = cpu(0) 150 | return arr, shape 151 | 152 | def asnumpy(self): 153 | """Convert this array to numpy array 154 | Returns 155 | ------- 156 | np_arr : numpy.ndarray 157 | The corresponding numpy array. 158 | """ 159 | np_arr = np.empty(self.shape, dtype=np.float32) 160 | arr, shape = NDArray._numpyasarray(np_arr) 161 | check_call(_LIB.DLArrayCopyFromTo( 162 | self.handle, ctypes.byref(arr), None)) 163 | _ = shape 164 | return np_arr 165 | 166 | def copyto(self, target): 167 | """Copy array to target 168 | Parameters 169 | ---------- 170 | target : NDArray 171 | The target array to be copied, must have same shape as this array. 172 | """ 173 | if isinstance(target, DLContext): 174 | target = empty(self.shape, target) 175 | if isinstance(target, NDArray): 176 | check_call(_LIB.DLArrayCopyFromTo( 177 | self.handle, target.handle, None)) 178 | else: 179 | raise ValueError("Unsupported target type %s" % str(type(target))) 180 | return target 181 | 182 | 183 | def array(arr, ctx=cpu(0)): 184 | """Create an array from source arr. 185 | Parameters 186 | ---------- 187 | arr : numpy.ndarray 188 | The array to be copied from 189 | ctx : DLContext, optional 190 | The device context to create the array 191 | Returns 192 | ------- 193 | ret : NDArray 194 | The created array 195 | """ 196 | if not isinstance(arr, np.ndarray): 197 | arr = np.array(arr) 198 | ret = empty(arr.shape, ctx) 199 | ret._sync_copyfrom(arr) 200 | return ret 201 | 202 | 203 | def empty(shape, ctx=cpu(0)): 204 | """Create an empty array given shape and device 205 | Parameters 206 | ---------- 207 | shape : tuple of int 208 | The shape of the array 209 | ctx : DLContext 210 | The context of the array 211 | Returns 212 | ------- 213 | arr : ndarray 214 | The array dlsys supported. 215 | """ 216 | shape = c_array(ctypes.c_int64, shape) 217 | ndim = ctypes.c_int(len(shape)) 218 | handle = DLArrayHandle() 219 | check_call(_LIB.DLArrayAlloc( 220 | shape, ndim, ctx, ctypes.byref(handle))) 221 | return NDArray(handle) 222 | 223 | 224 | def reshape(arr, new_shape): 225 | assert isinstance(arr, _nd.NDArray) 226 | # TODO (upul): check total number of elements match ... 227 | shape = c_array(ctypes.c_int64, new_shape) 228 | new_dim = len(new_shape) 229 | handle = arr.handle 230 | check_call(_LIB.DLArrayReshape(handle, shape, new_dim)) 231 | -------------------------------------------------------------------------------- /aurora/nn/__init__.py: -------------------------------------------------------------------------------- 1 | from .activations import relu 2 | from .activations import sigmoid 3 | from .activations import softmax 4 | from .loss_functions import softmax_cross_entropy_with_logits 5 | from .utils import softmax_func 6 | from .conv import conv2d 7 | from .pooling import maxPool 8 | 9 | __all__ = ['relu', 'sigmoid', 'softmax', 'softmax_cross_entropy_with_logits', 10 | 'softmax_func', 'conv2d', 'maxPool'] -------------------------------------------------------------------------------- /aurora/nn/activations.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from aurora.autodiff.autodiff import Op 3 | from aurora.nn.utils import softmax_func 4 | try: 5 | from aurora.ndarray import gpu_op, ndarray 6 | except ImportError: 7 | pass 8 | 9 | 10 | class ReluOp(Op): 11 | def __call__(self, node_A): 12 | new_node = Op.__call__(self) 13 | new_node.inputs = [node_A] 14 | new_node.name = "Relu(%s)" % (node_A.name) 15 | return new_node 16 | 17 | def compute(self, node, input_vals, output_val, use_numpy=True): 18 | assert len(input_vals) == 1 19 | if use_numpy: 20 | output_val[:] = np.maximum(input_vals[0], 0) 21 | else: 22 | gpu_op.relu(input_vals[0], output_val) 23 | 24 | def gradient(self, node, output_grad): 25 | return [relu_grad(node.inputs[0], output_grad)] 26 | 27 | def infer_shape(self, node, input_shapes): 28 | assert len(input_shapes) == 1 29 | return input_shapes[0] 30 | 31 | 32 | class ReluGradientOp(Op): 33 | def __call__(self, node_A, node_B): 34 | """node_B is output_grad""" 35 | new_node = Op.__call__(self) 36 | new_node.inputs = [node_A, node_B] 37 | new_node.name = "ReluGradient(%s)" % (node_A.name) 38 | return new_node 39 | 40 | def compute(self, node, input_vals, output_val, use_numpy=True): 41 | assert len(input_vals) == 2 42 | if use_numpy: 43 | output_val[:] = np.sign(np.maximum(input_vals[0], 0)) * input_vals[1] 44 | else: 45 | gpu_op.relu_gradient(input_vals[0], input_vals[1], output_val) 46 | 47 | def gradient(self, node, output_grad): 48 | raise NotImplementedError('Gradient of ReluGradientOp not implemented') 49 | 50 | def infer_shape(self, node, input_shapes): 51 | assert len(input_shapes) == 2 52 | assert input_shapes[0] == input_shapes[1] 53 | return input_shapes[0] 54 | 55 | 56 | class SigmoidOp(Op): 57 | def __call__(self, node_A): 58 | new_node = Op.__call__(self) 59 | new_node.inputs = [node_A] 60 | new_node.name = 'Sigmoid({0:s})'.format(node_A.name) 61 | return new_node 62 | 63 | def compute(self, node, input_vals, output_val, use_numpy=True): 64 | """ 65 | This function calculates the sigmoid of the input_vals[0]. 66 | The naive implementation (1/(1+ exp(-x)) is not stable. Hence 67 | we are using: 68 | tanh(x) = (exp(x) - exp(-x))/(exp(x) + exp(-x)) 69 | = 2*sigmoid(2*x) - 1 70 | hence: 71 | sigmoid(x) = 0.5 + 0.5*tanh(0.5*x) 72 | :param node: 73 | :param input_vals: 74 | :param output_val: 75 | :param use_numpy: 76 | :return: 77 | """ 78 | assert len(input_vals) == 1 79 | if use_numpy: 80 | output_val[:] = 0.5 + 0.5*np.tanh(0.5*input_vals[0]) 81 | else: 82 | raise NotImplementedError('GPU version not yet implemented') 83 | 84 | def gradient(self, node, output_grads): 85 | x = node.inputs[0] 86 | # g = sigmoid(x) * (1 - sigmoid(x)) 87 | # TODO: (upul) obove g failed in unit testing, need to check it. 88 | g = sigmoid(x) - sigmoid(x) * sigmoid(x) 89 | return [g * output_grads] 90 | 91 | def infer_shape(self, node, input_shapes): 92 | assert len(input_shapes) 93 | return input_shapes[0] 94 | 95 | 96 | class SoftmaxOp(Op): 97 | def __call__(self, node_A): 98 | new_node = Op.__call__(self) 99 | new_node.inputs = [node_A] 100 | new_node.name = 'SoftmaxOp({0:s})'.format(node_A.name) 101 | return new_node 102 | 103 | def compute(self, node, input_vals, output_val, use_numpy=True): 104 | assert len(input_vals) == 1 105 | if use_numpy: 106 | output_val[:] = softmax_func(input_vals[0]) 107 | else: 108 | gpu_op.softmax(input_vals[0], output_val) 109 | 110 | def gradient(self, node, output_grads): 111 | raise NotImplementedError('Not yet implemented, Please use CrossEntropy operator') 112 | 113 | def infer_shape(self, node, input_shapes): 114 | assert len(input_shapes) == 1 115 | return input_shapes[0] 116 | 117 | 118 | # TODO (upul): Other commonly use activation functions 119 | 120 | # Global singleton operators 121 | relu = ReluOp() 122 | relu_grad = ReluGradientOp() 123 | sigmoid = SigmoidOp() 124 | softmax = SoftmaxOp() 125 | -------------------------------------------------------------------------------- /aurora/nn/conv.py: -------------------------------------------------------------------------------- 1 | from aurora.autodiff.autodiff import Op 2 | from aurora.nn.pyx.im2col import im2col, col2im 3 | try: 4 | from aurora.ndarray import gpu_op, ndarray 5 | except ImportError: 6 | pass 7 | 8 | 9 | # TODO: (upul) The numpy version of the Conv2dOp, X_col is calculated twice. 10 | # One in compute() of Conv2dOp and the second time inside the compute() of 11 | # Conv2dBackwardFilter node. Check the feasibility of caching. 12 | 13 | class Conv2dOp(Op): 14 | def __call__(self, input, filter, bias, strides=(1, 1), padding=(0, 0)): 15 | new_node = Op.__call__(self) 16 | # input: 4-D data, (batch_size, depth, height, width) 17 | # filter: 4-D kernel (num_filters, depth, kernel_height, kernel_width) 18 | new_node.inputs = [input, filter, bias] 19 | new_node.strides = strides 20 | new_node.padding = padding 21 | new_node.name = 'Conv2d({0:s}, {1:s})'.format(input.name, filter.name) 22 | return new_node 23 | 24 | def compute(self, node, input_vals, output_val, use_numpy=True): 25 | assert len(input_vals) == 3 26 | 27 | X = input_vals[0] 28 | h = X.shape[2] 29 | w = X.shape[3] 30 | batch_size = X.shape[0] 31 | 32 | W = input_vals[1] 33 | filter_height = W.shape[2] 34 | filter_width = W.shape[3] 35 | n_filters = W.shape[0] 36 | 37 | b = input_vals[2] 38 | 39 | padding_height = node.padding[0] 40 | padding_width = node.padding[1] 41 | stride_height = node.strides[0] 42 | stride_width = node.strides[1] 43 | 44 | if use_numpy: 45 | b = b.reshape(n_filters, -1) 46 | h_new = int((h - filter_height + 2 * padding_height) / stride_height + 1) 47 | w_new = int((w - filter_width + 2 * padding_width) / stride_width + 1) 48 | X_col = im2col(X, filter_height, filter_width, padding_height, padding_width, 49 | stride_height, stride_width) 50 | W_col = W.reshape(n_filters, -1) 51 | out = W_col @ X_col + b 52 | out = out.reshape(n_filters, h_new, w_new, batch_size) 53 | output_val[:] = out.transpose(3, 0, 1, 2) 54 | else: 55 | gpu_op.cudnn_conv2d_forward(X, W, b, stride_height, stride_width, 56 | padding_height, padding_width, output_val) 57 | 58 | def gradient(self, node, output_grads): 59 | # 60 | filter_node = node.inputs[1] 61 | data_node = node.inputs[0] 62 | return [conv2dBackData(data_node, filter_node, output_grads), 63 | conv2dBackFilter(data_node, filter_node, output_grads), 64 | conv2dBackBias(output_grads)] 65 | 66 | def infer_shape(self, node, input_shapes): 67 | assert len(input_shapes) == 3 68 | 69 | X_shape = input_shapes[0] 70 | h = X_shape[2] 71 | w = X_shape[3] 72 | 73 | W_shape = input_shapes[1] 74 | filter_height = W_shape[2] 75 | filter_width = W_shape[3] 76 | 77 | padding_height = node.padding[0] 78 | padding_width = node.padding[1] 79 | stride_height = node.strides[0] 80 | stride_width = node.strides[1] 81 | 82 | h_new = int((h - filter_height + 2 * padding_height) / stride_height + 1) 83 | w_new = int((w - filter_width + 2 * padding_width) / stride_width + 1) 84 | d_new = W_shape[0] 85 | batch_size = X_shape[0] 86 | return batch_size, d_new, h_new, w_new 87 | 88 | 89 | class Conv2dGradientFilter(Op): 90 | def __call__(self, node_A, node_B, output_grad, strides=(1, 1), padding=(0, 0)): 91 | new_node = Op.__call__(self) 92 | new_node.inputs = [node_A, node_B, output_grad] 93 | new_node.strides = strides 94 | new_node.padding = padding 95 | new_node.name = "Conv2dBackwardFilter(%s, %s)" % (node_A.name, node_B.name) 96 | return new_node 97 | 98 | def compute(self, node, input_vals, output_val, use_numpy=True): 99 | assert len(input_vals) == 3 100 | 101 | X = input_vals[0] # data 102 | W = input_vals[1] # filter 103 | 104 | assert len(X.shape) == 4 105 | assert len(W.shape) == 4 106 | 107 | filter_height = W.shape[2] 108 | filter_width = W.shape[3] 109 | n_filters = W.shape[0] 110 | out_grad = input_vals[2] 111 | 112 | padding_height = node.padding[0] 113 | padding_width = node.padding[1] 114 | stride_height = node.strides[0] 115 | stride_width = node.strides[1] 116 | 117 | if use_numpy: 118 | X_col = im2col(X, filter_height, filter_width, padding_height, padding_width, 119 | stride_height, stride_width) 120 | dout_reshaped = out_grad.transpose(1, 2, 3, 0).reshape(n_filters, -1) 121 | dW = dout_reshaped @ X_col.T 122 | output_val[:] = dW.reshape(W.shape) 123 | 124 | else: 125 | gpu_op.cudnn_conv2d_backward_filter(X, out_grad, stride_height, stride_width, 126 | padding_height, padding_width, output_val) 127 | 128 | def gradient(self, node, output_grads): 129 | raise NotImplementedError('Gradient of ReluGradientOp not implemented') 130 | 131 | def infer_shape(self, node, input_shapes): 132 | assert len(input_shapes) == 3 133 | W_size = input_shapes[1] 134 | return W_size 135 | 136 | 137 | class Conv2dGradientData(Op): 138 | def __call__(self, node_A, node_B, output_grad, strides=(1, 1), padding=(0, 0)): 139 | new_node = Op.__call__(self) 140 | new_node.inputs = [node_A, node_B, output_grad] 141 | new_node.strides = strides 142 | new_node.padding = padding 143 | new_node.name = "Conv2dBackwardData(%s, %s)" % (node_A.name, node_B.name) 144 | return new_node 145 | 146 | def compute(self, node, input_vals, output_val, use_numpy=True): 147 | assert len(input_vals) == 3 148 | X = input_vals[0] # data 149 | W = input_vals[1] # filter 150 | output_grads = input_vals[2] 151 | 152 | assert len(X.shape) == 4 153 | assert len(W.shape) == 4 154 | 155 | filter_height = W.shape[2] 156 | filter_width = W.shape[3] 157 | n_filters = W.shape[0] 158 | 159 | padding_height, padding_width = node.padding 160 | stride_height, stride_width = node.strides 161 | 162 | if use_numpy: 163 | W_reshape = W.reshape(n_filters, -1) 164 | dout_reshaped = input_vals[2].transpose(1, 2, 3, 0).reshape(n_filters, -1) 165 | 166 | dX_col = W_reshape.T @ dout_reshaped 167 | batch_size, n_channels, img_height, img_width = X.shape 168 | output_val[:] = col2im(dX_col, batch_size, n_channels, 169 | img_height, img_width, filter_height, filter_width, 170 | padding_height, padding_width, 171 | stride_height, stride_width) 172 | else: 173 | gpu_op.cudnn_conv2d_backward_data(W, output_grads, stride_height, stride_width, 174 | padding_height, padding_width, output_val) 175 | 176 | def gradient(self, node, output_grads): 177 | raise NotImplementedError('Gradient of ReluGradientOp not implemented') 178 | 179 | def infer_shape(self, node, input_shapes): 180 | assert len(input_shapes) == 3 181 | X_size = input_shapes[0] 182 | return X_size 183 | 184 | 185 | class Conv2dGradientBias(Op): 186 | def __call__(self, node_A): 187 | new_node = Op.__call__(self) 188 | new_node.inputs = [node_A] 189 | new_node.name = "Conv2dBackwardBiase(%s)" % (node_A.name) 190 | return new_node 191 | 192 | def compute(self, node, input_vals, output_val, use_numpy=True): 193 | assert len(input_vals) == 1 194 | 195 | if use_numpy: 196 | output_val[:] = input_vals[0].sum(axis=(0, 2, 3)) 197 | else: 198 | gpu_op.cudnn_conv2d_backward_bias(input_vals[0], output_val) 199 | 200 | def gradient(self, node, output_grads): 201 | raise NotImplementedError('Gradient of ReluGradientOp not implemented') 202 | 203 | def infer_shape(self, node, input_shapes): 204 | assert len(input_shapes) == 1 205 | # size of the input_shape[0] = (batch_size, num_filters, filter_height, filter_width) 206 | return (input_shapes[0][1],) 207 | 208 | 209 | # Global singleton operators 210 | conv2d = Conv2dOp() 211 | conv2dBackFilter = Conv2dGradientFilter() 212 | conv2dBackData = Conv2dGradientData() 213 | conv2dBackBias = Conv2dGradientBias() 214 | -------------------------------------------------------------------------------- /aurora/nn/loss_functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from aurora.autodiff.autodiff import Op, zeros_like 3 | 4 | from .activations import softmax 5 | from .utils import log_sum_exp 6 | 7 | try: 8 | from aurora.ndarray import gpu_op, ndarray 9 | except ImportError: 10 | pass 11 | 12 | 13 | class CrossEntropyOp(Op): 14 | def __call__(self, node_A, node_B): 15 | new_node = Op.__call__(self) 16 | new_node.inputs = [node_A, node_B] 17 | new_node.name = 'CrossEntropy({0:s}, {1:s})'.format(node_A.name, node_B.name) 18 | return new_node 19 | 20 | def compute(self, node, input_vals, output_val, use_numpy=True): 21 | assert len(input_vals) == 2 22 | if use_numpy: 23 | logits = input_vals[0] 24 | actual = input_vals[1] 25 | safe_log_softmax = logits - log_sum_exp(logits) 26 | output_val[:] = np.mean(-np.sum(actual * safe_log_softmax, axis=1), keepdims=True) 27 | else: 28 | gpu_op.softmax_cross_entropy(input_vals[0], input_vals[1], output_val) 29 | 30 | def gradient(self, node, output_grads): 31 | grad_A = (softmax(node.inputs[0]) + -1 * node.inputs[1]) * output_grads 32 | grad_B = zeros_like(node.inputs[1]) 33 | return [grad_A, grad_B] 34 | 35 | def infer_shape(self, node, input_shapes): 36 | assert len(input_shapes) == 2 37 | return (1,) 38 | 39 | 40 | # TODO (upul) MSE 41 | # TODO (upul) RMSE 42 | # TODO (upul) sigmoid_corss_entropy_with_logits 43 | 44 | # Global singleton operations 45 | softmax_cross_entropy_with_logits = CrossEntropyOp() 46 | -------------------------------------------------------------------------------- /aurora/nn/pooling.py: -------------------------------------------------------------------------------- 1 | from aurora.autodiff.autodiff import Op 2 | from aurora.nn.pyx.fast_pooling import max_pool_forward, max_pool_backward 3 | 4 | try: 5 | from aurora.ndarray import gpu_op 6 | except ImportError: 7 | pass 8 | 9 | 10 | class MaxPoolOp(Op): 11 | def __call__(self, input, filter=(2, 2), strides=(2, 2)): 12 | new_node = Op.__call__(self) 13 | new_node.inputs = [input] 14 | new_node.filter = filter 15 | new_node.strides = strides 16 | new_node.cache = {} 17 | new_node.name = 'MaxPoolOp({})'.format(input.name) 18 | return new_node 19 | 20 | def compute(self, node, input_vals, output_val, use_numpy=True): 21 | assert len(input_vals) == 1 22 | 23 | filter_height = node.filter[0] 24 | filter_width = node.filter[1] 25 | stride_height = node.strides[0] 26 | stride_width = node.strides[1] 27 | 28 | if use_numpy: 29 | output_val[:] = max_pool_forward(input_vals[0], 30 | filter_height=filter_height, 31 | filter_width=filter_width, 32 | stride_height=stride_height, 33 | stride_width=stride_width) 34 | else: 35 | gpu_op.cudnn_pool_forward(input_vals[0], 36 | filter_height, filter_width, 37 | stride_height, stride_width, 38 | 'max', 39 | output_val) 40 | node.cache['forward'] = output_val 41 | 42 | def gradient(self, node, output_grads): 43 | return [maxPoolBack(node.inputs[0], output_grads, cache=node.cache)] 44 | 45 | def infer_shape(self, node, input_shapes): 46 | assert len(input_shapes) == 1 47 | 48 | filter_height = node.filter[0] 49 | filter_width = node.filter[1] 50 | stride_height = node.strides[0] 51 | stride_width = node.strides[1] 52 | 53 | input_batch_size = input_shapes[0][0] 54 | input_n_channels = input_shapes[0][1] 55 | input_height = input_shapes[0][2] 56 | input_width = input_shapes[0][3] 57 | 58 | new_height = int((input_height - filter_height) / stride_height) + 1 59 | new_width = int((input_width - filter_width) / stride_width) + 1 60 | return input_batch_size, input_n_channels, new_height, new_width 61 | 62 | 63 | class MaxPoolGradientOp(Op): 64 | def __call__(self, node_A, node_B, filter=(2, 2), strides=(2, 2), cache=None): 65 | new_node = Op.__call__(self) 66 | # node_B is the output_grad 67 | new_node.inputs = [node_A, node_B] 68 | new_node.filter = filter 69 | new_node.strides = strides 70 | new_node.cache = cache 71 | new_node.name = 'MaxPoolGradientOp(%s)' % (node_A.name) 72 | return new_node 73 | 74 | def compute(self, node, input_vals, output_val, use_numpy=True): 75 | assert len(input_vals) == 2 76 | 77 | filter_height = node.filter[0] 78 | filter_width = node.filter[1] 79 | stride_height = node.strides[0] 80 | stride_width = node.strides[1] 81 | 82 | data = input_vals[0] 83 | output_grad = input_vals[1] 84 | if use_numpy: 85 | output_val[:] = max_pool_backward(output_grad, 86 | data, 87 | filter_height=filter_height, 88 | filter_width=filter_width, 89 | stride_height=stride_height, 90 | stride_width=stride_width 91 | ) 92 | else: 93 | gpu_op.cudnn_pool_backward(data, output_grad, node.cache['forward'], 94 | filter_height, filter_width, 95 | stride_height, stride_width, 96 | 'max', 97 | output_val) 98 | 99 | def gradient(self, node, output_grads): 100 | raise NotImplementedError('Gradient of AverageGradientOp is not implemented') 101 | 102 | def infer_shape(self, node, input_shapes): 103 | assert len(input_shapes) == 2 104 | return input_shapes[0] 105 | 106 | 107 | # Global singleton operators 108 | maxPool = MaxPoolOp() 109 | maxPoolBack = MaxPoolGradientOp() 110 | -------------------------------------------------------------------------------- /aurora/nn/pyx/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/upul/Aurora/415a80ac5f7083475baca4a2d187cd102ba7a6c5/aurora/nn/pyx/__init__.py -------------------------------------------------------------------------------- /aurora/nn/pyx/fast_pooling.pyx: -------------------------------------------------------------------------------- 1 | cimport cython 2 | import numpy as np 3 | cimport numpy as np 4 | 5 | # TODO: (Upul) We need a better way to represent a big negative number 6 | cdef float BIG_NEGATIVE = -1.0e15 7 | 8 | @cython.boundscheck(False) 9 | @cython.wraparound(False) 10 | def max_pool_forward(np.float64_t[:, :, :, :] data, 11 | int filter_height, int filter_width, 12 | int stride_height, int stride_width): 13 | """ 14 | 15 | :param data: 16 | :param filter_height: 17 | :param filter_width: 18 | :param stride_height: 19 | :param stride_width: 20 | :return: 21 | """ 22 | 23 | cdef int batch_size = data.shape[0] 24 | cdef int input_channels = data.shape[1] 25 | cdef int height = data.shape[2] 26 | cdef int width = data.shape[3] 27 | 28 | # Define the dimensions of the output 29 | cdef int n_H = int(1 + (height - filter_height) / stride_height) 30 | cdef int n_W = int(1 + (width - filter_width) / stride_width) 31 | cdef int n_C = input_channels 32 | 33 | # Initialize output matrix 34 | cdef np.float64_t[:, :, :, :] output = np.zeros((batch_size, n_C, n_H, n_W)) 35 | 36 | cdef int i, c, h, w, vert_start, vert_end, horiz_start, horiz_end, ii, jj 37 | cdef float max_in_grid 38 | 39 | for i in range(batch_size): # loop over the training examples 40 | for c in range (n_C): # loop over the channels of the output volume 41 | for h in range(n_H): # loop on the vertical axis of the output volume 42 | for w in range(n_W): # loop on the horizontal axis of the output volume 43 | # Find the corners of the current "slice" 44 | vert_start = h*stride_height 45 | vert_end = h*stride_height + filter_height 46 | horiz_start = w*stride_width 47 | horiz_end = w*stride_width + filter_width 48 | # finding the max value within the given grid 49 | max_in_grid = BIG_NEGATIVE 50 | for ii in range(vert_start, vert_end): 51 | for jj in range(horiz_start, horiz_end): 52 | if data[i, c, ii, jj] > max_in_grid: 53 | max_in_grid = data[i, c, ii, jj] 54 | output[i, c, h, w] = max_in_grid 55 | return output 56 | 57 | @cython.boundscheck(False) 58 | @cython.wraparound(False) 59 | def max_pool_backward(np.float64_t[:, :, :, :] output_grad, 60 | np.float64_t[:, :, :, :] input_data, 61 | int filter_height=2, int filter_width=2, 62 | int stride_height=2, int stride_width=2): 63 | """ 64 | 65 | :param output_grad: 66 | :param input_data: 67 | :param filter_height: 68 | :param filter_width: 69 | :param stride_height: 70 | :param stride_width: 71 | :return: 72 | """ 73 | batch_size = output_grad.shape[0] 74 | channels = output_grad.shape[1] 75 | height = output_grad.shape[2] 76 | width = output_grad.shape[3] 77 | 78 | return _max_pool_backward_inner(output_grad, input_data, 79 | batch_size, channels,height, 80 | width, filter_height, 81 | filter_width, stride_height, 82 | stride_width) 83 | 84 | @cython.boundscheck(False) 85 | @cython.wraparound(False) 86 | cdef _max_pool_backward_inner(np.float64_t[:, :, :, :] output_grad, 87 | np.float64_t[:, :, :, :] input_data, 88 | int batch_size, int 89 | channels, 90 | int height, int width, 91 | int filter_height, int filter_width, 92 | int stride_height, int stride_width): 93 | """ 94 | 95 | :param output_grad: 96 | :param input_data: 97 | :param batch_size: 98 | :param channels: 99 | :param height: 100 | :param width: 101 | :param filter_height: 102 | :param filter_width: 103 | :param stride_height: 104 | :param stride_width: 105 | :return: 106 | """ 107 | 108 | grad_input = np.zeros_like(input_data) 109 | 110 | cdef np.float64_t[:, :, :] cct_example 111 | cdef int h, w, c, vert_start, vert_end, horiz_start, horiz_end, slice_height, slice_width, max_i, max_j 112 | cdef float max_value, cct_value 113 | 114 | # loop over the training examples 115 | for i in range(batch_size): 116 | 117 | # pick the current training example 118 | cct_example = input_data[i, :, :, :] 119 | 120 | for h in range(height): # loop on the vertical axis 121 | for w in range(width): # loop on the horizontal axis 122 | for c in range(channels): # loop over the channels (depth) 123 | 124 | # Find the corners of the current slice. 125 | vert_start = h*stride_height 126 | vert_end = h*stride_height + filter_height 127 | horiz_start = w*stride_width 128 | horiz_end = w*stride_width + filter_width 129 | 130 | # Compute the backward propagation in both modes. 131 | max_value = BIG_NEGATIVE 132 | for slice_height in range(vert_start, vert_end): 133 | for slice_width in range(horiz_start, horiz_end): 134 | cct_value = cct_example[c, slice_height, slice_width] 135 | if cct_value > max_value: 136 | max_value = cct_value 137 | max_i = slice_height 138 | max_j = slice_width 139 | grad_input[i, c, max_i, max_j] += output_grad[i, c, h, w] 140 | return grad_input 141 | -------------------------------------------------------------------------------- /aurora/nn/pyx/im2col.pyx: -------------------------------------------------------------------------------- 1 | cimport cython 2 | import numpy as np 3 | cimport numpy as np 4 | 5 | @cython.boundscheck(False) 6 | @cython.wraparound(False) 7 | cdef im2col_inner(np.float64_t[:, :, :, :] x_padded, 8 | np.float64_t[:, :] out, 9 | int h_new, int w_new, int C, int M, 10 | int filter_height, int filter_width, 11 | int stride_height, int stride_width): 12 | 13 | cdef int itr = 0 14 | cdef int start_i, end_i, start_j, end_j 15 | cdef int i, j, m 16 | cdef int k, c, p_h, p_w 17 | 18 | for i in range(h_new): 19 | for j in range(w_new): 20 | for m in range(M): 21 | start_i = stride_height * i 22 | end_i = stride_height * i + filter_width 23 | start_j = stride_width * j 24 | end_j = stride_width * j + filter_height 25 | 26 | k = 0 27 | for c in range(C): 28 | for p_h in range(start_i, end_i): 29 | for p_w in range(start_j, end_j): 30 | out[k, itr] = x_padded[m, c, p_h, p_w] 31 | k += 1 32 | itr += 1 33 | 34 | 35 | @cython.boundscheck(False) 36 | @cython.wraparound(False) 37 | cdef col2img_inner(np.float64_t[:, :] cols, 38 | np.float64_t[:, :, :, :] x_padded, 39 | int filter_height, int filter_width, 40 | int N, int C, int H, int W, 41 | int H_padded, int W_padded, 42 | int padding_height, int padding_width, 43 | int stride_height, int stride_width): 44 | cdef int idx = 0 45 | cdef int i, j, m, c, sh, sw 46 | cdef int start_height, start_width, k 47 | cdef np.float64_t[:] col 48 | 49 | cdef int p = H_padded - filter_height + 1 50 | cdef int q = W_padded - filter_width + 1 51 | i =0 52 | while i < p: 53 | j = 0 54 | while j < q: 55 | for m in range(N): 56 | col = cols[:, idx] 57 | start_height = i 58 | start_width = j 59 | k = 0 60 | for c in range(C): 61 | for sh in range(start_height, start_height + filter_height): 62 | for sw in range(start_width, start_width + filter_width): 63 | x_padded[m, c, sh, sw] += col[k] 64 | k += 1 65 | idx += 1 66 | j += stride_width 67 | i += stride_height 68 | if padding_height > 0 or padding_width >0: 69 | return x_padded[:, :, padding_height:-padding_height, padding_width:-padding_width] 70 | else: 71 | return x_padded 72 | 73 | 74 | @cython.boundscheck(False) 75 | @cython.wraparound(False) 76 | def im2col(np.float64_t[:, :, :, :] image, 77 | int filter_height=3, int filter_width=3, 78 | int padding_height=0, int padding_width=0, 79 | int stride_height=1, int stride_width=1): 80 | 81 | cdef int images_per_batch = image.shape[0] 82 | cdef int n_channels = image.shape[1] 83 | cdef int img_h = image.shape[2] 84 | cdef int img_w = image.shape[3] 85 | 86 | cdef np.float64_t[:, :, :, :] x_padded = np.pad(image, ((0, 0), 87 | (0, 0), 88 | (padding_height, padding_height), 89 | (padding_width, padding_width)), 90 | mode='constant') 91 | 92 | cdef int new_h = int((img_h - filter_height + 2 * padding_height) / stride_height + 1) 93 | cdef int new_w = int((img_w - filter_width + 2 * padding_width) / stride_width + 1) 94 | 95 | cdef int col_height = filter_width * filter_height * n_channels 96 | cdef int col_width = images_per_batch * new_h * new_w 97 | 98 | cdef np.float64_t[:, :] result = np.zeros((col_height, col_width)) 99 | 100 | im2col_inner(x_padded, result, new_h, new_w, n_channels, images_per_batch, 101 | filter_height, filter_width, stride_height, stride_width) 102 | 103 | return result 104 | 105 | @cython.boundscheck(False) 106 | @cython.wraparound(False) 107 | def col2im(np.float64_t[:, :] col2img_converted, int batch_size, 108 | int no_channels, int image_height, int image_width, 109 | int filter_height=3, int filter_width=3, 110 | int padding_height=0, int padding_width=0, 111 | int stride_height=1, int stride_width=1): 112 | 113 | cdef int padded_h = image_height + 2 * padding_height 114 | cdef int padded_w = image_width + 2 * padding_width 115 | cdef np.float64_t[:, :, :, :] result = np.zeros((batch_size, no_channels, padded_h, padded_w)) 116 | 117 | col2img_inner(col2img_converted, result, filter_height, 118 | filter_width, batch_size, no_channels, 119 | image_height, image_width, padded_h, 120 | padded_w, padding_height, padding_width, 121 | stride_height, stride_width) 122 | return result 123 | -------------------------------------------------------------------------------- /aurora/nn/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def softmax_func(x): 5 | """ 6 | Numerically stable softmax function. For more details 7 | about numerically calculations please refer: 8 | http://www.deeplearningbook.org/slides/04_numerical.pdf 9 | :param x: 10 | :return: 11 | """ 12 | stable_values = x - np.max(x, axis=1, keepdims=True) 13 | return np.exp(stable_values) / np.sum(np.exp(stable_values), axis=1, keepdims=True) 14 | 15 | 16 | def log_sum_exp(x): 17 | """ 18 | log_sum_exp is a very useful function in machine learning. 19 | It can be seen in many places including cross-entropy error. 20 | However, the naive implementation is numerically unstable. 21 | Therefore, we use the following implementation. For more details 22 | please refer: http://www.deeplearningbook.org/slides/04_numerical.pdf 23 | :param x: 24 | :return: 25 | """ 26 | mx = np.max(x, axis=1, keepdims=True) 27 | safe = x - mx 28 | return mx + np.log(np.sum(np.exp(safe), axis=1, keepdims=True)) 29 | 30 | 31 | # Following two methods were used in the initial version of the convolution operations. 32 | # Later we introduced fast Cython versions of `im2col` and `col2im` implementations. 33 | # Hence, these two methods are obsolete. 34 | def im2col(image, filter_size=(3, 3), padding=(0, 0), stride=(1, 1)): 35 | M, C, h, w, = image.shape 36 | filter_height = filter_size[0] 37 | filter_width = filter_size[1] 38 | padding_height = padding[0] 39 | padding_width = padding[1] 40 | stride_height = stride[0] 41 | stride_width = stride[1] 42 | x_padded = np.pad(image, ((0, 0), 43 | (0, 0), 44 | (padding_height, padding_height), 45 | (padding_width, padding_width)), 46 | mode='constant') 47 | h_new = int((h - filter_height + 2 * padding_height) / stride_height + 1) 48 | w_new = int((w - filter_width + 2 * padding_width) / stride_width + 1) 49 | 50 | out = np.zeros((filter_width * filter_height * C, M * h_new * w_new), dtype=image.dtype) 51 | 52 | itr = 0 53 | for i in range(h_new): 54 | for j in range(w_new): 55 | for m in range(M): 56 | start_i = stride_height * i 57 | end_i = stride_height * i + filter_width 58 | start_j = stride_width * j 59 | end_j = stride_width * j + filter_height 60 | out[:, itr] = x_padded[m, :, start_i:end_i, start_j:end_j].ravel() 61 | itr += 1 62 | return out 63 | 64 | 65 | def col2im(cols, x_shape, filter_size=(3, 3), padding=(0, 0), stride=(1, 1)): 66 | N, C, H, W = x_shape 67 | filter_height = filter_size[0] 68 | filter_width = filter_size[1] 69 | padding_height = padding[0] 70 | padding_width = padding[1] 71 | stride_height = stride[0] 72 | stride_width = stride[1] 73 | 74 | H_padded, W_padded = H + 2 * padding_height, W + 2 * padding_width 75 | x_padded = np.zeros((N, C, H_padded, W_padded), dtype=cols.dtype) 76 | 77 | idx = 0 78 | for i in range(0, H_padded - filter_height + 1, stride_height): 79 | for j in range(0, W_padded - filter_width + 1, stride_width): 80 | for m in range(N): 81 | col = cols[:, idx] 82 | col = col.reshape((C, filter_height, filter_width)) 83 | x_padded[m, :, i:i + filter_height, j:j + filter_width] += col 84 | idx += 1 85 | if padding[0] or padding[1] > 0: 86 | return x_padded[:, :, padding_height:-padding_height, padding_width:-padding_width] 87 | else: 88 | return x_padded 89 | -------------------------------------------------------------------------------- /aurora/optim/__init__.py: -------------------------------------------------------------------------------- 1 | from .sgd import SGD 2 | from .adam import Adam 3 | 4 | __all__ = ['SGD', 'Adam'] -------------------------------------------------------------------------------- /aurora/optim/adam.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .base import Base 3 | 4 | try: 5 | from aurora.ndarray import gpu_op, ndarray 6 | except ImportError: 7 | pass 8 | 9 | 10 | class Adam(Base): 11 | def __init__(self, cost, params, lr=1e-3, beta1=0.9, beta2=0.995, eps=1e-5, use_gpu=False): 12 | super().__init__(cost, params, lr, use_gpu=use_gpu) 13 | self.beta1 = beta1 14 | self.beta2 = beta2 15 | 16 | if self.use_gpu: 17 | self.velocity = [ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0)) 18 | for param in params] 19 | self.momentum = [ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0)) 20 | for param in params] 21 | 22 | self.vec_hat = [ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0)) 23 | for param in self.params] 24 | self.mom_hat = [ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0)) 25 | for param in self.params] 26 | else: 27 | self.velocity = [np.zeros_like(param.const) for param in params] 28 | self.momentum = [np.zeros_like(param.const) for param in params] 29 | 30 | self.time = 0 31 | self.eps = eps 32 | 33 | def step(self, feed_dict): 34 | exe_output = self.executor.run(feed_dict) 35 | self.time += 1 36 | 37 | if self.use_gpu: 38 | # set 39 | for i in range(len(self.vec_hat)): 40 | gpu_op.matrix_elementwise_multiply_by_const(self.vec_hat[i], 0.0, self.vec_hat[i]) 41 | gpu_op.matrix_elementwise_multiply_by_const(self.mom_hat[i], 0.0, self.mom_hat[i]) 42 | 43 | for i in range(len(self.params)): 44 | gpu_op.matrix_elementwise_multiply_by_const(self.momentum[i], self.beta1, self.momentum[i]) 45 | 46 | # TODO: (upul) copying dev->hot>dev is expensive. We need a better approach. 47 | tem_gpu_array = ndarray.array(exe_output[i + 1].asnumpy(), ctx=ndarray.gpu(0)) 48 | gpu_op.matrix_elementwise_multiply_by_const(exe_output[i + 1], (1 - self.beta1), tem_gpu_array) 49 | gpu_op.matrix_elementwise_add(self.momentum[i], tem_gpu_array, self.momentum[i]) 50 | gpu_op.matrix_elementwise_div_by_const(self.momentum[i], (1 - self.beta1 ** self.time), self.mom_hat[i]) 51 | 52 | gpu_op.matrix_elementwise_multiply_by_const(self.velocity[i], self.beta2, self.velocity[i]) 53 | gpu_op.matrix_elementwise_multiply(exe_output[i + 1], exe_output[i + 1], exe_output[i + 1]) 54 | gpu_op.matrix_elementwise_multiply_by_const(exe_output[i + 1], (1 - self.beta2), exe_output[i + 1]) 55 | gpu_op.matrix_elementwise_add(self.velocity[i], exe_output[i + 1], self.velocity[i]) 56 | gpu_op.matrix_elementwise_div_by_const(self.velocity[i], (1 - self.beta2 ** self.time), self.vec_hat[i]) 57 | 58 | for i in range(len(self.params)): 59 | gpu_op.matrix_elementwise_sqrt(self.vec_hat[i], self.vec_hat[i]) 60 | gpu_op.matrix_elementwise_add_by_const(self.vec_hat[i], self.eps, self.vec_hat[i]) 61 | 62 | gpu_op.matrix_elementwise_multiply_by_const(self.mom_hat[i], -1 * self.lr, self.mom_hat[i]) 63 | gpu_op.matrix_elementwise_division(self.mom_hat[i], self.vec_hat[i], self.mom_hat[i]) 64 | gpu_op.matrix_elementwise_add(self.params[i].const, self.mom_hat[i], self.params[i].const) 65 | 66 | else: 67 | vec_hat = [np.zeros_like(param.const) for param in self.params] 68 | mom_hat = [np.zeros_like(param.const) for param in self.params] 69 | 70 | for i in range(len(self.params)): 71 | self.momentum[i] = self.beta1 * self.momentum[i] + (1 - self.beta1) * exe_output[i + 1] 72 | mom_hat[i] = self.momentum[i] / (1 - self.beta1 ** self.time) 73 | 74 | self.velocity[i] = self.beta2 * self.velocity[i] + (1 - self.beta2) * (exe_output[i + 1] ** 2) 75 | vec_hat[i] = self.velocity[i] / (1 - self.beta2 ** self.time) 76 | 77 | for i in range(len(self.params)): 78 | self.params[i].const += -self.lr * mom_hat[i] / (np.sqrt(vec_hat[i]) + self.eps) 79 | 80 | cost = exe_output[0] 81 | if self.use_gpu: 82 | cost = cost.asnumpy() 83 | return cost 84 | -------------------------------------------------------------------------------- /aurora/optim/base.py: -------------------------------------------------------------------------------- 1 | import aurora.autodiff as ad 2 | try: 3 | from aurora.ndarray import ndarray 4 | except ImportError: 5 | pass 6 | 7 | 8 | class Base: 9 | def __init__(self, cost, params, lr=0.1, use_gpu=False): 10 | self.cost = cost 11 | 12 | # if use_gpu == True, create matrices in GPU 13 | self.params = self._copy_to_gpu(params) if use_gpu else params 14 | self.lr = lr 15 | grads = ad.gradients(cost, params) 16 | grads.insert(0, cost) 17 | self.use_gpu = use_gpu 18 | self.executor = ad.Executor(grads, use_gpu=use_gpu) 19 | 20 | def step(self, feed_dict): 21 | raise NotImplementedError('This method should be implemented by subclasses') 22 | 23 | @staticmethod 24 | def _copy_to_gpu(params): 25 | ctx = ndarray.gpu(0) 26 | gpu_arrays = [] 27 | for param in params: 28 | param.const = ndarray.array(param.const, ctx=ctx) 29 | gpu_arrays.append(param) 30 | return gpu_arrays 31 | -------------------------------------------------------------------------------- /aurora/optim/sgd.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .base import Base 3 | try: 4 | from aurora.ndarray import gpu_op, ndarray 5 | except ImportError: 6 | pass 7 | 8 | 9 | class SGD(Base): 10 | def __init__(self, cost, params, lr=0.1, momentum=0.9, use_gpu=False): 11 | super().__init__(cost, params, lr=lr, use_gpu=use_gpu) 12 | self.momentum = momentum 13 | if use_gpu: 14 | self.velocity = [ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0)) 15 | for param in params] 16 | else: 17 | self.velocity = [np.zeros_like(param.const) for param in params] 18 | 19 | def step(self, feed_dict): 20 | exe_output = self.executor.run(feed_dict) 21 | for i in range(len(self.params)): 22 | if self.use_gpu: 23 | gpu_op.matrix_elementwise_multiply_by_const(self.velocity[i], self.momentum, self.velocity[i]) 24 | gpu_op.matrix_elementwise_multiply_by_const(exe_output[1 + i], -self.lr, exe_output[1 + i]) 25 | gpu_op.matrix_elementwise_add(self.velocity[i], exe_output[1 + i], self.velocity[i]) 26 | 27 | gpu_op.matrix_elementwise_add(self.params[i].const, self.velocity[i], self.params[i].const) 28 | else: 29 | self.velocity[i] = self.momentum * self.velocity[i] - self.lr * exe_output[1 + i] 30 | self.params[i].const += self.velocity[i] 31 | 32 | cost = exe_output[0] 33 | if self.use_gpu: 34 | cost = cost.asnumpy() 35 | return cost 36 | -------------------------------------------------------------------------------- /cuda/Makefile: -------------------------------------------------------------------------------- 1 | CUDA_DIR = /usr/local/cuda 2 | 3 | CC_SRCS := $(wildcard src/*.cc) 4 | CC_OBJS := ${CC_SRCS:src/%.cc=build/obj/%.o} 5 | CUDA_SRCS := $(wildcard src/*.cu) 6 | CUDA_OBJS := ${CUDA_SRCS:src/%.cu=build/obj/%.o} 7 | OBJS := $(CC_OBJS) $(CUDA_OBJS) 8 | 9 | CC = g++ 10 | WARNINGS = -Wall -Wfatal-errors -Wno-unused -Wno-unused-result 11 | CC_FLAGS = -std=c++11 -fPIC $(WARNINGS) -I$(CUDA_DIR)/include 12 | LD_FLAGS = -L$(CUDA_DIR)/lib64 -lcuda -lcudart -lcublas -lcudnn 13 | 14 | NVCC = nvcc 15 | NVCC_FLAGS = -std=c++11 --compiler-options '-fPIC' 16 | ARCH = -gencode arch=compute_30,code=sm_30 \ 17 | -gencode arch=compute_35,code=sm_35 \ 18 | -gencode arch=compute_50,code=[sm_50,compute_50] \ 19 | -gencode arch=compute_52,code=[sm_52,compute_52] 20 | 21 | all: build/lib/libc_runtime_api.so 22 | 23 | build/lib/libc_runtime_api.so: $(OBJS) 24 | @mkdir -p build/lib 25 | $(CC) -shared $^ -o $@ $(LD_FLAGS) 26 | 27 | build/obj/%.o: src/%.cc 28 | @mkdir -p build/obj 29 | $(CC) $(CC_FLAGS) -c $< -o $@ 30 | 31 | build/obj/%.o: src/%.cu 32 | @mkdir -p build/obj 33 | $(NVCC) $(ARCH) $(NVCC_FLAGS) -c $< -o $@ 34 | 35 | clean: 36 | rm -rf build 37 | 38 | .PHONY: clean 39 | -------------------------------------------------------------------------------- /cuda/src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.7) 2 | project(assignment2) 3 | 4 | set(CMAKE_CXX_STANDARD 11) 5 | 6 | set(SOURCE_FILES 7 | src/c_runtime_api.cc 8 | src/c_runtime_api.h 9 | src/cpu_device_api.cc 10 | src/cpu_device_api.h 11 | src/cuda_device_api.cc 12 | src/cuda_device_api.h 13 | src/device_api.h 14 | src/dlarray.h 15 | src/runtime_base.h) 16 | 17 | add_executable(assignment2 ${SOURCE_FILES}) -------------------------------------------------------------------------------- /cuda/src/c_runtime_api.cc: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 by Contributors 3 | * \file c_runtime_api.cc 4 | * \brief Device specific implementations 5 | */ 6 | #include "./c_runtime_api.h" 7 | #include "./cpu_device_api.h" 8 | #include "./cuda_device_api.h" 9 | #include "./runtime_base.h" 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | namespace dlsys { 20 | namespace runtime { 21 | 22 | class DeviceAPIManager { 23 | public: 24 | static const int kMaxDeviceAPI = 8; 25 | 26 | // Get API 27 | static DeviceAPI *Get(DLContext ctx) { 28 | return Global()->GetAPI(ctx.device_type); 29 | } 30 | 31 | private: 32 | std::array api_; 33 | 34 | DeviceAPIManager() { 35 | std::fill(api_.begin(), api_.end(), nullptr); 36 | static CPUDeviceAPI cpu_device_api_inst; 37 | static CUDADeviceAPI gpu_device_api_inst; 38 | api_[kCPU] = static_cast(&cpu_device_api_inst); 39 | api_[kGPU] = static_cast(&gpu_device_api_inst); 40 | } 41 | 42 | // Get global static variable. 43 | static DeviceAPIManager *Global() { 44 | static DeviceAPIManager inst; 45 | return &inst; 46 | } 47 | 48 | // Get API. 49 | DeviceAPI *GetAPI(DLDeviceType type) { 50 | if (api_[type] == nullptr) { 51 | std::cerr << "Device API not supported" << std::endl; 52 | exit(EXIT_FAILURE); 53 | } 54 | return api_[type]; 55 | } 56 | }; 57 | 58 | inline DLArray *DLArrayCreate_() { 59 | DLArray *arr = new DLArray(); 60 | arr->shape = nullptr; 61 | arr->ndim = 0; 62 | arr->data = nullptr; 63 | return arr; 64 | } 65 | 66 | inline void DLArrayFree_(DLArray *arr) { 67 | if (arr != nullptr) { 68 | // ok to delete nullptr 69 | delete[] arr->shape; 70 | if (arr->data != nullptr) { 71 | DeviceAPIManager::Get(arr->ctx)->FreeDataSpace(arr->ctx, arr->data); 72 | } 73 | } 74 | delete arr; 75 | } 76 | 77 | inline size_t GetDataSize(DLArray *arr) { 78 | size_t size = 1; 79 | for (index_t i = 0; i < arr->ndim; ++i) { 80 | size *= arr->shape[i]; 81 | } 82 | // assume 32-bit float 83 | size *= 4; 84 | return size; 85 | } 86 | 87 | inline size_t GetDataAlignment(DLArray *arr) { 88 | // assume 32-bit float 89 | return 8; 90 | } 91 | 92 | } // namespace runtime 93 | } // namespace dlsys 94 | 95 | using namespace dlsys::runtime; 96 | 97 | int DLArrayAlloc(const index_t *shape, index_t ndim, DLContext ctx, 98 | DLArrayHandle *out) { 99 | DLArray *arr = nullptr; 100 | API_BEGIN() ; 101 | // shape 102 | arr = DLArrayCreate_(); 103 | // ndim 104 | arr->ndim = ndim; 105 | index_t *shape_copy = new index_t[ndim]; 106 | std::copy(shape, shape + ndim, shape_copy); 107 | arr->shape = shape_copy; 108 | // ctx 109 | arr->ctx = ctx; 110 | size_t size = GetDataSize(arr); 111 | size_t alignment = GetDataAlignment(arr); 112 | arr->data = DeviceAPIManager::Get(ctx)->AllocDataSpace(ctx, size, alignment); 113 | *out = arr; 114 | API_END_HANDLE_ERROR(DLArrayFree_(arr)); 115 | } 116 | 117 | int DLArrayFree(DLArrayHandle handle) { 118 | API_BEGIN() ; 119 | DLArray *arr = handle; 120 | DLArrayFree_(arr); 121 | API_END(); 122 | } 123 | 124 | int DLArrayReshape(const DLArrayHandle handle, const index_t *new_shape, index_t new_dim) { 125 | API_BEGIN() ; 126 | DLArray *arr = handle; 127 | 128 | index_t *shape_copy = new index_t[new_dim]; 129 | std::copy(new_shape, new_shape + new_dim, shape_copy); 130 | arr->shape = shape_copy; 131 | arr->ndim = new_dim; 132 | API_END(); 133 | } 134 | 135 | int DLArrayCopyFromTo(DLArrayHandle from, DLArrayHandle to, 136 | DLStreamHandle stream) { 137 | API_BEGIN() ; 138 | size_t from_size = GetDataSize(from); 139 | size_t to_size = GetDataSize(to); 140 | // The size must exactly match 141 | assert(from_size == to_size); 142 | DLContext ctx = from->ctx; 143 | if (ctx.device_type == kCPU) { 144 | ctx = to->ctx; 145 | } else { 146 | // Can not copy across different ctx types directly 147 | assert((to->ctx.device_type == kCPU) || 148 | (to->ctx.device_type == from->ctx.device_type)); 149 | } 150 | DeviceAPIManager::Get(ctx)->CopyDataFromTo(from->data, to->data, from_size, 151 | from->ctx, to->ctx, stream); 152 | API_END(); 153 | } 154 | -------------------------------------------------------------------------------- /cuda/src/c_runtime_api.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 by Contributors 3 | * \file c_runtime_api.h 4 | * \brief DL runtime library. 5 | * 6 | */ 7 | 8 | #ifndef DLSYS_RUNTIME_C_RUNTIME_API_H_ 9 | #define DLSYS_RUNTIME_C_RUNTIME_API_H_ 10 | 11 | #ifdef __cplusplus 12 | #define DLSYS_EXTERN_C extern "C" 13 | #else 14 | #define DLSYS_EXTERN_C 15 | #endif 16 | 17 | #include "dlarray.h" 18 | #include 19 | #include 20 | 21 | DLSYS_EXTERN_C { 22 | /*! \brief type of array index. */ 23 | typedef int64_t index_t; 24 | 25 | /*! \brief the array handle */ 26 | typedef DLArray *DLArrayHandle; 27 | /*! 28 | * \brief The stream that is specific to device 29 | * can be NULL, which indicates the default one. 30 | */ 31 | typedef void *DLStreamHandle; 32 | 33 | // Array related apis for quick proptying 34 | /*! 35 | * \brief Allocate a nd-array's memory, 36 | * including space of shape, of given spec. 37 | * 38 | * \param shape The shape of the array, the data content will be copied to out 39 | * \param ndim The number of dimension of the array. 40 | * \param ctx The ctx this array sits on. 41 | * \param out The output handle. 42 | * \return 0 when success, -1 when failure happens 43 | */ 44 | int DLArrayAlloc(const index_t *shape, index_t ndim, DLContext ctx, 45 | DLArrayHandle *out); 46 | 47 | /*! 48 | * \brief Free the DL Array. 49 | * \param handle The array handle to be freed. 50 | * \return 0 when success, -1 when failure happens 51 | */ 52 | int DLArrayFree(DLArrayHandle handle); 53 | 54 | /*! 55 | * \brief Copy the array, both from and to must be valid during the copy. 56 | * \param from The array to be copied from. 57 | * \param to The target space. 58 | * \param stream The stream where the copy happens, can be NULL. 59 | * \return 0 when success, -1 when failure happens 60 | */ 61 | int DLArrayCopyFromTo(DLArrayHandle from, DLArrayHandle to, 62 | DLStreamHandle stream); 63 | 64 | /*! 65 | * \brief Set all array elements to given value. 66 | * \param arr The array to be Set. 67 | * \param value The target value. 68 | * \return 0 when success, -1 when failure happens 69 | */ 70 | int DLGpuArraySet(DLArrayHandle arr, float value); 71 | 72 | 73 | int DLArrayReshape(const DLArrayHandle handle, const index_t *new_shape, index_t new_dim); 74 | 75 | /*! 76 | * \brief Broadcast input array to output array. 77 | * \param input The input array. 78 | * \param output The output array. 79 | * \return 0 when success, -1 when failure happens 80 | */ 81 | int DLGpuBroadcastTo(const DLArrayHandle input, DLArrayHandle output); 82 | 83 | /*! 84 | * \brief Reduce sum input array by axis=0 and store to output. 85 | * \param input The input array. 86 | * \param output The output array. 87 | * \return 0 when success, -1 when failure happens 88 | */ 89 | int DLGpuReduceSumAxisZero(const DLArrayHandle input, DLArrayHandle output); 90 | 91 | /*! 92 | * \brief Elementwise add two matrices and store to output. 93 | * \param matA The left input array. 94 | * \param matB The right input array. 95 | * \param output The output array. 96 | * \return 0 when success, -1 when failure happens 97 | */ 98 | int DLGpuMatrixElementwiseAdd(const DLArrayHandle matA, 99 | const DLArrayHandle matB, DLArrayHandle output); 100 | 101 | /*! 102 | * \brief Add matrix by const and store to output. 103 | * \param input The input array. 104 | * \param val The constant. 105 | * \param output The output array. 106 | * \return 0 when success, -1 when failure happens 107 | */ 108 | int DLGpuMatrixElementwiseAddByConst(const DLArrayHandle input, float val, 109 | DLArrayHandle output); 110 | 111 | 112 | int DLGpuMatrixElementwiseSubtract(const DLArrayHandle matA, 113 | const DLArrayHandle matB, DLArrayHandle output); 114 | 115 | int DLGpuMatrixElementwiseSubtractByConst(const DLArrayHandle input, float val, 116 | DLArrayHandle output); 117 | 118 | /*! 119 | * \brief Elementwise multiply two matrices and store to output. 120 | * \param matA The left input array. 121 | * \param matB The right input array. 122 | * \param output The output array. 123 | * \return 0 when success, -1 when failure happens 124 | */ 125 | int DLGpuMatrixElementwiseMultiply( 126 | const DLArrayHandle matA, const DLArrayHandle matB, DLArrayHandle output); 127 | 128 | /*! 129 | * \brief Multiply matrix by const and store to output. 130 | * \param input The input array. 131 | * \param val The constant. 132 | * \param output The output array. 133 | * \return 0 when success, -1 when failure happens 134 | */ 135 | int DLGpuMatrixMultiplyByConst(const DLArrayHandle input, float val, 136 | DLArrayHandle output); 137 | 138 | 139 | // TODO: (upul) documentation 140 | int DLGpuMatrixElementwiseDiv(const DLArrayHandle matA, 141 | const DLArrayHandle matB, 142 | DLArrayHandle output); 143 | 144 | // TODO: (upul) documentation 145 | int DLGpuMatrixElementwiseDivByConst(const DLArrayHandle matA, float val, 146 | DLArrayHandle output); 147 | 148 | /*! 149 | * \brief Matrix multiply two matrices and store to output. 150 | * \param matA The left input array. 151 | * \param transposeA Whether matA needs to be transposed 152 | * \param matB The right input array. 153 | * \param transposeB Whether matB needs to be transposed 154 | * \param output The output array. 155 | * \return 0 when success, -1 when failure happens 156 | */ 157 | int DLGpuMatrixMultiply(const DLArrayHandle matA, bool transposeA, 158 | const DLArrayHandle matB, bool transposeB, 159 | DLArrayHandle matC); 160 | 161 | /*! 162 | * \brief Compute relu on all array elements, and store to output. 163 | * \param input The input array. 164 | * \param output The output value. 165 | * \return 0 when success, -1 when failure happens 166 | */ 167 | int DLGpuRelu(const DLArrayHandle input, DLArrayHandle output); 168 | 169 | /*! 170 | * \brief Compute relu gradient, and store to output. 171 | * \param input The input array. 172 | * \param in_grad The input gradients value. 173 | * \param output The output array. 174 | * \return 0 when success, -1 when failure happens 175 | */ 176 | int DLGpuReluGradient(const DLArrayHandle input, const DLArrayHandle in_grad, 177 | DLArrayHandle output); 178 | 179 | /*! 180 | * \brief Compute softmax on matrix, and store to output. 181 | * \param input The input array. 182 | * \param output The output value. 183 | * \return 0 when success, -1 when failure happens 184 | */ 185 | int DLGpuSoftmax(const DLArrayHandle input, DLArrayHandle output); 186 | 187 | /*! 188 | * \brief Compute softmax_cross_entropy. 189 | * np.mean(-np.sum(y_ * np.log(softmax(y)), axis=1), keepdims=True) 190 | * \param input_a The y array. 191 | * \param input_b The y_ array. 192 | * \param output The output value. 193 | * \return 0 when success, -1 when failure happens 194 | */ 195 | int DLGpuSoftmaxCrossEntropy(const DLArrayHandle input_a, 196 | const DLArrayHandle input_b, 197 | DLArrayHandle output); 198 | 199 | int DLGpuMatrixElementwiseSqrt(const DLArrayHandle input_a, DLArrayHandle output); 200 | 201 | /* 202 | * CUDNN.... 203 | */ 204 | int cudnnReLUForward(const DLArrayHandle input, DLArrayHandle output); 205 | 206 | int cudnnConv2DForward(const DLArrayHandle input, 207 | const DLArrayHandle filter, 208 | const DLArrayHandle bias, 209 | const int stride_height, 210 | const int stride_width, 211 | const int padding_height, 212 | const int padding_width, 213 | DLArrayHandle output); 214 | 215 | int cudnnPoolForward(const DLArrayHandle input, 216 | const int pooling_height, 217 | const int pooling_width, 218 | const int stride_height, 219 | const int stride_width, 220 | const char *mode, 221 | DLArrayHandle output); 222 | 223 | int cudnnPoolBackward(const DLArrayHandle input, 224 | const DLArrayHandle output_grads, 225 | const DLArrayHandle output, 226 | const int pooling_height, 227 | const int pooling_width, 228 | const int stride_height, 229 | const int stride_width, 230 | const char *mode, 231 | DLArrayHandle pool_grad); 232 | 233 | int cudnnConv2DBackwardFilter(const DLArrayHandle input, 234 | const DLArrayHandle output_grads, 235 | const int stride_height, 236 | const int stride_width, 237 | const int padding_height, 238 | const int padding_width, 239 | DLArrayHandle filter_grad); 240 | 241 | int cudnnConv2DBackwardData(const DLArrayHandle filter, 242 | const DLArrayHandle output_grads, 243 | const int stride_height, 244 | const int stride_width, 245 | const int padding_height, 246 | const int padding_width, 247 | DLArrayHandle data_grad); 248 | 249 | int cudnnConv2DBackwardBias(const DLArrayHandle output_grads, 250 | DLArrayHandle bias_grads); 251 | 252 | } // DLSYS_EXTERN_C 253 | 254 | #endif // DLSYS_RUNTIME_C_RUNTIME_API_H_ 255 | -------------------------------------------------------------------------------- /cuda/src/cpu_device_api.cc: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 by Contributors 3 | * \file cpu_device_api.cc 4 | */ 5 | #include "./cpu_device_api.h" 6 | #include 7 | #include 8 | #include 9 | 10 | namespace dlsys { 11 | namespace runtime { 12 | 13 | void *CPUDeviceAPI::AllocDataSpace(DLContext ctx, size_t size, 14 | size_t alignment) { 15 | // std::cout << "allocating cpu data" << std::endl; 16 | void *ptr; 17 | int ret = posix_memalign(&ptr, alignment, size); 18 | if (ret != 0) 19 | throw std::bad_alloc(); 20 | return ptr; 21 | } 22 | 23 | void CPUDeviceAPI::FreeDataSpace(DLContext ctx, void *ptr) { free(ptr); } 24 | 25 | void CPUDeviceAPI::CopyDataFromTo(const void *from, void *to, size_t size, 26 | DLContext ctx_from, DLContext ctx_to, 27 | DLStreamHandle stream) { 28 | // std::cout << "copying cpu data" << std::endl; 29 | memcpy(to, from, size); 30 | } 31 | 32 | void CPUDeviceAPI::StreamSync(DLContext ctx, DLStreamHandle stream) {} 33 | 34 | } // namespace runtime 35 | } // namespace dlsys 36 | -------------------------------------------------------------------------------- /cuda/src/cpu_device_api.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 by Contributors 3 | * \file device_api.h 4 | * \brief Device specific API 5 | */ 6 | #ifndef DLSYS_RUNTIME_CPU_DEVICE_API_H_ 7 | #define DLSYS_RUNTIME_CPU_DEVICE_API_H_ 8 | 9 | #include "c_runtime_api.h" 10 | #include "device_api.h" 11 | #include 12 | #include 13 | 14 | namespace dlsys { 15 | namespace runtime { 16 | 17 | class CPUDeviceAPI : public DeviceAPI { 18 | public: 19 | void *AllocDataSpace(DLContext ctx, size_t size, size_t alignment) final; 20 | 21 | void FreeDataSpace(DLContext ctx, void *ptr) final; 22 | 23 | void CopyDataFromTo(const void *from, void *to, size_t size, 24 | DLContext ctx_from, DLContext ctx_to, DLStreamHandle stream) final; 25 | 26 | void StreamSync(DLContext ctx, DLStreamHandle stream) final; 27 | }; 28 | 29 | } // namespace runtime 30 | } // namespace dlsys 31 | #endif // DLSYS_RUNTIME_CPU_DEVICE_API_H_ 32 | -------------------------------------------------------------------------------- /cuda/src/cuda_device_api.cc: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 by Contributors 3 | * \file cuda_device_api.cc 4 | * \brief GPU specific API 5 | */ 6 | 7 | #include "./cuda_device_api.h" 8 | #include 9 | #include 10 | #include 11 | 12 | #define CUDA_CALL(func) \ 13 | { \ 14 | cudaError_t e = (func); \ 15 | assert((e == cudaSuccess) || (e == cudaErrorCudartUnloading)); \ 16 | } 17 | 18 | namespace dlsys { 19 | namespace runtime { 20 | 21 | static void GPUCopy(const void *from, void *to, size_t size, 22 | cudaMemcpyKind kind, cudaStream_t stream) { 23 | if (stream != 0) { 24 | CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream)); 25 | } else { 26 | CUDA_CALL(cudaMemcpy(to, from, size, kind)); 27 | } 28 | } 29 | 30 | void *CUDADeviceAPI::AllocDataSpace(DLContext ctx, size_t size, 31 | size_t alignment) { 32 | //std::cout << "allocating cuda data" << std::endl; 33 | CUDA_CALL(cudaSetDevice(ctx.device_id)); 34 | assert((256 % alignment) == 0U); // << "CUDA space is aligned at 256 bytes"; 35 | void *ret; 36 | CUDA_CALL(cudaMalloc(&ret, size)); 37 | return ret; 38 | } 39 | 40 | void CUDADeviceAPI::FreeDataSpace(DLContext ctx, void *ptr) { 41 | //std::cout << "releasing cuda data" << std::endl; 42 | CUDA_CALL(cudaSetDevice(ctx.device_id)); 43 | CUDA_CALL(cudaFree(ptr)); 44 | } 45 | 46 | void CUDADeviceAPI::CopyDataFromTo(const void *from, void *to, size_t size, 47 | DLContext ctx_from, DLContext ctx_to, DLStreamHandle stream) { 48 | //std::cout << "copying cuda data" << std::endl; 49 | cudaStream_t cu_stream = static_cast(stream); 50 | if (ctx_from.device_type == kGPU && ctx_to.device_type == kGPU) { 51 | CUDA_CALL(cudaSetDevice(ctx_from.device_id)); 52 | if (ctx_from.device_id == ctx_to.device_id) { 53 | GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream); 54 | } else { 55 | cudaMemcpyPeerAsync(to, ctx_to.device_id, from, ctx_from.device_id, 56 | size, cu_stream); 57 | } 58 | } else if (ctx_from.device_type == kGPU && ctx_to.device_type == kCPU) { 59 | CUDA_CALL(cudaSetDevice(ctx_from.device_id)); 60 | GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream); 61 | } else if (ctx_from.device_type == kCPU && ctx_to.device_type == kGPU) { 62 | CUDA_CALL(cudaSetDevice(ctx_to.device_id)); 63 | GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream); 64 | } else { 65 | std::cerr << "expect copy from/to GPU or between GPU" << std::endl; 66 | } 67 | } 68 | 69 | void CUDADeviceAPI::StreamSync(DLContext ctx, DLStreamHandle stream) { 70 | CUDA_CALL(cudaSetDevice(ctx.device_id)); 71 | CUDA_CALL(cudaStreamSynchronize(static_cast(stream))); 72 | } 73 | 74 | } // namespace runtime 75 | } // namespace dlsys 76 | -------------------------------------------------------------------------------- /cuda/src/cuda_device_api.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 by Contributors 3 | * \file device_api.h 4 | * \brief Device specific API 5 | */ 6 | #ifndef DLSYS_RUNTIME_CUDA_DEVICE_API_H_ 7 | #define DLSYS_RUNTIME_CUDA_DEVICE_API_H_ 8 | 9 | #include "c_runtime_api.h" 10 | #include "device_api.h" 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | namespace dlsys { 17 | namespace runtime { 18 | 19 | class CUDADeviceAPI : public DeviceAPI { 20 | public: 21 | void *AllocDataSpace(DLContext ctx, size_t size, size_t alignment) final; 22 | 23 | void FreeDataSpace(DLContext ctx, void *ptr) final; 24 | 25 | void CopyDataFromTo(const void *from, void *to, size_t size, 26 | DLContext ctx_from, DLContext ctx_to, 27 | DLStreamHandle stream) final; 28 | 29 | void StreamSync(DLContext ctx, DLStreamHandle stream) final; 30 | }; 31 | 32 | } // namespace runtime 33 | } // namespace dlsys 34 | #endif // DLSYS_RUNTIME_CUDA_DEVICE_API_H_ 35 | -------------------------------------------------------------------------------- /cuda/src/device_api.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 by Contributors 3 | * \file device_api.h 4 | * \brief Device specific API 5 | */ 6 | #ifndef DLSYS_RUNTIME_DEVICE_API_H_ 7 | #define DLSYS_RUNTIME_DEVICE_API_H_ 8 | 9 | #include "c_runtime_api.h" 10 | #include 11 | #include 12 | 13 | namespace dlsys { 14 | namespace runtime { 15 | 16 | class DeviceAPI { 17 | public: 18 | /*! \brief virtual destructor */ 19 | virtual ~DeviceAPI() {} 20 | 21 | /*! 22 | * \brief Allocate a data space on device. 23 | * \param ctx The device context to perform operation. 24 | * \param size The size of the memory 25 | * \param alignment The alignment of the memory. 26 | * \return The allocated device pointer 27 | */ 28 | virtual void *AllocDataSpace(DLContext ctx, size_t size, 29 | size_t alignment) = 0; 30 | 31 | /*! 32 | * \brief Free a data space on device. 33 | * \param ctx The device context to perform operation. 34 | * \param ptr The data space. 35 | * \tparam xpu The device mask. 36 | */ 37 | virtual void FreeDataSpace(DLContext ctx, void *ptr) = 0; 38 | 39 | /*! 40 | * \brief copy data from one place to another 41 | * \param dev The device to perform operation. 42 | * \param from The source array. 43 | * \param to The target array. 44 | * \param size The size of the memory 45 | * \param ctx_from The source context 46 | * \param ctx_to The target context 47 | */ 48 | virtual void CopyDataFromTo(const void *from, void *to, size_t size, 49 | DLContext ctx_from, DLContext ctx_to, 50 | DLStreamHandle stream) = 0; 51 | 52 | /*! 53 | * \brief Synchronize the stream 54 | * \param ctx The context to perform operation. 55 | * \param stream The stream to be sync. 56 | */ 57 | virtual void StreamSync(DLContext ctx, DLStreamHandle stream) = 0; 58 | }; 59 | 60 | } // namespace runtime 61 | } // namespace dlsys 62 | #endif // DLSYS_RUNTIME_DEVICE_API_H_ 63 | -------------------------------------------------------------------------------- /cuda/src/dlarray.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 by Contributors 3 | * \file dlarray.h 4 | * \brief Header that defines array struct. 5 | */ 6 | #ifndef DLSYS_H_ 7 | #define DLSYS_H_ 8 | 9 | #ifdef __cplusplus 10 | #define DLSYS_EXTERN_C extern "C" 11 | #else 12 | #define DLSYS_EXTERN_C 13 | #endif 14 | 15 | #include 16 | #include 17 | 18 | DLSYS_EXTERN_C { 19 | /*! 20 | * \brief The device type in DLContext. 21 | */ 22 | typedef enum { 23 | kCPU = 1, 24 | kGPU = 2, 25 | } DLDeviceType; 26 | 27 | /*! 28 | * \brief A Device context for array. 29 | */ 30 | typedef struct { 31 | /*! \brief The device index */ 32 | int device_id; 33 | /*! \brief The device type used in the device. */ 34 | DLDeviceType device_type; 35 | } DLContext; 36 | 37 | /*! 38 | * \brief Plain C Array object, does not manage memory. 39 | */ 40 | typedef struct { 41 | /*! 42 | * \brief The opaque data pointer points to the allocated data. 43 | * This will be CUDA device pointer or cl_mem handle in OpenCL. 44 | * This pointer is always aligns to 256 bytes as in CUDA. 45 | */ 46 | void *data; 47 | /*! \brief The device context of the tensor */ 48 | DLContext ctx; 49 | /*! \brief Number of dimensions */ 50 | int ndim; 51 | /*! \brief The shape of the tensor */ 52 | int64_t *shape; 53 | } DLArray; 54 | 55 | } // DLSYS_EXTERN_C 56 | #endif // DLSYS_H_ 57 | -------------------------------------------------------------------------------- /cuda/src/gpu_op.cu: -------------------------------------------------------------------------------- 1 | #include "./c_runtime_api.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | /* TODO: Your code here */ 9 | /* all your GPU kernel code, e.g. matrix_softmax_cross_entropy_kernel */ 10 | 11 | // y = inputs[0], y_ = inputs[1] 12 | // np.mean(-np.sum(y_ * np.log(softmax(y)), axis=1), keepdims=True) 13 | __global__ void matrix_softmax_cross_entropy_kernel(int nrow, int ncol, 14 | const float *input_a, const float *input_b, float *output) { 15 | // Dynamic shared memory, size provided at kernel launch. 16 | extern __shared__ float loss_per_row[]; 17 | // Two dimensional thread blocks. 18 | int y = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x 19 | + threadIdx.x; 20 | if (y >= nrow) { 21 | return; 22 | } 23 | input_a += y * ncol; 24 | input_b += y * ncol; 25 | float maxval = *input_a; 26 | // Find max for a row. 27 | for (int x = 1; x < ncol; ++x) { 28 | maxval = max(maxval, input_a[x]); 29 | } 30 | // Deduct by max for a row, and raise to exp. 31 | float sum = 0; 32 | for (int x = 0; x < ncol; ++x) { 33 | sum += exp(input_a[x] - maxval); 34 | } 35 | // Compute per-row loss. 36 | float loss = 0; 37 | for (int x = 0; x < ncol; ++x) { 38 | loss -= input_b[x] * log(exp(input_a[x] - maxval) / sum); 39 | } 40 | loss_per_row[y] = loss; 41 | __syncthreads(); 42 | // Compute reduce_mean across rows. 43 | float mean_loss = 0; 44 | // Use a single thread to reduce mean across rows. 45 | if ((threadIdx.x == 0) && (threadIdx.y == 0)) { 46 | for (int i = 0; i < nrow; ++i) { 47 | mean_loss += loss_per_row[i]; 48 | } 49 | mean_loss /= nrow; 50 | output[0] = mean_loss; 51 | } 52 | } 53 | 54 | 55 | __global__ void array_set_kernel(float *array, float value, int n) { 56 | int index = blockIdx.x * blockDim.x + threadIdx.x; 57 | if (index < n) { 58 | array[index] = value; 59 | } 60 | } 61 | 62 | 63 | int DLGpuArraySet(DLArrayHandle arr, float value) { /* TODO: Your code here */ 64 | int n = 1; 65 | for (int i = 0; i < arr->ndim; i++) { 66 | n = n * arr->shape[i]; 67 | } 68 | 69 | float *array_data = (float *) arr->data; 70 | 71 | int threads_per_block = 1024; 72 | int num_blocks = (n + threads_per_block - 1) / threads_per_block; 73 | 74 | array_set_kernel << < num_blocks, threads_per_block >> > (array_data, value, n); 75 | return 0; 76 | } 77 | 78 | 79 | __global__ void broadcast_to_kernel(const float *input_data, 80 | float *output_data, 81 | index_t input_n, 82 | index_t output_n) { 83 | index_t idx = blockDim.x * blockIdx.x + threadIdx.x; 84 | if (idx < output_n) { 85 | output_data[idx] = input_data[idx % input_n]; 86 | } 87 | } 88 | 89 | 90 | int DLGpuBroadcastTo(const DLArrayHandle input, DLArrayHandle output) { 91 | /* TODO: Your code here */ 92 | index_t input_n = 1; 93 | for (int i = 0; i < input->ndim; i++) 94 | input_n *= input->shape[i]; 95 | 96 | index_t output_n = 1; 97 | for (int i = 0; i < output->ndim; i++) 98 | output_n *= output->shape[i]; 99 | 100 | const float *input_data = (const float *) input->data; 101 | float *output_data = (float *) output->data; 102 | 103 | int thread_per_block = 512; 104 | int n_blocks = (output_n + thread_per_block - 1) / thread_per_block; 105 | broadcast_to_kernel << < n_blocks, thread_per_block >> > (input_data, output_data, 106 | input_n, output_n); 107 | return 0; 108 | } 109 | 110 | __global__ void reduced_sum_axis_zero(const float *input_data, float *output_data, int input_n, int output_n) { 111 | int idx = blockDim.x * blockIdx.x + threadIdx.x; 112 | if (idx < output_n) { 113 | output_data[idx] = 0.0; 114 | for (int i = 0; i < input_n / output_n; i++) { 115 | output_data[idx] += input_data[i * output_n + idx]; 116 | } 117 | } 118 | } 119 | 120 | int DLGpuReduceSumAxisZero(const DLArrayHandle input, DLArrayHandle output) { 121 | /* TODO: Your code here */ 122 | int input_n = 1; 123 | for (int i = 0; i < input->ndim; i++) { 124 | input_n *= input->shape[i]; 125 | } 126 | 127 | int output_n = 1; 128 | for (int i = 0; i < output->ndim; i++) { 129 | output_n *= output->shape[i]; 130 | } 131 | 132 | const float *input_data = (const float *) input->data; 133 | float *output_data = (float *) output->data; 134 | 135 | int thread_per_block = 1024; 136 | int n_blocks = (output_n + thread_per_block - 1) / thread_per_block; 137 | 138 | reduced_sum_axis_zero << < n_blocks, thread_per_block >> > (input_data, output_data, input_n, output_n); 139 | return 0; 140 | } 141 | 142 | __global__ void matrix_elementwise_add(const float *a, const float *b, float *c, 143 | int n) { 144 | int index = blockIdx.x * blockDim.x + threadIdx.x; 145 | if (index < n) { 146 | c[index] = a[index] + b[index]; 147 | } 148 | } 149 | 150 | int DLGpuMatrixElementwiseAdd(const DLArrayHandle matA, 151 | const DLArrayHandle matB, DLArrayHandle output) { 152 | /* TODO: Your code here */ 153 | int n = 1; 154 | for (int i = 0; i < output->ndim; i++) { 155 | n = n * output->shape[i]; 156 | } 157 | const float *data_A = (const float *) matA->data; 158 | const float *data_B = (const float *) matB->data; 159 | float *data_output = (float *) output->data; 160 | 161 | int threads_per_block = 1024; 162 | int num_blocks = (n + threads_per_block - 1) / threads_per_block; 163 | 164 | matrix_elementwise_add << < num_blocks, threads_per_block >> > (data_A, data_B, 165 | data_output, n); 166 | return 0; 167 | } 168 | 169 | __global__ 170 | void matrix_elementwise_subtract(const float *a, const float *b, float *c, 171 | int n) { 172 | int index = blockIdx.x * blockDim.x + threadIdx.x; 173 | if (index < n) { 174 | c[index] = a[index] - b[index]; 175 | } 176 | } 177 | 178 | int DLGpuMatrixElementwiseSubtract(const DLArrayHandle matA, 179 | const DLArrayHandle matB, DLArrayHandle output) { 180 | /* TODO: Your code here */ 181 | int n = 1; 182 | for (int i = 0; i < output->ndim; i++) { 183 | n = n * output->shape[i]; 184 | } 185 | const float *data_A = (const float *) matA->data; 186 | const float *data_B = (const float *) matB->data; 187 | float *data_output = (float *) output->data; 188 | 189 | int threads_per_block = 1024; 190 | int num_blocks = (n + threads_per_block - 1) / threads_per_block; 191 | 192 | matrix_elementwise_subtract << < num_blocks, threads_per_block >> > (data_A, data_B, 193 | data_output, n); 194 | return 0; 195 | } 196 | 197 | __global__ 198 | void matrix_elementwise_division(const float *a, const float *b, float *result, int n) { 199 | int index = blockIdx.x * blockDim.x + threadIdx.x; 200 | if (index < n) { 201 | result[index] = a[index] / b[index]; 202 | } 203 | } 204 | 205 | int DLGpuMatrixElementwiseDiv(const DLArrayHandle matA, const DLArrayHandle matB, 206 | DLArrayHandle output) { 207 | int n = 1; 208 | for (int i = 0; i < output->ndim; i++) { 209 | n = n * output->shape[i]; 210 | } 211 | const float *data_A = (const float *) matA->data; 212 | const float *data_B = (const float *) matB->data; 213 | float *data_output = (float *) output->data; 214 | 215 | int threads_per_block = 1024; 216 | int num_blocks = (n + threads_per_block - 1) / threads_per_block; 217 | 218 | matrix_elementwise_division << < num_blocks, threads_per_block >> > (data_A, data_B, 219 | data_output, n); 220 | return 0; 221 | 222 | } 223 | 224 | __global__ void matrix_elementwise_add_by_const_kernal(const float *d_in, 225 | float *d_out, float val, int n) { 226 | int index = blockIdx.x * blockDim.x + threadIdx.x; 227 | if (index < n) { 228 | d_out[index] = d_in[index] + val; 229 | } 230 | } 231 | 232 | int DLGpuMatrixElementwiseAddByConst(const DLArrayHandle input, float val, 233 | DLArrayHandle output) { 234 | /* TODO: Your code here */ 235 | int n = 1; 236 | for (int i = 0; i < output->ndim; i++) { 237 | n = n * output->shape[i]; 238 | } 239 | const float *input_data = (const float *) input->data; 240 | float *output_data = (float *) output->data; 241 | int threads_per_block = 1024; 242 | int num_blocks = (n + threads_per_block - 1) / threads_per_block; 243 | matrix_elementwise_add_by_const_kernal << < num_blocks, threads_per_block >> > ( 244 | input_data, output_data, val, n); 245 | return 0; 246 | } 247 | 248 | __global__ 249 | void matrix_elementwise_subtract_by_const_kernal(const float *d_in, 250 | float *d_out, float val, int n) { 251 | int index = blockIdx.x * blockDim.x + threadIdx.x; 252 | if (index < n) { 253 | d_out[index] = d_in[index] - val; 254 | } 255 | } 256 | 257 | int DLGpuMatrixElementwiseSubtractByConst(const DLArrayHandle input, float val, 258 | DLArrayHandle output) { 259 | /* TODO: Your code here */ 260 | int n = 1; 261 | for (int i = 0; i < output->ndim; i++) { 262 | n = n * output->shape[i]; 263 | } 264 | const float *input_data = (const float *) input->data; 265 | float *output_data = (float *) output->data; 266 | int threads_per_block = 1024; 267 | int num_blocks = (n + threads_per_block - 1) / threads_per_block; 268 | matrix_elementwise_subtract_by_const_kernal << < num_blocks, threads_per_block >> > ( 269 | input_data, output_data, val, n); 270 | return 0; 271 | } 272 | 273 | 274 | __global__ void matrix_elementwise_div_by_const_kernal(const float *d_in, 275 | float *d_out, float val, int n) { 276 | int index = blockIdx.x * blockDim.x + threadIdx.x; 277 | if (index < n) { 278 | d_out[index] = d_in[index] / val; 279 | } 280 | } 281 | 282 | int DLGpuMatrixElementwiseDivByConst(const DLArrayHandle input, float val, 283 | DLArrayHandle output) { 284 | /* TODO: Your code here */ 285 | int n = 1; 286 | for (int i = 0; i < output->ndim; i++) { 287 | n = n * output->shape[i]; 288 | } 289 | const float *input_data = (const float *) input->data; 290 | float *output_data = (float *) output->data; 291 | int threads_per_block = 1024; 292 | int num_blocks = (n + threads_per_block - 1) / threads_per_block; 293 | matrix_elementwise_div_by_const_kernal << < num_blocks, threads_per_block >> > ( 294 | input_data, output_data, val, n); 295 | return 0; 296 | } 297 | 298 | 299 | __global__ void elementwise_mul_kernel(const float *data_a, const float *data_b, 300 | float *output, int n) { 301 | 302 | int index = blockDim.x * blockIdx.x + threadIdx.x; 303 | if (index < n) { 304 | output[index] = data_a[index] * data_b[index]; 305 | } 306 | } 307 | 308 | int DLGpuMatrixElementwiseMultiply(const DLArrayHandle matA, 309 | const DLArrayHandle matB, DLArrayHandle output) { 310 | /* TODO: Your code here */ 311 | int n = 1; 312 | for (int i = 0; i < output->ndim; i++) { 313 | n = n * output->shape[i]; 314 | } 315 | 316 | int threads_per_block = 1024; 317 | int num_blocks = (n + threads_per_block - 1) / threads_per_block; 318 | 319 | const float *mat_a_data = (const float *) matA->data; 320 | const float *mat_b_data = (const float *) matB->data; 321 | float *output_data = (float *) output->data; 322 | 323 | elementwise_mul_kernel << < num_blocks, threads_per_block >> > (mat_a_data, 324 | mat_b_data, output_data, n); 325 | 326 | return 0; 327 | } 328 | 329 | __global__ 330 | void matrix_elementwise_sqrt(const float *d_input, float *d_output, int n) { 331 | int index = blockDim.x * blockIdx.x + threadIdx.x; 332 | if (index < n) { 333 | d_output[index] = sqrt(d_input[index]); 334 | } 335 | } 336 | 337 | int DLGpuMatrixElementwiseSqrt(const DLArrayHandle input, DLArrayHandle output) { 338 | /* TODO: Your code here */ 339 | int n = 1; 340 | for (int i = 0; i < input->ndim; i++) { 341 | n *= input->shape[i]; 342 | } 343 | 344 | const float *input_data = (const float *) input->data; 345 | float *output_data = (float *) output->data; 346 | int threads_per_block = 1024; 347 | int num_blocks = (n + threads_per_block - 1) / threads_per_block; 348 | matrix_elementwise_sqrt << < num_blocks, threads_per_block >> > (input_data, output_data, n); 349 | return 0; 350 | } 351 | 352 | 353 | __global__ void marix_multiply_by_const(const float *d_input, float *d_output, 354 | float val, int n) { 355 | int index = blockDim.x * blockIdx.x + threadIdx.x; 356 | if (index < n) { 357 | d_output[index] = d_input[index] * val; 358 | } 359 | } 360 | 361 | int DLGpuMatrixMultiplyByConst(const DLArrayHandle input, float val, 362 | DLArrayHandle output) { 363 | /* TODO: Your code here */ 364 | int n = 1; 365 | for (int i = 0; i < input->ndim; i++) { 366 | n *= input->shape[i]; 367 | } 368 | 369 | const float *input_data = (const float *) input->data; 370 | float *output_data = (float *) output->data; 371 | int threads_per_block = 1024; 372 | int num_blocks = (n + threads_per_block - 1) / threads_per_block; 373 | marix_multiply_by_const << < num_blocks, threads_per_block >> > (input_data, 374 | output_data, val, n); 375 | return 0; 376 | } 377 | 378 | // int DLGpuMatrixMultiply(const DLArrayHandle matA, bool transposeA, 379 | // const DLArrayHandle matB, bool transposeB, DLArrayHandle matC) { 380 | // /* TODO: Your code here */ 381 | // // Hint: use cublas 382 | // // cublas assume matrix is column major 383 | // cublasHandle_t handle; 384 | // cublasStatus_t stat = cublasCreate(&handle); 385 | // if (stat != CUBLAS_STATUS_SUCCESS) 386 | // printf("CUBLAS initialization failed\n"); 387 | 388 | // const float *matA_data = (const float *) matA->data; 389 | // const float *matB_data = (const float *) matB->data; 390 | // float *matC_data = (float *) matC->data; 391 | 392 | // cublasOperation_t transa = transposeA ? CUBLAS_OP_T : CUBLAS_OP_N; 393 | // cublasOperation_t transb = transposeB ? CUBLAS_OP_T : CUBLAS_OP_N; 394 | 395 | // int m = transposeB ? matB->shape[0] : matB->shape[1]; 396 | // int n = transposeA ? matA->shape[1] : matA->shape[0]; 397 | // int k = transposeA ? matA->shape[0] : matA->shape[1]; 398 | 399 | // float alpha = 1.0f; 400 | // float beta = 0.0f; 401 | // stat = cublasSgemm(handle, transb, transa, 402 | // m, n, k, 403 | // &alpha, matB_data, matB->shape[1], 404 | // matA_data, matA->shape[1], 405 | // &beta, matC_data, m); 406 | 407 | // if (stat != CUBLAS_STATUS_SUCCESS) 408 | // printf("CUBLAS kernel execution error.\n"); 409 | 410 | // stat = cublasDestroy(handle); 411 | // if (stat != CUBLAS_STATUS_SUCCESS) 412 | // printf("CUBLAS shutdown error\n"); 413 | 414 | // return 0; 415 | // } 416 | cublasHandle_t cublas_handle = NULL; 417 | 418 | int DLGpuMatrixMultiply(const DLArrayHandle matA, bool transposeA, 419 | const DLArrayHandle matB, bool transposeB, 420 | DLArrayHandle matC) { 421 | /* TODO: Your code here */ 422 | // Hint: use cublas 423 | // cublas assume matrix is column major 424 | // op(A) * op(B) = C 425 | // op(B)T * op(A)T = CT 426 | 427 | if (!cublas_handle) { 428 | cublasCreate(&cublas_handle); 429 | } 430 | 431 | float one = 1.0f; 432 | float zero = 0.0f; 433 | int m = matC->shape[1]; 434 | int n = matC->shape[0]; 435 | int k = transposeA ? matA->shape[0] : matA->shape[1]; 436 | 437 | cublasSgemm(cublas_handle, 438 | transposeB ? CUBLAS_OP_T : CUBLAS_OP_N, 439 | transposeA ? CUBLAS_OP_T : CUBLAS_OP_N, 440 | m, n, k, 441 | &one, 442 | (const float *) matB->data, !transposeB ? m : k, 443 | (const float *) matA->data, !transposeA ? k : n, 444 | &zero, 445 | (float *) matC->data, m 446 | ); 447 | return 0; 448 | } 449 | 450 | __global__ void relu_kernel(const float *input, float *output, int n) { 451 | int index = blockDim.x * blockIdx.x + threadIdx.x; 452 | if (index < n) { 453 | float element = input[index]; 454 | if (element <= 0) { 455 | output[index] = 0; 456 | } else { 457 | output[index] = element; 458 | } 459 | } 460 | } 461 | 462 | int DLGpuRelu(const DLArrayHandle input, DLArrayHandle output) { 463 | /* TODO: Your code here */ 464 | int n = 1; 465 | for (int i = 0; i < input->ndim; i++) { 466 | n *= input->shape[i]; 467 | } 468 | 469 | const float *input_data = (const float *) input->data; 470 | float *output_data = (float *) output->data; 471 | int threads_per_block = 1024; 472 | int num_blocks = (n + threads_per_block - 1) / threads_per_block; 473 | relu_kernel << < num_blocks, threads_per_block >> > (input_data, output_data, n); 474 | return 0; 475 | } 476 | 477 | __global__ void relu_gradient_kernel(const float *input, float *output, 478 | const float *in_grad, int n) { 479 | int index = blockDim.x * blockIdx.x + threadIdx.x; 480 | if (index < n) { 481 | float element = input[index]; 482 | if (element <= 0) { 483 | output[index] = 0; 484 | } else { 485 | output[index] = in_grad[index]; 486 | } 487 | } 488 | } 489 | 490 | int DLGpuReluGradient(const DLArrayHandle input, const DLArrayHandle in_grad, 491 | DLArrayHandle output) { 492 | /* TODO: Your code here */ 493 | int n = 1; 494 | for (int i = 0; i < input->ndim; i++) { 495 | n *= input->shape[i]; 496 | } 497 | 498 | const float *input_data = (const float *) input->data; 499 | float *output_data = (float *) output->data; 500 | const float *in_grad_data = (const float *) in_grad->data; 501 | int threads_per_block = 1024; 502 | int num_blocks = (n + threads_per_block - 1) / threads_per_block; 503 | 504 | relu_gradient_kernel << < num_blocks, threads_per_block >> > (input_data, 505 | output_data, in_grad_data, n); 506 | return 0; 507 | } 508 | 509 | __global__ void softmax_kernel(int64_t nrow, int64_t ncol, 510 | const float *input_data, 511 | float *output_data) { 512 | 513 | // two dimensional thread blocks. 514 | int y = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x; 515 | if (y >= nrow) { 516 | return; 517 | } 518 | // y_th row of input data 519 | input_data += y * ncol; 520 | output_data += y * ncol; 521 | // find max for a row. 522 | float maxval = *input_data; 523 | for (int x = 1; x < ncol; ++x) { 524 | maxval = max(maxval, input_data[x]); 525 | } 526 | // Deduct by max for a row, and raise to exp. 527 | // in case of too large of exp, and the result will not be affected 528 | float sum = 0; 529 | for (int x = 0; x < ncol; ++x) { 530 | sum += exp(input_data[x] - maxval); 531 | } 532 | // Compute per-row softmax. 533 | for (int x = 0; x < ncol; ++x) { 534 | output_data[x] = exp(input_data[x] - maxval) / sum; 535 | } 536 | } 537 | 538 | 539 | int DLGpuSoftmax(const DLArrayHandle input, DLArrayHandle output) { 540 | /* TODO: Your code here */ 541 | assert(input->ndim == 2); 542 | assert(output->ndim == 2); 543 | int64_t nrow = input->shape[0]; 544 | int64_t ncol = input->shape[1]; 545 | float *input_data = (float *) input->data; 546 | float *output_data = (float *) output->data; 547 | dim3 threads; 548 | if (nrow < 1024) { 549 | threads.x = nrow; 550 | } else { 551 | threads.x = 1024; 552 | threads.y = (nrow + 1023) / 1024; 553 | } 554 | softmax_kernel << < 1, threads >> > (nrow, ncol, input_data, output_data); 555 | return 0; 556 | } 557 | 558 | int DLGpuSoftmaxCrossEntropy(const DLArrayHandle input_a, 559 | const DLArrayHandle input_b, DLArrayHandle output) { 560 | assert(input_a->ndim == 2); 561 | assert(input_b->ndim == 2); 562 | assert(output->ndim == 1); 563 | assert( 564 | input_a->shape[0] == input_b->shape[0] 565 | && input_a->shape[1] == input_b->shape[1]); 566 | int nrow = input_a->shape[0]; 567 | // Maximum x- or y-dimension of a block = 1024 568 | // But we need 'nrow' shared memory, and max shared memory is 48KB. 569 | // Conservatively allow max 16KB shared memory. 570 | assert(nrow <= 1024 * 4); 571 | int ncol = input_a->shape[1]; 572 | const float *input_data_a = (const float *) input_a->data; 573 | const float *input_data_b = (const float *) input_b->data; 574 | float *output_data = (float *) output->data; 575 | dim3 threads; 576 | if (nrow <= 1024) { 577 | threads.x = nrow; 578 | } else { 579 | threads.x = 1024; 580 | threads.y = (nrow + 1023) / 1024; 581 | } 582 | // 1 block, each block with 'threads' number of threads with 'nrow' shared 583 | // memory size 584 | matrix_softmax_cross_entropy_kernel << < 1, threads, nrow * sizeof(float) >> > ( 585 | nrow, ncol, input_data_a, input_data_b, output_data); 586 | return 0; 587 | } 588 | -------------------------------------------------------------------------------- /cuda/src/runtime_base.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) 2017 by Contributors 3 | * \file runtime_base.h 4 | * \brief Base of all C APIs 5 | */ 6 | #ifndef DLSYS_RUNTIME_RUNTIME_BASE_H_ 7 | #define DLSYS_RUNTIME_RUNTIME_BASE_H_ 8 | 9 | #include "c_runtime_api.h" 10 | #include 11 | 12 | /*! \brief macro to guard beginning and end section of all functions */ 13 | #define API_BEGIN() try { 14 | /*! 15 | * \brief every function starts with API_BEGIN(), and finishes with API_END() 16 | * or API_END_HANDLE_ERROR 17 | */ 18 | #define API_END() \ 19 | } \ 20 | catch (std::runtime_error & _except_) { \ 21 | return DLSYSAPIHandleException(_except_); \ 22 | } \ 23 | return 0; 24 | 25 | /*! 26 | * \brief every function starts with API_BEGIN() and finishes with API_END() or 27 | * API_END_HANDLE_ERROR. The finally clause contains procedure to cleanup states 28 | * when an error happens. 29 | */ 30 | #define API_END_HANDLE_ERROR(Finalize) \ 31 | } \ 32 | catch (std::runtime_error & _except_) { \ 33 | Finalize; \ 34 | return DLSYSAPIHandleException(_except_); \ 35 | } \ 36 | return 0; 37 | 38 | /*! 39 | * \brief handle exception throwed out 40 | * \param e the exception 41 | * \return the return value of API after exception is handled 42 | */ 43 | inline int DLSYSAPIHandleException(const std::runtime_error &e) { 44 | // TODO 45 | // TVMAPISetLastError(e.what()); 46 | return -1; 47 | } 48 | 49 | #endif // DLSYS_RUNTIME_RUNTIME_BASE_H_ 50 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/upul/Aurora/415a80ac5f7083475baca4a2d187cd102ba7a6c5/examples/__init__.py -------------------------------------------------------------------------------- /examples/mnist.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import aurora as au 3 | import aurora.autodiff as ad 4 | import timeit 5 | import argparse 6 | 7 | 8 | def measure_accuracy(activation, data, use_gpu=False): 9 | X_val, y_val = data 10 | executor = ad.Executor([activation], use_gpu=use_gpu) 11 | prob_val, = executor.run(feed_shapes={X: X_val}) 12 | if use_gpu: 13 | prob_val = prob_val.asnumpy() 14 | 15 | correct = np.sum(np.equal(y_val, np.argmax(prob_val, axis=1))) 16 | percentage = (correct / (y_val.shape[0])) * 100.00 17 | return percentage 18 | 19 | 20 | def build_graph(X, y, input_size, hid_1_size, hid_2_size, output_size): 21 | # Parameter of the model 22 | rand = np.random.RandomState(seed=1024) 23 | W1 = ad.Parameter(name="W1", init=rand.normal(scale=0.1, size=(input_size, hid_1_size))) 24 | b1 = ad.Parameter(name="b1", init=rand.normal(scale=0.1, size=(hid_1_size))) 25 | 26 | W2 = ad.Parameter(name="W2", init=rand.normal(scale=0.1, size=(hid_1_size, hid_2_size))) 27 | b2 = ad.Parameter(name="b2", init=rand.normal(scale=0.1, size=(hid_2_size))) 28 | 29 | W3 = ad.Parameter(name="W3", init=rand.normal(scale=0.1, size=(hid_2_size, output_size))) 30 | b3 = ad.Parameter(name="b3", init=rand.normal(scale=0.1, size=(output_size))) 31 | 32 | # building the NN model 33 | z1 = ad.matmul(X, W1) 34 | hidden_1 = z1 + ad.broadcast_to(b1, z1) 35 | activation_1 = au.nn.relu(hidden_1) 36 | 37 | z2 = ad.matmul(activation_1, W2) 38 | hidden_2 = z2 + ad.broadcast_to(b2, z2) 39 | activation_2 = au.nn.relu(hidden_2) 40 | 41 | z3 = ad.matmul(activation_2, W3) 42 | hidden_3 = z3 + ad.broadcast_to(b3, z3) 43 | loss = au.nn.softmax_cross_entropy_with_logits(hidden_3, y) 44 | return loss, W1, b1, W2, b2, W3, b3, hidden_3 45 | 46 | 47 | if __name__ == '__main__': 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument('-c', '--exe_context', 50 | help='Choose execution context: numpy, gpu', 51 | default='numpy') 52 | 53 | parser.add_argument('-i', '--num_iter', 54 | help='Choose number of iterations', 55 | default=500) 56 | 57 | args = parser.parse_args() 58 | 59 | use_gpu = False 60 | 61 | if args.exe_context == 'gpu': 62 | use_gpu = True 63 | n_iter = int(args.num_iter) 64 | 65 | start = timeit.default_timer() 66 | # Create an instance of MNIST dataset and 67 | # create a generator for reading training data 68 | data = au.datasets.MNIST(batch_size=128) 69 | batch_generator = data.train_batch_generator() 70 | 71 | input_size = data.num_features() # number of features 72 | hid_1_size = 256 # size of first hidden layer 73 | hid_2_size = 100 # size of the second hidden layer 74 | output_size = 10 # size of the output layer 75 | 76 | lr = 1e-3 # learning rate 77 | 78 | # X and y will be used to input data 79 | X = ad.Variable(name="X") 80 | y = ad.Variable(name='y') 81 | 82 | loss, W1, b1, W2, b2, W3, b3, logit = build_graph(X, y, input_size, hid_1_size, hid_2_size, output_size) 83 | # Using Adam optimizer 84 | # optimizer = au.optim.Adam(loss, params=[W1, b1, W2, b2, W3, b3], lr=lr) 85 | optimizer = au.optim.Adam(loss, params=[W1, b1, W2, b2, W3, b3], lr=lr, use_gpu=use_gpu) 86 | # Starts training 87 | for i in range(n_iter): 88 | # read next random batch from the training generator 89 | X_batch, y_batch = next(batch_generator) 90 | # run the optimizer and it will return the cost 91 | # after that iteration 92 | loss_now = optimizer.step(feed_dict={X: X_batch, y: y_batch}) 93 | if i <= 10 or (i <= 100 and i % 10 == 0) or (i <= 1000 and i % 100 == 0) or (i <= 10000 and i % 500 == 0): 94 | fmt_str = 'iter: {0:>5d} cost: {1:>8.5f}' 95 | print(fmt_str.format(i, loss_now[0])) 96 | 97 | # printing validation accuracy 98 | # TODO (upul) optimize hyper-parameters using validation dataset 99 | val_acc = measure_accuracy(logit, data.validation(), use_gpu=use_gpu) 100 | print('Validation accuracy: {:>.2f}'.format(val_acc)) 101 | 102 | # printing testing accuracy 103 | test_acc = measure_accuracy(logit, data.testing(), use_gpu=use_gpu) 104 | print('Testing accuracy: {:>.2f}'.format(test_acc)) 105 | 106 | end = timeit.default_timer() 107 | print('Time taken for training/testing: {0:.3f} seconds'.format(end - start)) 108 | -------------------------------------------------------------------------------- /examples/mnist_cnn.py: -------------------------------------------------------------------------------- 1 | """Trains a simple convnet on the MNIST dataset. 2 | ===================================================================== 3 | Numpy: Gets 99.00 % test accuracy after 3000 iterations with 4 | 64 batch size. 5 | 6 | Running Time: 1197.57 seconds on Intel(R) Core(TM) i7-7700K 7 | CPU @ 4.20GHz 8 Cores. 8 | 9 | 10 | GPU: Coming soon 11 | """ 12 | 13 | import argparse 14 | import timeit 15 | 16 | import aurora as au 17 | import aurora.autodiff as ad 18 | import numpy as np 19 | 20 | 21 | def build_network(image, y, batch_size=32): 22 | rand = np.random.RandomState(seed=1024) 23 | 24 | reshaped_images = ad.reshape(image, newshape=(batch_size, 1, 28, 28)) 25 | 26 | # weight in (number_kernels, color_depth, kernel_height, kernel_width) 27 | W1 = ad.Parameter(name='W1', init=rand.normal(scale=0.1, size=(32, 1, 5, 5))) 28 | b1 = ad.Parameter(name='b1', init=rand.normal(scale=0.1, size=32)) 29 | conv1 = au.nn.conv2d(input=reshaped_images, filter=W1, bias=b1) 30 | activation1 = au.nn.relu(conv1) 31 | # size of activation1: batch_size x 10 x 24 x 24 32 | 33 | # weight in (number_kernels, number_kernels of previous layer, kernel_height, kernel_width) 34 | W2 = ad.Parameter(name='W2', init=rand.normal(scale=0.1, size=(64, 32, 5, 5))) 35 | b2 = ad.Parameter(name='b2', init=rand.normal(scale=0.1, size=64)) 36 | conv2 = au.nn.conv2d(input=activation1, filter=W2, bias=b2) 37 | activation2 = au.nn.relu(conv2) 38 | # size of activation2: batch_size x 32 x 20 x 20 39 | 40 | pooling1 = au.nn.maxPool(activation2, filter=(2, 2), strides=(2, 2)) 41 | # size of activation2: batch_size x 32 x 10 x 10 = batch_size x 3200 42 | 43 | flatten = ad.reshape(pooling1, newshape=(batch_size, 6400)) 44 | 45 | W3 = ad.Parameter(name='W3', init=rand.normal(scale=0.1, size=(6400, 512))) 46 | b3 = ad.Parameter(name='b3', init=rand.normal(scale=0.1, size=512)) 47 | Z3 = ad.matmul(flatten, W3) 48 | Z3 = Z3 + ad.broadcast_to(b3, Z3) 49 | activation3 = au.nn.relu(Z3) 50 | 51 | W4 = ad.Parameter(name='W4', init=rand.normal(scale=0.1, size=(512, 10))) 52 | b4 = ad.Parameter(name='b4', init=rand.normal(scale=0.1, size=10)) 53 | logits = ad.matmul(activation3, W4) 54 | logits = logits + ad.broadcast_to(b4, logits) 55 | loss = au.nn.softmax_cross_entropy_with_logits(logits, y) 56 | 57 | return loss, W1, b1, W2, b2, W3, b3, W4, b4, logits 58 | 59 | 60 | def measure_accuracy(activation, data, batch_size=32, use_gpu=False): 61 | X_val, y_val = data 62 | 63 | executor = ad.Executor([activation], use_gpu=use_gpu) 64 | 65 | max_val = len(X_val) - len(X_val) % batch_size 66 | y_val = y_val[0:max_val] 67 | 68 | prediction = np.zeros(max_val) 69 | for i in range(0, max_val, batch_size): 70 | start = i 71 | end = i + batch_size 72 | 73 | X_batch, y_batch = X_val[start:end], y_val[start:end] 74 | prob_val, = executor.run(feed_shapes={images: X_batch}) 75 | 76 | if use_gpu: 77 | prob_val = prob_val.asnumpy() 78 | prediction[start:end] = np.argmax(prob_val, axis=1) 79 | 80 | correct = np.sum(np.equal(y_val, prediction)) 81 | percentage = (correct / len(prediction)) * 100.00 82 | return percentage 83 | 84 | 85 | if __name__ == '__main__': 86 | parser = argparse.ArgumentParser() 87 | parser.add_argument('-c', '--exe_context', 88 | help='Choose execution context: numpy, gpu', 89 | default='numpy') 90 | 91 | parser.add_argument('-i', '--num_iter', 92 | help='Choose number of iterations', 93 | default=500) 94 | 95 | args = parser.parse_args() 96 | 97 | use_gpu = False 98 | if args.exe_context == 'gpu': 99 | use_gpu = True 100 | 101 | n_iter = int(args.num_iter) 102 | 103 | start = timeit.default_timer() 104 | 105 | data = au.datasets.MNIST(batch_size=128) 106 | batch_generator = data.train_batch_generator() 107 | 108 | # images in (batch_size, color_depth, height, width) 109 | images = ad.Variable(name='images') 110 | labels = ad.Variable(name='y') 111 | 112 | loss, W1, b1, W2, b2, W3, b3, W4, b4, logits = build_network(images, labels, batch_size=128) 113 | opt_params = [W1, b1, W2, b2, W3, b3, W4, b4] 114 | optimizer = au.optim.Adam(loss, params=opt_params, lr=1e-3, use_gpu=use_gpu) 115 | 116 | cumulative_loss = [] 117 | for i in range(n_iter): 118 | X_batch, y_batch = next(batch_generator) 119 | loss_now = optimizer.step(feed_dict={images: X_batch, labels: y_batch}) 120 | cumulative_loss.append(loss_now[0]) 121 | if i <= 10 or (i <= 100 and i % 10 == 0) or (i <= 1000 and i % 100 == 0) or (i <= 10000 and i % 500 == 0): 122 | fmt_str = 'iter: {0:>5d} avg. cost: {1:>8.5f}' 123 | print(fmt_str.format(i, sum(cumulative_loss)/len(cumulative_loss))) 124 | cumulative_loss.clear() 125 | 126 | # printing validation accuracy 127 | val_acc = measure_accuracy(logits, data.validation(), batch_size=128, use_gpu=use_gpu) 128 | print('Validation accuracy: {:>.2f}'.format(val_acc)) 129 | 130 | # printing testing accuracy 131 | test_acc = measure_accuracy(logits, data.testing(), batch_size=128, use_gpu=use_gpu) 132 | print('Testing accuracy: {:>.2f}'.format(test_acc)) 133 | 134 | end = timeit.default_timer() 135 | print('Time taken for training/testing: {0:.3f} seconds'.format(end - start)) 136 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.13.3 2 | matplotlib==2.1.0 3 | seaborn==0.8 4 | cytoolz==0.8.2 5 | 6 | -------------------------------------------------------------------------------- /resources/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/upul/Aurora/415a80ac5f7083475baca4a2d187cd102ba7a6c5/resources/logo.png -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages 2 | from distutils.core import setup 3 | from distutils.extension import Extension 4 | from Cython.Build import cythonize 5 | import numpy 6 | 7 | extensions = [ 8 | Extension('aurora.nn.pyx.im2col', ['aurora/nn/pyx/im2col.pyx'], 9 | include_dirs=[numpy.get_include()] 10 | ), 11 | Extension('aurora.nn.pyx.fast_pooling', ['aurora/nn/pyx/fast_pooling.pyx'], 12 | include_dirs=[numpy.get_include()] 13 | ), 14 | ] 15 | 16 | setup( 17 | name='aurora', 18 | version='0.01', 19 | description='Minimal Deep Learning library is written in Python/Numpy and a bit of C++', 20 | url='https://github.com/upul/Aurora', 21 | author='Upul Bandara', 22 | author_email='upulbandara@gmail.com', 23 | license='MIT', 24 | ext_modules=cythonize(extensions), 25 | packages=find_packages(exclude=['Aurora.tests']) 26 | 27 | ) 28 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/upul/Aurora/415a80ac5f7083475baca4a2d187cd102ba7a6c5/tests/__init__.py -------------------------------------------------------------------------------- /tests/nn_primitives/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/upul/Aurora/415a80ac5f7083475baca4a2d187cd102ba7a6c5/tests/nn_primitives/__init__.py -------------------------------------------------------------------------------- /tests/nn_primitives/test_cython.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.testing as npt 3 | from aurora.nn.pyx.fast_pooling import max_pool_forward 4 | from aurora.nn.pyx.fast_pooling import max_pool_backward 5 | from aurora.nn.pyx.im2col import im2col 6 | from aurora.nn.pyx.im2col import col2im 7 | from tests.utils.gradient_check import gradient_check_numpy_expr 8 | 9 | 10 | # Testing Max Pooling Layers 11 | 12 | def test_max_pooling_forward(): 13 | data = np.array([[[[0.12, -1.23, 0.01, 2.45], 14 | [5.00, -10.01, 1.09, 4.66], 15 | [4.56, 6.78, 3.45, 3.33], 16 | [0.01, 1.00, 3.56, 3.39]]]]) 17 | 18 | # Test Case: 1 19 | # filter = (2, 2) stride = (2, 2) 20 | result = max_pool_forward(data, 2, 2, 2, 2) 21 | expected = np.array([[[[5.00, 4.66], 22 | [6.78, 3.56]]]]) 23 | assert result.shape == expected.shape 24 | npt.assert_array_almost_equal(result, expected) 25 | 26 | # Test Case: 2 27 | # filter = (2, 2) stride = (1, 1) 28 | result = max_pool_forward(data, 2, 2, 1, 1) 29 | expected = np.array([[ 30 | [[5.00, 1.09, 4.66], 31 | [6.78, 6.78, 4.66], 32 | [6.78, 6.78, 3.56]]]]) 33 | assert result.shape == expected.shape 34 | npt.assert_array_almost_equal(expected, result) 35 | 36 | # Test Case: 3 37 | # filter = (2, 2), stride = (2, 2) 38 | shape = (2, 3, 4, 4) 39 | data = np.linspace(-0.3, 0.4, num=np.prod(shape)).reshape(shape) 40 | result = max_pool_forward(data, 2, 2, 2, 2) 41 | expected = np.array([[[[-0.26315789, -0.24842105], 42 | [-0.20421053, -0.18947368]], 43 | [[-0.14526316, -0.13052632], 44 | [-0.08631579, -0.07157895]], 45 | [[-0.02736842, -0.01263158], 46 | [0.03157895, 0.04631579]]], 47 | [[[0.09052632, 0.10526316], 48 | [0.14947368, 0.16421053]], 49 | [[0.20842105, 0.22315789], 50 | [0.26736842, 0.28210526]], 51 | [[0.32631579, 0.34105263], 52 | [0.38526316, 0.4]]]]) 53 | npt.assert_array_almost_equal(result, expected) 54 | 55 | 56 | def test_max_pooling_backward(): 57 | data = np.array([[[[0.12, -1.23, 0.01, 2.45], 58 | [5.00, -10.01, 1.09, 4.66], 59 | [4.56, 6.78, 3.45, 3.33], 60 | [0.01, 1.00, 3.56, 3.39]]]]) 61 | output_grad = np.array([[[[1.0, 1.0], 62 | [1.0, 1.0]]]]) 63 | # Test Case: 1 64 | # filter = (2, 2) stride = (2, 2) 65 | expected = np.array([[[[0.0, 0.0, 0.0, 0.0], 66 | [1.0, 0.0, 0.0, 1.0], 67 | [0.0, 1.0, 0.0, 0.0], 68 | [0.0, 0.0, 1.0, 0.0]]]]) 69 | result = max_pool_backward(output_grad, data, 70 | filter_height=2, filter_width=2, 71 | stride_height=2, stride_width=2) 72 | npt.assert_array_almost_equal(result, expected) 73 | 74 | # calculate numerical gradient 75 | numerical = gradient_check_numpy_expr(lambda d: max_pool_forward(d, 2, 2, 2, 2), data, output_grad) 76 | npt.assert_array_almost_equal(numerical, expected, decimal=3) 77 | 78 | # Test Case: 2 79 | # filter = (2, 2) stride = (2, 2) 80 | # different output_grad 81 | output_grad = np.array([[[[0.0, 5.10], 82 | [0.12, 0.20]]]]) 83 | expected = np.array([[[[0.0, 0.0, 0.0, 0.0], 84 | [0.0, 0.0, 0.0, 5.10], 85 | [0.0, 0.12, 0.0, 0.0], 86 | [0.0, 0.0, 0.20, 0.0]]]]) 87 | result = max_pool_backward(output_grad, data, 88 | filter_height=2, filter_width=2, 89 | stride_height=2, stride_width=2) 90 | npt.assert_array_almost_equal(result, expected) 91 | 92 | # calculate numerical gradient 93 | numerical = gradient_check_numpy_expr(lambda d: max_pool_forward(d, 2, 2, 2, 2), data, output_grad) 94 | npt.assert_array_almost_equal(numerical, expected, decimal=2) 95 | 96 | # Test Case: 3 97 | # filter = (2, 2) stride = (1, 1) 98 | output_grad = np.array([[[[1.0, 1.0, 1.0], 99 | [1.0, 1.0, 1.0], 100 | [1.0, 1.0, 1.0]]]]) 101 | result = max_pool_backward(output_grad, data, 102 | filter_height=2, filter_width=2, 103 | stride_height=1, stride_width=1) 104 | numerical = gradient_check_numpy_expr(lambda x: max_pool_forward(x, 2, 2, 1, 1), data, output_grad) 105 | npt.assert_array_almost_equal(numerical, result, decimal=2) 106 | 107 | # Test Case: 4 108 | # filter = (2, 2) stride = (2, 2) 109 | # input shape = (2, 2, 6, 6) 110 | data = np.random.normal(scale=0.01, size=(2, 2, 6, 6)) 111 | output_grad = np.ones((2, 2, 3, 3)) 112 | result = max_pool_backward(output_grad, data, 113 | filter_height=2, filter_width=2, 114 | stride_height=2, stride_width=2) 115 | numerical = gradient_check_numpy_expr(lambda d: max_pool_forward(d, 2, 2, 2, 2), data, output_grad) 116 | npt.assert_array_almost_equal(numerical, result, decimal=4) 117 | 118 | 119 | # Testing Image to Column operations 120 | def test_im2col(): 121 | data = np.arange(16).reshape((1, 1, 4, 4)).astype(np.float64) 122 | # one image in the batch 2 by 2 kernel with stride = 1 123 | result = im2col(data, filter_height=2, filter_width=2, 124 | padding_height=0, padding_width=0, 125 | stride_height=1, stride_width=1) 126 | 127 | expected = np.array([[0, 1, 2, 4, 5, 6, 8, 9, 10], 128 | [1, 2, 3, 5, 6, 7, 9, 10, 11], 129 | [4, 5, 6, 8, 9, 10, 12, 13, 14], 130 | [5, 6, 7, 9, 10, 11, 13, 14, 15]]).astype(np.float64) 131 | npt.assert_array_almost_equal(result, expected) 132 | 133 | # one image in the batch 2 by 2 kernel with stride = 2 134 | result = im2col(data, filter_height=2, filter_width=2, 135 | padding_height=0, padding_width=0, 136 | stride_height=2, stride_width=2) 137 | expected = np.array([[0, 2, 8, 10], 138 | [1, 3, 9, 11], 139 | [4, 6, 12, 14], 140 | [5, 7, 13, 15]]).astype(np.float64) 141 | npt.assert_array_almost_equal(result, expected) 142 | 143 | # one image in the batche 2 by 2 kernel with stride = 1 and padding = 1 144 | data = np.arange(9).reshape(1, 1, 3, 3).astype(np.float64) 145 | result = im2col(data, filter_height=2, filter_width=2, 146 | padding_height=1, padding_width=1, 147 | stride_height=1, stride_width=1) 148 | expected = np.array([[0, 0, 0, 0, 0, 0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8], 149 | [0, 0, 0, 0, 0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0], 150 | [0, 0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 0, 0, 0], 151 | [0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 0, 0, 0, 0]]).astype(np.float64) 152 | npt.assert_array_almost_equal(result, expected) 153 | 154 | # more than one color channels 155 | # kernel 2 by 2 stride = 1 156 | data = np.arange(18).reshape(1, 2, 3, 3).astype(np.float64) 157 | result = im2col(data, filter_height=2, filter_width=2, 158 | padding_height=0, padding_width=0, 159 | stride_height=1, stride_width=1) 160 | expected = np.array([[0, 1, 3, 4], 161 | [1, 2, 4, 5], 162 | [3, 4, 6, 7], 163 | [4, 5, 7, 8], 164 | [9, 10, 12, 13], 165 | [10, 11, 13, 14], 166 | [12, 13, 15, 16], 167 | [13, 14, 16, 17]]) 168 | npt.assert_array_almost_equal(result, expected) 169 | 170 | # more than one batch and color chennel 171 | # kernel 2 by 2 with stride of 1 172 | data = np.arange(36).reshape(2, 2, 3, 3).astype(np.float64) 173 | result = im2col(data, filter_height=2, filter_width=2, 174 | padding_height=0, padding_width=0, 175 | stride_height=1, stride_width=1) 176 | expected = np.array([[0, 18, 1, 19, 3, 21, 4, 22], 177 | [1, 19, 2, 20, 4, 22, 5, 23], 178 | [3, 21, 4, 22, 6, 24, 7, 25], 179 | [4, 22, 5, 23, 7, 25, 8, 26], 180 | [9, 27, 10, 28, 12, 30, 13, 31], 181 | [10, 28, 11, 29, 13, 31, 14, 32], 182 | [12, 30, 13, 31, 15, 33, 16, 34], 183 | [13, 31, 14, 32, 16, 34, 17, 35]]).astype(np.float64) 184 | print(np.array(result)) 185 | npt.assert_array_almost_equal(result, expected) 186 | 187 | # TODO: (upul) test several kernel sizes and different stride, kernel size and padding 188 | # : in different directions 189 | 190 | 191 | def test_col2im(): 192 | # batch size 1, color channels 1, 3 by 3 image. Stride 1, filter 2 by 2 and no padding 193 | data = np.arange(9).reshape((1, 1, 3, 3)).astype(np.float64) 194 | i2c_result = im2col(data, filter_height=2, filter_width=2, 195 | padding_height=0, padding_width=0, 196 | stride_height=1, stride_width=1) 197 | result = col2im(i2c_result, 1, 1, 3, 3, 198 | 2, 2, 199 | 0, 0, 200 | 1, 1) 201 | expected = np.array([[[[0., 2., 2.], 202 | [6., 16., 10.], 203 | [6., 14., 8.]]]]).astype(np.float64) 204 | npt.assert_array_almost_equal(result, expected) 205 | 206 | # batch size 1, color channels 1, 4 by 4 image. Stride 2, filter 2 by 2 and no padding 207 | data = np.arange(16).reshape((1, 1, 4, 4)).astype(np.float64) 208 | i2c_result = im2col(data, filter_height=2, filter_width=2, 209 | padding_height=0, padding_width=0, 210 | stride_height=2, stride_width=2) 211 | result = col2im(i2c_result, 212 | 1, 1, # batch size and color channels 213 | 4, 4, # img width and height 214 | 2, 2, # kernel 215 | 0, 0, # padding 216 | 2, 2) # stride 217 | -------------------------------------------------------------------------------- /tests/test_autodiff_cpu.py: -------------------------------------------------------------------------------- 1 | import aurora as au 2 | import aurora.autodiff as ad 3 | import numpy as np 4 | import numpy.testing as npt 5 | 6 | 7 | def test_dummy(): 8 | assert 1 == 1 9 | 10 | 11 | def test_identity(): 12 | x2 = ad.Variable(name="x2") 13 | y = x2 14 | 15 | grad_x2, = ad.gradients(y, [x2]) 16 | 17 | executor = ad.Executor([y, grad_x2]) 18 | x2_val = 2 * np.ones(3) 19 | y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val}) 20 | 21 | assert isinstance(y, ad.Node) 22 | assert np.array_equal(y_val, x2_val) 23 | assert np.array_equal(grad_x2_val, np.ones_like(x2_val)) 24 | 25 | 26 | def test_add_by_const(): 27 | x2 = ad.Variable(name="x2") 28 | y = 5 + x2 29 | 30 | grad_x2, = ad.gradients(y, [x2]) 31 | 32 | executor = ad.Executor([y, grad_x2]) 33 | x2_val = 2 * np.ones(3) 34 | y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val}) 35 | 36 | assert isinstance(y, ad.Node) 37 | assert np.array_equal(y_val, x2_val + 5) 38 | assert np.array_equal(grad_x2_val, np.ones_like(x2_val)) 39 | 40 | 41 | def test_mul_by_const(): 42 | x2 = ad.Variable(name='x2') 43 | y = 3 * x2 44 | 45 | grad_x2, = ad.gradients(y, [x2]) 46 | executor = ad.Executor([y, grad_x2]) 47 | x2_val = 2 * np.ones(3) 48 | y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val}) 49 | 50 | # asserts 51 | assert isinstance(y, ad.Node) 52 | assert np.array_equal(y_val, 3 * x2_val) 53 | assert np.array_equal(grad_x2_val, 3 * np.ones_like(x2_val)) 54 | 55 | 56 | def test_mul_two_var(): 57 | x2 = ad.Variable(name='x2') 58 | x3 = ad.Variable(name='x3') 59 | y = x2 * x3 60 | 61 | grad_x2, grad_x3 = ad.gradients(y, [x2, x3]) 62 | executor = ad.Executor([y, grad_x2, grad_x3]) 63 | x2_val = 2 * np.ones(3) 64 | x3_val = 3 * np.ones(3) 65 | y_val, grad_x2_val, grad_x3_val = executor.run(feed_shapes={x2: x2_val, x3: x3_val}) 66 | 67 | # asserts 68 | assert isinstance(y, ad.Node) 69 | assert np.array_equal(y_val, 6 * np.ones(3)) 70 | assert np.array_equal(grad_x2_val, x3_val) 71 | assert np.array_equal(grad_x3_val, x2_val) 72 | 73 | 74 | def test_sub_two_vars(): 75 | x2 = ad.Variable(name='x2') 76 | x3 = ad.Variable(name='x3') 77 | y = x2 - x3 78 | 79 | grad_x2, grad_x3 = ad.gradients(y, [x2, x3]) 80 | executor = ad.Executor([y, grad_x2, grad_x3]) 81 | x2_val = 4 * np.ones(3) 82 | x3_val = 3 * np.ones(3) 83 | y_val, grad_x2_val, grad_x3_val = executor.run(feed_shapes={x2: x2_val, x3: x3_val}) 84 | 85 | # asserts 86 | assert isinstance(y, ad.Node) 87 | assert np.array_equal(y_val, 1 * np.ones(3)) 88 | assert np.array_equal(grad_x2_val, np.ones(3)) 89 | assert np.array_equal(grad_x3_val, -1 * np.ones(3)) 90 | 91 | 92 | def test_sub_by_const(): 93 | x2 = ad.Variable(name='x2') 94 | y = x2 - 3 95 | 96 | grad_x2, = ad.gradients(y, [x2]) 97 | executor = ad.Executor([y, grad_x2]) 98 | x2_val = 2 * np.ones(3) 99 | y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val}) 100 | 101 | # asserts 102 | assert isinstance(y, ad.Node) 103 | assert np.array_equal(y_val, -1 * np.ones(3)) 104 | assert np.array_equal(grad_x2_val, np.ones_like(x2_val)) 105 | 106 | 107 | def test_div_two_var(): 108 | x2 = ad.Variable(name='x2') 109 | x3 = ad.Variable(name='x3') 110 | y = x2 / x3 111 | 112 | grad_x2, grad_x3 = ad.gradients(y, [x2, x3]) 113 | executor = ad.Executor([y, grad_x2, grad_x3]) 114 | x2_val = 4 * np.ones(3) 115 | x3_val = 2 * np.ones(3) 116 | y_val, grad_x2_val, grad_x3_val = executor.run(feed_shapes={x2: x2_val, x3: x3_val}) 117 | 118 | # asserts 119 | assert isinstance(y, ad.Node) 120 | assert np.array_equal(grad_x2_val, 1.0 / x3_val) 121 | assert np.array_equal(grad_x3_val, -1.0 * x2_val / (x3_val * x3_val)) 122 | 123 | 124 | def test_div_by_const(): 125 | x2 = ad.Variable(name='x2') 126 | y = x2 / 2.0 127 | 128 | grad_x2, = ad.gradients(y, [x2]) 129 | executor = ad.Executor([y, grad_x2]) 130 | x2_val = 2 * np.ones(3) 131 | y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val}) 132 | 133 | # asserts 134 | assert isinstance(y, ad.Node) 135 | assert np.array_equal(y_val, x2_val / 2.0) 136 | assert np.array_equal(grad_x2_val, np.ones_like(x2_val) / 2.0) 137 | 138 | 139 | def test_reduce_sum(): 140 | x2 = ad.Variable(name='x2') 141 | y = ad.reduce_sum(x2) 142 | 143 | grad_x2, = ad.gradients(y, [x2]) 144 | executor = ad.Executor([y, grad_x2]) 145 | x2_val = np.array([[1, 2, 3], [4, 5, 6]]) 146 | y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val}) 147 | 148 | # asserts 149 | assert isinstance(y, ad.Node) 150 | assert np.array_equal(y_val, np.array([5, 7, 9])) 151 | assert np.array_equal(grad_x2_val, np.array([1, 1, 1])) 152 | 153 | 154 | def test_broadcast_to(): 155 | x2 = ad.Variable(name='x2') 156 | x3 = ad.Variable(name='x3') 157 | y = ad.broadcast_to(x2, x3) 158 | 159 | grad_x2, grad_x3 = ad.gradients(y, [x2, x3]) 160 | executor = ad.Executor([y, grad_x2, grad_x3]) 161 | x2_val = np.array([[1, 2, 3]]) 162 | x3_val = np.zeros((3, 3)) 163 | y_val, grad_x2_val, grad_x3_val = executor.run(feed_shapes={x2: x2_val, x3: x3_val}) 164 | 165 | # asserts 166 | assert isinstance(y, ad.Node) 167 | assert np.array_equal(y_val, np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]])) 168 | assert np.array_equal(grad_x2_val, np.array([3, 3, 3])) 169 | 170 | 171 | def test_matmul_two_vars(): 172 | x2 = ad.Variable(name='x2') 173 | x3 = ad.Variable(name='x3') 174 | y = ad.matmul(x2, x3) 175 | 176 | grad_x2, grad_x3 = ad.gradients(y, [x2, x3]) 177 | executor = ad.Executor([y, grad_x2, grad_x3]) 178 | x2_val = np.array([[1, 2], [3, 4], [5, 6]]) # 3x2 179 | x3_val = np.array([[7, 8, 9], [10, 11, 12]]) # 2x3 180 | 181 | y_val, grad_x2_val, grad_x3_val = executor.run(feed_shapes={x2: x2_val, x3: x3_val}) 182 | 183 | expected_yval = np.matmul(x2_val, x3_val) 184 | expected_grad_x2_val = np.matmul(np.ones_like(expected_yval), np.transpose(x3_val)) 185 | expected_grad_x3_val = np.matmul(np.transpose(x2_val), np.ones_like(expected_yval)) 186 | 187 | assert isinstance(y, ad.Node) 188 | assert np.array_equal(y_val, expected_yval) 189 | assert np.array_equal(grad_x2_val, expected_grad_x2_val) 190 | assert np.array_equal(grad_x3_val, expected_grad_x3_val) 191 | 192 | 193 | def test_relu(): 194 | x2 = ad.Variable(name='x2') 195 | y = au.nn.relu(x2) 196 | 197 | grad_x2, = ad.gradients(y, [x2]) 198 | executor = ad.Executor([y, grad_x2]) 199 | x2_val = np.array([[-1, 2, 3], [1, -2, 0]]) 200 | y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val}) 201 | expected_y_val = np.array([[0, 2, 3], [1, 0, 0]]) 202 | expected_x2_grad = np.array([[0, 1, 1], [1, 0, 0]]) 203 | assert np.array_equal(y_val, expected_y_val) 204 | assert np.array_equal(grad_x2_val, expected_x2_grad) 205 | 206 | 207 | def test_cross_entropy(): 208 | x2_pred = ad.Variable(name='x2_pred') 209 | x2_actu = ad.Variable(name='x2_actu') 210 | y = au.nn.softmax_cross_entropy_with_logits(x2_pred, x2_actu) 211 | 212 | x2_pred_grad, x2_actu_grad = ad.gradients(y, [x2_pred, x2_actu]) 213 | 214 | x2_pred_val = np.array([[0.8, 0.01, 0.5], [0.8, 0.01, 0.5]]) 215 | x2_actu_val = np.array([[1.0, 1.0, 0], [1.0, 1.0, 0]]) 216 | 217 | executor = ad.Executor([y, x2_pred_grad, x2_actu_grad]) 218 | y_val, x2_pred_grad_val, x2_actu_grad_val = executor.run(feed_shapes={x2_pred: x2_pred_val, x2_actu: x2_actu_val}) 219 | # print(x2_actu_grad_val) 220 | assert True 221 | 222 | 223 | def test_matmul_var_and_param(): 224 | x2 = ad.Variable(name="x2") 225 | w2_val = np.array([[7, 8, 9], [10, 11, 12]]) # 2x3 226 | w2 = ad.Parameter(name="w2", init=w2_val) 227 | y = ad.matmul(x2, w2) 228 | 229 | grad_x2, grad_w2 = ad.gradients(y, [x2, w2]) 230 | 231 | executor = ad.Executor([y, grad_x2, grad_w2]) 232 | x2_val = np.array([[1, 2], [3, 4], [5, 6]]) # 3x2 233 | 234 | y_val, grad_x2_val, grad_w2_val = executor.run(feed_shapes={x2: x2_val}) 235 | 236 | expected_yval = np.matmul(x2_val, w2_val) 237 | expected_grad_x2_val = np.matmul(np.ones_like(expected_yval), np.transpose(w2_val)) 238 | expected_grad_x3_val = np.matmul(np.transpose(x2_val), np.ones_like(expected_yval)) 239 | 240 | assert isinstance(y, ad.Node) 241 | # assert np.array_equal(y_val, expected_yval) 242 | # assert np.array_equal(grad_x2_val, expected_grad_x2_val) 243 | # assert np.array_equal(grad_w2_val, expected_grad_x3_val) 244 | 245 | 246 | def test_sigmoid_activation(): 247 | x2 = ad.Variable(name='x2') 248 | y = au.nn.sigmoid(x2) 249 | 250 | x2_val = np.array([-100, 0, 100]) 251 | grad_x2, = ad.gradients(y, [x2]) 252 | executor = ad.Executor([y, grad_x2]) 253 | y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val}) 254 | npt.assert_array_almost_equal(np.array([0.000, 0.500, 1.0]), y_val) 255 | npt.assert_array_almost_equal(np.array([0, 0.25, 0]), grad_x2_val) 256 | 257 | # testing with extreme values for numerical stability. 258 | x2_val = np.array([-9.9e10, 9.9e10]).astype(np.float32) 259 | y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val}) 260 | npt.assert_array_almost_equal(np.array([0.0, 1.0]), y_val) 261 | npt.assert_array_almost_equal(np.array([0.0, 0.0]), grad_x2_val) 262 | 263 | 264 | def test_max_pooling(): 265 | x2 = ad.Variable(name='x2') 266 | y = au.nn.maxPool(x2, filter=(2, 2), strides=(2, 2)) 267 | 268 | grad_x2, = ad.gradients(y, [x2]) 269 | executor = ad.Executor([y, grad_x2]) 270 | x2_val = np.random.randn(1, 1, 4, 4) 271 | 272 | y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val}) 273 | 274 | numerical_grad_x2 = ad.eval_numerical_grad(y, 275 | feed_dict={x2: x2_val}, 276 | wrt=x2, 277 | h=1e-5) 278 | assert isinstance(y, ad.Node) 279 | # TODO: (upul) looks like a bug in my eval_numerical_grad implementation 280 | # Hence I'm using one decimal points 281 | npt.assert_array_almost_equal(grad_x2_val, numerical_grad_x2, decimal=2) 282 | 283 | 284 | def test_conv2d(): 285 | x2 = ad.Variable(name='x2') 286 | w2 = ad.Variable(name='w2') 287 | b2 = ad.Variable(name='b2') 288 | 289 | y = au.nn.conv2d(input=x2, filter=w2, bias=b2) 290 | 291 | grad_x2, grad_w2, grad_b2 = ad.gradients(y, [x2, w2, b2]) 292 | executor = ad.Executor([y, grad_x2, grad_w2, grad_b2]) 293 | x2_val = np.random.randn(1, 2, 4, 4) 294 | w2_val = np.random.randn(2, 2, 3, 3) 295 | b2_val = np.random.randn(2, ) 296 | 297 | y_val, grad_x2_val, grad_w2_val, grad_b2_val = executor.run(feed_shapes={x2: x2_val, 298 | w2: w2_val, 299 | b2: b2_val}) 300 | 301 | numerical_grad_w2 = ad.eval_numerical_grad(y, 302 | feed_dict={x2: x2_val, 303 | w2: w2_val, 304 | b2: b2_val}, 305 | wrt=w2) 306 | numerical_grad_x2 = ad.eval_numerical_grad(y, 307 | feed_dict={x2: x2_val, 308 | w2: w2_val, 309 | b2: b2_val}, 310 | wrt=x2) 311 | numerical_grad_b2 = ad.eval_numerical_grad(y, 312 | feed_dict={x2: x2_val, 313 | w2: w2_val, 314 | b2: b2_val}, 315 | wrt=b2) 316 | 317 | assert isinstance(y, ad.Node) 318 | npt.assert_array_almost_equal(numerical_grad_w2, grad_w2_val) 319 | npt.assert_array_almost_equal(numerical_grad_x2, grad_x2_val) 320 | npt.assert_array_almost_equal(numerical_grad_b2, grad_b2_val) 321 | 322 | x2 = ad.Variable(name='x2') 323 | w2 = ad.Parameter(name='w2', init=w2_val) 324 | b2 = ad.Parameter(name='b2', init=b2_val) 325 | y = au.nn.conv2d(x2, w2, b2) 326 | 327 | grad_x2, grad_w2, grad_b2 = ad.gradients(y, [x2, w2, b2]) 328 | executor = ad.Executor([y, grad_x2, grad_w2, grad_b2]) 329 | y_val, grad_x2_val, grad_w2_val, grad_b2_val = executor.run(feed_shapes={x2: x2_val}) 330 | 331 | assert isinstance(y, ad.Node) 332 | npt.assert_array_almost_equal(numerical_grad_w2, grad_w2_val) 333 | npt.assert_array_almost_equal(numerical_grad_b2, grad_b2_val) 334 | npt.assert_array_almost_equal(numerical_grad_x2, grad_x2_val) 335 | 336 | 337 | def test_reshape(): 338 | x2 = ad.Variable(name='x2') 339 | y = ad.reshape(x2, newshape=(1, 4)) 340 | 341 | grad_x2, = ad.gradients(y, [x2]) 342 | executor = ad.Executor([y, grad_x2]) 343 | x2_val = np.random.randn(2, 2) 344 | y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val}) 345 | 346 | assert isinstance(y, ad.Node) 347 | assert y_val.shape == (1, 4) 348 | npt.assert_array_equal(grad_x2_val, np.ones((2, 2))) 349 | 350 | # x2 = ad.Variable(name='x2') 351 | # y = ad.reshape(x2, newshape=(2, 1, 2, 3)) 352 | # grad_x2, = ad.gradients(y, [x2]) 353 | # executor = ad.Executor([y, grad_x2]) 354 | # x2_val = np.random.randn(2, 6) 355 | # y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val}) 356 | # 357 | # assert isinstance(y, ad.Node) 358 | # assert y_val.shape == (2, 1, 2, 3) 359 | # npt.assert_array_equal(grad_x2_val, np.ones((2, 1, 2, 3))) 360 | -------------------------------------------------------------------------------- /tests/test_autodiff_gpu.py: -------------------------------------------------------------------------------- 1 | import aurora as au 2 | import aurora.autodiff as ad 3 | import numpy as np 4 | import numpy.testing as npt 5 | from aurora.ndarray import ndarray, gpu_op 6 | 7 | 8 | def test_identity(): 9 | x2 = ad.Variable(name='x2') 10 | y = x2 11 | 12 | grad_x2, = ad.gradients(y, [x2]) 13 | 14 | executor = ad.Executor([y, grad_x2], use_gpu=True) 15 | x2_val = 2 * np.ones(3) 16 | y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val}) 17 | 18 | y_val_np = y_val.asnumpy() 19 | grad_x2_val_np = grad_x2_val.asnumpy() 20 | 21 | assert isinstance(y, ad.Node) 22 | assert np.array_equal(y_val_np, x2_val) 23 | assert np.array_equal(grad_x2_val_np, np.ones_like(x2_val)) 24 | 25 | 26 | def test_add_by_const(): 27 | x2 = ad.Variable(name="x2") 28 | y = 5 + x2 29 | 30 | grad_x2, = ad.gradients(y, [x2]) 31 | 32 | executor = ad.Executor([y, grad_x2], use_gpu=True) 33 | x2_val = 2 * np.ones(3) 34 | y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val}) 35 | 36 | y_val = y_val.asnumpy() 37 | grad_x2_val = grad_x2_val.asnumpy() 38 | 39 | assert isinstance(y, ad.Node) 40 | assert np.array_equal(y_val, x2_val + 5) 41 | assert np.array_equal(grad_x2_val, np.ones_like(x2_val)) 42 | 43 | 44 | def test_softmax(): 45 | shape = (2, 2) 46 | x_val = np.random.uniform(-5, 5, shape).astype(np.float32) 47 | 48 | x2 = ad.Variable(name='x2') 49 | prob = au.nn.softmax(x2) 50 | executor = ad.Executor([prob], use_gpu=True) 51 | y, = executor.run(feed_shapes={x2: x_val}) 52 | y = y.asnumpy() 53 | np.testing.assert_allclose(au.nn.softmax_func(x_val), y, rtol=1e-5) 54 | -------------------------------------------------------------------------------- /tests/test_gpu_operations.py: -------------------------------------------------------------------------------- 1 | import aurora as au 2 | import aurora.autodiff as ad 3 | import numpy as np 4 | import numpy.testing as npt 5 | from aurora.ndarray import ndarray, gpu_op 6 | 7 | 8 | def test_dummy(): 9 | assert 1 == 1 10 | 11 | 12 | def test_array_set(): 13 | ctx = ndarray.gpu(0) 14 | shape = (5000, 2000) 15 | # oneslike 16 | arr_x = ndarray.empty(shape, ctx=ctx) 17 | gpu_op.array_set(arr_x, 1.) 18 | x = arr_x.asnumpy() 19 | np.testing.assert_allclose(np.ones(shape), x) 20 | # zeroslike 21 | gpu_op.array_set(arr_x, 0.) 22 | x = arr_x.asnumpy() 23 | np.testing.assert_allclose(np.zeros(shape), x) 24 | 25 | 26 | def test_broadcast_to(): 27 | ctx = ndarray.gpu(0) 28 | shape = (200, 300) 29 | to_shape = (130, 200, 300) 30 | x = np.random.uniform(-1, 1, shape).astype(np.float32) 31 | arr_x = ndarray.array(x, ctx=ctx) 32 | arr_y = ndarray.empty(to_shape, ctx=ctx) 33 | gpu_op.broadcast_to(arr_x, arr_y) 34 | y = arr_y.asnumpy() 35 | np.testing.assert_allclose(np.broadcast_to(x, to_shape), y) 36 | 37 | 38 | def test_reduce_sum_axis_zero(): 39 | ctx = ndarray.gpu(0) 40 | shape = (500, 200, 100) 41 | to_shape = (200, 100) 42 | x = np.random.uniform(0, 20, shape).astype(np.float32) 43 | arr_x = ndarray.array(x, ctx=ctx) 44 | arr_y = ndarray.empty(to_shape, ctx=ctx) 45 | gpu_op.reduce_sum_axis_zero(arr_x, arr_y) 46 | y = arr_y.asnumpy() 47 | y_ = np.sum(x, axis=0) 48 | for index, _ in np.ndenumerate(y): 49 | v = y[index] 50 | v_ = y_[index] 51 | if abs((v - v_) / v_) > 1e-4: 52 | print(index, v, v_) 53 | np.testing.assert_allclose(np.sum(x, axis=0), y, rtol=1e-5) 54 | 55 | 56 | def test_matrix_elementwise_add(): 57 | ctx = ndarray.gpu(0) 58 | shape = (5000, 2000) 59 | x = np.random.uniform(0, 10, size=shape).astype(np.float32) 60 | y = np.random.uniform(0, 10, size=shape).astype(np.float32) 61 | arr_x = ndarray.array(x, ctx=ctx) 62 | arr_y = ndarray.array(y, ctx=ctx) 63 | arr_z = ndarray.empty(shape, ctx=ctx) 64 | gpu_op.matrix_elementwise_add(arr_x, arr_y, arr_z) 65 | z = arr_z.asnumpy() 66 | np.testing.assert_allclose(x + y, z, rtol=1e-5) 67 | 68 | 69 | def test_matrix_elementwise_add_by_const(): 70 | shape = (2000, 3000) 71 | ctx = ndarray.gpu(0) 72 | x = np.random.uniform(0, 10, size=shape).astype(np.float32) 73 | val = np.random.uniform(-5, 5) 74 | arr_x = ndarray.array(x, ctx=ctx) 75 | arr_y = ndarray.empty(shape, ctx=ctx) 76 | gpu_op.matrix_elementwise_add_by_const(arr_x, val, arr_y) 77 | y = arr_y.asnumpy() 78 | np.testing.assert_allclose(x + val, y, rtol=1e-5) 79 | 80 | 81 | def test_matrix_elementwise_multiply(): 82 | ctx = ndarray.gpu(0) 83 | shape = (500, 200) 84 | x = np.random.uniform(0, 10, size=shape).astype(np.float32) 85 | y = np.random.uniform(0, 10, size=shape).astype(np.float32) 86 | arr_x = ndarray.array(x, ctx=ctx) 87 | arr_y = ndarray.array(y, ctx=ctx) 88 | arr_z = ndarray.empty(shape, ctx=ctx) 89 | gpu_op.matrix_elementwise_multiply(arr_x, arr_y, arr_z) 90 | z = arr_z.asnumpy() 91 | np.testing.assert_allclose(x * y, z, rtol=1e-5) 92 | 93 | 94 | def test_matrix_elementwise_sqrt(): 95 | ctx = ndarray.gpu(0) 96 | shape = (500, 200) 97 | x = np.random.uniform(0, 10, size=shape).astype(np.float32) 98 | arr_x = ndarray.array(x, ctx=ctx) 99 | gpu_op.matrix_elementwise_sqrt(arr_x, arr_x) 100 | z = arr_x.asnumpy() 101 | np.testing.assert_allclose(np.sqrt(x), z, rtol=1e-5) 102 | 103 | 104 | def test_matrix_elementwise_multiply_by_const(): 105 | shape = (2000, 3000) 106 | ctx = ndarray.gpu(0) 107 | x = np.random.uniform(0, 10, size=shape).astype(np.float32) 108 | val = np.random.uniform(-5, 5) 109 | arr_x = ndarray.array(x, ctx=ctx) 110 | arr_y = ndarray.empty(shape, ctx=ctx) 111 | gpu_op.matrix_elementwise_multiply_by_const(arr_x, val, arr_y) 112 | y = arr_y.asnumpy() 113 | np.testing.assert_allclose(x * val, y, rtol=1e-5) 114 | 115 | 116 | def test_matrix_multiply(): 117 | ctx = ndarray.gpu(0) 118 | x = np.random.uniform(0, 10, size=(500, 700)).astype(np.float32) 119 | y = np.random.uniform(0, 10, size=(700, 1000)).astype(np.float32) 120 | arr_x = ndarray.array(x, ctx=ctx) 121 | arr_y = ndarray.array(y, ctx=ctx) 122 | arr_z = ndarray.empty((500, 1000), ctx=ctx) 123 | gpu_op.matrix_multiply(arr_x, False, arr_y, False, arr_z) 124 | z = arr_z.asnumpy() 125 | np.testing.assert_allclose(np.dot(x, y), z, rtol=1e-5) 126 | 127 | x = np.random.uniform(0, 10, size=(1000, 500)).astype(np.float32) 128 | y = np.random.uniform(0, 10, size=(2000, 500)).astype(np.float32) 129 | arr_x = ndarray.array(x, ctx=ctx) 130 | arr_y = ndarray.array(y, ctx=ctx) 131 | arr_z = ndarray.empty((1000, 2000), ctx=ctx) 132 | gpu_op.matrix_multiply(arr_x, False, arr_y, True, arr_z) 133 | z = arr_z.asnumpy() 134 | np.testing.assert_allclose(np.dot(x, np.transpose(y)), z, rtol=1e-5) 135 | 136 | x = np.random.uniform(0, 10, size=(500, 1000)).astype(np.float32) 137 | y = np.random.uniform(0, 10, size=(2000, 500)).astype(np.float32) 138 | arr_x = ndarray.array(x, ctx=ctx) 139 | arr_y = ndarray.array(y, ctx=ctx) 140 | arr_z = ndarray.empty((1000, 2000), ctx=ctx) 141 | gpu_op.matrix_multiply(arr_x, True, arr_y, True, arr_z) 142 | z = arr_z.asnumpy() 143 | np.testing.assert_allclose(np.dot(np.transpose(x), np.transpose(y)), z, 144 | rtol=1e-5) 145 | 146 | 147 | def test_relu(): 148 | shape = (2000, 2500) 149 | ctx = ndarray.gpu(0) 150 | x = np.random.uniform(-1, 1, shape).astype(np.float32) 151 | arr_x = ndarray.array(x, ctx=ctx) 152 | arr_y = ndarray.empty(shape, ctx=ctx) 153 | gpu_op.relu(arr_x, arr_y) 154 | y = arr_y.asnumpy() 155 | np.testing.assert_allclose(np.maximum(x, 0).astype(np.float32), y) 156 | 157 | 158 | def test_relu_gradient(): 159 | shape = (2000, 2500) 160 | ctx = ndarray.gpu(0) 161 | x = np.random.uniform(-1, 1, shape).astype(np.float32) 162 | grad_x = np.random.uniform(-5, 5, shape).astype(np.float32) 163 | arr_x = ndarray.array(x, ctx=ctx) 164 | arr_grad_x = ndarray.array(grad_x, ctx=ctx) 165 | arr_y = ndarray.empty(shape, ctx=ctx) 166 | gpu_op.relu_gradient(arr_x, arr_grad_x, arr_y) 167 | y = arr_y.asnumpy() 168 | np.testing.assert_allclose(((x > 0) * grad_x).astype(np.float32), y) 169 | 170 | 171 | def test_softmax(): 172 | ctx = ndarray.gpu(0) 173 | shape = (400, 1000) 174 | x = np.random.uniform(-5, 5, shape).astype(np.float32) 175 | arr_x = ndarray.array(x, ctx=ctx) 176 | arr_y = ndarray.empty(shape, ctx=ctx) 177 | gpu_op.softmax(arr_x, arr_y) 178 | y = arr_y.asnumpy() 179 | np.testing.assert_allclose(au.nn.softmax_func(x), y, rtol=1e-5) 180 | -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/upul/Aurora/415a80ac5f7083475baca4a2d187cd102ba7a6c5/tests/utils/__init__.py -------------------------------------------------------------------------------- /tests/utils/gradient_check.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def gradient_check_numpy_expr(func, x, output_gradient, h=1e-5): 5 | """ 6 | This utility function calculates gradient of the function `func` 7 | at `x`. 8 | :param func: 9 | :param x: 10 | :param output_gradient: 11 | :param h: 12 | :return: 13 | """ 14 | grad = np.zeros_like(x).astype(np.float32) 15 | iter = np.nditer(x, flags=['multi_index'], op_flags=['readwrite']) 16 | while not iter.finished: 17 | idx = iter.multi_index 18 | old_value = x[idx] 19 | 20 | # calculate positive value 21 | x[idx] = old_value + h 22 | pos = func(x).copy() 23 | 24 | # calculate negative value 25 | x[idx] = old_value - h 26 | neg = func(x).copy() 27 | 28 | # restore 29 | x[idx] = old_value 30 | 31 | # calculate gradient 32 | # Type of pos and neg will be memoryview if we are testing Cython functions. 33 | # Therefore, we create numpy arrays be performing - operation. 34 | # TODO: Don't we have an alternative method without creating numpy array from memoryview? 35 | grad[idx] = np.sum((np.array(pos) - np.array(neg)) * output_gradient) / (2 * h) 36 | iter.iternext() 37 | 38 | return grad 39 | --------------------------------------------------------------------------------