├── .gitignore
├── LICENSE.md
├── README.md
├── aurora
    ├── __init__.py
    ├── autodiff
    │   ├── __init__.py
    │   ├── autodiff.py
    │   ├── executor.py
    │   ├── gradients.py
    │   ├── math.py
    │   ├── numerical_gradient.py
    │   └── utils.py
    ├── datasets
    │   ├── __init__.py
    │   ├── data
    │   │   └── mnist.pkl.gz
    │   ├── mnist.py
    │   └── synthetic.py
    ├── ndarray
    │   ├── __init__.py
    │   ├── _base.py
    │   ├── gpu_op.py
    │   └── ndarray.py
    ├── nn
    │   ├── __init__.py
    │   ├── activations.py
    │   ├── conv.py
    │   ├── loss_functions.py
    │   ├── pooling.py
    │   ├── pyx
    │   │   ├── __init__.py
    │   │   ├── fast_pooling.pyx
    │   │   └── im2col.pyx
    │   └── utils.py
    └── optim
    │   ├── __init__.py
    │   ├── adam.py
    │   ├── base.py
    │   └── sgd.py
├── cuda
    ├── Makefile
    └── src
    │   ├── CMakeLists.txt
    │   ├── c_runtime_api.cc
    │   ├── c_runtime_api.h
    │   ├── cpu_device_api.cc
    │   ├── cpu_device_api.h
    │   ├── cuda_device_api.cc
    │   ├── cuda_device_api.h
    │   ├── cudnn_operations.cu
    │   ├── device_api.h
    │   ├── dlarray.h
    │   ├── gpu_op.cu
    │   └── runtime_base.h
├── examples
    ├── __init__.py
    ├── mnist.py
    ├── mnist_cnn.py
    └── notebooks
    │   └── mnist_cnn.ipynb
├── requirements.txt
├── resources
    └── logo.png
├── setup.py
└── tests
    ├── __init__.py
    ├── nn_primitives
        ├── __init__.py
        └── test_cython.py
    ├── test_autodiff_cpu.py
    ├── test_autodiff_gpu.py
    ├── test_gpu_operations.py
    └── utils
        ├── __init__.py
        └── gradient_check.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.so
 3 | *.pyd
 4 | *~
 5 | 
 6 | build/
 7 | dist/
 8 | .idea
 9 | indi.egg-info/
10 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 |  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Aurora: Minimal Deep Learning Library.
 2 | 
 3 | Aurora is a minimal deep learning library written in Python, Cython, and C++ with the help of Numpy, CUDA, and cuDNN. Though it is simple, Aurora comes with some advanced design concepts found it a typical deep learning library. 
 4 | 
 5 | * Automatic differentiation using static computational graphs.
 6 | * Shape and type inference.
 7 | * Static memory allocation for efficient training and inference.
 8 | 
 9 | 
10 | ### Installation
11 | 
12 | Aurora relies on several external libraries including `CUDA`, `cuDNN`, and `NumPy`. For `CUDA` and `cuDNN` installation instructions please refer official documentation. Python dependencies can be installed by running the `requirements.txt` file.
13 | 
14 | ##### Environment setup
15 | 
16 | To utilize GPU capabilities of the Aurora library, you need to have a Nvidia GPU. If `CUDA` toolkit is not already installed, first install the latest version of the `CUDA` toolkit as well as `cuDNN` library. Next, set following environment variables.
17 | 
18 | ```bash
19 | export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
20 | export PATH=/usr/local/cuda/bin:$PATH
21 | ```
22 | 
23 | ##### Cloning the Repository
24 | 
25 | You can clone Aurora repository using following command.
26 | 
27 | `git clone https://github.com/upul/Aurora.git`
28 | 
29 | 
30 | ##### Building the GPU Backend
31 | 
32 | Next, you need to build GPU backend. So please `cuda` directory and run `make` command as shown below.
33 | 
34 | 1. Go to `cuda` directory (`cd cuda`)
35 | 2. Run `make`
36 | 
37 | ##### Installing the Library
38 | 
39 | Go to `Aurora` directory and run:
40 | 
41 | 1. `pip install -r requirements.txt`
42 | 2. `pip install .`
43 | 
44 | 
45 | ### Examples
46 | 
47 | Following lists some noticeable examples. For the complete list of examples please refer [`examples`](https://github.com/upul/Aurora/tree/master/examples) directory. Also,  for Jupyter notebooks please refer [`examples/notebooks`](https://github.com/upul/Aurora/tree/master/examples/notebooks) folder.
48 | 
49 | 1. [mnist](https://github.com/upul/Aurora/blob/master/examples/mnist.py)
50 | 2. [mnist_cnn](https://github.com/upul/Aurora/blob/master/examples/mnist_cnn.py)
51 | 
52 | 
53 | ### Future Work
54 | 
55 | Following features will be added in upcoming releases.
56 | 
57 | * Dropout and Batch Normalization.
58 | * High-level API similar to Keras.
59 | * Ability to load pre-trained models.
60 | * Model checkpointing.
61 | 
62 | 
63 | ### Acknowledgement
64 | 
65 | It all started with [CSE 599G1: Deep Learning System Design](http://dlsys.cs.washington.edu/) course. This course really helped me to understand fundamentals of Deep Learning System design. My answers to the two programming assignments of [CSE 599G1](http://dlsys.cs.washington.edu/) was the foundation of Aurora library.  So I would like to acknowledge with much appreciation the instructors and teaching assistants of the  [SE 599G1](http://dlsys.cs.washington.edu/) course.
66 | 
67 | 
68 | ### References.
69 | 
70 | 1. [CSE 599G1: Deep Learning System Design](http://dlsys.cs.washington.edu/) 
71 | 2. [MXNet Architecture](https://mxnet.incubator.apache.org/architecture/index.html)
72 | 3. [Parallel Programming With CUDA | Udacity](https://www.udacity.com/course/intro-to-parallel-programming--cs344)
73 | 4. [Programming Massively Parallel Processors, Third Edition: A Hands-on Approach 3rd Edition](https://www.amazon.com/Programming-Massively-Parallel-Processors-Hands/dp/0128119861/ref=pd_sim_14_3?_encoding=UTF8&psc=1&refRID=1Z3KFKEPTFQJE7MZQ40G)
74 | 


--------------------------------------------------------------------------------
/aurora/__init__.py:
--------------------------------------------------------------------------------
 1 | import aurora.nn
 2 | import aurora.optim
 3 | import aurora.datasets
 4 | 
 5 | __all__ = ['nn', 'optim', 'datasets']
 6 | 
 7 | try:
 8 |     from aurora.ndarray import gpu_op
 9 | 
10 |     __all__.append("ndarray")
11 | except ImportError:
12 |     pass
13 | 


--------------------------------------------------------------------------------
/aurora/autodiff/__init__.py:
--------------------------------------------------------------------------------
 1 | from .autodiff import Node
 2 | from .autodiff import Parameter
 3 | from .autodiff import Variable
 4 | from .autodiff import broadcast_to
 5 | from .autodiff import matmul
 6 | from .autodiff import reduce_sum
 7 | from .autodiff import reshape
 8 | from .executor import Executor
 9 | from .gradients import gradients
10 | from .math import tanh
11 | from .numerical_gradient import eval_numerical_grad
12 | 
13 | __all__ = ["Variable", "Parameter", "gradients", "Node", "Executor",
14 |            "reduce_sum", "broadcast_to", "matmul", "sigmoid",
15 |            "tanh", 'eval_numerical_grad', 'reshape']
16 | 


--------------------------------------------------------------------------------
/aurora/autodiff/autodiff.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | try:
  3 |     from aurora.ndarray import gpu_op, ndarray
  4 | except ImportError:
  5 |     pass
  6 | 
  7 | 
  8 | class Node(object):
  9 |     """ Node object represents a node in the computational graph"""
 10 | 
 11 |     def __init__(self):
 12 |         """ New node will be created by Op objects __call__ method"""
 13 |         # list of inputs to this node
 14 |         self.inputs = []
 15 |         # operator
 16 |         self.op = None
 17 |         # constants
 18 |         self.const = None
 19 |         # name of the node mainly use for debugging
 20 |         self.name = ""
 21 | 
 22 |     def __add__(self, other):
 23 |         """ Adding two nodes and returns a new node"""
 24 |         if isinstance(other, Node):
 25 |             return add(self, other)
 26 |         else:
 27 |             return add_const(self, other)
 28 | 
 29 |     def __sub__(self, other):
 30 |         if isinstance(other, Node):
 31 |             return sub(self, other)
 32 |         else:
 33 |             return sub_const(self, other)
 34 | 
 35 |     def __rsub__(self, other):
 36 |         return ref_sub_const(self, other)
 37 | 
 38 |     def __mul__(self, other):
 39 |         if isinstance(other, Node):
 40 |             return mul(self, other)
 41 |         else:
 42 |             return mul_const(self, other)
 43 | 
 44 |     def __truediv__(self, other):
 45 |         if isinstance(other, Node):
 46 |             return div(self, other)
 47 |         else:
 48 |             return div_const(self, other)
 49 | 
 50 |     # Allow left-hand-side add and multiply.
 51 |     __radd__ = __add__
 52 |     __rmul__ = __mul__
 53 |     __rdiv__ = __truediv__
 54 | 
 55 | 
 56 | class Op(object):
 57 |     """ Op class represents operations perform on nodes"""
 58 | 
 59 |     def __call__(self):
 60 |         """
 61 |         Create a new node which represents operations perform on the graph
 62 | 
 63 |         Parameters
 64 |         ----------
 65 |         None
 66 | 
 67 |         Returns
 68 |         -------
 69 |         Node
 70 |             The new node object
 71 |         """
 72 |         new_node = Node()
 73 |         new_node.op = self
 74 |         return new_node
 75 | 
 76 |     def compute(self, node, input_vals, output_val, use_numpy=True):
 77 |         """
 78 |         Given the values of input nodes, compute the output value
 79 | 
 80 |         Parameters
 81 |         ----------
 82 |         :type use_numpy: object
 83 |         :param use_numpy:
 84 |         :param node: Node that performs the computation
 85 |         :param input_vals: Values of input node
 86 | 
 87 |         Returns
 88 |         -------
 89 |         :return: The output value of the node
 90 |         """
 91 |         raise NotImplementedError
 92 | 
 93 |     def gradient(self, node, output_grads):
 94 |         """
 95 |         Given the value of output gradients this operation calculate the
 96 |         gradient contribution of each input node
 97 | 
 98 |         Parameters
 99 |         ----------
100 |         :param node:
101 |         :param output_grads:
102 | 
103 |         Returns
104 |         -------
105 |         :return: A list of gradient contribution to each input node respectively
106 |         """
107 |         raise NotImplementedError
108 | 
109 |     def infer_shape(self, node, input_shapes):
110 |         raise NotImplementedError
111 | 
112 | 
113 | class AddOp(Op):
114 |     """
115 | 
116 |     """
117 | 
118 |     def __call__(self, nodeA, nodeB):
119 |         """
120 |         This Operator element-wise two nodes
121 | 
122 |         Parameters
123 |         ----------
124 |         :param nodeA: LHS operand
125 |         :param nodeB: RHS operand
126 | 
127 |         Returns
128 |         -------
129 |         :return: A new Node which represents the element-wise plus operation
130 |         """
131 |         new_node = Op.__call__(self)
132 |         new_node.inputs = [nodeA, nodeB]
133 |         new_node.name = '({}+{})'.format(nodeA.name, nodeB.name)
134 |         return new_node
135 | 
136 |     def compute(self, node, input_vals, output_val, use_numpy=True):
137 |         """
138 |         Given values of two input nodes, return result of element-wise addition.
139 |         Parameters
140 |         ----------
141 |         :param node:
142 |         :param input_vals: List of two input nodes
143 | 
144 |         Returens
145 |         --------
146 |         :return:  The result of the element-wise addition operation
147 |         """
148 |         assert len(input_vals) == 2
149 |         # return input_vals[0] + input_vals[1]
150 |         if use_numpy:
151 |             output_val[:] = input_vals[0] + input_vals[1]
152 |         else:
153 |             if input_vals[0].shape == input_vals[1].shape:
154 |                 gpu_op.matrix_elementwise_add(input_vals[0], input_vals[1], output_val)
155 |             elif input_vals[0].shape == (1,):
156 |                 const = input_vals[0].asnumpy()[0]  # TODO: (upul) do we need this ? check it?
157 |                 gpu_op.matrix_elementwise_add_by_const(input_vals[1], const, output_val)
158 |             elif input_vals[1].shape == (1,):
159 |                 const = input_vals[1].asnumpy()[1]  # TODO: (upul) do we need this ? check it?
160 |                 gpu_op.matrix_elementwise_add_by_const(input_vals[0], const, output_val)
161 |             else:
162 |                 pass  # TODO: (upul) handle input[0] and input[1] in different shapes
163 | 
164 |     def gradient(self, node, output_grads):
165 |         """
166 |         Given the values of output gradients, calculate the gradients of input nodes
167 | 
168 |         Parameters
169 |         ----------
170 |         :param node:
171 |         :param output_grads: Gradient contribution of output nodes
172 | 
173 |         Returns
174 |         -------
175 |         :return: A list of gradient contribution of output nodes
176 |         """
177 |         return [output_grads, output_grads]
178 | 
179 |     def infer_shape(self, node, input_shapes):
180 |         assert len(input_shapes) == 2
181 |         assert input_shapes[0] == input_shapes[1]
182 |         return input_shapes[0]
183 | 
184 | 
185 | class AddByConstOp(Op):
186 |     """
187 |     Operator represents the element-wise addition of a node and a const
188 |     """
189 | 
190 |     def __call__(self, node_A, const_val):
191 |         """
192 | 
193 |         :param node:
194 |         :param const_val:
195 |         :return:
196 |         """
197 |         new_node = Op.__call__(self)
198 |         new_node.const = const_val
199 |         new_node.inputs = [node_A]
200 |         new_node.name = '({0:s}+{1:f})'.format(node_A.name, const_val)
201 |         return new_node
202 | 
203 |     def compute(self, node, input_vals, output_val, use_numpy=True):
204 |         """
205 | 
206 |         :param node:
207 |         :param input_vals:
208 |         :return:
209 |         """
210 |         assert len(input_vals) == 1
211 |         if use_numpy:
212 |             output_val[:] = node.const + input_vals[0]
213 |         else:
214 |             gpu_op.matrix_elementwise_add_by_const(
215 |                 input_vals[0], node.const, output_val)
216 | 
217 |     def gradient(self, node, output_grads):
218 |         """
219 | 
220 |         :param node:
221 |         :param output_grads:
222 |         :return:
223 |         """
224 |         return [output_grads]
225 | 
226 |     def infer_shape(self, node, input_shapes):
227 |         assert len(input_shapes) == 1
228 |         # assert node.const.shape == input_shapes[0]
229 |         return input_shapes[0]
230 | 
231 | 
232 | class SubOp(Op):
233 |     def __call__(self, node_A, node_B):
234 |         new_node = Op.__call__(self)
235 |         new_node.inputs = [node_A, node_B]
236 |         new_node.name = '({0:s}-{1:s})'.format(node_A.name, node_B.name)
237 |         return new_node
238 | 
239 |     def compute(self, node, input_vals, output_val, use_numpy=True):
240 |         assert len(input_vals) == 2
241 |         if use_numpy:
242 |             output_val[:] = input_vals[0] - input_vals[1]
243 |         else:
244 |             gpu_op.matrix_elementwise_subtract(input_vals[0], input_vals[1], output_val)
245 | 
246 |     def gradient(self, node, output_grads):
247 |         return [output_grads, -1 * output_grads]
248 | 
249 |     def infer_shape(self, node, input_shapes):
250 |         assert len(input_shapes) == 2
251 |         assert input_shapes[0] == input_shapes[1]
252 |         return input_shapes[0]
253 | 
254 | 
255 | class SubByConstOp(Op):
256 |     def __call__(self, node_A, const_val):
257 |         new_node = Op.__call__(self)
258 |         new_node.inputs = [node_A]
259 |         new_node.const = const_val
260 |         new_node.name = '({0:s}-{1:f})'.format(node_A.name, const_val)
261 |         return new_node
262 | 
263 |     def compute(self, node, input_vals, output_val, use_numpy=True):
264 |         assert len(input_vals) == 1
265 |         if use_numpy:
266 |             output_val[:] = input_vals[0] - node.const
267 |         else:
268 |             gpu_op.matrix_elementwise_subtract_by_const(input_vals[0], node.const, output_val)
269 | 
270 |     def gradient(self, node, output_grads):
271 |         return [output_grads]
272 | 
273 |     def infer_shape(self, node, input_shapes):
274 |         assert len(input_shapes) == 1
275 |         return input_shapes[0]
276 | 
277 | 
278 | class ReflectedSubByConstOp(Op):
279 |     def __call__(self, node_A, const_val):
280 |         new_node = Op.__call__(self)
281 |         new_node.inputs = [node_A]
282 |         new_node.const = const_val
283 |         new_node.name = '({0:f}-{1:s})'.format(const_val, node_A.name)
284 |         return new_node
285 | 
286 |     def compute(self, node, input_vals, output_val, use_numpy=True):
287 |         assert len(input_vals) == 1
288 |         return node.const - input_vals[0]
289 | 
290 |     def gradient(self, node, output_grads):
291 |         return [-1 * output_grads]
292 | 
293 |     def infer_shape(self, node, input_shapes):
294 |         assert len(input_shapes) == 1
295 |         return input_shapes[0]
296 | 
297 | 
298 | class OnesLikeOp(Op):
299 |     def __call__(self, node_A):
300 |         new_node = Op.__call__(self)
301 |         new_node.inputs = [node_A]
302 |         new_node.name = 'Oneslike({})'.format(node_A.name)
303 |         return new_node
304 | 
305 |     def compute(self, node, input_vals, output_val, use_numpy=True):
306 |         assert len(input_vals) == 1
307 |         if use_numpy:
308 |             assert isinstance(input_vals[0], np.ndarray)
309 |             output_val[:] = np.ones(input_vals[0].shape)
310 |         else:
311 |             gpu_op.array_set(output_val, 1)
312 | 
313 |     def gradient(self, node, output_grads):
314 |         return [zeros_like(node.inputs[0])]
315 | 
316 |     def infer_shape(self, node, input_shapes):
317 |         assert len(input_shapes) == 1
318 |         if input_shapes[0] == 1:  # TODO (upul) do we need this if ?
319 |             return (1,)
320 |         else:
321 |             return input_shapes[0]
322 | 
323 | 
324 | class ZerosLikeOp(Op):
325 |     def __call__(self, node_A):
326 |         new_node = Op.__call__(self)
327 |         new_node.inputs = [node_A]
328 |         new_node.name = 'Zeroslike({})'.format(node_A.name)
329 |         return new_node
330 | 
331 |     def compute(self, node, input_vals, output_val, use_numpy=True):
332 |         assert len(input_vals) == 1
333 |         if use_numpy:
334 |             assert isinstance(input_vals[0], np.ndarray)
335 |             output_val[:] = np.zeros(input_vals[0].shape)
336 |         else:
337 |             gpu_op.array_set(output_val, 0)
338 | 
339 |     def gradient(self, node, output_grads):
340 |         return [zeros_like(node.inputs[0])]
341 | 
342 |     def infer_shape(self, node, input_shapes):
343 |         assert len(input_shapes) == 1
344 |         if input_shapes[0] == 1:  # TODO (upul) do we need this if ?
345 |             return (1,)
346 |         else:
347 |             return input_shapes[0]
348 | 
349 | 
350 | class ReshapeOp(Op):
351 |     def __call__(self, node_A, newshape):
352 |         new_node = Op.__call__(self)
353 |         new_node.inputs = [node_A]
354 |         new_node.newshape = newshape
355 |         new_node.name = 'Reshape({})'.format(node_A.name)
356 |         return new_node
357 | 
358 |     def compute(self, node, input_vals, output_val, use_numpy=True):
359 |         assert len(input_vals) == 1
360 |         if use_numpy:
361 |             assert isinstance(input_vals[0], np.ndarray)
362 |             output_val[:] = np.reshape(input_vals[0], newshape=node.newshape)
363 |         else:
364 |             # TODO: (upul) changing share is not an expensive  operation. But looks
365 |             #     : bit ugly. Can't we find out an alternative approach?
366 |             input_shape = input_vals[0].shape
367 |             ndarray.reshape(output_val, input_shape)
368 |             input_vals[0].copyto(output_val)
369 |             ndarray.reshape(output_val, node.newshape)
370 | 
371 |     def gradient(self, node, output_grads):
372 |         return [reshape_grad(node.inputs[0], output_grads)]
373 | 
374 |     def infer_shape(self, node, input_shapes):
375 |         assert len(input_shapes) == 1
376 |         return node.newshape
377 | 
378 | 
379 | class ReshapeGradientOp(Op):
380 |     def __call__(self, node_A, node_B):
381 |         new_node = Op.__call__(self)
382 |         new_node.inputs = [node_A, node_B]
383 |         new_node.name = 'ReshapeGradientOp({0:s})'.format(node_A.name)
384 |         return new_node
385 | 
386 |     def compute(self, node, input_vals, output_val, use_numpy=True):
387 |         assert len(input_vals) == 2
388 |         if use_numpy:
389 |             output_val[:] = input_vals[1].reshape(input_vals[0].shape)
390 |         else:
391 |             # TODO: (upul) changing share is not an expensive  operation. But looks
392 |             #     : bit ugly. Can't we find out an alternative approach?
393 |             ndarray.reshape(output_val, input_vals[0].shape)
394 |             input_vals[1].copyto(output_val)
395 | 
396 |     def gradient(self, node, output_grads):
397 |         raise NotImplementedError('Gradient of ReshapeGradientOp not supported')
398 | 
399 |     def infer_shape(self, node, input_shapes):
400 |         assert len(input_shapes) == 2
401 |         return input_shapes[0]
402 | 
403 | 
404 | class MulOp(Op):
405 |     def __call__(self, node_A, node_B):
406 |         new_node = Op.__call__(self)
407 |         new_node.inputs = [node_A, node_B]
408 |         new_node.name = '({0:s}*{1:s})'.format(node_A.name, node_B.name)
409 |         return new_node
410 | 
411 |     def compute(self, node, input_vals, output_val, use_numpy=True):
412 |         assert len(input_vals) == 2
413 |         if use_numpy:
414 |             output_val[:] = input_vals[0] * input_vals[1]
415 |         else:
416 |             ip_1_shape = input_vals[0].shape
417 |             ip_2_shape = input_vals[1].shape
418 |             if ip_1_shape == ip_2_shape:
419 |                 gpu_op.matrix_elementwise_multiply(input_vals[0], input_vals[1], output_val)
420 |             elif ip_1_shape == (1,):
421 |                 const_val = input_vals[0].asnumpy()[0]
422 |                 gpu_op.matrix_elementwise_multiply_by_const(input_vals[1], const_val, output_val)
423 |             elif ip_2_shape == (1,):
424 |                 const_val = input_vals[1].asnumpy()[0]
425 |                 gpu_op.matrix_elementwise_multiply_by_const(input_vals[0], const_val, output_val)
426 |             else:
427 |                 pass  # TODO (upul) handle ip_1_shape != ip_2_shape
428 | 
429 |     def gradient(self, node, output_grads):
430 |         return [node.inputs[1] * output_grads, node.inputs[0] * output_grads]
431 | 
432 |     def infer_shape(self, node, input_shapes):
433 |         assert len(input_shapes) == 2
434 |         if input_shapes[0] == (1,):
435 |             return input_shapes[1]
436 |         elif input_shapes[1] == (1,):
437 |             return input_shapes[0]
438 |         elif input_shapes[0] == input_shapes[1]:
439 |             return input_shapes[0]
440 |         else:
441 |             stmt = 'Invalid dimensions {0:s}, (1:s)'.format(input_shapes[0], input_shapes[1])
442 |             raise RuntimeError(stmt)
443 | 
444 | 
445 | class MulByConstOp(Op):
446 |     def __call__(self, node_A, const_val):
447 |         new_node = Op.__call__(self)
448 |         new_node.inputs = [node_A]
449 |         new_node.const = const_val
450 |         new_node.name = '({0:s}*{1:f})'.format(node_A.name, const_val)
451 |         return new_node
452 | 
453 |     def compute(self, node, input_vals, output_val, use_numpy=True):
454 |         assert len(input_vals) == 1
455 |         if use_numpy:
456 |             output_val[:] = node.const * input_vals[0]
457 |         else:
458 |             gpu_op.matrix_elementwise_multiply_by_const(
459 |                 input_vals[0], node.const, output_val)
460 | 
461 |     def gradient(self, node, output_grads):
462 |         return [node.const * output_grads]
463 | 
464 |     def infer_shape(self, node, input_shapes):
465 |         assert len(input_shapes) == 1
466 |         return input_shapes[0]
467 | 
468 | 
469 | class DivOp(Op):
470 |     def __call__(self, node_A, node_B):
471 |         new_node = Op.__call__(self)
472 |         new_node.inputs = [node_A, node_B]
473 |         new_node.name = '({0:s}/{1:s})'.format(node_A.name, node_B.name)
474 |         return new_node
475 | 
476 |     def compute(self, node, input_vals, output_val, use_numpy=True):
477 |         assert len(input_vals) == 2
478 |         if use_numpy:
479 |             output_val[:] = input_vals[0] / input_vals[1]
480 |         else:
481 |             gpu_op.matrix_elementwise_division(input_vals[0], input_vals[1], output_val)
482 | 
483 |     def gradient(self, node, output_grads):
484 |         grad_A = output_grads / node.inputs[1]
485 |         grad_B = -1.0 * output_grads * node.inputs[0] / (node.inputs[1] * node.inputs[1])
486 |         return [grad_A, grad_B]
487 | 
488 |     def infer_shape(self, node, input_shapes):
489 |         assert len(input_shapes) == 2
490 |         assert input_shapes[0] == input_shapes[1]
491 |         return input_shapes[0]
492 | 
493 | 
494 | class DivByConstOp(Op):
495 |     def __call__(self, node_A, const_val):
496 |         new_node = Op.__call__(self)
497 |         new_node.inputs = [node_A]
498 |         new_node.const = const_val
499 |         new_node.name = '({0:s}/{1:f})'.format(node_A.name, const_val)
500 |         return new_node
501 | 
502 |     def compute(self, node, input_vals, output_val, use_numpy=True):
503 |         assert len(input_vals) == 1
504 |         if use_numpy:
505 |             output_val[:] = input_vals[0] / node.const
506 |         else:
507 |             gpu_op.matrix_elementwise_div_by_const(input_vals[0], node.const, output_val)
508 | 
509 |     def gradient(self, node, output_grads):
510 |         return [output_grads / node.const]
511 | 
512 |     def infer_shape(self, node, input_shapes):
513 |         assert len(input_shapes) == 1
514 |         return input_shapes[0]
515 | 
516 | 
517 | class PlaceholderOp(Op):
518 |     """Op to feed value to a nodes."""
519 | 
520 |     def __call__(self):
521 |         """Creates a variable node."""
522 |         new_node = Op.__call__(self)
523 |         return new_node
524 | 
525 |     def compute(self, node, input_vals, output_val, use_numpy=True):
526 |         """No compute function since node value is fed directly in Executor."""
527 |         assert False, "placeholder values provided by feed_dict"
528 | 
529 |     def gradient(self, node, output_grad):
530 |         """No gradient function since node has no inputs."""
531 |         return None
532 | 
533 | 
534 | class ReduceSumOp(Op):
535 |     """
536 | 
537 |     """
538 | 
539 |     def __call__(self, node_A):
540 |         new_node = Op.__call__(self)
541 |         new_node.inputs = [node_A]
542 |         new_node.name = 'ReduceSum({0:s})'.format(node_A.name)
543 |         return new_node
544 | 
545 |     def compute(self, node, input_vals, output_val, use_numpy=True):
546 |         """
547 | 
548 |         :param node:
549 |         :param input_vals:
550 |         :param output_val:
551 |         :param use_numpy:
552 |         :return:
553 |         """
554 |         assert len(input_vals) == 1
555 |         if use_numpy:
556 |             assert isinstance(output_val, np.ndarray)
557 |             output_val[:] = np.sum(input_vals[0], axis=0)
558 |         else:
559 |             gpu_op.reduce_sum_axis_zero(input_vals[0], output_val)
560 | 
561 |     def gradient(self, node, output_grads):
562 |         return [output_grads]
563 | 
564 |     def infer_shape(self, node, input_shapes):
565 |         assert len(input_shapes) == 1
566 |         if len(input_shapes[0]) == 1:
567 |             return (1,)
568 |         else:
569 |             return tuple(input_shapes[0][i]
570 |                          for i in range(1, len(input_shapes[0])))
571 | 
572 | 
573 | class BroadcastToOp(Op):
574 |     def __call__(self, node_A, node_B):
575 |         new_node = Op.__call__(self)
576 |         new_node.inputs = [node_A, node_B]
577 |         new_node.name = 'BroadcastTo({0:s}, {1:s}.shape)'.format(node_A.name, node_B.name)
578 |         return new_node
579 | 
580 |     def compute(self, node, input_vals, output_val, use_numpy=True):
581 |         assert len(input_vals) == 2
582 |         if use_numpy:
583 |             output_val[:] = np.broadcast_to(input_vals[0], input_vals[1].shape)
584 |         else:
585 |             gpu_op.broadcast_to(input_vals[0], output_val)
586 | 
587 |     def gradient(self, node, output_grads):
588 |         grad_A = reduce_sum(output_grads)
589 |         grad_B = zeros_like(node.inputs[1])
590 |         return [grad_A, grad_B]
591 | 
592 |     def infer_shape(self, node, input_shapes):
593 |         assert len(input_shapes) == 2
594 |         return input_shapes[1]
595 | 
596 | 
597 | class MatMulOp(Op):  # TODO: (upul) double check what this class is doing
598 |     def __call__(self, node_A, node_B, trans_A=False, trans_B=False):
599 |         new_node = Op.__call__(self)
600 |         new_node.inputs = [node_A, node_B]
601 |         new_node.trans_A = trans_A
602 |         new_node.trans_B = trans_B
603 |         new_node.name = 'MatMul({0:s}, {1:s}'.format(node_A.name, node_B.name)
604 |         return new_node
605 | 
606 |     def compute(self, node, input_vals, output_val, use_numpy=True):
607 |         assert len(input_vals) == 2
608 |         if use_numpy:
609 |             if node.trans_A:
610 |                 input_vals[0] = input_vals[0].T
611 |             if node.trans_B:
612 |                 input_vals[1] = input_vals[1].T
613 |             output_val[:] = np.dot(input_vals[0], input_vals[1])
614 |         else:
615 |             gpu_op.matrix_multiply(
616 |                 input_vals[0], node.trans_A,
617 |                 input_vals[1], node.trans_B,
618 |                 output_val)
619 | 
620 |     def gradient(self, node, output_grads):
621 |         grad_A = matmul(output_grads, node.inputs[1], trans_A=False, trans_B=True)
622 |         grad_B = matmul(node.inputs[0], output_grads, trans_A=True, trans_B=False)
623 |         return [grad_A, grad_B]
624 | 
625 |     def infer_shape(self, node, input_shapes):
626 |         """Need to handle input_vals[0].shape != input_vals[1].shape"""
627 |         assert len(input_shapes) == 2
628 |         (row_A, col_A) = input_shapes[0]
629 |         if node.trans_A:
630 |             row_A, col_A = col_A, row_A
631 |         (row_B, col_B) = input_shapes[1]
632 |         if node.trans_B:
633 |             row_B, col_B = col_B, row_B
634 | 
635 |         assert col_A == row_B
636 |         return (row_A, col_B)
637 | 
638 | 
639 | def Variable(name):
640 |     """User defined variables in an expression.
641 |         e.g. x = Variable(name = "x")
642 |     """
643 |     placeholder_node = placeholder()
644 |     placeholder_node.name = name
645 |     return placeholder_node
646 | 
647 | 
648 | def Parameter(name, init):
649 |     """
650 |     example: w = Parameter(name='w', state=...)
651 |     :param name:
652 |     :param init:
653 |     :return:
654 |     """
655 |     parameter_node = placeholder()
656 |     parameter_node.name = name
657 |     parameter_node.const = init
658 |     return parameter_node
659 | 
660 | 
661 | # Global singleton operations
662 | add = AddOp()
663 | add_const = AddByConstOp()
664 | sub = SubOp()
665 | sub_const = SubByConstOp()
666 | ref_sub_const = ReflectedSubByConstOp()
667 | mul = MulOp()
668 | mul_const = MulByConstOp()
669 | div = DivOp()
670 | div_const = DivByConstOp()
671 | zeros_like = ZerosLikeOp()
672 | ones_like = OnesLikeOp()
673 | reduce_sum = ReduceSumOp()
674 | broadcast_to = BroadcastToOp()
675 | reshape = ReshapeOp()
676 | reshape_grad = ReshapeGradientOp()
677 | matmul = MatMulOp()
678 | placeholder = PlaceholderOp()
679 | 


--------------------------------------------------------------------------------
/aurora/autodiff/executor.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from aurora.autodiff.autodiff import PlaceholderOp
  3 | from .utils import find_topo_sort
  4 | try:
  5 |     from aurora.ndarray import gpu_op, ndarray
  6 | except ImportError:
  7 |     pass
  8 | 
  9 | 
 10 | class Executor:
 11 |     """
 12 | 
 13 |     """
 14 | 
 15 |     def __init__(self, eval_list, use_gpu=False):
 16 |         """
 17 |         Executor computes values for a given subset of nodes in a computation graph.
 18 | 
 19 |         Parameters:
 20 |         -----------
 21 |         :param eval_list: Values of the nodes of this list need to be computed
 22 |         """
 23 |         self.eval_node_list = eval_list
 24 |         self.ctx = None
 25 |         if use_gpu:
 26 |             self.ctx = ndarray.gpu(0)
 27 | 
 28 |         self.topo_order = find_topo_sort(self.eval_node_list)
 29 |         self.node_to_arr_map = None
 30 |         self.node_to_shape_map = None
 31 |         self.feed_shapes = None
 32 | 
 33 |     def infer_shape(self, feed_shapes):
 34 |         """
 35 |         Given the shapes of the feed_shapes dictionary, we infer shapes of all nodes in the graph
 36 |         :param feed_shapes:
 37 |         :return:
 38 |         """
 39 |         self.node_to_shape_map = {}
 40 |         for node in self.topo_order:
 41 |             if node in self.node_to_shape_map:
 42 |                 continue
 43 | 
 44 |             # TODO (upul): following if condition looks like a hack. Find a better approach
 45 |             if isinstance(node.op, PlaceholderOp) and node.const is not None:
 46 |                 self.node_to_shape_map[node] = node.const.shape
 47 |                 continue
 48 | 
 49 |             if node in feed_shapes:
 50 |                 self.node_to_shape_map[node] = feed_shapes[node]
 51 |             else:
 52 |                 input_shpes = []
 53 |                 for input_node in node.inputs:
 54 |                     input_shpes.append(self.node_to_shape_map[input_node])
 55 | 
 56 |                 self.node_to_shape_map[node] = node.op.infer_shape(node, input_shpes)
 57 | 
 58 |     def memory_plan(self, feed_shapes):
 59 |         """
 60 | 
 61 |         :param feed_shapes:
 62 |         :return:
 63 |         """
 64 |         # topo_order = find_topo_sort(self.eval_node_list)  # TODO (upul) cache this
 65 |         # self.node_to_arr_map = {}
 66 |         # for node in topo_order:
 67 |         #     self.node_to_arr_map[node] = ndarray.empty(self.node_to_shape_map[node], ctx=self.ctx)
 68 | 
 69 |         if self.node_to_arr_map is None:
 70 |             self.node_to_arr_map = {}
 71 | 
 72 |         for node in self.topo_order:
 73 |             if node in feed_shapes:
 74 |                 continue
 75 |             self.node_to_arr_map[node] = ndarray.empty(self.node_to_shape_map[node], ctx=self.ctx)
 76 | 
 77 |     def run(self, feed_shapes, convert_to_numpy_ret_vals=False):
 78 |         """
 79 |         Values of the nodes given in eval_list are evaluated against feed_dict
 80 | 
 81 |         Parameters
 82 |         ----------
 83 |         :param feed_shapes: A dictionary of nodes who values are specified by the user
 84 | 
 85 |         Returns
 86 |         -------
 87 |         :return: Values of the nodes specified by the eval_list
 88 |         """
 89 |         def are_feed_shapes_equal(sa, sb):
 90 |             if (not isinstance(sa, dict)) or (not isinstance(sb, dict)):
 91 |                 return False
 92 |             unmatched_item = set(sa.items()) ^ set(sb.items())
 93 |             return len(unmatched_item) == 0
 94 | 
 95 |         # Assume self.ctx is None implies numpy array and numpy ops.
 96 |         use_numpy = self.ctx is None
 97 |         node_to_val_map = {}
 98 |         for node, value in feed_shapes.items():
 99 |             if use_numpy:
100 |                 # all values passed in feed_dict must be np.ndarray
101 |                 assert isinstance(value, np.ndarray)
102 |                 node_to_val_map[node] = value
103 |             else:
104 |                 # convert values to ndarray.NDArray if necessary
105 |                 if isinstance(value, np.ndarray):
106 |                     node_to_val_map[node] = ndarray.array(value, ctx=self.ctx)
107 |                 elif isinstance(value, ndarray.NDArray):
108 |                     node_to_val_map[node] = value
109 |                 else:
110 |                     assert False, "feed_dict value type not supported"
111 | 
112 |         # collect shapes for all placeholders
113 |         feed_shapes = {}
114 |         for node in node_to_val_map:
115 |             feed_shapes[node] = node_to_val_map[node].shape
116 | 
117 |         # infer shape if feed_shapes changed since last run
118 |         # e.g. call run() on test data after trainng
119 |         if (not are_feed_shapes_equal(feed_shapes, self.feed_shapes)):
120 |             self.infer_shape(feed_shapes)
121 |             self.feed_shapes = feed_shapes
122 |             # plan memory if using GPU
123 |             if (not use_numpy):
124 |                 self.memory_plan(feed_shapes)
125 | 
126 |         # Traverse graph in topo order and compute values for all nodes.
127 |         for node in self.topo_order:
128 |             if node in node_to_val_map:
129 |                 # Skip placeholder nodes. Values already provided by feed_dict.
130 |                 continue
131 | 
132 |             # TODO (upul): following if condition looks like a hack. Find a better approach
133 |             if isinstance(node.op, PlaceholderOp) and node.const is not None:
134 |                 node_to_val_map[node] = node.const
135 |                 continue
136 | 
137 |             input_vals = [node_to_val_map[n] for n in node.inputs]
138 |             if use_numpy:
139 |                 node_val = np.empty(shape=self.node_to_shape_map[node])
140 |             else:
141 |                 node_val = self.node_to_arr_map[node]
142 |             # node_val is modified in-place whether np.ndarray or NDArray
143 |             node.op.compute(node, input_vals, node_val, use_numpy)
144 |             node_to_val_map[node] = node_val
145 | 
146 |         # Collect node values.
147 |         if not use_numpy and convert_to_numpy_ret_vals:
148 |             return [node_to_val_map[n].asnumpy() for n in self.eval_node_list]
149 | 
150 |         return [node_to_val_map[n] for n in self.eval_node_list]
151 | 
152 |     @staticmethod
153 |     def _are_feed_shapes_equal(sa, sb):
154 |         if (not isinstance(sa, dict)) or (not isinstance(sb, dict)):
155 |             return False
156 |         unmatched_items = set(sa.items()) ^ set(sb.items())
157 |         return len(unmatched_items)
158 | 


--------------------------------------------------------------------------------
/aurora/autodiff/gradients.py:
--------------------------------------------------------------------------------
 1 | from .utils import sum_node_list
 2 | from .utils import find_topo_sort
 3 | from .autodiff import ones_like
 4 | 
 5 | # TODO: (upul) clean in improve comments
 6 | def gradients(output_node, node_list):
 7 |     # a map from node to a list of gradient contributions from each output node
 8 |     node_to_output_grads_list = {}
 9 |     # Special note on initializing gradient of output_node as oneslike_op(output_node):
10 |     # We are really taking a derivative of the scalar reduce_sum(output_node)
11 |     # instead of the vector output_node. But this is the common case for loss function.
12 |     node_to_output_grads_list[output_node] = [ones_like(output_node)]
13 |     # a map from node to the gradient of that node
14 |     node_to_output_grad = {}
15 |     # Traverse graph in reverse topological order given the output_node that we are taking gradient wrt.
16 |     reverse_topo_order = reversed(find_topo_sort([output_node]))
17 |     for node in reverse_topo_order:
18 |         output_grad = sum_node_list(node_to_output_grads_list[node])
19 |         node_to_output_grad[node] = output_grad
20 | 
21 |         input_grads_list = node.op.gradient(node, output_grad)
22 |         for i in range(len(node.inputs)):
23 |             if node.inputs[i] not in node_to_output_grads_list:
24 |                 node_to_output_grads_list[node.inputs[i]] = []
25 |             node_to_output_grads_list[node.inputs[i]].append(input_grads_list[i])
26 | 
27 |     # Collect results for gradients requested.
28 |     grad_node_list = [node_to_output_grad[node] for node in node_list]
29 |     return grad_node_list
30 | 


--------------------------------------------------------------------------------
/aurora/autodiff/math.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from aurora.autodiff.autodiff import Op
 4 | 
 5 | 
 6 | class TanhOp(Op):
 7 |     """
 8 |     Tanh Activation function
 9 | 
10 |     """
11 | 
12 |     def __call__(self, node_A):
13 |         new_node = Op.__call__(self)
14 |         new_node.inputs = [node_A]
15 |         new_node.name = 'Tanh({0:s})'.format(node_A.name)
16 |         return new_node
17 | 
18 |     def compute(self, node, input_vals, output_val, use_numpy=True):
19 |         assert len(input_vals) == 1
20 |         if use_numpy:
21 |             output_val[:] = np.tanh(input_vals[0])
22 |         else:
23 |             raise NotImplementedError('GPU version of TanhOp not yet implemented')
24 | 
25 |     def gradient(self, node, output_grads):
26 |         x = node.inputs[0]
27 |         g = 1 - (tanh(x) * tanh(x))
28 |         return [g * output_grads]
29 | 
30 |     def infer_shape(self, node, input_shapes):
31 |         assert len(input_shapes)
32 |         return input_shapes[0]
33 | 
34 | 
35 | # Global singleton operations
36 | tanh = TanhOp()
37 | 
38 | # TODO: (upul) other basic math functions such as sin, cos, min, max, and etc
39 | 


--------------------------------------------------------------------------------
/aurora/autodiff/numerical_gradient.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from .executor import Executor
 3 | 
 4 | 
 5 | def eval_numerical_grad(f, feed_dict, wrt, h=1e-5):
 6 |     wrt_val = feed_dict[wrt]
 7 |     grad = np.zeros_like(wrt_val)
 8 | 
 9 |     it = np.nditer(wrt_val, flags=['multi_index'], op_flags=['readwrite'])
10 |     while not it.finished:
11 |         ix = it.multi_index
12 |         old_val = wrt_val[ix]
13 |         wrt_val[ix] = old_val + h
14 |         executor = Executor([f])
15 |         feed_dict[wrt] = wrt_val
16 | 
17 |         result_plus, = executor.run(feed_shapes=feed_dict)
18 |         wrt_val[ix] = old_val - h
19 |         executor = Executor([f])
20 | 
21 |         feed_dict[wrt] = wrt_val
22 |         result_minus, = executor.run(feed_shapes=feed_dict)
23 | 
24 |         grad[ix] = np.sum((result_plus - result_minus) / (2.0 * h))
25 | 
26 |         wrt_val[ix] = old_val
27 |         feed_dict[wrt] = wrt_val
28 |         it.iternext()
29 |     return grad
30 | 


--------------------------------------------------------------------------------
/aurora/autodiff/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def find_topo_sort(node_list):
 5 |     """
 6 | 
 7 |     :param node_list:
 8 |     :return:
 9 |     """
10 |     visited = set()
11 |     topo_order = []
12 |     for node in node_list:
13 |         depth_first_search(node, visited, topo_order)
14 |     return topo_order
15 | 
16 | 
17 | def depth_first_search(node, visited, topo_order):
18 |     """
19 | 
20 |     :param node:
21 |     :param visited:
22 |     :param topo_order:
23 |     :return:
24 |     """
25 |     if node in visited:
26 |         return
27 |     visited.add(node)
28 |     for n in node.inputs:
29 |         depth_first_search(n, visited, topo_order)
30 |     topo_order.append(node)
31 | 
32 | 
33 | def sum_node_list(node_list):
34 |     """
35 |     Custom sum function in order to avoid
36 |     create redundant nodes in Python sum implementation
37 |     :param node_list:
38 |     :return:
39 |     """
40 |     from operator import add
41 |     from functools import reduce
42 |     return reduce(add, node_list)
43 | 


--------------------------------------------------------------------------------
/aurora/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .synthetic import spiral
2 | from .mnist import MNIST
3 | 
4 | __all__ = ['spiral', 'MNIST']


--------------------------------------------------------------------------------
/aurora/datasets/data/mnist.pkl.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/upul/Aurora/415a80ac5f7083475baca4a2d187cd102ba7a6c5/aurora/datasets/data/mnist.pkl.gz


--------------------------------------------------------------------------------
/aurora/datasets/mnist.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gzip
 3 | import pickle
 4 | import os
 5 | 
 6 | 
 7 | class MNIST:
 8 |     def __init__(self, batch_size):
 9 |         self.batch_size = batch_size
10 | 
11 |         train, valid, test = self._load_data()
12 |         self.X_train, self.y_train = train[0], train[1]
13 | 
14 |         # encoding y_train using one-hot encoding
15 |         self.y_train_one_hot = np.zeros((self.y_train.shape[0], 10))
16 |         self.y_train_one_hot[np.arange(self.y_train.shape[0]), self.y_train] = 1
17 | 
18 |         self.X_valid, self.y_valid = valid[0], valid[1]
19 |         self.X_test, self.y_test = test[0], test[1]
20 | 
21 |     def train_batch_generator(self):
22 |         while True:
23 |             rand_indices = np.random.choice(self.X_train.shape[0], self.batch_size, False)
24 |             yield self.X_train[rand_indices], self.y_train_one_hot[rand_indices]
25 | 
26 |     def validation(self):
27 |         return self.X_valid, self.y_valid
28 | 
29 |     def testing(self):
30 |         return self.X_test, self.y_test
31 | 
32 |     def num_features(self):
33 |         return self.X_train.shape[1]
34 | 
35 |     def _load_data(self):
36 |         script_dir = os.path.dirname(__file__)
37 |         mnist_file = os.path.join(os.path.join(script_dir, 'data'), 'mnist.pkl.gz')
38 | 
39 |         with gzip.open(mnist_file, 'rb') as mnist_file:
40 |             u = pickle._Unpickler(mnist_file)
41 |             u.encoding = 'latin1'
42 |             train, val, test = u.load()
43 |         return train, val, test
44 | 


--------------------------------------------------------------------------------
/aurora/datasets/synthetic.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | # TODO: (upul) improve the interface of following method
 4 | def spiral(num_cls, dim, point_per_cls, rnd_state=1024):
 5 |     np.random.seed(rnd_state)
 6 |     points_per_cls = 100  # number of points per class
 7 |     dim = 2  # dimensionality
 8 |     num_cls = 3  # number of classes
 9 |     X_data = np.zeros((points_per_cls * num_cls, dim))
10 |     y_data = np.zeros(points_per_cls * num_cls, dtype='uint8')
11 |     for j in range(num_cls):
12 |         ix = range(points_per_cls * j, points_per_cls * (j + 1))
13 |         r = np.linspace(0.0, 1, points_per_cls)
14 |         t = np.linspace(j * 4, (j + 1) * 4, points_per_cls) + np.random.randn(points_per_cls) * 0.2  # theta
15 |         X_data[ix] = np.c_[r * np.sin(t), r * np.cos(t)]
16 |         y_data[ix] = j
17 | 
18 |     y_data_encoded = np.zeros((points_per_cls * num_cls, num_cls))
19 |     y_data_encoded[range(points_per_cls * num_cls), y_data] = 1
20 |     return X_data, y_data, y_data_encoded
21 | 


--------------------------------------------------------------------------------
/aurora/ndarray/__init__.py:
--------------------------------------------------------------------------------
1 | from . import ndarray
2 | from . import gpu_op
3 | 
4 | __all__ = ['ndarray', 'gpu_op']


--------------------------------------------------------------------------------
/aurora/ndarray/_base.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # pylint: disable=invalid-name
 3 | """ ctypes library of dlsys and helper functions """
 4 | from __future__ import absolute_import
 5 | 
 6 | import os
 7 | import ctypes
 8 | from pathlib import Path
 9 | 
10 | 
11 | def _load_lib():
12 |     """Load libary in build/lib."""
13 |     lib_root = Path(__file__).parents[2]
14 |     lib_path = os.path.join(lib_root, 'cuda/build/lib/')
15 |     path_to_so_file = os.path.join(lib_path, "libc_runtime_api.so")
16 |     lib = ctypes.CDLL(path_to_so_file, ctypes.RTLD_GLOBAL)
17 |     return lib
18 | 
19 | 
20 | # global library instance
21 | try:
22 |     _LIB = _load_lib()
23 | except:
24 |     # TODO: (upul) Do we need to log the error message?
25 |     pass 
26 | 
27 | 
28 | ##################
29 | # Helper Methods #
30 | ##################
31 | 
32 | def check_call(ret):
33 |     """Check the return value of C API call
34 | 
35 |     This function will crash when error occurs.
36 |     Wrap every API call with this function
37 | 
38 |     Parameters
39 |     ----------
40 |     ret : int
41 |         return value from API calls
42 |     """
43 |     assert (ret == 0)
44 | 
45 | 
46 | def c_array(ctype, values):
47 |     """Create ctypes array from a python array
48 | 
49 |     Parameters
50 |     ----------
51 |     ctype : ctypes data type
52 |         data type of the array we want to convert to
53 | 
54 |     values : tuple or list
55 |         data content
56 | 
57 |     Returns
58 |     -------
59 |     out : ctypes array
60 |         Created ctypes array
61 |     """
62 |     return (ctype * len(values))(*values)
63 | 


--------------------------------------------------------------------------------
/aurora/ndarray/gpu_op.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | 
  3 | import ctypes
  4 | from ._base import _LIB
  5 | from . import ndarray as _nd
  6 | 
  7 | 
  8 | def array_set(arr, value):
  9 |     assert isinstance(arr, _nd.NDArray)
 10 |     _LIB.DLGpuArraySet(arr.handle, ctypes.c_float(value))
 11 | 
 12 | 
 13 | def broadcast_to(in_arr, out_arr):
 14 |     assert isinstance(in_arr, _nd.NDArray)
 15 |     assert isinstance(out_arr, _nd.NDArray)
 16 |     _LIB.DLGpuBroadcastTo(in_arr.handle, out_arr.handle)
 17 | 
 18 | 
 19 | def reduce_sum_axis_zero(in_arr, out_arr):
 20 |     assert isinstance(in_arr, _nd.NDArray)
 21 |     assert isinstance(out_arr, _nd.NDArray)
 22 |     _LIB.DLGpuReduceSumAxisZero(in_arr.handle, out_arr.handle)
 23 | 
 24 | 
 25 | def matrix_elementwise_add(matA, matB, matC):
 26 |     assert isinstance(matA, _nd.NDArray)
 27 |     assert isinstance(matB, _nd.NDArray)
 28 |     assert isinstance(matC, _nd.NDArray)
 29 |     _LIB.DLGpuMatrixElementwiseAdd(matA.handle, matB.handle, matC.handle)
 30 | 
 31 | 
 32 | def matrix_elementwise_add_by_const(in_mat, val, out_mat):
 33 |     assert isinstance(in_mat, _nd.NDArray)
 34 |     assert isinstance(out_mat, _nd.NDArray)
 35 |     _LIB.DLGpuMatrixElementwiseAddByConst(
 36 |         in_mat.handle, ctypes.c_float(val), out_mat.handle)
 37 | 
 38 | 
 39 | def matrix_elementwise_subtract(matA, matB, matC):
 40 |     assert isinstance(matA, _nd.NDArray)
 41 |     assert isinstance(matB, _nd.NDArray)
 42 |     assert isinstance(matC, _nd.NDArray)
 43 |     _LIB.DLGpuMatrixElementwiseSubtract(matA.handle, matB.handle, matC.handle)
 44 | 
 45 | 
 46 | def matrix_elementwise_subtract_by_const(in_mat, val, out_mat):
 47 |     assert isinstance(in_mat, _nd.NDArray)
 48 |     assert isinstance(out_mat, _nd.NDArray)
 49 |     _LIB.DLGpuMatrixElementwiseSubtractByConst(
 50 |         in_mat.handle, ctypes.c_float(val), out_mat.handle)
 51 | 
 52 | 
 53 | def matrix_elementwise_multiply(matA, matB, matC):
 54 |     assert isinstance(matA, _nd.NDArray)
 55 |     assert isinstance(matB, _nd.NDArray)
 56 |     assert isinstance(matC, _nd.NDArray)
 57 |     _LIB.DLGpuMatrixElementwiseMultiply(
 58 |         matA.handle, matB.handle, matC.handle)
 59 | 
 60 | 
 61 | def matrix_elementwise_multiply_by_const(in_mat, val, out_mat):
 62 |     assert isinstance(in_mat, _nd.NDArray)
 63 |     assert isinstance(out_mat, _nd.NDArray)
 64 |     _LIB.DLGpuMatrixMultiplyByConst(
 65 |         in_mat.handle, ctypes.c_float(val), out_mat.handle)
 66 | 
 67 | 
 68 | def matrix_elementwise_division(matA, matB, matC):
 69 |     assert isinstance(matA, _nd.NDArray)
 70 |     assert isinstance(matB, _nd.NDArray)
 71 |     assert isinstance(matC, _nd.NDArray)
 72 |     _LIB.DLGpuMatrixElementwiseDiv(
 73 |         matA.handle, matB.handle, matC.handle)
 74 | 
 75 | 
 76 | def matrix_elementwise_div_by_const(in_mat, val, out_mat):
 77 |     assert isinstance(in_mat, _nd.NDArray)
 78 |     assert isinstance(out_mat, _nd.NDArray)
 79 |     _LIB.DLGpuMatrixElementwiseDivByConst(
 80 |         in_mat.handle, ctypes.c_float(val), out_mat.handle)
 81 | 
 82 | 
 83 | def matrix_elementwise_sqrt(in_mat, out_mat):
 84 |     assert isinstance(in_mat, _nd.NDArray)
 85 |     assert isinstance(out_mat, _nd.NDArray)
 86 |     _LIB.DLGpuMatrixElementwiseSqrt(in_mat.handle, out_mat.handle)
 87 | 
 88 | 
 89 | def matrix_multiply(matA, transA, matB, transB, matC):
 90 |     assert isinstance(matA, _nd.NDArray)
 91 |     assert isinstance(matB, _nd.NDArray)
 92 |     assert isinstance(matC, _nd.NDArray)
 93 |     _LIB.DLGpuMatrixMultiply(
 94 |         matA.handle, transA, matB.handle, transB, matC.handle)
 95 | 
 96 | 
 97 | def relu(in_arr, out_arr):
 98 |     assert isinstance(in_arr, _nd.NDArray)
 99 |     assert isinstance(out_arr, _nd.NDArray)
100 |     _LIB.DLGpuRelu(in_arr.handle, out_arr.handle)
101 | 
102 | 
103 | def relu_gradient(in_arr, in_grad_arr, out_arr):
104 |     assert isinstance(in_arr, _nd.NDArray)
105 |     assert isinstance(in_grad_arr, _nd.NDArray)
106 |     assert isinstance(out_arr, _nd.NDArray)
107 |     _LIB.DLGpuReluGradient(in_arr.handle, in_grad_arr.handle, out_arr.handle)
108 | 
109 | 
110 | def softmax(in_arr, out_arr):
111 |     assert isinstance(in_arr, _nd.NDArray)
112 |     assert isinstance(out_arr, _nd.NDArray)
113 |     _LIB.DLGpuSoftmax(in_arr.handle, out_arr.handle)
114 | 
115 | 
116 | def softmax_cross_entropy(in_arr_a, in_arr_b, out_arr):
117 |     assert isinstance(in_arr_a, _nd.NDArray)
118 |     assert isinstance(in_arr_b, _nd.NDArray)
119 |     assert isinstance(out_arr, _nd.NDArray)
120 |     _LIB.DLGpuSoftmaxCrossEntropy(
121 |         in_arr_a.handle, in_arr_b.handle, out_arr.handle)
122 | 
123 | 
124 | def cudnn_relu_forward(in_array, out_array):
125 |     assert isinstance(in_array, _nd.NDArray)
126 |     assert isinstance(out_array, _nd.NDArray)
127 |     _LIB.cudnnReLUForward(in_array.handle, out_array.handle)
128 | 
129 | 
130 | def cudnn_conv2d_forward(input, filter, bias, stride_height, stride_width, padding_height, padding_width, output):
131 |     assert isinstance(input, _nd.NDArray)
132 |     assert isinstance(filter, _nd.NDArray)
133 |     assert isinstance(bias, _nd.NDArray)
134 |     assert isinstance(stride_height, int)
135 |     assert isinstance(stride_width, int)
136 |     assert isinstance(padding_height, int)
137 |     assert isinstance(padding_width, int)
138 |     assert isinstance(output, _nd.NDArray)
139 |     _LIB.cudnnConv2DForward(input.handle, filter.handle,
140 |                             bias.handle,
141 |                             stride_height, stride_width,
142 |                             padding_height, padding_width,
143 |                             output.handle)
144 | 
145 | 
146 | def cudnn_pool_forward(input,
147 |                        pooling_height, pooling_width,
148 |                        stride_height, stride_width,
149 |                        mode,
150 |                        output):
151 |     assert isinstance(input, _nd.NDArray)
152 |     assert isinstance(stride_height, int)
153 |     assert isinstance(stride_width, int)
154 |     assert isinstance(pooling_height, int)
155 |     assert isinstance(pooling_width, int)
156 |     assert isinstance(mode, str)
157 |     assert isinstance(output, _nd.NDArray)
158 | 
159 |     mode = mode.encode('utf-8')
160 | 
161 |     _LIB.cudnnPoolForward(input.handle,
162 |                           stride_height, stride_width,
163 |                           pooling_height, pooling_width,
164 |                           ctypes.c_char_p(mode),
165 |                           output.handle)
166 | 
167 | 
168 | def cudnn_pool_backward(input,
169 |                         output_grads,
170 |                         output,
171 |                         pooling_height, pooling_width,
172 |                         stride_height, stride_width,
173 |                         mode,
174 |                         pool_grad):
175 |     assert isinstance(input, _nd.NDArray)
176 |     assert isinstance(output_grads, _nd.NDArray)
177 |     assert isinstance(output, _nd.NDArray)
178 |     assert isinstance(pool_grad, _nd.NDArray)
179 | 
180 |     assert isinstance(pooling_height, int)
181 |     assert isinstance(pooling_width, int)
182 |     assert isinstance(stride_height, int)
183 |     assert isinstance(stride_width, int)
184 | 
185 |     mode = mode.encode('utf-8')
186 | 
187 |     _LIB.cudnnPoolBackward(input.handle,
188 |                            output_grads.handle,
189 |                            output.handle,
190 |                            pooling_height, pooling_width,
191 |                            stride_height, stride_width,
192 |                            mode,
193 |                            pool_grad.handle)
194 | 
195 | 
196 | def cudnn_conv2d_backward_filter(input,
197 |                                  output_grads,
198 |                                  stride_height,
199 |                                  stride_width,
200 |                                  padding_height,
201 |                                  padding_width,
202 |                                  filter_grad):
203 |     assert isinstance(input, _nd.NDArray)
204 |     assert isinstance(output_grads, _nd.NDArray)
205 |     assert isinstance(stride_height, int)
206 |     assert isinstance(stride_width, int)
207 |     assert isinstance(padding_height, int)
208 |     assert isinstance(padding_width, int)
209 |     assert isinstance(filter_grad, _nd.NDArray)
210 |     _LIB.cudnnConv2DBackwardFilter(input.handle,
211 |                                    output_grads.handle,
212 |                                    stride_height, stride_width,
213 |                                    padding_height, padding_width,
214 |                                    filter_grad.handle)
215 | 
216 | 
217 | def cudnn_conv2d_backward_data(filter,
218 |                                output_grad,
219 |                                stride_height,
220 |                                stride_width,
221 |                                padding_height,
222 |                                padding_width,
223 |                                data_grad):
224 |     assert isinstance(filter, _nd.NDArray)
225 |     assert isinstance(output_grad, _nd.NDArray)
226 |     assert isinstance(stride_height, int)
227 |     assert isinstance(stride_width, int)
228 |     assert isinstance(padding_height, int)
229 |     assert isinstance(padding_width, int)
230 |     assert isinstance(data_grad, _nd.NDArray)
231 |     _LIB.cudnnConv2DBackwardData(filter.handle,
232 |                                  output_grad.handle,
233 |                                  stride_height,
234 |                                  stride_width,
235 |                                  padding_height,
236 |                                  padding_width,
237 |                                  data_grad.handle)
238 | 
239 | 
240 | def cudnn_conv2d_backward_bias(output_grads, bias_grads):
241 |     assert isinstance(output_grads, _nd.NDArray)
242 |     assert isinstance(bias_grads, _nd.NDArray)
243 |     _LIB.cudnnConv2DBackwardBias(output_grads.handle, bias_grads.handle)
244 | 


--------------------------------------------------------------------------------
/aurora/ndarray/ndarray.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | 
  3 | from ._base import _LIB, check_call, c_array
  4 | from . import ndarray as _nd
  5 | import ctypes
  6 | import numpy as np
  7 | 
  8 | 
  9 | class DLContext(ctypes.Structure):
 10 |     """DL context strucure."""
 11 |     _fields_ = [("device_id", ctypes.c_int),
 12 |                 ("device_type", ctypes.c_int)]
 13 | 
 14 |     MASK2STR = {
 15 |         1: 'cpu',
 16 |         2: 'gpu',
 17 |     }
 18 | 
 19 |     def __init__(self, device_id, device_type):
 20 |         super(DLContext, self).__init__()
 21 |         self.device_id = device_id
 22 |         self.device_type = device_type
 23 | 
 24 |     def __repr__(self):
 25 |         return "%s(%d)" % (
 26 |             DLContext.MASK2STR[self.device_type], self.device_id)
 27 | 
 28 | 
 29 | class DLArray(ctypes.Structure):
 30 |     """DLArray in C API"""
 31 |     _fields_ = [("data", ctypes.c_void_p),
 32 |                 ("ctx", DLContext),
 33 |                 ("ndim", ctypes.c_int),
 34 |                 ("shape", ctypes.POINTER(ctypes.c_int64))]
 35 | 
 36 | 
 37 | DLArrayHandle = ctypes.POINTER(DLArray)
 38 | 
 39 | 
 40 | def cpu(dev_id=0):
 41 |     """Construct a CPU device
 42 |     Parameters
 43 |     ----------
 44 |     dev_id : int, optional
 45 |         The integer device id
 46 |     """
 47 |     return DLContext(dev_id, 1)
 48 | 
 49 | 
 50 | def gpu(dev_id=0):
 51 |     """Construct a CPU device
 52 |     Parameters
 53 |     ----------
 54 |     dev_id : int, optional
 55 |         The integer device id
 56 |     """
 57 |     return DLContext(dev_id, 2)
 58 | 
 59 | 
 60 | def is_gpu_ctx(ctx):
 61 |     """Return if context is GPU context.
 62 |     Parameters
 63 |     ----------
 64 |     ctx : DLContext
 65 |         The query context
 66 |     """
 67 |     return ctx and ctx.device_type == 2
 68 | 
 69 | 
 70 | class NDArray(object):
 71 |     """Lightweight NDArray class of DL runtime.
 72 |     Strictly this is only an Array Container(a buffer object)
 73 |     No arthimetic operations are defined.
 74 |     """
 75 |     __slots__ = ["handle"]
 76 | 
 77 |     # pylint: disable=no-member
 78 |     def __init__(self, handle):
 79 |         """Initialize the function with handle
 80 |         Parameters
 81 |         ----------
 82 |         handle : DLArrayHandle
 83 |             the handle to the underlying C++ DLArray
 84 |         """
 85 |         self.handle = handle
 86 | 
 87 |     def __del__(self):
 88 |         check_call(_LIB.DLArrayFree(self.handle))
 89 | 
 90 |     @property
 91 |     def shape(self):
 92 |         """Shape of this array"""
 93 |         return tuple(self.handle.contents.shape[i]
 94 |                      for i in range(self.handle.contents.ndim))
 95 | 
 96 |     @property
 97 |     def ctx(self):
 98 |         """context of this array"""
 99 |         return self.handle.contents.ctx
100 | 
101 |     def __setitem__(self, in_slice, value):
102 |         """Set ndarray value"""
103 |         if (not isinstance(in_slice, slice) or
104 |                 in_slice.start is not None
105 |                 or in_slice.stop is not None):
106 |             raise ValueError('Array only support set from numpy array')
107 |         if isinstance(value, NDArray):
108 |             if value.handle is not self.handle:
109 |                 value.copyto(self)
110 |         elif isinstance(value, (np.ndarray, np.generic)):
111 |             self._sync_copyfrom(value)
112 |         else:
113 |             raise TypeError('type %s not supported' % str(type(value)))
114 | 
115 |     def _sync_copyfrom(self, source_array):
116 |         """Peform an synchronize copy from the array.
117 |         Parameters
118 |         ----------
119 |         source_array : array_like
120 |             The data source we should like to copy from.
121 |         """
122 |         if not isinstance(source_array, np.ndarray):
123 |             try:
124 |                 source_array = np.array(source_array, dtype=np.float32)
125 |             except:
126 |                 raise TypeError('array must be an array_like data,' +
127 |                                 'type %s is not supported'
128 |                                 % str(type(source_array)))
129 |         source_array = np.ascontiguousarray(source_array, dtype=np.float32)
130 |         if source_array.shape != self.shape:
131 |             raise ValueError('array shape do not match the shape of NDArray')
132 |         source_arr, shape = NDArray._numpyasarray(source_array)
133 |         check_call(_LIB.DLArrayCopyFromTo(
134 |             ctypes.byref(source_arr), self.handle, None))
135 |         # de-allocate shape until now
136 |         _ = shape
137 | 
138 |     @staticmethod
139 |     def _numpyasarray(np_data):
140 |         """Return a DLArray representation of a numpy array."""
141 |         data = np_data
142 |         assert data.flags['C_CONTIGUOUS']
143 |         arr = DLArray()
144 |         shape = c_array(ctypes.c_int64, data.shape)
145 |         arr.data = data.ctypes.data_as(ctypes.c_void_p)
146 |         arr.shape = shape
147 |         arr.ndim = data.ndim
148 |         # CPU device
149 |         arr.ctx = cpu(0)
150 |         return arr, shape
151 | 
152 |     def asnumpy(self):
153 |         """Convert this array to numpy array
154 |         Returns
155 |         -------
156 |         np_arr : numpy.ndarray
157 |             The corresponding numpy array.
158 |         """
159 |         np_arr = np.empty(self.shape, dtype=np.float32)
160 |         arr, shape = NDArray._numpyasarray(np_arr)
161 |         check_call(_LIB.DLArrayCopyFromTo(
162 |             self.handle, ctypes.byref(arr), None))
163 |         _ = shape
164 |         return np_arr
165 | 
166 |     def copyto(self, target):
167 |         """Copy array to target
168 |         Parameters
169 |         ----------
170 |         target : NDArray
171 |             The target array to be copied, must have same shape as this array.
172 |         """
173 |         if isinstance(target, DLContext):
174 |             target = empty(self.shape, target)
175 |         if isinstance(target, NDArray):
176 |             check_call(_LIB.DLArrayCopyFromTo(
177 |                 self.handle, target.handle, None))
178 |         else:
179 |             raise ValueError("Unsupported target type %s" % str(type(target)))
180 |         return target
181 | 
182 | 
183 | def array(arr, ctx=cpu(0)):
184 |     """Create an array from source arr.
185 |     Parameters
186 |     ----------
187 |     arr : numpy.ndarray
188 |         The array to be copied from
189 |     ctx : DLContext, optional
190 |         The device context to create the array
191 |     Returns
192 |     -------
193 |     ret : NDArray
194 |         The created array
195 |     """
196 |     if not isinstance(arr, np.ndarray):
197 |         arr = np.array(arr)
198 |     ret = empty(arr.shape, ctx)
199 |     ret._sync_copyfrom(arr)
200 |     return ret
201 | 
202 | 
203 | def empty(shape, ctx=cpu(0)):
204 |     """Create an empty array given shape and device
205 |     Parameters
206 |     ----------
207 |     shape : tuple of int
208 |         The shape of the array
209 |     ctx : DLContext
210 |         The context of the array
211 |     Returns
212 |     -------
213 |     arr : ndarray
214 |         The array dlsys supported.
215 |     """
216 |     shape = c_array(ctypes.c_int64, shape)
217 |     ndim = ctypes.c_int(len(shape))
218 |     handle = DLArrayHandle()
219 |     check_call(_LIB.DLArrayAlloc(
220 |         shape, ndim, ctx, ctypes.byref(handle)))
221 |     return NDArray(handle)
222 | 
223 | 
224 | def reshape(arr, new_shape):
225 |     assert isinstance(arr, _nd.NDArray)
226 |     # TODO (upul): check total number of elements match ...
227 |     shape = c_array(ctypes.c_int64, new_shape)
228 |     new_dim = len(new_shape)
229 |     handle = arr.handle
230 |     check_call(_LIB.DLArrayReshape(handle, shape, new_dim))
231 | 


--------------------------------------------------------------------------------
/aurora/nn/__init__.py:
--------------------------------------------------------------------------------
 1 | from .activations import relu
 2 | from .activations import sigmoid
 3 | from .activations import softmax
 4 | from .loss_functions import softmax_cross_entropy_with_logits
 5 | from .utils import softmax_func
 6 | from .conv import conv2d
 7 | from .pooling import maxPool
 8 | 
 9 | __all__ = ['relu', 'sigmoid', 'softmax', 'softmax_cross_entropy_with_logits',
10 |            'softmax_func', 'conv2d', 'maxPool']


--------------------------------------------------------------------------------
/aurora/nn/activations.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from aurora.autodiff.autodiff import Op
  3 | from aurora.nn.utils import softmax_func
  4 | try:
  5 |     from aurora.ndarray import gpu_op, ndarray
  6 | except ImportError:
  7 |     pass
  8 | 
  9 | 
 10 | class ReluOp(Op):
 11 |     def __call__(self, node_A):
 12 |         new_node = Op.__call__(self)
 13 |         new_node.inputs = [node_A]
 14 |         new_node.name = "Relu(%s)" % (node_A.name)
 15 |         return new_node
 16 | 
 17 |     def compute(self, node, input_vals, output_val, use_numpy=True):
 18 |         assert len(input_vals) == 1
 19 |         if use_numpy:
 20 |             output_val[:] = np.maximum(input_vals[0], 0)
 21 |         else:
 22 |             gpu_op.relu(input_vals[0], output_val)
 23 | 
 24 |     def gradient(self, node, output_grad):
 25 |         return [relu_grad(node.inputs[0], output_grad)]
 26 | 
 27 |     def infer_shape(self, node, input_shapes):
 28 |         assert len(input_shapes) == 1
 29 |         return input_shapes[0]
 30 | 
 31 | 
 32 | class ReluGradientOp(Op):
 33 |     def __call__(self, node_A, node_B):
 34 |         """node_B is output_grad"""
 35 |         new_node = Op.__call__(self)
 36 |         new_node.inputs = [node_A, node_B]
 37 |         new_node.name = "ReluGradient(%s)" % (node_A.name)
 38 |         return new_node
 39 | 
 40 |     def compute(self, node, input_vals, output_val, use_numpy=True):
 41 |         assert len(input_vals) == 2
 42 |         if use_numpy:
 43 |             output_val[:] = np.sign(np.maximum(input_vals[0], 0)) * input_vals[1]
 44 |         else:
 45 |             gpu_op.relu_gradient(input_vals[0], input_vals[1], output_val)
 46 | 
 47 |     def gradient(self, node, output_grad):
 48 |         raise NotImplementedError('Gradient of ReluGradientOp not implemented')
 49 | 
 50 |     def infer_shape(self, node, input_shapes):
 51 |         assert len(input_shapes) == 2
 52 |         assert input_shapes[0] == input_shapes[1]
 53 |         return input_shapes[0]
 54 | 
 55 | 
 56 | class SigmoidOp(Op):
 57 |     def __call__(self, node_A):
 58 |         new_node = Op.__call__(self)
 59 |         new_node.inputs = [node_A]
 60 |         new_node.name = 'Sigmoid({0:s})'.format(node_A.name)
 61 |         return new_node
 62 | 
 63 |     def compute(self, node, input_vals, output_val, use_numpy=True):
 64 |         """
 65 |         This function calculates the sigmoid of the input_vals[0].
 66 |         The naive implementation (1/(1+ exp(-x)) is not stable. Hence
 67 |         we are using:
 68 |         tanh(x) = (exp(x) - exp(-x))/(exp(x) + exp(-x))
 69 |                 = 2*sigmoid(2*x) - 1
 70 |         hence:
 71 |         sigmoid(x) = 0.5 + 0.5*tanh(0.5*x)
 72 |         :param node:
 73 |         :param input_vals:
 74 |         :param output_val:
 75 |         :param use_numpy:
 76 |         :return:
 77 |         """
 78 |         assert len(input_vals) == 1
 79 |         if use_numpy:
 80 |             output_val[:] = 0.5 + 0.5*np.tanh(0.5*input_vals[0])
 81 |         else:
 82 |             raise NotImplementedError('GPU version not yet implemented')
 83 | 
 84 |     def gradient(self, node, output_grads):
 85 |         x = node.inputs[0]
 86 |         # g = sigmoid(x) * (1 - sigmoid(x))
 87 |         # TODO: (upul) obove g failed in unit testing, need to check it.
 88 |         g = sigmoid(x) - sigmoid(x) * sigmoid(x)
 89 |         return [g * output_grads]
 90 | 
 91 |     def infer_shape(self, node, input_shapes):
 92 |         assert len(input_shapes)
 93 |         return input_shapes[0]
 94 | 
 95 | 
 96 | class SoftmaxOp(Op):
 97 |     def __call__(self, node_A):
 98 |         new_node = Op.__call__(self)
 99 |         new_node.inputs = [node_A]
100 |         new_node.name = 'SoftmaxOp({0:s})'.format(node_A.name)
101 |         return new_node
102 | 
103 |     def compute(self, node, input_vals, output_val, use_numpy=True):
104 |         assert len(input_vals) == 1
105 |         if use_numpy:
106 |             output_val[:] = softmax_func(input_vals[0])
107 |         else:
108 |             gpu_op.softmax(input_vals[0], output_val)
109 | 
110 |     def gradient(self, node, output_grads):
111 |         raise NotImplementedError('Not yet implemented, Please use CrossEntropy operator')
112 | 
113 |     def infer_shape(self, node, input_shapes):
114 |         assert len(input_shapes) == 1
115 |         return input_shapes[0]
116 | 
117 | 
118 | # TODO (upul): Other commonly use activation functions
119 | 
120 | # Global singleton operators
121 | relu = ReluOp()
122 | relu_grad = ReluGradientOp()
123 | sigmoid = SigmoidOp()
124 | softmax = SoftmaxOp()
125 | 


--------------------------------------------------------------------------------
/aurora/nn/conv.py:
--------------------------------------------------------------------------------
  1 | from aurora.autodiff.autodiff import Op
  2 | from aurora.nn.pyx.im2col import im2col, col2im
  3 | try:
  4 |     from aurora.ndarray import gpu_op, ndarray
  5 | except ImportError:
  6 |     pass
  7 | 
  8 | 
  9 | # TODO: (upul) The numpy version of the Conv2dOp, X_col is calculated twice.
 10 | #       One in compute() of Conv2dOp and the second time inside the compute() of
 11 | #       Conv2dBackwardFilter node. Check the feasibility of caching.
 12 | 
 13 | class Conv2dOp(Op):
 14 |     def __call__(self, input, filter, bias, strides=(1, 1), padding=(0, 0)):
 15 |         new_node = Op.__call__(self)
 16 |         # input: 4-D data, (batch_size, depth, height, width)
 17 |         # filter: 4-D kernel (num_filters, depth, kernel_height, kernel_width)
 18 |         new_node.inputs = [input, filter, bias]
 19 |         new_node.strides = strides
 20 |         new_node.padding = padding
 21 |         new_node.name = 'Conv2d({0:s}, {1:s})'.format(input.name, filter.name)
 22 |         return new_node
 23 | 
 24 |     def compute(self, node, input_vals, output_val, use_numpy=True):
 25 |         assert len(input_vals) == 3
 26 | 
 27 |         X = input_vals[0]
 28 |         h = X.shape[2]
 29 |         w = X.shape[3]
 30 |         batch_size = X.shape[0]
 31 | 
 32 |         W = input_vals[1]
 33 |         filter_height = W.shape[2]
 34 |         filter_width = W.shape[3]
 35 |         n_filters = W.shape[0]
 36 | 
 37 |         b = input_vals[2]
 38 | 
 39 |         padding_height = node.padding[0]
 40 |         padding_width = node.padding[1]
 41 |         stride_height = node.strides[0]
 42 |         stride_width = node.strides[1]
 43 | 
 44 |         if use_numpy:
 45 |             b = b.reshape(n_filters, -1)
 46 |             h_new = int((h - filter_height + 2 * padding_height) / stride_height + 1)
 47 |             w_new = int((w - filter_width + 2 * padding_width) / stride_width + 1)
 48 |             X_col = im2col(X, filter_height, filter_width, padding_height, padding_width,
 49 |                            stride_height, stride_width)
 50 |             W_col = W.reshape(n_filters, -1)
 51 |             out = W_col @ X_col + b
 52 |             out = out.reshape(n_filters, h_new, w_new, batch_size)
 53 |             output_val[:] = out.transpose(3, 0, 1, 2)
 54 |         else:
 55 |             gpu_op.cudnn_conv2d_forward(X, W, b, stride_height, stride_width,
 56 |                                         padding_height, padding_width, output_val)
 57 | 
 58 |     def gradient(self, node, output_grads):
 59 |         #
 60 |         filter_node = node.inputs[1]
 61 |         data_node = node.inputs[0]
 62 |         return [conv2dBackData(data_node, filter_node, output_grads),
 63 |                 conv2dBackFilter(data_node, filter_node, output_grads),
 64 |                 conv2dBackBias(output_grads)]
 65 | 
 66 |     def infer_shape(self, node, input_shapes):
 67 |         assert len(input_shapes) == 3
 68 | 
 69 |         X_shape = input_shapes[0]
 70 |         h = X_shape[2]
 71 |         w = X_shape[3]
 72 | 
 73 |         W_shape = input_shapes[1]
 74 |         filter_height = W_shape[2]
 75 |         filter_width = W_shape[3]
 76 | 
 77 |         padding_height = node.padding[0]
 78 |         padding_width = node.padding[1]
 79 |         stride_height = node.strides[0]
 80 |         stride_width = node.strides[1]
 81 | 
 82 |         h_new = int((h - filter_height + 2 * padding_height) / stride_height + 1)
 83 |         w_new = int((w - filter_width + 2 * padding_width) / stride_width + 1)
 84 |         d_new = W_shape[0]
 85 |         batch_size = X_shape[0]
 86 |         return batch_size, d_new, h_new, w_new
 87 | 
 88 | 
 89 | class Conv2dGradientFilter(Op):
 90 |     def __call__(self, node_A, node_B, output_grad, strides=(1, 1), padding=(0, 0)):
 91 |         new_node = Op.__call__(self)
 92 |         new_node.inputs = [node_A, node_B, output_grad]
 93 |         new_node.strides = strides
 94 |         new_node.padding = padding
 95 |         new_node.name = "Conv2dBackwardFilter(%s, %s)" % (node_A.name, node_B.name)
 96 |         return new_node
 97 | 
 98 |     def compute(self, node, input_vals, output_val, use_numpy=True):
 99 |         assert len(input_vals) == 3
100 | 
101 |         X = input_vals[0]  # data
102 |         W = input_vals[1]  # filter
103 | 
104 |         assert len(X.shape) == 4
105 |         assert len(W.shape) == 4
106 | 
107 |         filter_height = W.shape[2]
108 |         filter_width = W.shape[3]
109 |         n_filters = W.shape[0]
110 |         out_grad = input_vals[2]
111 | 
112 |         padding_height = node.padding[0]
113 |         padding_width = node.padding[1]
114 |         stride_height = node.strides[0]
115 |         stride_width = node.strides[1]
116 | 
117 |         if use_numpy:
118 |             X_col = im2col(X, filter_height, filter_width, padding_height, padding_width,
119 |                            stride_height, stride_width)
120 |             dout_reshaped = out_grad.transpose(1, 2, 3, 0).reshape(n_filters, -1)
121 |             dW = dout_reshaped @ X_col.T
122 |             output_val[:] = dW.reshape(W.shape)
123 | 
124 |         else:
125 |             gpu_op.cudnn_conv2d_backward_filter(X, out_grad, stride_height, stride_width,
126 |                                                 padding_height, padding_width, output_val)
127 | 
128 |     def gradient(self, node, output_grads):
129 |         raise NotImplementedError('Gradient of ReluGradientOp not implemented')
130 | 
131 |     def infer_shape(self, node, input_shapes):
132 |         assert len(input_shapes) == 3
133 |         W_size = input_shapes[1]
134 |         return W_size
135 | 
136 | 
137 | class Conv2dGradientData(Op):
138 |     def __call__(self, node_A, node_B, output_grad, strides=(1, 1), padding=(0, 0)):
139 |         new_node = Op.__call__(self)
140 |         new_node.inputs = [node_A, node_B, output_grad]
141 |         new_node.strides = strides
142 |         new_node.padding = padding
143 |         new_node.name = "Conv2dBackwardData(%s, %s)" % (node_A.name, node_B.name)
144 |         return new_node
145 | 
146 |     def compute(self, node, input_vals, output_val, use_numpy=True):
147 |         assert len(input_vals) == 3
148 |         X = input_vals[0]  # data
149 |         W = input_vals[1]  # filter
150 |         output_grads = input_vals[2]
151 | 
152 |         assert len(X.shape) == 4
153 |         assert len(W.shape) == 4
154 | 
155 |         filter_height = W.shape[2]
156 |         filter_width = W.shape[3]
157 |         n_filters = W.shape[0]
158 | 
159 |         padding_height, padding_width = node.padding
160 |         stride_height, stride_width = node.strides
161 | 
162 |         if use_numpy:
163 |             W_reshape = W.reshape(n_filters, -1)
164 |             dout_reshaped = input_vals[2].transpose(1, 2, 3, 0).reshape(n_filters, -1)
165 | 
166 |             dX_col = W_reshape.T @ dout_reshaped
167 |             batch_size, n_channels, img_height, img_width = X.shape
168 |             output_val[:] = col2im(dX_col, batch_size, n_channels,
169 |                                    img_height, img_width, filter_height, filter_width,
170 |                                    padding_height, padding_width,
171 |                                    stride_height, stride_width)
172 |         else:
173 |             gpu_op.cudnn_conv2d_backward_data(W, output_grads, stride_height, stride_width,
174 |                                               padding_height, padding_width, output_val)
175 | 
176 |     def gradient(self, node, output_grads):
177 |         raise NotImplementedError('Gradient of ReluGradientOp not implemented')
178 | 
179 |     def infer_shape(self, node, input_shapes):
180 |         assert len(input_shapes) == 3
181 |         X_size = input_shapes[0]
182 |         return X_size
183 | 
184 | 
185 | class Conv2dGradientBias(Op):
186 |     def __call__(self, node_A):
187 |         new_node = Op.__call__(self)
188 |         new_node.inputs = [node_A]
189 |         new_node.name = "Conv2dBackwardBiase(%s)" % (node_A.name)
190 |         return new_node
191 | 
192 |     def compute(self, node, input_vals, output_val, use_numpy=True):
193 |         assert len(input_vals) == 1
194 | 
195 |         if use_numpy:
196 |             output_val[:] = input_vals[0].sum(axis=(0, 2, 3))
197 |         else:
198 |             gpu_op.cudnn_conv2d_backward_bias(input_vals[0], output_val)
199 | 
200 |     def gradient(self, node, output_grads):
201 |         raise NotImplementedError('Gradient of ReluGradientOp not implemented')
202 | 
203 |     def infer_shape(self, node, input_shapes):
204 |         assert len(input_shapes) == 1
205 |         # size of the input_shape[0] = (batch_size, num_filters, filter_height, filter_width)
206 |         return (input_shapes[0][1],)
207 | 
208 | 
209 | # Global singleton operators
210 | conv2d = Conv2dOp()
211 | conv2dBackFilter = Conv2dGradientFilter()
212 | conv2dBackData = Conv2dGradientData()
213 | conv2dBackBias = Conv2dGradientBias()
214 | 


--------------------------------------------------------------------------------
/aurora/nn/loss_functions.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from aurora.autodiff.autodiff import Op, zeros_like
 3 | 
 4 | from .activations import softmax
 5 | from .utils import log_sum_exp
 6 | 
 7 | try:
 8 |     from aurora.ndarray import gpu_op, ndarray
 9 | except ImportError:
10 |     pass
11 | 
12 | 
13 | class CrossEntropyOp(Op):
14 |     def __call__(self, node_A, node_B):
15 |         new_node = Op.__call__(self)
16 |         new_node.inputs = [node_A, node_B]
17 |         new_node.name = 'CrossEntropy({0:s}, {1:s})'.format(node_A.name, node_B.name)
18 |         return new_node
19 | 
20 |     def compute(self, node, input_vals, output_val, use_numpy=True):
21 |         assert len(input_vals) == 2
22 |         if use_numpy:
23 |             logits = input_vals[0]
24 |             actual = input_vals[1]
25 |             safe_log_softmax = logits - log_sum_exp(logits)
26 |             output_val[:] = np.mean(-np.sum(actual * safe_log_softmax, axis=1), keepdims=True)
27 |         else:
28 |             gpu_op.softmax_cross_entropy(input_vals[0], input_vals[1], output_val)
29 | 
30 |     def gradient(self, node, output_grads):
31 |         grad_A = (softmax(node.inputs[0]) + -1 * node.inputs[1]) * output_grads
32 |         grad_B = zeros_like(node.inputs[1])
33 |         return [grad_A, grad_B]
34 | 
35 |     def infer_shape(self, node, input_shapes):
36 |         assert len(input_shapes) == 2
37 |         return (1,)
38 | 
39 | 
40 | # TODO (upul) MSE
41 | # TODO (upul) RMSE
42 | # TODO (upul) sigmoid_corss_entropy_with_logits
43 | 
44 | # Global singleton operations
45 | softmax_cross_entropy_with_logits = CrossEntropyOp()
46 | 


--------------------------------------------------------------------------------
/aurora/nn/pooling.py:
--------------------------------------------------------------------------------
  1 | from aurora.autodiff.autodiff import Op
  2 | from aurora.nn.pyx.fast_pooling import max_pool_forward, max_pool_backward
  3 | 
  4 | try:
  5 |     from aurora.ndarray import gpu_op
  6 | except ImportError:
  7 |     pass
  8 | 
  9 | 
 10 | class MaxPoolOp(Op):
 11 |     def __call__(self, input, filter=(2, 2), strides=(2, 2)):
 12 |         new_node = Op.__call__(self)
 13 |         new_node.inputs = [input]
 14 |         new_node.filter = filter
 15 |         new_node.strides = strides
 16 |         new_node.cache = {}
 17 |         new_node.name = 'MaxPoolOp({})'.format(input.name)
 18 |         return new_node
 19 | 
 20 |     def compute(self, node, input_vals, output_val, use_numpy=True):
 21 |         assert len(input_vals) == 1
 22 | 
 23 |         filter_height = node.filter[0]
 24 |         filter_width = node.filter[1]
 25 |         stride_height = node.strides[0]
 26 |         stride_width = node.strides[1]
 27 | 
 28 |         if use_numpy:
 29 |             output_val[:] = max_pool_forward(input_vals[0],
 30 |                                              filter_height=filter_height,
 31 |                                              filter_width=filter_width,
 32 |                                              stride_height=stride_height,
 33 |                                              stride_width=stride_width)
 34 |         else:
 35 |             gpu_op.cudnn_pool_forward(input_vals[0],
 36 |                                       filter_height, filter_width,
 37 |                                       stride_height, stride_width,
 38 |                                       'max',
 39 |                                       output_val)
 40 |             node.cache['forward'] = output_val
 41 | 
 42 |     def gradient(self, node, output_grads):
 43 |         return [maxPoolBack(node.inputs[0], output_grads, cache=node.cache)]
 44 | 
 45 |     def infer_shape(self, node, input_shapes):
 46 |         assert len(input_shapes) == 1
 47 | 
 48 |         filter_height = node.filter[0]
 49 |         filter_width = node.filter[1]
 50 |         stride_height = node.strides[0]
 51 |         stride_width = node.strides[1]
 52 | 
 53 |         input_batch_size = input_shapes[0][0]
 54 |         input_n_channels = input_shapes[0][1]
 55 |         input_height = input_shapes[0][2]
 56 |         input_width = input_shapes[0][3]
 57 | 
 58 |         new_height = int((input_height - filter_height) / stride_height) + 1
 59 |         new_width = int((input_width - filter_width) / stride_width) + 1
 60 |         return input_batch_size, input_n_channels, new_height, new_width
 61 | 
 62 | 
 63 | class MaxPoolGradientOp(Op):
 64 |     def __call__(self, node_A, node_B, filter=(2, 2), strides=(2, 2), cache=None):
 65 |         new_node = Op.__call__(self)
 66 |         # node_B is the output_grad
 67 |         new_node.inputs = [node_A, node_B]
 68 |         new_node.filter = filter
 69 |         new_node.strides = strides
 70 |         new_node.cache = cache
 71 |         new_node.name = 'MaxPoolGradientOp(%s)' % (node_A.name)
 72 |         return new_node
 73 | 
 74 |     def compute(self, node, input_vals, output_val, use_numpy=True):
 75 |         assert len(input_vals) == 2
 76 | 
 77 |         filter_height = node.filter[0]
 78 |         filter_width = node.filter[1]
 79 |         stride_height = node.strides[0]
 80 |         stride_width = node.strides[1]
 81 | 
 82 |         data = input_vals[0]
 83 |         output_grad = input_vals[1]
 84 |         if use_numpy:
 85 |             output_val[:] = max_pool_backward(output_grad,
 86 |                                               data,
 87 |                                               filter_height=filter_height,
 88 |                                               filter_width=filter_width,
 89 |                                               stride_height=stride_height,
 90 |                                               stride_width=stride_width
 91 |                                               )
 92 |         else:
 93 |             gpu_op.cudnn_pool_backward(data, output_grad, node.cache['forward'],
 94 |                                        filter_height, filter_width,
 95 |                                        stride_height, stride_width,
 96 |                                        'max',
 97 |                                        output_val)
 98 | 
 99 |     def gradient(self, node, output_grads):
100 |         raise NotImplementedError('Gradient of AverageGradientOp is not implemented')
101 | 
102 |     def infer_shape(self, node, input_shapes):
103 |         assert len(input_shapes) == 2
104 |         return input_shapes[0]
105 | 
106 | 
107 | # Global singleton operators
108 | maxPool = MaxPoolOp()
109 | maxPoolBack = MaxPoolGradientOp()
110 | 


--------------------------------------------------------------------------------
/aurora/nn/pyx/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/upul/Aurora/415a80ac5f7083475baca4a2d187cd102ba7a6c5/aurora/nn/pyx/__init__.py


--------------------------------------------------------------------------------
/aurora/nn/pyx/fast_pooling.pyx:
--------------------------------------------------------------------------------
  1 | cimport cython
  2 | import numpy as np
  3 | cimport numpy as np
  4 | 
  5 | # TODO: (Upul) We need a better way to represent a big negative number
  6 | cdef float BIG_NEGATIVE = -1.0e15
  7 | 
  8 | @cython.boundscheck(False)
  9 | @cython.wraparound(False)
 10 | def max_pool_forward(np.float64_t[:, :, :, :] data,
 11 |                      int filter_height, int filter_width,
 12 |                      int stride_height, int stride_width):
 13 |     """
 14 | 
 15 |     :param data:
 16 |     :param filter_height:
 17 |     :param filter_width:
 18 |     :param stride_height:
 19 |     :param stride_width:
 20 |     :return:
 21 |     """
 22 | 
 23 |     cdef int batch_size = data.shape[0]
 24 |     cdef int input_channels = data.shape[1]
 25 |     cdef int height = data.shape[2]
 26 |     cdef int width = data.shape[3]
 27 | 
 28 |     # Define the dimensions of the output
 29 |     cdef int n_H = int(1 + (height - filter_height) / stride_height)
 30 |     cdef int n_W = int(1 + (width - filter_width) / stride_width)
 31 |     cdef int n_C = input_channels
 32 | 
 33 |     # Initialize output matrix
 34 |     cdef np.float64_t[:, :, :, :] output = np.zeros((batch_size, n_C, n_H, n_W))
 35 | 
 36 |     cdef int i, c, h, w, vert_start, vert_end, horiz_start, horiz_end,  ii, jj
 37 |     cdef float max_in_grid
 38 | 
 39 |     for i in range(batch_size):           # loop over the training examples
 40 |         for c in range (n_C):             # loop over the channels of the output volume
 41 |             for h in range(n_H):          # loop on the vertical axis of the output volume
 42 |                 for w in range(n_W):      # loop on the horizontal axis of the output volume
 43 |                     # Find the corners of the current "slice"
 44 |                     vert_start = h*stride_height
 45 |                     vert_end = h*stride_height + filter_height
 46 |                     horiz_start = w*stride_width
 47 |                     horiz_end = w*stride_width + filter_width
 48 |                     # finding the max value within the given grid
 49 |                     max_in_grid = BIG_NEGATIVE
 50 |                     for ii in range(vert_start, vert_end):
 51 |                         for jj in range(horiz_start, horiz_end):
 52 |                             if data[i, c, ii, jj] > max_in_grid:
 53 |                                 max_in_grid = data[i, c, ii, jj]
 54 |                     output[i, c, h, w] = max_in_grid
 55 |     return output
 56 | 
 57 | @cython.boundscheck(False)
 58 | @cython.wraparound(False)
 59 | def max_pool_backward(np.float64_t[:, :, :, :] output_grad,
 60 |                       np.float64_t[:, :, :, :] input_data,
 61 |                       int filter_height=2, int filter_width=2,
 62 |                       int stride_height=2, int stride_width=2):
 63 |     """
 64 | 
 65 |     :param output_grad:
 66 |     :param input_data:
 67 |     :param filter_height:
 68 |     :param filter_width:
 69 |     :param stride_height:
 70 |     :param stride_width:
 71 |     :return:
 72 |     """
 73 |     batch_size = output_grad.shape[0]
 74 |     channels = output_grad.shape[1]
 75 |     height = output_grad.shape[2]
 76 |     width = output_grad.shape[3]
 77 | 
 78 |     return _max_pool_backward_inner(output_grad, input_data,
 79 |                                    batch_size, channels,height,
 80 |                                    width, filter_height,
 81 |                                    filter_width, stride_height,
 82 |                                    stride_width)
 83 | 
 84 | @cython.boundscheck(False)
 85 | @cython.wraparound(False)
 86 | cdef _max_pool_backward_inner(np.float64_t[:, :, :, :] output_grad,
 87 |                              np.float64_t[:, :, :, :] input_data,
 88 |                              int batch_size, int
 89 |                              channels,
 90 |                              int height, int width,
 91 |                              int filter_height, int filter_width,
 92 |                              int stride_height, int stride_width):
 93 |     """
 94 |     
 95 |     :param output_grad: 
 96 |     :param input_data: 
 97 |     :param batch_size: 
 98 |     :param channels: 
 99 |     :param height: 
100 |     :param width: 
101 |     :param filter_height: 
102 |     :param filter_width: 
103 |     :param stride_height: 
104 |     :param stride_width: 
105 |     :return: 
106 |     """
107 | 
108 |     grad_input = np.zeros_like(input_data)
109 | 
110 |     cdef np.float64_t[:, :, :]  cct_example
111 |     cdef int h, w, c, vert_start, vert_end, horiz_start, horiz_end, slice_height, slice_width, max_i, max_j
112 |     cdef float max_value, cct_value
113 | 
114 |     # loop over the training examples
115 |     for i in range(batch_size):
116 | 
117 |         # pick the current training example
118 |         cct_example = input_data[i, :, :, :]
119 | 
120 |         for h in range(height):             # loop on the vertical axis
121 |             for w in range(width):          # loop on the horizontal axis
122 |                 for c in range(channels):   # loop over the channels (depth)
123 | 
124 |                     # Find the corners of the current slice.
125 |                     vert_start = h*stride_height
126 |                     vert_end = h*stride_height + filter_height
127 |                     horiz_start = w*stride_width
128 |                     horiz_end = w*stride_width + filter_width
129 | 
130 |                     # Compute the backward propagation in both modes.
131 |                     max_value = BIG_NEGATIVE
132 |                     for slice_height in range(vert_start, vert_end):
133 |                         for slice_width in range(horiz_start, horiz_end):
134 |                             cct_value = cct_example[c, slice_height, slice_width]
135 |                             if cct_value > max_value:
136 |                                 max_value = cct_value
137 |                                 max_i = slice_height
138 |                                 max_j = slice_width
139 |                     grad_input[i, c, max_i, max_j] += output_grad[i, c, h, w]
140 |     return grad_input
141 | 


--------------------------------------------------------------------------------
/aurora/nn/pyx/im2col.pyx:
--------------------------------------------------------------------------------
  1 | cimport cython
  2 | import numpy as np
  3 | cimport numpy as np
  4 | 
  5 | @cython.boundscheck(False)
  6 | @cython.wraparound(False)
  7 | cdef im2col_inner(np.float64_t[:, :, :, :] x_padded,
  8 |                   np.float64_t[:, :] out,
  9 |                   int h_new, int w_new, int C, int M,
 10 |                   int filter_height, int filter_width,
 11 |                   int stride_height, int stride_width):
 12 | 
 13 |     cdef int itr = 0
 14 |     cdef int start_i, end_i, start_j, end_j
 15 |     cdef int i, j, m
 16 |     cdef int k, c, p_h, p_w
 17 | 
 18 |     for i in range(h_new):
 19 |         for j in range(w_new):
 20 |             for m in range(M):
 21 |                 start_i = stride_height * i
 22 |                 end_i = stride_height * i + filter_width
 23 |                 start_j = stride_width * j
 24 |                 end_j = stride_width * j + filter_height              
 25 |                 
 26 |                 k = 0
 27 |                 for c in range(C):
 28 |                     for p_h in range(start_i, end_i):
 29 |                         for p_w in range(start_j, end_j):
 30 |                             out[k, itr] = x_padded[m, c, p_h, p_w]
 31 |                             k += 1
 32 |                 itr += 1
 33 | 
 34 | 
 35 | @cython.boundscheck(False)
 36 | @cython.wraparound(False)
 37 | cdef col2img_inner(np.float64_t[:, :] cols,
 38 |                    np.float64_t[:, :, :, :] x_padded,
 39 |                    int filter_height, int filter_width,
 40 |                    int N, int C, int H, int W,
 41 |                    int H_padded, int W_padded,
 42 |                    int padding_height, int padding_width,
 43 |                    int stride_height, int stride_width):
 44 |     cdef int idx = 0
 45 |     cdef int i, j, m, c, sh, sw
 46 |     cdef int start_height, start_width, k
 47 |     cdef np.float64_t[:] col
 48 | 
 49 |     cdef int p = H_padded - filter_height + 1
 50 |     cdef int q = W_padded - filter_width + 1
 51 |     i =0
 52 |     while i < p:
 53 |         j = 0
 54 |         while j < q:
 55 |             for m in range(N):
 56 |                 col = cols[:, idx]
 57 |                 start_height = i
 58 |                 start_width = j
 59 |                 k = 0
 60 |                 for c in range(C):
 61 |                     for sh in range(start_height, start_height + filter_height):
 62 |                         for sw in range(start_width, start_width + filter_width):
 63 |                             x_padded[m, c, sh, sw] += col[k]
 64 |                             k += 1
 65 |                 idx += 1
 66 |             j += stride_width
 67 |         i += stride_height
 68 |     if padding_height > 0 or padding_width >0:
 69 |         return x_padded[:, :, padding_height:-padding_height, padding_width:-padding_width]
 70 |     else:
 71 |         return x_padded
 72 | 
 73 | 
 74 | @cython.boundscheck(False)
 75 | @cython.wraparound(False)
 76 | def im2col(np.float64_t[:, :, :, :] image,
 77 |            int filter_height=3, int filter_width=3,
 78 |            int padding_height=0, int padding_width=0,
 79 |            int stride_height=1, int stride_width=1):
 80 | 
 81 |     cdef int images_per_batch = image.shape[0]
 82 |     cdef int n_channels = image.shape[1]
 83 |     cdef int img_h = image.shape[2]
 84 |     cdef int img_w = image.shape[3]
 85 | 
 86 |     cdef np.float64_t[:, :, :, :]  x_padded = np.pad(image, ((0, 0),
 87 |                               (0, 0),
 88 |                               (padding_height, padding_height),
 89 |                               (padding_width, padding_width)),
 90 |                       mode='constant')
 91 | 
 92 |     cdef int new_h = int((img_h - filter_height + 2 * padding_height) / stride_height + 1)
 93 |     cdef int new_w = int((img_w - filter_width + 2 * padding_width) / stride_width + 1)
 94 | 
 95 |     cdef int col_height = filter_width * filter_height * n_channels
 96 |     cdef int col_width = images_per_batch * new_h * new_w
 97 | 
 98 |     cdef np.float64_t[:, :] result = np.zeros((col_height, col_width))
 99 | 
100 |     im2col_inner(x_padded, result, new_h, new_w, n_channels, images_per_batch,
101 |                  filter_height, filter_width, stride_height, stride_width)
102 | 
103 |     return result
104 | 
105 | @cython.boundscheck(False)
106 | @cython.wraparound(False)
107 | def col2im(np.float64_t[:, :] col2img_converted,  int batch_size,
108 |            int no_channels, int image_height, int image_width,
109 |            int filter_height=3, int filter_width=3,
110 |            int padding_height=0, int padding_width=0,
111 |            int stride_height=1, int stride_width=1):
112 | 
113 |     cdef int padded_h = image_height + 2 * padding_height
114 |     cdef int padded_w = image_width + 2 * padding_width
115 |     cdef np.float64_t[:, :, :, :]  result = np.zeros((batch_size, no_channels, padded_h, padded_w))
116 | 
117 |     col2img_inner(col2img_converted, result, filter_height,
118 |                   filter_width, batch_size, no_channels,
119 |                   image_height, image_width, padded_h,
120 |                   padded_w, padding_height, padding_width,
121 |                   stride_height, stride_width)
122 |     return result
123 | 


--------------------------------------------------------------------------------
/aurora/nn/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def softmax_func(x):
 5 |     """
 6 |     Numerically stable softmax function. For more details
 7 |     about numerically calculations please refer:
 8 |     http://www.deeplearningbook.org/slides/04_numerical.pdf
 9 |     :param x:
10 |     :return:
11 |     """
12 |     stable_values = x - np.max(x, axis=1, keepdims=True)
13 |     return np.exp(stable_values) / np.sum(np.exp(stable_values), axis=1, keepdims=True)
14 | 
15 | 
16 | def log_sum_exp(x):
17 |     """
18 |     log_sum_exp is a very useful function in machine learning.
19 |     It can be seen in many places including cross-entropy error.
20 |     However, the naive implementation is numerically unstable.
21 |     Therefore, we use the following implementation. For more details
22 |     please refer: http://www.deeplearningbook.org/slides/04_numerical.pdf
23 |     :param x:
24 |     :return:
25 |     """
26 |     mx = np.max(x, axis=1, keepdims=True)
27 |     safe = x - mx
28 |     return mx + np.log(np.sum(np.exp(safe), axis=1, keepdims=True))
29 | 
30 | 
31 | # Following two methods were used in the initial version of the convolution operations.
32 | # Later we introduced fast Cython versions of `im2col` and `col2im` implementations.
33 | # Hence, these two methods are obsolete.
34 | def im2col(image, filter_size=(3, 3), padding=(0, 0), stride=(1, 1)):
35 |     M, C, h, w, = image.shape
36 |     filter_height = filter_size[0]
37 |     filter_width = filter_size[1]
38 |     padding_height = padding[0]
39 |     padding_width = padding[1]
40 |     stride_height = stride[0]
41 |     stride_width = stride[1]
42 |     x_padded = np.pad(image, ((0, 0),
43 |                               (0, 0),
44 |                               (padding_height, padding_height),
45 |                               (padding_width, padding_width)),
46 |                       mode='constant')
47 |     h_new = int((h - filter_height + 2 * padding_height) / stride_height + 1)
48 |     w_new = int((w - filter_width + 2 * padding_width) / stride_width + 1)
49 | 
50 |     out = np.zeros((filter_width * filter_height * C, M * h_new * w_new), dtype=image.dtype)
51 | 
52 |     itr = 0
53 |     for i in range(h_new):
54 |         for j in range(w_new):
55 |             for m in range(M):
56 |                 start_i = stride_height * i
57 |                 end_i = stride_height * i + filter_width
58 |                 start_j = stride_width * j
59 |                 end_j = stride_width * j + filter_height
60 |                 out[:, itr] = x_padded[m, :, start_i:end_i, start_j:end_j].ravel()
61 |                 itr += 1
62 |     return out
63 | 
64 | 
65 | def col2im(cols, x_shape, filter_size=(3, 3), padding=(0, 0), stride=(1, 1)):
66 |     N, C, H, W = x_shape
67 |     filter_height = filter_size[0]
68 |     filter_width = filter_size[1]
69 |     padding_height = padding[0]
70 |     padding_width = padding[1]
71 |     stride_height = stride[0]
72 |     stride_width = stride[1]
73 | 
74 |     H_padded, W_padded = H + 2 * padding_height, W + 2 * padding_width
75 |     x_padded = np.zeros((N, C, H_padded, W_padded), dtype=cols.dtype)
76 | 
77 |     idx = 0
78 |     for i in range(0, H_padded - filter_height + 1, stride_height):
79 |         for j in range(0, W_padded - filter_width + 1, stride_width):
80 |             for m in range(N):
81 |                 col = cols[:, idx]
82 |                 col = col.reshape((C, filter_height, filter_width))
83 |                 x_padded[m, :, i:i + filter_height, j:j + filter_width] += col
84 |                 idx += 1
85 |     if padding[0] or padding[1] > 0:
86 |         return x_padded[:, :, padding_height:-padding_height, padding_width:-padding_width]
87 |     else:
88 |         return x_padded
89 | 


--------------------------------------------------------------------------------
/aurora/optim/__init__.py:
--------------------------------------------------------------------------------
1 | from .sgd import SGD
2 | from .adam import Adam
3 | 
4 | __all__ = ['SGD', 'Adam']


--------------------------------------------------------------------------------
/aurora/optim/adam.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from .base import Base
 3 | 
 4 | try:
 5 |     from aurora.ndarray import gpu_op, ndarray
 6 | except ImportError:
 7 |     pass
 8 | 
 9 | 
10 | class Adam(Base):
11 |     def __init__(self, cost, params, lr=1e-3, beta1=0.9, beta2=0.995, eps=1e-5, use_gpu=False):
12 |         super().__init__(cost, params, lr, use_gpu=use_gpu)
13 |         self.beta1 = beta1
14 |         self.beta2 = beta2
15 | 
16 |         if self.use_gpu:
17 |             self.velocity = [ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0))
18 |                              for param in params]
19 |             self.momentum = [ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0))
20 |                              for param in params]
21 | 
22 |             self.vec_hat = [ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0))
23 |                             for param in self.params]
24 |             self.mom_hat = [ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0))
25 |                             for param in self.params]
26 |         else:
27 |             self.velocity = [np.zeros_like(param.const) for param in params]
28 |             self.momentum = [np.zeros_like(param.const) for param in params]
29 | 
30 |         self.time = 0
31 |         self.eps = eps
32 | 
33 |     def step(self, feed_dict):
34 |         exe_output = self.executor.run(feed_dict)
35 |         self.time += 1
36 | 
37 |         if self.use_gpu:
38 |             # set
39 |             for i in range(len(self.vec_hat)):
40 |                 gpu_op.matrix_elementwise_multiply_by_const(self.vec_hat[i], 0.0, self.vec_hat[i])
41 |                 gpu_op.matrix_elementwise_multiply_by_const(self.mom_hat[i], 0.0, self.mom_hat[i])
42 | 
43 |             for i in range(len(self.params)):
44 |                 gpu_op.matrix_elementwise_multiply_by_const(self.momentum[i], self.beta1, self.momentum[i])
45 | 
46 |                 # TODO: (upul) copying dev->hot>dev is expensive. We need a better approach.
47 |                 tem_gpu_array = ndarray.array(exe_output[i + 1].asnumpy(), ctx=ndarray.gpu(0))
48 |                 gpu_op.matrix_elementwise_multiply_by_const(exe_output[i + 1], (1 - self.beta1), tem_gpu_array)
49 |                 gpu_op.matrix_elementwise_add(self.momentum[i], tem_gpu_array, self.momentum[i])
50 |                 gpu_op.matrix_elementwise_div_by_const(self.momentum[i], (1 - self.beta1 ** self.time), self.mom_hat[i])
51 | 
52 |                 gpu_op.matrix_elementwise_multiply_by_const(self.velocity[i], self.beta2, self.velocity[i])
53 |                 gpu_op.matrix_elementwise_multiply(exe_output[i + 1], exe_output[i + 1], exe_output[i + 1])
54 |                 gpu_op.matrix_elementwise_multiply_by_const(exe_output[i + 1], (1 - self.beta2), exe_output[i + 1])
55 |                 gpu_op.matrix_elementwise_add(self.velocity[i], exe_output[i + 1], self.velocity[i])
56 |                 gpu_op.matrix_elementwise_div_by_const(self.velocity[i], (1 - self.beta2 ** self.time), self.vec_hat[i])
57 | 
58 |             for i in range(len(self.params)):
59 |                 gpu_op.matrix_elementwise_sqrt(self.vec_hat[i], self.vec_hat[i])
60 |                 gpu_op.matrix_elementwise_add_by_const(self.vec_hat[i], self.eps, self.vec_hat[i])
61 | 
62 |                 gpu_op.matrix_elementwise_multiply_by_const(self.mom_hat[i], -1 * self.lr, self.mom_hat[i])
63 |                 gpu_op.matrix_elementwise_division(self.mom_hat[i], self.vec_hat[i], self.mom_hat[i])
64 |                 gpu_op.matrix_elementwise_add(self.params[i].const, self.mom_hat[i], self.params[i].const)
65 | 
66 |         else:
67 |             vec_hat = [np.zeros_like(param.const) for param in self.params]
68 |             mom_hat = [np.zeros_like(param.const) for param in self.params]
69 | 
70 |             for i in range(len(self.params)):
71 |                 self.momentum[i] = self.beta1 * self.momentum[i] + (1 - self.beta1) * exe_output[i + 1]
72 |                 mom_hat[i] = self.momentum[i] / (1 - self.beta1 ** self.time)
73 | 
74 |                 self.velocity[i] = self.beta2 * self.velocity[i] + (1 - self.beta2) * (exe_output[i + 1] ** 2)
75 |                 vec_hat[i] = self.velocity[i] / (1 - self.beta2 ** self.time)
76 | 
77 |             for i in range(len(self.params)):
78 |                 self.params[i].const += -self.lr * mom_hat[i] / (np.sqrt(vec_hat[i]) + self.eps)
79 | 
80 |         cost = exe_output[0]
81 |         if self.use_gpu:
82 |             cost = cost.asnumpy()
83 |         return cost
84 | 


--------------------------------------------------------------------------------
/aurora/optim/base.py:
--------------------------------------------------------------------------------
 1 | import aurora.autodiff as ad
 2 | try:
 3 |     from aurora.ndarray import ndarray
 4 | except ImportError:
 5 |     pass
 6 | 
 7 | 
 8 | class Base:
 9 |     def __init__(self, cost, params, lr=0.1, use_gpu=False):
10 |         self.cost = cost
11 | 
12 |         # if use_gpu == True, create matrices in GPU
13 |         self.params = self._copy_to_gpu(params) if use_gpu else params
14 |         self.lr = lr
15 |         grads = ad.gradients(cost, params)
16 |         grads.insert(0, cost)
17 |         self.use_gpu = use_gpu
18 |         self.executor = ad.Executor(grads, use_gpu=use_gpu)
19 | 
20 |     def step(self, feed_dict):
21 |         raise NotImplementedError('This method should be implemented by subclasses')
22 | 
23 |     @staticmethod
24 |     def _copy_to_gpu(params):
25 |         ctx = ndarray.gpu(0)
26 |         gpu_arrays = []
27 |         for param in params:
28 |             param.const = ndarray.array(param.const, ctx=ctx)
29 |             gpu_arrays.append(param)
30 |         return gpu_arrays
31 | 


--------------------------------------------------------------------------------
/aurora/optim/sgd.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from .base import Base
 3 | try:
 4 |     from aurora.ndarray import gpu_op, ndarray
 5 | except ImportError:
 6 |     pass
 7 | 
 8 | 
 9 | class SGD(Base):
10 |     def __init__(self, cost, params, lr=0.1, momentum=0.9, use_gpu=False):
11 |         super().__init__(cost, params, lr=lr, use_gpu=use_gpu)
12 |         self.momentum = momentum
13 |         if use_gpu:
14 |             self.velocity = [ndarray.array(np.zeros_like(param.const.asnumpy()), ctx=ndarray.gpu(0))
15 |                              for param in params]
16 |         else:
17 |             self.velocity = [np.zeros_like(param.const) for param in params]
18 | 
19 |     def step(self, feed_dict):
20 |         exe_output = self.executor.run(feed_dict)
21 |         for i in range(len(self.params)):
22 |             if self.use_gpu:
23 |                 gpu_op.matrix_elementwise_multiply_by_const(self.velocity[i], self.momentum, self.velocity[i])
24 |                 gpu_op.matrix_elementwise_multiply_by_const(exe_output[1 + i], -self.lr, exe_output[1 + i])
25 |                 gpu_op.matrix_elementwise_add(self.velocity[i], exe_output[1 + i], self.velocity[i])
26 | 
27 |                 gpu_op.matrix_elementwise_add(self.params[i].const, self.velocity[i], self.params[i].const)
28 |             else:
29 |                 self.velocity[i] = self.momentum * self.velocity[i] - self.lr * exe_output[1 + i]
30 |                 self.params[i].const += self.velocity[i]
31 | 
32 |         cost = exe_output[0]
33 |         if self.use_gpu:
34 |             cost = cost.asnumpy()
35 |         return cost
36 | 


--------------------------------------------------------------------------------
/cuda/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_DIR = /usr/local/cuda
 2 | 
 3 | CC_SRCS := $(wildcard src/*.cc)
 4 | CC_OBJS := ${CC_SRCS:src/%.cc=build/obj/%.o}
 5 | CUDA_SRCS := $(wildcard src/*.cu)
 6 | CUDA_OBJS := ${CUDA_SRCS:src/%.cu=build/obj/%.o}
 7 | OBJS := $(CC_OBJS) $(CUDA_OBJS)
 8 | 
 9 | CC = g++
10 | WARNINGS = -Wall -Wfatal-errors -Wno-unused -Wno-unused-result
11 | CC_FLAGS = -std=c++11 -fPIC $(WARNINGS) -I$(CUDA_DIR)/include
12 | LD_FLAGS = -L$(CUDA_DIR)/lib64 -lcuda -lcudart -lcublas -lcudnn
13 | 
14 | NVCC = nvcc
15 | NVCC_FLAGS = -std=c++11 --compiler-options '-fPIC'
16 | ARCH = -gencode arch=compute_30,code=sm_30 \
17 |        -gencode arch=compute_35,code=sm_35 \
18 |        -gencode arch=compute_50,code=[sm_50,compute_50] \
19 |        -gencode arch=compute_52,code=[sm_52,compute_52]
20 | 
21 | all: build/lib/libc_runtime_api.so
22 | 
23 | build/lib/libc_runtime_api.so: $(OBJS)
24 | 	@mkdir -p build/lib
25 | 	$(CC) -shared $^ -o $@ $(LD_FLAGS)
26 | 
27 | build/obj/%.o: src/%.cc
28 | 	@mkdir -p build/obj
29 | 	$(CC) $(CC_FLAGS) -c $< -o $@
30 | 
31 | build/obj/%.o: src/%.cu
32 | 	@mkdir -p build/obj
33 | 	$(NVCC) $(ARCH) $(NVCC_FLAGS) -c $< -o $@
34 | 
35 | clean:
36 | 	rm -rf build
37 | 
38 | .PHONY: clean
39 | 


--------------------------------------------------------------------------------
/cuda/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.7)
 2 | project(assignment2)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 11)
 5 | 
 6 | set(SOURCE_FILES
 7 |         src/c_runtime_api.cc
 8 |         src/c_runtime_api.h
 9 |         src/cpu_device_api.cc
10 |         src/cpu_device_api.h
11 |         src/cuda_device_api.cc
12 |         src/cuda_device_api.h
13 |         src/device_api.h
14 |         src/dlarray.h
15 |         src/runtime_base.h)
16 | 
17 | add_executable(assignment2 ${SOURCE_FILES})


--------------------------------------------------------------------------------
/cuda/src/c_runtime_api.cc:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  *  Copyright (c) 2017 by Contributors
  3 |  * \file c_runtime_api.cc
  4 |  * \brief Device specific implementations
  5 |  */
  6 | #include "./c_runtime_api.h"
  7 | #include "./cpu_device_api.h"
  8 | #include "./cuda_device_api.h"
  9 | #include "./runtime_base.h"
 10 | #include <algorithm>
 11 | #include <array>
 12 | #include <cassert>
 13 | #include <cstdlib>
 14 | #include <iostream>
 15 | #include <stdlib.h>
 16 | #include <string>
 17 | #include <thread>
 18 | 
 19 | namespace dlsys {
 20 |     namespace runtime {
 21 | 
 22 |         class DeviceAPIManager {
 23 |         public:
 24 |             static const int kMaxDeviceAPI = 8;
 25 | 
 26 |             // Get API
 27 |             static DeviceAPI *Get(DLContext ctx) {
 28 |                 return Global()->GetAPI(ctx.device_type);
 29 |             }
 30 | 
 31 |         private:
 32 |             std::array<DeviceAPI *, kMaxDeviceAPI> api_;
 33 | 
 34 |             DeviceAPIManager() {
 35 |                 std::fill(api_.begin(), api_.end(), nullptr);
 36 |                 static CPUDeviceAPI cpu_device_api_inst;
 37 |                 static CUDADeviceAPI gpu_device_api_inst;
 38 |                 api_[kCPU] = static_cast<DeviceAPI *>(&cpu_device_api_inst);
 39 |                 api_[kGPU] = static_cast<DeviceAPI *>(&gpu_device_api_inst);
 40 |             }
 41 | 
 42 |             // Get global static variable.
 43 |             static DeviceAPIManager *Global() {
 44 |                 static DeviceAPIManager inst;
 45 |                 return &inst;
 46 |             }
 47 | 
 48 |             // Get API.
 49 |             DeviceAPI *GetAPI(DLDeviceType type) {
 50 |                 if (api_[type] == nullptr) {
 51 |                     std::cerr << "Device API not supported" << std::endl;
 52 |                     exit(EXIT_FAILURE);
 53 |                 }
 54 |                 return api_[type];
 55 |             }
 56 |         };
 57 | 
 58 |         inline DLArray *DLArrayCreate_() {
 59 |             DLArray *arr = new DLArray();
 60 |             arr->shape = nullptr;
 61 |             arr->ndim = 0;
 62 |             arr->data = nullptr;
 63 |             return arr;
 64 |         }
 65 | 
 66 |         inline void DLArrayFree_(DLArray *arr) {
 67 |             if (arr != nullptr) {
 68 |                 // ok to delete nullptr
 69 |                 delete[] arr->shape;
 70 |                 if (arr->data != nullptr) {
 71 |                     DeviceAPIManager::Get(arr->ctx)->FreeDataSpace(arr->ctx, arr->data);
 72 |                 }
 73 |             }
 74 |             delete arr;
 75 |         }
 76 | 
 77 |         inline size_t GetDataSize(DLArray *arr) {
 78 |             size_t size = 1;
 79 |             for (index_t i = 0; i < arr->ndim; ++i) {
 80 |                 size *= arr->shape[i];
 81 |             }
 82 |             // assume 32-bit float
 83 |             size *= 4;
 84 |             return size;
 85 |         }
 86 | 
 87 |         inline size_t GetDataAlignment(DLArray *arr) {
 88 |             // assume 32-bit float
 89 |             return 8;
 90 |         }
 91 | 
 92 |     } // namespace runtime
 93 | } // namespace dlsys
 94 | 
 95 | using namespace dlsys::runtime;
 96 | 
 97 | int DLArrayAlloc(const index_t *shape, index_t ndim, DLContext ctx,
 98 |                  DLArrayHandle *out) {
 99 |     DLArray *arr = nullptr;
100 |     API_BEGIN() ;
101 |         // shape
102 |         arr = DLArrayCreate_();
103 |         // ndim
104 |         arr->ndim = ndim;
105 |         index_t *shape_copy = new index_t[ndim];
106 |         std::copy(shape, shape + ndim, shape_copy);
107 |         arr->shape = shape_copy;
108 |         // ctx
109 |         arr->ctx = ctx;
110 |         size_t size = GetDataSize(arr);
111 |         size_t alignment = GetDataAlignment(arr);
112 |         arr->data = DeviceAPIManager::Get(ctx)->AllocDataSpace(ctx, size, alignment);
113 |         *out = arr;
114 |     API_END_HANDLE_ERROR(DLArrayFree_(arr));
115 | }
116 | 
117 | int DLArrayFree(DLArrayHandle handle) {
118 |     API_BEGIN() ;
119 |         DLArray *arr = handle;
120 |         DLArrayFree_(arr);
121 |     API_END();
122 | }
123 | 
124 | int DLArrayReshape(const DLArrayHandle handle, const index_t *new_shape, index_t new_dim) {
125 |     API_BEGIN() ;
126 |         DLArray *arr = handle;
127 | 
128 |         index_t *shape_copy = new index_t[new_dim];
129 |         std::copy(new_shape, new_shape + new_dim, shape_copy);
130 |         arr->shape = shape_copy;
131 |         arr->ndim = new_dim;
132 |     API_END();
133 | }
134 | 
135 | int DLArrayCopyFromTo(DLArrayHandle from, DLArrayHandle to,
136 |                       DLStreamHandle stream) {
137 |     API_BEGIN() ;
138 |         size_t from_size = GetDataSize(from);
139 |         size_t to_size = GetDataSize(to);
140 |         // The size must exactly match
141 |         assert(from_size == to_size);
142 |         DLContext ctx = from->ctx;
143 |         if (ctx.device_type == kCPU) {
144 |             ctx = to->ctx;
145 |         } else {
146 |             // Can not copy across different ctx types directly
147 |             assert((to->ctx.device_type == kCPU) ||
148 |                    (to->ctx.device_type == from->ctx.device_type));
149 |         }
150 |         DeviceAPIManager::Get(ctx)->CopyDataFromTo(from->data, to->data, from_size,
151 |                                                    from->ctx, to->ctx, stream);
152 |     API_END();
153 | }
154 | 


--------------------------------------------------------------------------------
/cuda/src/c_runtime_api.h:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  *  Copyright (c) 2017 by Contributors
  3 |  * \file c_runtime_api.h
  4 |  * \brief DL runtime library.
  5 |  *
  6 |  */
  7 | 
  8 | #ifndef DLSYS_RUNTIME_C_RUNTIME_API_H_
  9 | #define DLSYS_RUNTIME_C_RUNTIME_API_H_
 10 | 
 11 | #ifdef __cplusplus
 12 | #define DLSYS_EXTERN_C extern "C"
 13 | #else
 14 | #define DLSYS_EXTERN_C
 15 | #endif
 16 | 
 17 | #include "dlarray.h"
 18 | #include <stddef.h>
 19 | #include <stdint.h>
 20 | 
 21 | DLSYS_EXTERN_C {
 22 | /*! \brief type of array index. */
 23 | typedef int64_t index_t;
 24 | 
 25 | /*! \brief the array handle */
 26 | typedef DLArray *DLArrayHandle;
 27 | /*!
 28 |  * \brief The stream that is specific to device
 29 |  * can be NULL, which indicates the default one.
 30 |  */
 31 | typedef void *DLStreamHandle;
 32 | 
 33 | // Array related apis for quick proptying
 34 | /*!
 35 |  * \brief Allocate a nd-array's memory,
 36 |  *  including space of shape, of given spec.
 37 |  *
 38 |  * \param shape The shape of the array, the data content will be copied to out
 39 |  * \param ndim The number of dimension of the array.
 40 |  * \param ctx The ctx this array sits on.
 41 |  * \param out The output handle.
 42 |  * \return 0 when success, -1 when failure happens
 43 |  */
 44 | int DLArrayAlloc(const index_t *shape, index_t ndim, DLContext ctx,
 45 |                  DLArrayHandle *out);
 46 | 
 47 | /*!
 48 |  * \brief Free the DL Array.
 49 |  * \param handle The array handle to be freed.
 50 |  * \return 0 when success, -1 when failure happens
 51 |  */
 52 | int DLArrayFree(DLArrayHandle handle);
 53 | 
 54 | /*!
 55 |  * \brief Copy the array, both from and to must be valid during the copy.
 56 |  * \param from The array to be copied from.
 57 |  * \param to The target space.
 58 |  * \param stream The stream where the copy happens, can be NULL.
 59 |  * \return 0 when success, -1 when failure happens
 60 |  */
 61 | int DLArrayCopyFromTo(DLArrayHandle from, DLArrayHandle to,
 62 |                       DLStreamHandle stream);
 63 | 
 64 | /*!
 65 |  * \brief Set all array elements to given value.
 66 |  * \param arr The array to be Set.
 67 |  * \param value The target value.
 68 |  * \return 0 when success, -1 when failure happens
 69 |  */
 70 | int DLGpuArraySet(DLArrayHandle arr, float value);
 71 | 
 72 | 
 73 | int DLArrayReshape(const DLArrayHandle handle, const index_t *new_shape, index_t new_dim);
 74 | 
 75 | /*!
 76 |  * \brief Broadcast input array to output array.
 77 |  * \param input The input array.
 78 |  * \param output The output array.
 79 |  * \return 0 when success, -1 when failure happens
 80 |  */
 81 | int DLGpuBroadcastTo(const DLArrayHandle input, DLArrayHandle output);
 82 | 
 83 | /*!
 84 |  * \brief Reduce sum input array by axis=0 and store to output.
 85 |  * \param input The input array.
 86 |  * \param output The output array.
 87 |  * \return 0 when success, -1 when failure happens
 88 |  */
 89 | int DLGpuReduceSumAxisZero(const DLArrayHandle input, DLArrayHandle output);
 90 | 
 91 | /*!
 92 |  * \brief Elementwise add two matrices and store to output.
 93 |  * \param matA The left input array.
 94 |  * \param matB The right input array.
 95 |  * \param output The output array.
 96 |  * \return 0 when success, -1 when failure happens
 97 |  */
 98 | int DLGpuMatrixElementwiseAdd(const DLArrayHandle matA,
 99 |                               const DLArrayHandle matB, DLArrayHandle output);
100 | 
101 | /*!
102 |  * \brief Add matrix by const and store to output.
103 |  * \param input The input array.
104 |  * \param val The constant.
105 |  * \param output The output array.
106 |  * \return 0 when success, -1 when failure happens
107 |  */
108 | int DLGpuMatrixElementwiseAddByConst(const DLArrayHandle input, float val,
109 |                                      DLArrayHandle output);
110 | 
111 | 
112 | int DLGpuMatrixElementwiseSubtract(const DLArrayHandle matA,
113 |                                    const DLArrayHandle matB, DLArrayHandle output);
114 | 
115 | int DLGpuMatrixElementwiseSubtractByConst(const DLArrayHandle input, float val,
116 |                                           DLArrayHandle output);
117 | 
118 | /*!
119 |  * \brief Elementwise multiply two matrices and store to output.
120 |  * \param matA The left input array.
121 |  * \param matB The right input array.
122 |  * \param output The output array.
123 |  * \return 0 when success, -1 when failure happens
124 |  */
125 | int DLGpuMatrixElementwiseMultiply(
126 |         const DLArrayHandle matA, const DLArrayHandle matB, DLArrayHandle output);
127 | 
128 | /*!
129 |  * \brief Multiply matrix by const and store to output.
130 |  * \param input The input array.
131 |  * \param val The constant.
132 |  * \param output The output array.
133 |  * \return 0 when success, -1 when failure happens
134 |  */
135 | int DLGpuMatrixMultiplyByConst(const DLArrayHandle input, float val,
136 |                                DLArrayHandle output);
137 | 
138 | 
139 | // TODO: (upul) documentation
140 | int DLGpuMatrixElementwiseDiv(const DLArrayHandle matA,
141 |                               const DLArrayHandle matB,
142 |                               DLArrayHandle output);
143 | 
144 | // TODO: (upul) documentation
145 | int DLGpuMatrixElementwiseDivByConst(const DLArrayHandle matA, float val,
146 |                                      DLArrayHandle output);
147 | 
148 | /*!
149 |  * \brief Matrix multiply two matrices and store to output.
150 |  * \param matA The left input array.
151 |  * \param transposeA Whether matA needs to be transposed
152 |  * \param matB The right input array.
153 |  * \param transposeB Whether matB needs to be transposed
154 |  * \param output The output array.
155 |  * \return 0 when success, -1 when failure happens
156 |  */
157 | int DLGpuMatrixMultiply(const DLArrayHandle matA, bool transposeA,
158 |                         const DLArrayHandle matB, bool transposeB,
159 |                         DLArrayHandle matC);
160 | 
161 | /*!
162 |  * \brief Compute relu on all array elements, and store to output.
163 |  * \param input The input array.
164 |  * \param output The output value.
165 |  * \return 0 when success, -1 when failure happens
166 |  */
167 | int DLGpuRelu(const DLArrayHandle input, DLArrayHandle output);
168 | 
169 | /*!
170 |  * \brief Compute relu gradient, and store to output.
171 |  * \param input The input array.
172 |  * \param in_grad The input gradients value.
173 |  * \param output The output array.
174 |  * \return 0 when success, -1 when failure happens
175 |  */
176 | int DLGpuReluGradient(const DLArrayHandle input, const DLArrayHandle in_grad,
177 |                       DLArrayHandle output);
178 | 
179 | /*!
180 |  * \brief Compute softmax on matrix, and store to output.
181 |  * \param input The input array.
182 |  * \param output The output value.
183 |  * \return 0 when success, -1 when failure happens
184 |  */
185 | int DLGpuSoftmax(const DLArrayHandle input, DLArrayHandle output);
186 | 
187 | /*!
188 |  * \brief Compute softmax_cross_entropy.
189 |  *  np.mean(-np.sum(y_ * np.log(softmax(y)), axis=1), keepdims=True)
190 |  * \param input_a The y array.
191 |  * \param input_b The y_ array.
192 |  * \param output The output value.
193 |  * \return 0 when success, -1 when failure happens
194 |  */
195 | int DLGpuSoftmaxCrossEntropy(const DLArrayHandle input_a,
196 |                              const DLArrayHandle input_b,
197 |                              DLArrayHandle output);
198 | 
199 | int DLGpuMatrixElementwiseSqrt(const DLArrayHandle input_a, DLArrayHandle output);
200 | 
201 | /*
202 | * CUDNN....
203 | */
204 | int cudnnReLUForward(const DLArrayHandle input, DLArrayHandle output);
205 | 
206 | int cudnnConv2DForward(const DLArrayHandle input,
207 |                        const DLArrayHandle filter,
208 |                        const DLArrayHandle bias,
209 |                        const int stride_height,
210 |                        const int stride_width,
211 |                        const int padding_height,
212 |                        const int padding_width,
213 |                        DLArrayHandle output);
214 | 
215 | int cudnnPoolForward(const DLArrayHandle input,
216 |                      const int pooling_height,
217 |                      const int pooling_width,
218 |                      const int stride_height,
219 |                      const int stride_width,
220 |                      const char *mode,
221 |                      DLArrayHandle output);
222 | 
223 | int cudnnPoolBackward(const DLArrayHandle input,
224 |                       const DLArrayHandle output_grads,
225 |                       const DLArrayHandle output,
226 |                       const int pooling_height,
227 |                       const int pooling_width,
228 |                       const int stride_height,
229 |                       const int stride_width,
230 |                       const char *mode,
231 |                       DLArrayHandle pool_grad);
232 | 
233 | int cudnnConv2DBackwardFilter(const DLArrayHandle input,
234 |                               const DLArrayHandle output_grads,
235 |                               const int stride_height,
236 |                               const int stride_width,
237 |                               const int padding_height,
238 |                               const int padding_width,
239 |                               DLArrayHandle filter_grad);
240 | 
241 | int cudnnConv2DBackwardData(const DLArrayHandle filter,
242 |                             const DLArrayHandle output_grads,
243 |                             const int stride_height,
244 |                             const int stride_width,
245 |                             const int padding_height,
246 |                             const int padding_width,
247 |                             DLArrayHandle data_grad);
248 | 
249 | int cudnnConv2DBackwardBias(const DLArrayHandle output_grads,
250 |                             DLArrayHandle bias_grads);
251 | 
252 | } // DLSYS_EXTERN_C
253 | 
254 | #endif // DLSYS_RUNTIME_C_RUNTIME_API_H_
255 | 


--------------------------------------------------------------------------------
/cuda/src/cpu_device_api.cc:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  *  Copyright (c) 2017 by Contributors
 3 |  * \file cpu_device_api.cc
 4 |  */
 5 | #include "./cpu_device_api.h"
 6 | #include <cstdlib>
 7 | #include <cstring>
 8 | #include <iostream>
 9 | 
10 | namespace dlsys {
11 |     namespace runtime {
12 | 
13 |         void *CPUDeviceAPI::AllocDataSpace(DLContext ctx, size_t size,
14 |                                            size_t alignment) {
15 |             // std::cout << "allocating cpu data" << std::endl;
16 |             void *ptr;
17 |             int ret = posix_memalign(&ptr, alignment, size);
18 |             if (ret != 0)
19 |                 throw std::bad_alloc();
20 |             return ptr;
21 |         }
22 | 
23 |         void CPUDeviceAPI::FreeDataSpace(DLContext ctx, void *ptr) { free(ptr); }
24 | 
25 |         void CPUDeviceAPI::CopyDataFromTo(const void *from, void *to, size_t size,
26 |                                           DLContext ctx_from, DLContext ctx_to,
27 |                                           DLStreamHandle stream) {
28 |             // std::cout << "copying cpu data" << std::endl;
29 |             memcpy(to, from, size);
30 |         }
31 | 
32 |         void CPUDeviceAPI::StreamSync(DLContext ctx, DLStreamHandle stream) {}
33 | 
34 |     } // namespace runtime
35 | } // namespace dlsys
36 | 


--------------------------------------------------------------------------------
/cuda/src/cpu_device_api.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  *  Copyright (c) 2017 by Contributors
 3 |  * \file device_api.h
 4 |  * \brief Device specific API
 5 |  */
 6 | #ifndef DLSYS_RUNTIME_CPU_DEVICE_API_H_
 7 | #define DLSYS_RUNTIME_CPU_DEVICE_API_H_
 8 | 
 9 | #include "c_runtime_api.h"
10 | #include "device_api.h"
11 | #include <assert.h>
12 | #include <string>
13 | 
14 | namespace dlsys {
15 |     namespace runtime {
16 | 
17 |         class CPUDeviceAPI : public DeviceAPI {
18 |         public:
19 |             void *AllocDataSpace(DLContext ctx, size_t size, size_t alignment) final;
20 | 
21 |             void FreeDataSpace(DLContext ctx, void *ptr) final;
22 | 
23 |             void CopyDataFromTo(const void *from, void *to, size_t size,
24 |                                 DLContext ctx_from, DLContext ctx_to, DLStreamHandle stream) final;
25 | 
26 |             void StreamSync(DLContext ctx, DLStreamHandle stream) final;
27 |         };
28 | 
29 |     } // namespace runtime
30 | } // namespace dlsys
31 | #endif // DLSYS_RUNTIME_CPU_DEVICE_API_H_
32 | 


--------------------------------------------------------------------------------
/cuda/src/cuda_device_api.cc:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  *  Copyright (c) 2017 by Contributors
 3 |  * \file cuda_device_api.cc
 4 |  * \brief GPU specific API
 5 |  */
 6 | 
 7 | #include "./cuda_device_api.h"
 8 | #include <cassert>
 9 | #include <cuda_runtime.h>
10 | #include <iostream>
11 | 
12 | #define CUDA_CALL(func)                                                        \
13 |   {                                                                            \
14 |     cudaError_t e = (func);                                                    \
15 |     assert((e == cudaSuccess) || (e == cudaErrorCudartUnloading));             \
16 |   }
17 | 
18 | namespace dlsys {
19 |     namespace runtime {
20 | 
21 |         static void GPUCopy(const void *from, void *to, size_t size,
22 |                             cudaMemcpyKind kind, cudaStream_t stream) {
23 |             if (stream != 0) {
24 |                 CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream));
25 |             } else {
26 |                 CUDA_CALL(cudaMemcpy(to, from, size, kind));
27 |             }
28 |         }
29 | 
30 |         void *CUDADeviceAPI::AllocDataSpace(DLContext ctx, size_t size,
31 |                                             size_t alignment) {
32 |             //std::cout << "allocating cuda data" << std::endl;
33 |             CUDA_CALL(cudaSetDevice(ctx.device_id));
34 |             assert((256 % alignment) == 0U); // << "CUDA space is aligned at 256 bytes";
35 |             void *ret;
36 |             CUDA_CALL(cudaMalloc(&ret, size));
37 |             return ret;
38 |         }
39 | 
40 |         void CUDADeviceAPI::FreeDataSpace(DLContext ctx, void *ptr) {
41 |             //std::cout << "releasing cuda data" << std::endl;
42 |             CUDA_CALL(cudaSetDevice(ctx.device_id));
43 |             CUDA_CALL(cudaFree(ptr));
44 |         }
45 | 
46 |         void CUDADeviceAPI::CopyDataFromTo(const void *from, void *to, size_t size,
47 |                                            DLContext ctx_from, DLContext ctx_to, DLStreamHandle stream) {
48 |             //std::cout << "copying cuda data" << std::endl;
49 |             cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
50 |             if (ctx_from.device_type == kGPU && ctx_to.device_type == kGPU) {
51 |                 CUDA_CALL(cudaSetDevice(ctx_from.device_id));
52 |                 if (ctx_from.device_id == ctx_to.device_id) {
53 |                     GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream);
54 |                 } else {
55 |                     cudaMemcpyPeerAsync(to, ctx_to.device_id, from, ctx_from.device_id,
56 |                                         size, cu_stream);
57 |                 }
58 |             } else if (ctx_from.device_type == kGPU && ctx_to.device_type == kCPU) {
59 |                 CUDA_CALL(cudaSetDevice(ctx_from.device_id));
60 |                 GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream);
61 |             } else if (ctx_from.device_type == kCPU && ctx_to.device_type == kGPU) {
62 |                 CUDA_CALL(cudaSetDevice(ctx_to.device_id));
63 |                 GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream);
64 |             } else {
65 |                 std::cerr << "expect copy from/to GPU or between GPU" << std::endl;
66 |             }
67 |         }
68 | 
69 |         void CUDADeviceAPI::StreamSync(DLContext ctx, DLStreamHandle stream) {
70 |             CUDA_CALL(cudaSetDevice(ctx.device_id));
71 |             CUDA_CALL(cudaStreamSynchronize(static_cast<cudaStream_t>(stream)));
72 |         }
73 | 
74 |     } // namespace runtime
75 | } // namespace dlsys
76 | 


--------------------------------------------------------------------------------
/cuda/src/cuda_device_api.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  *  Copyright (c) 2017 by Contributors
 3 |  * \file device_api.h
 4 |  * \brief Device specific API
 5 |  */
 6 | #ifndef DLSYS_RUNTIME_CUDA_DEVICE_API_H_
 7 | #define DLSYS_RUNTIME_CUDA_DEVICE_API_H_
 8 | 
 9 | #include "c_runtime_api.h"
10 | #include "device_api.h"
11 | #include <cuda_runtime.h>
12 | 
13 | #include <assert.h>
14 | #include <string>
15 | 
16 | namespace dlsys {
17 |     namespace runtime {
18 | 
19 |         class CUDADeviceAPI : public DeviceAPI {
20 |         public:
21 |             void *AllocDataSpace(DLContext ctx, size_t size, size_t alignment) final;
22 | 
23 |             void FreeDataSpace(DLContext ctx, void *ptr) final;
24 | 
25 |             void CopyDataFromTo(const void *from, void *to, size_t size,
26 |                                 DLContext ctx_from, DLContext ctx_to,
27 |                                 DLStreamHandle stream) final;
28 | 
29 |             void StreamSync(DLContext ctx, DLStreamHandle stream) final;
30 |         };
31 | 
32 |     } // namespace runtime
33 | } // namespace dlsys
34 | #endif // DLSYS_RUNTIME_CUDA_DEVICE_API_H_
35 | 


--------------------------------------------------------------------------------
/cuda/src/device_api.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  *  Copyright (c) 2017 by Contributors
 3 |  * \file device_api.h
 4 |  * \brief Device specific API
 5 |  */
 6 | #ifndef DLSYS_RUNTIME_DEVICE_API_H_
 7 | #define DLSYS_RUNTIME_DEVICE_API_H_
 8 | 
 9 | #include "c_runtime_api.h"
10 | #include <assert.h>
11 | #include <string>
12 | 
13 | namespace dlsys {
14 |     namespace runtime {
15 | 
16 |         class DeviceAPI {
17 |         public:
18 |             /*! \brief virtual destructor */
19 |             virtual ~DeviceAPI() {}
20 | 
21 |             /*!
22 |              * \brief Allocate a data space on device.
23 |              * \param ctx The device context to perform operation.
24 |              * \param size The size of the memory
25 |              * \param alignment The alignment of the memory.
26 |              * \return The allocated device pointer
27 |              */
28 |             virtual void *AllocDataSpace(DLContext ctx, size_t size,
29 |                                          size_t alignment) = 0;
30 | 
31 |             /*!
32 |              * \brief Free a data space on device.
33 |              * \param ctx The device context to perform operation.
34 |              * \param ptr The data space.
35 |              * \tparam xpu The device mask.
36 |              */
37 |             virtual void FreeDataSpace(DLContext ctx, void *ptr) = 0;
38 | 
39 |             /*!
40 |              * \brief copy data from one place to another
41 |              * \param dev The device to perform operation.
42 |              * \param from The source array.
43 |              * \param to The target array.
44 |              * \param size The size of the memory
45 |              * \param ctx_from The source context
46 |              * \param ctx_to The target context
47 |              */
48 |             virtual void CopyDataFromTo(const void *from, void *to, size_t size,
49 |                                         DLContext ctx_from, DLContext ctx_to,
50 |                                         DLStreamHandle stream) = 0;
51 | 
52 |             /*!
53 |              * \brief Synchronize the stream
54 |              * \param ctx The context to perform operation.
55 |              * \param stream The stream to be sync.
56 |              */
57 |             virtual void StreamSync(DLContext ctx, DLStreamHandle stream) = 0;
58 |         };
59 | 
60 |     } // namespace runtime
61 | } // namespace dlsys
62 | #endif // DLSYS_RUNTIME_DEVICE_API_H_
63 | 


--------------------------------------------------------------------------------
/cuda/src/dlarray.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  *  Copyright (c) 2017 by Contributors
 3 |  * \file dlarray.h
 4 |  * \brief Header that defines array struct.
 5 |  */
 6 | #ifndef DLSYS_H_
 7 | #define DLSYS_H_
 8 | 
 9 | #ifdef __cplusplus
10 | #define DLSYS_EXTERN_C extern "C"
11 | #else
12 | #define DLSYS_EXTERN_C
13 | #endif
14 | 
15 | #include <stddef.h>
16 | #include <stdint.h>
17 | 
18 | DLSYS_EXTERN_C {
19 | /*!
20 |  * \brief The device type in DLContext.
21 |  */
22 | typedef enum {
23 |     kCPU = 1,
24 |     kGPU = 2,
25 | } DLDeviceType;
26 | 
27 | /*!
28 |  * \brief A Device context for array.
29 |  */
30 | typedef struct {
31 |     /*! \brief The device index */
32 |     int device_id;
33 |     /*! \brief The device type used in the device. */
34 |     DLDeviceType device_type;
35 | } DLContext;
36 | 
37 | /*!
38 |  * \brief Plain C Array object, does not manage memory.
39 |  */
40 | typedef struct {
41 |     /*!
42 |      * \brief The opaque data pointer points to the allocated data.
43 |      *  This will be CUDA device pointer or cl_mem handle in OpenCL.
44 |      *  This pointer is always aligns to 256 bytes as in CUDA.
45 |      */
46 |     void *data;
47 |     /*! \brief The device context of the tensor */
48 |     DLContext ctx;
49 |     /*! \brief Number of dimensions */
50 |     int ndim;
51 |     /*! \brief The shape of the tensor */
52 |     int64_t *shape;
53 | } DLArray;
54 | 
55 | } // DLSYS_EXTERN_C
56 | #endif // DLSYS_H_
57 | 


--------------------------------------------------------------------------------
/cuda/src/gpu_op.cu:
--------------------------------------------------------------------------------
  1 | #include "./c_runtime_api.h"
  2 | #include <cassert>
  3 | #include <cstdio>
  4 | #include <cublas_v2.h>
  5 | #include <cuda_runtime.h>
  6 | #include <math.h>
  7 | 
  8 | /* TODO: Your code here */
  9 | /* all your GPU kernel code, e.g. matrix_softmax_cross_entropy_kernel */
 10 | 
 11 | // y = inputs[0], y_ = inputs[1]
 12 | // np.mean(-np.sum(y_ * np.log(softmax(y)), axis=1), keepdims=True)
 13 | __global__ void matrix_softmax_cross_entropy_kernel(int nrow, int ncol,
 14 |                                                     const float *input_a, const float *input_b, float *output) {
 15 |     // Dynamic shared memory, size provided at kernel launch.
 16 |     extern __shared__ float loss_per_row[];
 17 |     // Two dimensional thread blocks.
 18 |     int y = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x
 19 |             + threadIdx.x;
 20 |     if (y >= nrow) {
 21 |         return;
 22 |     }
 23 |     input_a += y * ncol;
 24 |     input_b += y * ncol;
 25 |     float maxval = *input_a;
 26 |     // Find max for a row.
 27 |     for (int x = 1; x < ncol; ++x) {
 28 |         maxval = max(maxval, input_a[x]);
 29 |     }
 30 |     // Deduct by max for a row, and raise to exp.
 31 |     float sum = 0;
 32 |     for (int x = 0; x < ncol; ++x) {
 33 |         sum += exp(input_a[x] - maxval);
 34 |     }
 35 |     // Compute per-row loss.
 36 |     float loss = 0;
 37 |     for (int x = 0; x < ncol; ++x) {
 38 |         loss -= input_b[x] * log(exp(input_a[x] - maxval) / sum);
 39 |     }
 40 |     loss_per_row[y] = loss;
 41 |     __syncthreads();
 42 |     // Compute reduce_mean across rows.
 43 |     float mean_loss = 0;
 44 |     // Use a single thread to reduce mean across rows.
 45 |     if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
 46 |         for (int i = 0; i < nrow; ++i) {
 47 |             mean_loss += loss_per_row[i];
 48 |         }
 49 |         mean_loss /= nrow;
 50 |         output[0] = mean_loss;
 51 |     }
 52 | }
 53 | 
 54 | 
 55 | __global__ void array_set_kernel(float *array, float value, int n) {
 56 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
 57 |     if (index < n) {
 58 |         array[index] = value;
 59 |     }
 60 | }
 61 | 
 62 | 
 63 | int DLGpuArraySet(DLArrayHandle arr, float value) { /* TODO: Your code here */
 64 |     int n = 1;
 65 |     for (int i = 0; i < arr->ndim; i++) {
 66 |         n = n * arr->shape[i];
 67 |     }
 68 | 
 69 |     float *array_data = (float *) arr->data;
 70 | 
 71 |     int threads_per_block = 1024;
 72 |     int num_blocks = (n + threads_per_block - 1) / threads_per_block;
 73 | 
 74 |     array_set_kernel << < num_blocks, threads_per_block >> > (array_data, value, n);
 75 |     return 0;
 76 | }
 77 | 
 78 | 
 79 | __global__ void broadcast_to_kernel(const float *input_data,
 80 |                                     float *output_data,
 81 |                                     index_t input_n,
 82 |                                     index_t output_n) {
 83 |     index_t idx = blockDim.x * blockIdx.x + threadIdx.x;
 84 |     if (idx < output_n) {
 85 |         output_data[idx] = input_data[idx % input_n];
 86 |     }
 87 | }
 88 | 
 89 | 
 90 | int DLGpuBroadcastTo(const DLArrayHandle input, DLArrayHandle output) {
 91 |     /* TODO: Your code here */
 92 |     index_t input_n = 1;
 93 |     for (int i = 0; i < input->ndim; i++)
 94 |         input_n *= input->shape[i];
 95 | 
 96 |     index_t output_n = 1;
 97 |     for (int i = 0; i < output->ndim; i++)
 98 |         output_n *= output->shape[i];
 99 | 
100 |     const float *input_data = (const float *) input->data;
101 |     float *output_data = (float *) output->data;
102 | 
103 |     int thread_per_block = 512;
104 |     int n_blocks = (output_n + thread_per_block - 1) / thread_per_block;
105 |     broadcast_to_kernel << < n_blocks, thread_per_block >> > (input_data, output_data,
106 |             input_n, output_n);
107 |     return 0;
108 | }
109 | 
110 | __global__ void reduced_sum_axis_zero(const float *input_data, float *output_data, int input_n, int output_n) {
111 |     int idx = blockDim.x * blockIdx.x + threadIdx.x;
112 |     if (idx < output_n) {
113 |         output_data[idx] = 0.0;
114 |         for (int i = 0; i < input_n / output_n; i++) {
115 |             output_data[idx] += input_data[i * output_n + idx];
116 |         }
117 |     }
118 | }
119 | 
120 | int DLGpuReduceSumAxisZero(const DLArrayHandle input, DLArrayHandle output) {
121 |     /* TODO: Your code here */
122 |     int input_n = 1;
123 |     for (int i = 0; i < input->ndim; i++) {
124 |         input_n *= input->shape[i];
125 |     }
126 | 
127 |     int output_n = 1;
128 |     for (int i = 0; i < output->ndim; i++) {
129 |         output_n *= output->shape[i];
130 |     }
131 | 
132 |     const float *input_data = (const float *) input->data;
133 |     float *output_data = (float *) output->data;
134 | 
135 |     int thread_per_block = 1024;
136 |     int n_blocks = (output_n + thread_per_block - 1) / thread_per_block;
137 | 
138 |     reduced_sum_axis_zero << < n_blocks, thread_per_block >> > (input_data, output_data, input_n, output_n);
139 |     return 0;
140 | }
141 | 
142 | __global__ void matrix_elementwise_add(const float *a, const float *b, float *c,
143 |                                        int n) {
144 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
145 |     if (index < n) {
146 |         c[index] = a[index] + b[index];
147 |     }
148 | }
149 | 
150 | int DLGpuMatrixElementwiseAdd(const DLArrayHandle matA,
151 |                               const DLArrayHandle matB, DLArrayHandle output) {
152 |     /* TODO: Your code here */
153 |     int n = 1;
154 |     for (int i = 0; i < output->ndim; i++) {
155 |         n = n * output->shape[i];
156 |     }
157 |     const float *data_A = (const float *) matA->data;
158 |     const float *data_B = (const float *) matB->data;
159 |     float *data_output = (float *) output->data;
160 | 
161 |     int threads_per_block = 1024;
162 |     int num_blocks = (n + threads_per_block - 1) / threads_per_block;
163 | 
164 |     matrix_elementwise_add << < num_blocks, threads_per_block >> > (data_A, data_B,
165 |             data_output, n);
166 |     return 0;
167 | }
168 | 
169 | __global__
170 | void matrix_elementwise_subtract(const float *a, const float *b, float *c,
171 |                                  int n) {
172 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
173 |     if (index < n) {
174 |         c[index] = a[index] - b[index];
175 |     }
176 | }
177 | 
178 | int DLGpuMatrixElementwiseSubtract(const DLArrayHandle matA,
179 |                                    const DLArrayHandle matB, DLArrayHandle output) {
180 |     /* TODO: Your code here */
181 |     int n = 1;
182 |     for (int i = 0; i < output->ndim; i++) {
183 |         n = n * output->shape[i];
184 |     }
185 |     const float *data_A = (const float *) matA->data;
186 |     const float *data_B = (const float *) matB->data;
187 |     float *data_output = (float *) output->data;
188 | 
189 |     int threads_per_block = 1024;
190 |     int num_blocks = (n + threads_per_block - 1) / threads_per_block;
191 | 
192 |     matrix_elementwise_subtract << < num_blocks, threads_per_block >> > (data_A, data_B,
193 |             data_output, n);
194 |     return 0;
195 | }
196 | 
197 | __global__
198 | void matrix_elementwise_division(const float *a, const float *b, float *result, int n) {
199 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
200 |     if (index < n) {
201 |         result[index] = a[index] / b[index];
202 |     }
203 | }
204 | 
205 | int DLGpuMatrixElementwiseDiv(const DLArrayHandle matA, const DLArrayHandle matB,
206 |                               DLArrayHandle output) {
207 |     int n = 1;
208 |     for (int i = 0; i < output->ndim; i++) {
209 |         n = n * output->shape[i];
210 |     }
211 |     const float *data_A = (const float *) matA->data;
212 |     const float *data_B = (const float *) matB->data;
213 |     float *data_output = (float *) output->data;
214 | 
215 |     int threads_per_block = 1024;
216 |     int num_blocks = (n + threads_per_block - 1) / threads_per_block;
217 | 
218 |     matrix_elementwise_division << < num_blocks, threads_per_block >> > (data_A, data_B,
219 |             data_output, n);
220 |     return 0;
221 | 
222 | }
223 | 
224 | __global__ void matrix_elementwise_add_by_const_kernal(const float *d_in,
225 |                                                        float *d_out, float val, int n) {
226 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
227 |     if (index < n) {
228 |         d_out[index] = d_in[index] + val;
229 |     }
230 | }
231 | 
232 | int DLGpuMatrixElementwiseAddByConst(const DLArrayHandle input, float val,
233 |                                      DLArrayHandle output) {
234 |     /* TODO: Your code here */
235 |     int n = 1;
236 |     for (int i = 0; i < output->ndim; i++) {
237 |         n = n * output->shape[i];
238 |     }
239 |     const float *input_data = (const float *) input->data;
240 |     float *output_data = (float *) output->data;
241 |     int threads_per_block = 1024;
242 |     int num_blocks = (n + threads_per_block - 1) / threads_per_block;
243 |     matrix_elementwise_add_by_const_kernal << < num_blocks, threads_per_block >> > (
244 |             input_data, output_data, val, n);
245 |     return 0;
246 | }
247 | 
248 | __global__
249 | void matrix_elementwise_subtract_by_const_kernal(const float *d_in,
250 |                                                  float *d_out, float val, int n) {
251 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
252 |     if (index < n) {
253 |         d_out[index] = d_in[index] - val;
254 |     }
255 | }
256 | 
257 | int DLGpuMatrixElementwiseSubtractByConst(const DLArrayHandle input, float val,
258 |                                           DLArrayHandle output) {
259 |     /* TODO: Your code here */
260 |     int n = 1;
261 |     for (int i = 0; i < output->ndim; i++) {
262 |         n = n * output->shape[i];
263 |     }
264 |     const float *input_data = (const float *) input->data;
265 |     float *output_data = (float *) output->data;
266 |     int threads_per_block = 1024;
267 |     int num_blocks = (n + threads_per_block - 1) / threads_per_block;
268 |     matrix_elementwise_subtract_by_const_kernal << < num_blocks, threads_per_block >> > (
269 |             input_data, output_data, val, n);
270 |     return 0;
271 | }
272 | 
273 | 
274 | __global__ void matrix_elementwise_div_by_const_kernal(const float *d_in,
275 |                                                        float *d_out, float val, int n) {
276 |     int index = blockIdx.x * blockDim.x + threadIdx.x;
277 |     if (index < n) {
278 |         d_out[index] = d_in[index] / val;
279 |     }
280 | }
281 | 
282 | int DLGpuMatrixElementwiseDivByConst(const DLArrayHandle input, float val,
283 |                                      DLArrayHandle output) {
284 |     /* TODO: Your code here */
285 |     int n = 1;
286 |     for (int i = 0; i < output->ndim; i++) {
287 |         n = n * output->shape[i];
288 |     }
289 |     const float *input_data = (const float *) input->data;
290 |     float *output_data = (float *) output->data;
291 |     int threads_per_block = 1024;
292 |     int num_blocks = (n + threads_per_block - 1) / threads_per_block;
293 |     matrix_elementwise_div_by_const_kernal << < num_blocks, threads_per_block >> > (
294 |             input_data, output_data, val, n);
295 |     return 0;
296 | }
297 | 
298 | 
299 | __global__ void elementwise_mul_kernel(const float *data_a, const float *data_b,
300 |                                        float *output, int n) {
301 | 
302 |     int index = blockDim.x * blockIdx.x + threadIdx.x;
303 |     if (index < n) {
304 |         output[index] = data_a[index] * data_b[index];
305 |     }
306 | }
307 | 
308 | int DLGpuMatrixElementwiseMultiply(const DLArrayHandle matA,
309 |                                    const DLArrayHandle matB, DLArrayHandle output) {
310 |     /* TODO: Your code here */
311 |     int n = 1;
312 |     for (int i = 0; i < output->ndim; i++) {
313 |         n = n * output->shape[i];
314 |     }
315 | 
316 |     int threads_per_block = 1024;
317 |     int num_blocks = (n + threads_per_block - 1) / threads_per_block;
318 | 
319 |     const float *mat_a_data = (const float *) matA->data;
320 |     const float *mat_b_data = (const float *) matB->data;
321 |     float *output_data = (float *) output->data;
322 | 
323 |     elementwise_mul_kernel << < num_blocks, threads_per_block >> > (mat_a_data,
324 |             mat_b_data, output_data, n);
325 | 
326 |     return 0;
327 | }
328 | 
329 | __global__
330 | void matrix_elementwise_sqrt(const float *d_input, float *d_output, int n) {
331 |     int index = blockDim.x * blockIdx.x + threadIdx.x;
332 |     if (index < n) {
333 |         d_output[index] = sqrt(d_input[index]);
334 |     }
335 | }
336 | 
337 | int DLGpuMatrixElementwiseSqrt(const DLArrayHandle input, DLArrayHandle output) {
338 |     /* TODO: Your code here */
339 |     int n = 1;
340 |     for (int i = 0; i < input->ndim; i++) {
341 |         n *= input->shape[i];
342 |     }
343 | 
344 |     const float *input_data = (const float *) input->data;
345 |     float *output_data = (float *) output->data;
346 |     int threads_per_block = 1024;
347 |     int num_blocks = (n + threads_per_block - 1) / threads_per_block;
348 |     matrix_elementwise_sqrt << < num_blocks, threads_per_block >> > (input_data, output_data, n);
349 |     return 0;
350 | }
351 | 
352 | 
353 | __global__ void marix_multiply_by_const(const float *d_input, float *d_output,
354 |                                         float val, int n) {
355 |     int index = blockDim.x * blockIdx.x + threadIdx.x;
356 |     if (index < n) {
357 |         d_output[index] = d_input[index] * val;
358 |     }
359 | }
360 | 
361 | int DLGpuMatrixMultiplyByConst(const DLArrayHandle input, float val,
362 |                                DLArrayHandle output) {
363 |     /* TODO: Your code here */
364 |     int n = 1;
365 |     for (int i = 0; i < input->ndim; i++) {
366 |         n *= input->shape[i];
367 |     }
368 | 
369 |     const float *input_data = (const float *) input->data;
370 |     float *output_data = (float *) output->data;
371 |     int threads_per_block = 1024;
372 |     int num_blocks = (n + threads_per_block - 1) / threads_per_block;
373 |     marix_multiply_by_const << < num_blocks, threads_per_block >> > (input_data,
374 |             output_data, val, n);
375 |     return 0;
376 | }
377 | 
378 | // int DLGpuMatrixMultiply(const DLArrayHandle matA, bool transposeA,
379 | // 		const DLArrayHandle matB, bool transposeB, DLArrayHandle matC) {
380 | // 	/* TODO: Your code here */
381 | // 	// Hint: use cublas
382 | // 	// cublas assume matrix is column major
383 | //     cublasHandle_t handle;
384 | //     cublasStatus_t stat = cublasCreate(&handle);
385 | //     if (stat != CUBLAS_STATUS_SUCCESS)
386 | //         printf("CUBLAS initialization failed\n");
387 | 
388 | //     const float *matA_data = (const float *) matA->data;
389 | //     const float *matB_data = (const float *) matB->data;
390 | //     float *matC_data = (float *) matC->data;
391 | 
392 | //     cublasOperation_t transa = transposeA ? CUBLAS_OP_T : CUBLAS_OP_N;
393 | //     cublasOperation_t transb = transposeB ? CUBLAS_OP_T : CUBLAS_OP_N;
394 | 
395 | //     int m = transposeB ? matB->shape[0] : matB->shape[1];
396 | //     int n = transposeA ? matA->shape[1] : matA->shape[0];
397 | //     int k = transposeA ? matA->shape[0] : matA->shape[1];
398 | 
399 | //     float alpha = 1.0f;
400 | //     float beta = 0.0f;
401 | //     stat = cublasSgemm(handle, transb, transa,
402 | //                        m, n, k,
403 | //                        &alpha, matB_data, matB->shape[1],
404 | //                        matA_data, matA->shape[1],
405 | //                        &beta, matC_data, m);
406 | 
407 | //     if (stat != CUBLAS_STATUS_SUCCESS)
408 | //         printf("CUBLAS kernel execution error.\n");
409 | 
410 | //     stat = cublasDestroy(handle);
411 | //     if (stat != CUBLAS_STATUS_SUCCESS)
412 | //         printf("CUBLAS shutdown error\n");
413 | 
414 | //     return 0;
415 | // }
416 | cublasHandle_t cublas_handle = NULL;
417 | 
418 | int DLGpuMatrixMultiply(const DLArrayHandle matA, bool transposeA,
419 |                         const DLArrayHandle matB, bool transposeB,
420 |                         DLArrayHandle matC) {
421 |     /* TODO: Your code here */
422 |     // Hint: use cublas
423 |     // cublas assume matrix is column major
424 |     // op(A) * op(B) = C
425 |     // op(B)T * op(A)T = CT
426 | 
427 |     if (!cublas_handle) {
428 |         cublasCreate(&cublas_handle);
429 |     }
430 | 
431 |     float one = 1.0f;
432 |     float zero = 0.0f;
433 |     int m = matC->shape[1];
434 |     int n = matC->shape[0];
435 |     int k = transposeA ? matA->shape[0] : matA->shape[1];
436 | 
437 |     cublasSgemm(cublas_handle,
438 |                 transposeB ? CUBLAS_OP_T : CUBLAS_OP_N,
439 |                 transposeA ? CUBLAS_OP_T : CUBLAS_OP_N,
440 |                 m, n, k,
441 |                 &one,
442 |                 (const float *) matB->data, !transposeB ? m : k,
443 |                 (const float *) matA->data, !transposeA ? k : n,
444 |                 &zero,
445 |                 (float *) matC->data, m
446 |     );
447 |     return 0;
448 | }
449 | 
450 | __global__ void relu_kernel(const float *input, float *output, int n) {
451 |     int index = blockDim.x * blockIdx.x + threadIdx.x;
452 |     if (index < n) {
453 |         float element = input[index];
454 |         if (element <= 0) {
455 |             output[index] = 0;
456 |         } else {
457 |             output[index] = element;
458 |         }
459 |     }
460 | }
461 | 
462 | int DLGpuRelu(const DLArrayHandle input, DLArrayHandle output) {
463 |     /* TODO: Your code here */
464 |     int n = 1;
465 |     for (int i = 0; i < input->ndim; i++) {
466 |         n *= input->shape[i];
467 |     }
468 | 
469 |     const float *input_data = (const float *) input->data;
470 |     float *output_data = (float *) output->data;
471 |     int threads_per_block = 1024;
472 |     int num_blocks = (n + threads_per_block - 1) / threads_per_block;
473 |     relu_kernel << < num_blocks, threads_per_block >> > (input_data, output_data, n);
474 |     return 0;
475 | }
476 | 
477 | __global__ void relu_gradient_kernel(const float *input, float *output,
478 |                                      const float *in_grad, int n) {
479 |     int index = blockDim.x * blockIdx.x + threadIdx.x;
480 |     if (index < n) {
481 |         float element = input[index];
482 |         if (element <= 0) {
483 |             output[index] = 0;
484 |         } else {
485 |             output[index] = in_grad[index];
486 |         }
487 |     }
488 | }
489 | 
490 | int DLGpuReluGradient(const DLArrayHandle input, const DLArrayHandle in_grad,
491 |                       DLArrayHandle output) {
492 |     /* TODO: Your code here */
493 |     int n = 1;
494 |     for (int i = 0; i < input->ndim; i++) {
495 |         n *= input->shape[i];
496 |     }
497 | 
498 |     const float *input_data = (const float *) input->data;
499 |     float *output_data = (float *) output->data;
500 |     const float *in_grad_data = (const float *) in_grad->data;
501 |     int threads_per_block = 1024;
502 |     int num_blocks = (n + threads_per_block - 1) / threads_per_block;
503 | 
504 |     relu_gradient_kernel << < num_blocks, threads_per_block >> > (input_data,
505 |             output_data, in_grad_data, n);
506 |     return 0;
507 | }
508 | 
509 | __global__ void softmax_kernel(int64_t nrow, int64_t ncol,
510 |                                const float *input_data,
511 |                                float *output_data) {
512 | 
513 | // two dimensional thread blocks.
514 |     int y = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
515 |     if (y >= nrow) {
516 |         return;
517 |     }
518 |     // y_th row of input data
519 |     input_data += y * ncol;
520 |     output_data += y * ncol;
521 |     // find max for a row.
522 |     float maxval = *input_data;
523 |     for (int x = 1; x < ncol; ++x) {
524 |         maxval = max(maxval, input_data[x]);
525 |     }
526 |     // Deduct by max for a row, and raise to exp.
527 |     // in case of too large of exp, and the result will not be affected
528 |     float sum = 0;
529 |     for (int x = 0; x < ncol; ++x) {
530 |         sum += exp(input_data[x] - maxval);
531 |     }
532 |     // Compute per-row softmax.
533 |     for (int x = 0; x < ncol; ++x) {
534 |         output_data[x] = exp(input_data[x] - maxval) / sum;
535 |     }
536 | }
537 | 
538 | 
539 | int DLGpuSoftmax(const DLArrayHandle input, DLArrayHandle output) {
540 |     /* TODO: Your code here */
541 |     assert(input->ndim == 2);
542 |     assert(output->ndim == 2);
543 |     int64_t nrow = input->shape[0];
544 |     int64_t ncol = input->shape[1];
545 |     float *input_data = (float *) input->data;
546 |     float *output_data = (float *) output->data;
547 |     dim3 threads;
548 |     if (nrow < 1024) {
549 |         threads.x = nrow;
550 |     } else {
551 |         threads.x = 1024;
552 |         threads.y = (nrow + 1023) / 1024;
553 |     }
554 |     softmax_kernel << < 1, threads >> > (nrow, ncol, input_data, output_data);
555 |     return 0;
556 | }
557 | 
558 | int DLGpuSoftmaxCrossEntropy(const DLArrayHandle input_a,
559 |                              const DLArrayHandle input_b, DLArrayHandle output) {
560 |     assert(input_a->ndim == 2);
561 |     assert(input_b->ndim == 2);
562 |     assert(output->ndim == 1);
563 |     assert(
564 |             input_a->shape[0] == input_b->shape[0]
565 |             && input_a->shape[1] == input_b->shape[1]);
566 |     int nrow = input_a->shape[0];
567 |     // Maximum x- or y-dimension of a block = 1024
568 |     // But we need 'nrow' shared memory, and max shared memory is 48KB.
569 |     // Conservatively allow max 16KB shared memory.
570 |     assert(nrow <= 1024 * 4);
571 |     int ncol = input_a->shape[1];
572 |     const float *input_data_a = (const float *) input_a->data;
573 |     const float *input_data_b = (const float *) input_b->data;
574 |     float *output_data = (float *) output->data;
575 |     dim3 threads;
576 |     if (nrow <= 1024) {
577 |         threads.x = nrow;
578 |     } else {
579 |         threads.x = 1024;
580 |         threads.y = (nrow + 1023) / 1024;
581 |     }
582 |     // 1 block, each block with 'threads' number of threads with 'nrow' shared
583 |     // memory size
584 |     matrix_softmax_cross_entropy_kernel << < 1, threads, nrow * sizeof(float) >> > (
585 |             nrow, ncol, input_data_a, input_data_b, output_data);
586 |     return 0;
587 | }
588 | 


--------------------------------------------------------------------------------
/cuda/src/runtime_base.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  *  Copyright (c) 2017 by Contributors
 3 |  * \file runtime_base.h
 4 |  * \brief Base of all C APIs
 5 |  */
 6 | #ifndef DLSYS_RUNTIME_RUNTIME_BASE_H_
 7 | #define DLSYS_RUNTIME_RUNTIME_BASE_H_
 8 | 
 9 | #include "c_runtime_api.h"
10 | #include <stdexcept>
11 | 
12 | /*! \brief  macro to guard beginning and end section of all functions */
13 | #define API_BEGIN() try {
14 | /*!
15 |  * \brief every function starts with API_BEGIN(), and finishes with API_END()
16 |  *  or API_END_HANDLE_ERROR
17 |  */
18 | #define API_END()                                                              \
19 |   }                                                                            \
20 |   catch (std::runtime_error & _except_) {                                      \
21 |     return DLSYSAPIHandleException(_except_);                                  \
22 |   }                                                                            \
23 |   return 0;
24 | 
25 | /*!
26 |  * \brief every function starts with API_BEGIN() and finishes with API_END() or
27 |  * API_END_HANDLE_ERROR. The finally clause contains procedure to cleanup states
28 |  * when an error happens.
29 |  */
30 | #define API_END_HANDLE_ERROR(Finalize)                                         \
31 |   }                                                                            \
32 |   catch (std::runtime_error & _except_) {                                      \
33 |     Finalize;                                                                  \
34 |     return DLSYSAPIHandleException(_except_);                                  \
35 |   }                                                                            \
36 |   return 0;
37 | 
38 | /*!
39 |  * \brief handle exception throwed out
40 |  * \param e the exception
41 |  * \return the return value of API after exception is handled
42 |  */
43 | inline int DLSYSAPIHandleException(const std::runtime_error &e) {
44 |     // TODO
45 |     // TVMAPISetLastError(e.what());
46 |     return -1;
47 | }
48 | 
49 | #endif // DLSYS_RUNTIME_RUNTIME_BASE_H_
50 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/upul/Aurora/415a80ac5f7083475baca4a2d187cd102ba7a6c5/examples/__init__.py


--------------------------------------------------------------------------------
/examples/mnist.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import aurora as au
  3 | import aurora.autodiff as ad
  4 | import timeit
  5 | import argparse
  6 | 
  7 | 
  8 | def measure_accuracy(activation, data, use_gpu=False):
  9 |     X_val, y_val = data
 10 |     executor = ad.Executor([activation], use_gpu=use_gpu)
 11 |     prob_val, = executor.run(feed_shapes={X: X_val})
 12 |     if use_gpu:
 13 |         prob_val = prob_val.asnumpy()
 14 | 
 15 |     correct = np.sum(np.equal(y_val, np.argmax(prob_val, axis=1)))
 16 |     percentage = (correct / (y_val.shape[0])) * 100.00
 17 |     return percentage
 18 | 
 19 | 
 20 | def build_graph(X, y, input_size, hid_1_size, hid_2_size, output_size):
 21 |     # Parameter of the model
 22 |     rand = np.random.RandomState(seed=1024)
 23 |     W1 = ad.Parameter(name="W1", init=rand.normal(scale=0.1, size=(input_size, hid_1_size)))
 24 |     b1 = ad.Parameter(name="b1", init=rand.normal(scale=0.1, size=(hid_1_size)))
 25 | 
 26 |     W2 = ad.Parameter(name="W2", init=rand.normal(scale=0.1, size=(hid_1_size, hid_2_size)))
 27 |     b2 = ad.Parameter(name="b2", init=rand.normal(scale=0.1, size=(hid_2_size)))
 28 | 
 29 |     W3 = ad.Parameter(name="W3", init=rand.normal(scale=0.1, size=(hid_2_size, output_size)))
 30 |     b3 = ad.Parameter(name="b3", init=rand.normal(scale=0.1, size=(output_size)))
 31 | 
 32 |     # building the NN model
 33 |     z1 = ad.matmul(X, W1)
 34 |     hidden_1 = z1 + ad.broadcast_to(b1, z1)
 35 |     activation_1 = au.nn.relu(hidden_1)
 36 | 
 37 |     z2 = ad.matmul(activation_1, W2)
 38 |     hidden_2 = z2 + ad.broadcast_to(b2, z2)
 39 |     activation_2 = au.nn.relu(hidden_2)
 40 | 
 41 |     z3 = ad.matmul(activation_2, W3)
 42 |     hidden_3 = z3 + ad.broadcast_to(b3, z3)
 43 |     loss = au.nn.softmax_cross_entropy_with_logits(hidden_3, y)
 44 |     return loss, W1, b1, W2, b2, W3, b3, hidden_3
 45 | 
 46 | 
 47 | if __name__ == '__main__':
 48 |     parser = argparse.ArgumentParser()
 49 |     parser.add_argument('-c', '--exe_context',
 50 |                         help='Choose execution context: numpy, gpu',
 51 |                         default='numpy')
 52 | 
 53 |     parser.add_argument('-i', '--num_iter',
 54 |                         help='Choose number of iterations',
 55 |                         default=500)
 56 | 
 57 |     args = parser.parse_args()
 58 | 
 59 |     use_gpu = False
 60 | 
 61 |     if args.exe_context == 'gpu':
 62 |         use_gpu = True
 63 |     n_iter = int(args.num_iter)
 64 | 
 65 |     start = timeit.default_timer()
 66 |     # Create an instance of MNIST dataset and
 67 |     # create a generator for reading training data
 68 |     data = au.datasets.MNIST(batch_size=128)
 69 |     batch_generator = data.train_batch_generator()
 70 | 
 71 |     input_size = data.num_features()  # number of features
 72 |     hid_1_size = 256  # size of first hidden layer
 73 |     hid_2_size = 100  # size of the second hidden layer
 74 |     output_size = 10  # size of the output layer
 75 | 
 76 |     lr = 1e-3  # learning rate
 77 | 
 78 |     # X and y will be used to input data
 79 |     X = ad.Variable(name="X")
 80 |     y = ad.Variable(name='y')
 81 | 
 82 |     loss, W1, b1, W2, b2, W3, b3, logit = build_graph(X, y, input_size, hid_1_size, hid_2_size, output_size)
 83 |     # Using Adam optimizer
 84 |     # optimizer = au.optim.Adam(loss, params=[W1, b1, W2, b2, W3, b3], lr=lr)
 85 |     optimizer = au.optim.Adam(loss, params=[W1, b1, W2, b2, W3, b3], lr=lr, use_gpu=use_gpu)
 86 |     # Starts training
 87 |     for i in range(n_iter):
 88 |         # read next random batch from the training generator
 89 |         X_batch, y_batch = next(batch_generator)
 90 |         # run the optimizer and it will return the cost
 91 |         # after that iteration
 92 |         loss_now = optimizer.step(feed_dict={X: X_batch, y: y_batch})
 93 |         if i <= 10 or (i <= 100 and i % 10 == 0) or (i <= 1000 and i % 100 == 0) or (i <= 10000 and i % 500 == 0):
 94 |             fmt_str = 'iter: {0:>5d} cost: {1:>8.5f}'
 95 |             print(fmt_str.format(i, loss_now[0]))
 96 | 
 97 |     # printing validation accuracy
 98 |     # TODO (upul) optimize hyper-parameters using validation dataset
 99 |     val_acc = measure_accuracy(logit, data.validation(), use_gpu=use_gpu)
100 |     print('Validation accuracy: {:>.2f}'.format(val_acc))
101 | 
102 |     # printing testing accuracy
103 |     test_acc = measure_accuracy(logit, data.testing(), use_gpu=use_gpu)
104 |     print('Testing accuracy: {:>.2f}'.format(test_acc))
105 | 
106 |     end = timeit.default_timer()
107 |     print('Time taken for training/testing: {0:.3f} seconds'.format(end - start))
108 | 


--------------------------------------------------------------------------------
/examples/mnist_cnn.py:
--------------------------------------------------------------------------------
  1 | """Trains a simple convnet on the MNIST dataset.
  2 | =====================================================================
  3 | Numpy:   Gets 99.00 % test accuracy after 3000 iterations with
  4 |          64 batch size.
  5 | 
  6 |          Running Time: 1197.57 seconds on Intel(R) Core(TM) i7-7700K
  7 |          CPU @ 4.20GHz 8 Cores.
  8 | 
  9 | 
 10 | GPU:    Coming soon
 11 | """
 12 | 
 13 | import argparse
 14 | import timeit
 15 | 
 16 | import aurora as au
 17 | import aurora.autodiff as ad
 18 | import numpy as np
 19 | 
 20 | 
 21 | def build_network(image, y, batch_size=32):
 22 |     rand = np.random.RandomState(seed=1024)
 23 | 
 24 |     reshaped_images = ad.reshape(image, newshape=(batch_size, 1, 28, 28))
 25 | 
 26 |     # weight in (number_kernels, color_depth, kernel_height, kernel_width)
 27 |     W1 = ad.Parameter(name='W1', init=rand.normal(scale=0.1, size=(32, 1, 5, 5)))
 28 |     b1 = ad.Parameter(name='b1', init=rand.normal(scale=0.1, size=32))
 29 |     conv1 = au.nn.conv2d(input=reshaped_images, filter=W1, bias=b1)
 30 |     activation1 = au.nn.relu(conv1)
 31 |     # size of activation1: batch_size x 10 x 24 x 24
 32 | 
 33 |     # weight in (number_kernels, number_kernels of previous layer, kernel_height, kernel_width)
 34 |     W2 = ad.Parameter(name='W2', init=rand.normal(scale=0.1, size=(64, 32, 5, 5)))
 35 |     b2 = ad.Parameter(name='b2', init=rand.normal(scale=0.1, size=64))
 36 |     conv2 = au.nn.conv2d(input=activation1, filter=W2, bias=b2)
 37 |     activation2 = au.nn.relu(conv2)
 38 |     # size of activation2: batch_size x 32 x 20 x 20
 39 | 
 40 |     pooling1 = au.nn.maxPool(activation2, filter=(2, 2), strides=(2, 2))
 41 |     # size of activation2: batch_size x 32 x 10 x 10 = batch_size x 3200
 42 | 
 43 |     flatten = ad.reshape(pooling1, newshape=(batch_size, 6400))
 44 | 
 45 |     W3 = ad.Parameter(name='W3', init=rand.normal(scale=0.1, size=(6400, 512)))
 46 |     b3 = ad.Parameter(name='b3', init=rand.normal(scale=0.1, size=512))
 47 |     Z3 = ad.matmul(flatten, W3)
 48 |     Z3 = Z3 + ad.broadcast_to(b3, Z3)
 49 |     activation3 = au.nn.relu(Z3)
 50 | 
 51 |     W4 = ad.Parameter(name='W4', init=rand.normal(scale=0.1, size=(512, 10)))
 52 |     b4 = ad.Parameter(name='b4', init=rand.normal(scale=0.1, size=10))
 53 |     logits = ad.matmul(activation3, W4)
 54 |     logits = logits + ad.broadcast_to(b4, logits)
 55 |     loss = au.nn.softmax_cross_entropy_with_logits(logits, y)
 56 | 
 57 |     return loss, W1, b1, W2, b2, W3, b3, W4, b4, logits
 58 | 
 59 | 
 60 | def measure_accuracy(activation, data, batch_size=32, use_gpu=False):
 61 |     X_val, y_val = data
 62 | 
 63 |     executor = ad.Executor([activation], use_gpu=use_gpu)
 64 | 
 65 |     max_val = len(X_val) - len(X_val) % batch_size
 66 |     y_val = y_val[0:max_val]
 67 | 
 68 |     prediction = np.zeros(max_val)
 69 |     for i in range(0, max_val, batch_size):
 70 |         start = i
 71 |         end = i + batch_size
 72 | 
 73 |         X_batch, y_batch = X_val[start:end], y_val[start:end]
 74 |         prob_val, = executor.run(feed_shapes={images: X_batch})
 75 | 
 76 |         if use_gpu:
 77 |             prob_val = prob_val.asnumpy()
 78 |         prediction[start:end] = np.argmax(prob_val, axis=1)
 79 | 
 80 |     correct = np.sum(np.equal(y_val, prediction))
 81 |     percentage = (correct / len(prediction)) * 100.00
 82 |     return percentage
 83 | 
 84 | 
 85 | if __name__ == '__main__':
 86 |     parser = argparse.ArgumentParser()
 87 |     parser.add_argument('-c', '--exe_context',
 88 |                         help='Choose execution context: numpy, gpu',
 89 |                         default='numpy')
 90 | 
 91 |     parser.add_argument('-i', '--num_iter',
 92 |                         help='Choose number of iterations',
 93 |                         default=500)
 94 | 
 95 |     args = parser.parse_args()
 96 | 
 97 |     use_gpu = False
 98 |     if args.exe_context == 'gpu':
 99 |         use_gpu = True
100 | 
101 |     n_iter = int(args.num_iter)
102 | 
103 |     start = timeit.default_timer()
104 | 
105 |     data = au.datasets.MNIST(batch_size=128)
106 |     batch_generator = data.train_batch_generator()
107 | 
108 |     # images in (batch_size, color_depth, height, width)
109 |     images = ad.Variable(name='images')
110 |     labels = ad.Variable(name='y')
111 | 
112 |     loss, W1, b1, W2, b2, W3, b3, W4, b4, logits = build_network(images, labels, batch_size=128)
113 |     opt_params = [W1, b1, W2, b2, W3, b3, W4, b4]
114 |     optimizer = au.optim.Adam(loss, params=opt_params, lr=1e-3, use_gpu=use_gpu)
115 | 
116 |     cumulative_loss = []
117 |     for i in range(n_iter):
118 |         X_batch, y_batch = next(batch_generator)
119 |         loss_now = optimizer.step(feed_dict={images: X_batch, labels: y_batch})
120 |         cumulative_loss.append(loss_now[0])
121 |         if i <= 10 or (i <= 100 and i % 10 == 0) or (i <= 1000 and i % 100 == 0) or (i <= 10000 and i % 500 == 0):
122 |             fmt_str = 'iter: {0:>5d} avg. cost: {1:>8.5f}'
123 |             print(fmt_str.format(i, sum(cumulative_loss)/len(cumulative_loss)))
124 |             cumulative_loss.clear()
125 | 
126 |     # printing validation accuracy
127 |     val_acc = measure_accuracy(logits, data.validation(), batch_size=128, use_gpu=use_gpu)
128 |     print('Validation accuracy: {:>.2f}'.format(val_acc))
129 | 
130 |     # printing testing accuracy
131 |     test_acc = measure_accuracy(logits, data.testing(), batch_size=128, use_gpu=use_gpu)
132 |     print('Testing accuracy: {:>.2f}'.format(test_acc))
133 | 
134 |     end = timeit.default_timer()
135 |     print('Time taken for training/testing: {0:.3f} seconds'.format(end - start))
136 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.13.3
2 | matplotlib==2.1.0
3 | seaborn==0.8
4 | cytoolz==0.8.2
5 | 
6 | 


--------------------------------------------------------------------------------
/resources/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/upul/Aurora/415a80ac5f7083475baca4a2d187cd102ba7a6c5/resources/logo.png


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages
 2 | from distutils.core import setup
 3 | from distutils.extension import Extension
 4 | from Cython.Build import cythonize
 5 | import numpy
 6 | 
 7 | extensions = [
 8 |     Extension('aurora.nn.pyx.im2col', ['aurora/nn/pyx/im2col.pyx'],
 9 |               include_dirs=[numpy.get_include()]
10 |               ),
11 |     Extension('aurora.nn.pyx.fast_pooling', ['aurora/nn/pyx/fast_pooling.pyx'],
12 |               include_dirs=[numpy.get_include()]
13 |               ),
14 | ]
15 | 
16 | setup(
17 |     name='aurora',
18 |     version='0.01',
19 |     description='Minimal Deep Learning library is written in Python/Numpy and a bit of C++',
20 |     url='https://github.com/upul/Aurora',
21 |     author='Upul Bandara',
22 |     author_email='upulbandara@gmail.com',
23 |     license='MIT',
24 |     ext_modules=cythonize(extensions),
25 |     packages=find_packages(exclude=['Aurora.tests'])
26 | 
27 | )
28 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/upul/Aurora/415a80ac5f7083475baca4a2d187cd102ba7a6c5/tests/__init__.py


--------------------------------------------------------------------------------
/tests/nn_primitives/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/upul/Aurora/415a80ac5f7083475baca4a2d187cd102ba7a6c5/tests/nn_primitives/__init__.py


--------------------------------------------------------------------------------
/tests/nn_primitives/test_cython.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import numpy.testing as npt
  3 | from aurora.nn.pyx.fast_pooling import max_pool_forward
  4 | from aurora.nn.pyx.fast_pooling import max_pool_backward
  5 | from aurora.nn.pyx.im2col import im2col
  6 | from aurora.nn.pyx.im2col import col2im
  7 | from tests.utils.gradient_check import gradient_check_numpy_expr
  8 | 
  9 | 
 10 | # Testing Max Pooling Layers
 11 | 
 12 | def test_max_pooling_forward():
 13 |     data = np.array([[[[0.12, -1.23, 0.01, 2.45],
 14 |                        [5.00, -10.01, 1.09, 4.66],
 15 |                        [4.56, 6.78, 3.45, 3.33],
 16 |                        [0.01, 1.00, 3.56, 3.39]]]])
 17 | 
 18 |     # Test Case: 1
 19 |     # filter = (2, 2) stride = (2, 2)
 20 |     result = max_pool_forward(data, 2, 2, 2, 2)
 21 |     expected = np.array([[[[5.00, 4.66],
 22 |                            [6.78, 3.56]]]])
 23 |     assert result.shape == expected.shape
 24 |     npt.assert_array_almost_equal(result, expected)
 25 | 
 26 |     # Test Case: 2
 27 |     # filter = (2, 2) stride = (1, 1)
 28 |     result = max_pool_forward(data, 2, 2, 1, 1)
 29 |     expected = np.array([[
 30 |         [[5.00, 1.09, 4.66],
 31 |          [6.78, 6.78, 4.66],
 32 |          [6.78, 6.78, 3.56]]]])
 33 |     assert result.shape == expected.shape
 34 |     npt.assert_array_almost_equal(expected, result)
 35 | 
 36 |     # Test Case: 3
 37 |     # filter = (2, 2), stride = (2, 2)
 38 |     shape = (2, 3, 4, 4)
 39 |     data = np.linspace(-0.3, 0.4, num=np.prod(shape)).reshape(shape)
 40 |     result = max_pool_forward(data, 2, 2, 2, 2)
 41 |     expected = np.array([[[[-0.26315789, -0.24842105],
 42 |                            [-0.20421053, -0.18947368]],
 43 |                           [[-0.14526316, -0.13052632],
 44 |                            [-0.08631579, -0.07157895]],
 45 |                           [[-0.02736842, -0.01263158],
 46 |                            [0.03157895, 0.04631579]]],
 47 |                          [[[0.09052632, 0.10526316],
 48 |                            [0.14947368, 0.16421053]],
 49 |                           [[0.20842105, 0.22315789],
 50 |                            [0.26736842, 0.28210526]],
 51 |                           [[0.32631579, 0.34105263],
 52 |                            [0.38526316, 0.4]]]])
 53 |     npt.assert_array_almost_equal(result, expected)
 54 | 
 55 | 
 56 | def test_max_pooling_backward():
 57 |     data = np.array([[[[0.12, -1.23, 0.01, 2.45],
 58 |                        [5.00, -10.01, 1.09, 4.66],
 59 |                        [4.56, 6.78, 3.45, 3.33],
 60 |                        [0.01, 1.00, 3.56, 3.39]]]])
 61 |     output_grad = np.array([[[[1.0, 1.0],
 62 |                               [1.0, 1.0]]]])
 63 |     # Test Case: 1
 64 |     # filter = (2, 2) stride = (2, 2)
 65 |     expected = np.array([[[[0.0, 0.0, 0.0, 0.0],
 66 |                            [1.0, 0.0, 0.0, 1.0],
 67 |                            [0.0, 1.0, 0.0, 0.0],
 68 |                            [0.0, 0.0, 1.0, 0.0]]]])
 69 |     result = max_pool_backward(output_grad, data,
 70 |                                filter_height=2, filter_width=2,
 71 |                                stride_height=2, stride_width=2)
 72 |     npt.assert_array_almost_equal(result, expected)
 73 | 
 74 |     # calculate numerical gradient
 75 |     numerical = gradient_check_numpy_expr(lambda d: max_pool_forward(d, 2, 2, 2, 2), data, output_grad)
 76 |     npt.assert_array_almost_equal(numerical, expected, decimal=3)
 77 | 
 78 |     # Test Case: 2
 79 |     # filter = (2, 2) stride = (2, 2)
 80 |     # different output_grad
 81 |     output_grad = np.array([[[[0.0, 5.10],
 82 |                               [0.12, 0.20]]]])
 83 |     expected = np.array([[[[0.0, 0.0, 0.0, 0.0],
 84 |                            [0.0, 0.0, 0.0, 5.10],
 85 |                            [0.0, 0.12, 0.0, 0.0],
 86 |                            [0.0, 0.0, 0.20, 0.0]]]])
 87 |     result = max_pool_backward(output_grad, data,
 88 |                                filter_height=2, filter_width=2,
 89 |                                stride_height=2, stride_width=2)
 90 |     npt.assert_array_almost_equal(result, expected)
 91 | 
 92 |     # calculate numerical gradient
 93 |     numerical = gradient_check_numpy_expr(lambda d: max_pool_forward(d, 2, 2, 2, 2), data, output_grad)
 94 |     npt.assert_array_almost_equal(numerical, expected, decimal=2)
 95 | 
 96 |     # Test Case: 3
 97 |     # filter = (2, 2) stride = (1, 1)
 98 |     output_grad = np.array([[[[1.0, 1.0, 1.0],
 99 |                               [1.0, 1.0, 1.0],
100 |                               [1.0, 1.0, 1.0]]]])
101 |     result = max_pool_backward(output_grad, data,
102 |                                filter_height=2, filter_width=2,
103 |                                stride_height=1, stride_width=1)
104 |     numerical = gradient_check_numpy_expr(lambda x: max_pool_forward(x, 2, 2, 1, 1), data, output_grad)
105 |     npt.assert_array_almost_equal(numerical, result, decimal=2)
106 | 
107 |     # Test Case: 4
108 |     # filter = (2, 2) stride = (2, 2)
109 |     # input shape = (2, 2, 6, 6)
110 |     data = np.random.normal(scale=0.01, size=(2, 2, 6, 6))
111 |     output_grad = np.ones((2, 2, 3, 3))
112 |     result = max_pool_backward(output_grad, data,
113 |                                filter_height=2, filter_width=2,
114 |                                stride_height=2, stride_width=2)
115 |     numerical = gradient_check_numpy_expr(lambda d: max_pool_forward(d, 2, 2, 2, 2), data, output_grad)
116 |     npt.assert_array_almost_equal(numerical, result, decimal=4)
117 | 
118 | 
119 | # Testing Image to Column operations
120 | def test_im2col():
121 |     data = np.arange(16).reshape((1, 1, 4, 4)).astype(np.float64)
122 |     # one image in the batch 2 by 2 kernel with stride = 1
123 |     result = im2col(data, filter_height=2, filter_width=2,
124 |                     padding_height=0, padding_width=0,
125 |                     stride_height=1, stride_width=1)
126 | 
127 |     expected = np.array([[0, 1, 2, 4, 5, 6, 8, 9, 10],
128 |                          [1, 2, 3, 5, 6, 7, 9, 10, 11],
129 |                          [4, 5, 6, 8, 9, 10, 12, 13, 14],
130 |                          [5, 6, 7, 9, 10, 11, 13, 14, 15]]).astype(np.float64)
131 |     npt.assert_array_almost_equal(result, expected)
132 | 
133 |     # one image in the batch 2 by 2 kernel with stride = 2
134 |     result = im2col(data, filter_height=2, filter_width=2,
135 |                     padding_height=0, padding_width=0,
136 |                     stride_height=2, stride_width=2)
137 |     expected = np.array([[0, 2, 8, 10],
138 |                          [1, 3, 9, 11],
139 |                          [4, 6, 12, 14],
140 |                          [5, 7, 13, 15]]).astype(np.float64)
141 |     npt.assert_array_almost_equal(result, expected)
142 | 
143 |     # one image in the batche 2 by 2 kernel with stride = 1 and padding  = 1
144 |     data = np.arange(9).reshape(1, 1, 3, 3).astype(np.float64)
145 |     result = im2col(data, filter_height=2, filter_width=2,
146 |                     padding_height=1, padding_width=1,
147 |                     stride_height=1, stride_width=1)
148 |     expected = np.array([[0, 0, 0, 0, 0, 0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8],
149 |                          [0, 0, 0, 0, 0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0],
150 |                          [0, 0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 0, 0, 0],
151 |                          [0, 1, 2, 0, 3, 4, 5, 0, 6, 7, 8, 0, 0, 0, 0, 0]]).astype(np.float64)
152 |     npt.assert_array_almost_equal(result, expected)
153 | 
154 |     # more than one color channels
155 |     # kernel 2 by 2 stride = 1
156 |     data = np.arange(18).reshape(1, 2, 3, 3).astype(np.float64)
157 |     result = im2col(data, filter_height=2, filter_width=2,
158 |                     padding_height=0, padding_width=0,
159 |                     stride_height=1, stride_width=1)
160 |     expected = np.array([[0, 1, 3, 4],
161 |                          [1, 2, 4, 5],
162 |                          [3, 4, 6, 7],
163 |                          [4, 5, 7, 8],
164 |                          [9, 10, 12, 13],
165 |                          [10, 11, 13, 14],
166 |                          [12, 13, 15, 16],
167 |                          [13, 14, 16, 17]])
168 |     npt.assert_array_almost_equal(result, expected)
169 | 
170 |     # more than one batch and color chennel
171 |     # kernel 2 by 2 with stride of 1
172 |     data = np.arange(36).reshape(2, 2, 3, 3).astype(np.float64)
173 |     result = im2col(data, filter_height=2, filter_width=2,
174 |                     padding_height=0, padding_width=0,
175 |                     stride_height=1, stride_width=1)
176 |     expected = np.array([[0, 18, 1, 19, 3, 21, 4, 22],
177 |                          [1, 19, 2, 20, 4, 22, 5, 23],
178 |                          [3, 21, 4, 22, 6, 24, 7, 25],
179 |                          [4, 22, 5, 23, 7, 25, 8, 26],
180 |                          [9, 27, 10, 28, 12, 30, 13, 31],
181 |                          [10, 28, 11, 29, 13, 31, 14, 32],
182 |                          [12, 30, 13, 31, 15, 33, 16, 34],
183 |                          [13, 31, 14, 32, 16, 34, 17, 35]]).astype(np.float64)
184 |     print(np.array(result))
185 |     npt.assert_array_almost_equal(result, expected)
186 | 
187 |     # TODO: (upul) test several kernel sizes and different stride, kernel size and padding
188 |     #     : in different directions
189 | 
190 | 
191 | def test_col2im():
192 |     # batch size 1, color channels 1, 3 by 3 image. Stride 1, filter 2 by 2 and no padding
193 |     data = np.arange(9).reshape((1, 1, 3, 3)).astype(np.float64)
194 |     i2c_result = im2col(data, filter_height=2, filter_width=2,
195 |                         padding_height=0, padding_width=0,
196 |                         stride_height=1, stride_width=1)
197 |     result = col2im(i2c_result, 1, 1, 3, 3,
198 |                     2, 2,
199 |                     0, 0,
200 |                     1, 1)
201 |     expected = np.array([[[[0., 2., 2.],
202 |                            [6., 16., 10.],
203 |                            [6., 14., 8.]]]]).astype(np.float64)
204 |     npt.assert_array_almost_equal(result, expected)
205 | 
206 |     # batch size 1, color channels 1, 4 by 4 image. Stride 2, filter 2 by 2 and no padding
207 |     data = np.arange(16).reshape((1, 1, 4, 4)).astype(np.float64)
208 |     i2c_result = im2col(data, filter_height=2, filter_width=2,
209 |                         padding_height=0, padding_width=0,
210 |                         stride_height=2, stride_width=2)
211 |     result = col2im(i2c_result,
212 |                     1, 1,  # batch size and color channels
213 |                     4, 4,  # img width and height
214 |                     2, 2,  # kernel
215 |                     0, 0,  # padding
216 |                     2, 2)  # stride
217 | 


--------------------------------------------------------------------------------
/tests/test_autodiff_cpu.py:
--------------------------------------------------------------------------------
  1 | import aurora as au
  2 | import aurora.autodiff as ad
  3 | import numpy as np
  4 | import numpy.testing as npt
  5 | 
  6 | 
  7 | def test_dummy():
  8 |     assert 1 == 1
  9 | 
 10 | 
 11 | def test_identity():
 12 |     x2 = ad.Variable(name="x2")
 13 |     y = x2
 14 | 
 15 |     grad_x2, = ad.gradients(y, [x2])
 16 | 
 17 |     executor = ad.Executor([y, grad_x2])
 18 |     x2_val = 2 * np.ones(3)
 19 |     y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val})
 20 | 
 21 |     assert isinstance(y, ad.Node)
 22 |     assert np.array_equal(y_val, x2_val)
 23 |     assert np.array_equal(grad_x2_val, np.ones_like(x2_val))
 24 | 
 25 | 
 26 | def test_add_by_const():
 27 |     x2 = ad.Variable(name="x2")
 28 |     y = 5 + x2
 29 | 
 30 |     grad_x2, = ad.gradients(y, [x2])
 31 | 
 32 |     executor = ad.Executor([y, grad_x2])
 33 |     x2_val = 2 * np.ones(3)
 34 |     y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val})
 35 | 
 36 |     assert isinstance(y, ad.Node)
 37 |     assert np.array_equal(y_val, x2_val + 5)
 38 |     assert np.array_equal(grad_x2_val, np.ones_like(x2_val))
 39 | 
 40 | 
 41 | def test_mul_by_const():
 42 |     x2 = ad.Variable(name='x2')
 43 |     y = 3 * x2
 44 | 
 45 |     grad_x2, = ad.gradients(y, [x2])
 46 |     executor = ad.Executor([y, grad_x2])
 47 |     x2_val = 2 * np.ones(3)
 48 |     y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val})
 49 | 
 50 |     # asserts
 51 |     assert isinstance(y, ad.Node)
 52 |     assert np.array_equal(y_val, 3 * x2_val)
 53 |     assert np.array_equal(grad_x2_val, 3 * np.ones_like(x2_val))
 54 | 
 55 | 
 56 | def test_mul_two_var():
 57 |     x2 = ad.Variable(name='x2')
 58 |     x3 = ad.Variable(name='x3')
 59 |     y = x2 * x3
 60 | 
 61 |     grad_x2, grad_x3 = ad.gradients(y, [x2, x3])
 62 |     executor = ad.Executor([y, grad_x2, grad_x3])
 63 |     x2_val = 2 * np.ones(3)
 64 |     x3_val = 3 * np.ones(3)
 65 |     y_val, grad_x2_val, grad_x3_val = executor.run(feed_shapes={x2: x2_val, x3: x3_val})
 66 | 
 67 |     # asserts
 68 |     assert isinstance(y, ad.Node)
 69 |     assert np.array_equal(y_val, 6 * np.ones(3))
 70 |     assert np.array_equal(grad_x2_val, x3_val)
 71 |     assert np.array_equal(grad_x3_val, x2_val)
 72 | 
 73 | 
 74 | def test_sub_two_vars():
 75 |     x2 = ad.Variable(name='x2')
 76 |     x3 = ad.Variable(name='x3')
 77 |     y = x2 - x3
 78 | 
 79 |     grad_x2, grad_x3 = ad.gradients(y, [x2, x3])
 80 |     executor = ad.Executor([y, grad_x2, grad_x3])
 81 |     x2_val = 4 * np.ones(3)
 82 |     x3_val = 3 * np.ones(3)
 83 |     y_val, grad_x2_val, grad_x3_val = executor.run(feed_shapes={x2: x2_val, x3: x3_val})
 84 | 
 85 |     # asserts
 86 |     assert isinstance(y, ad.Node)
 87 |     assert np.array_equal(y_val, 1 * np.ones(3))
 88 |     assert np.array_equal(grad_x2_val, np.ones(3))
 89 |     assert np.array_equal(grad_x3_val, -1 * np.ones(3))
 90 | 
 91 | 
 92 | def test_sub_by_const():
 93 |     x2 = ad.Variable(name='x2')
 94 |     y = x2 - 3
 95 | 
 96 |     grad_x2, = ad.gradients(y, [x2])
 97 |     executor = ad.Executor([y, grad_x2])
 98 |     x2_val = 2 * np.ones(3)
 99 |     y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val})
100 | 
101 |     # asserts
102 |     assert isinstance(y, ad.Node)
103 |     assert np.array_equal(y_val, -1 * np.ones(3))
104 |     assert np.array_equal(grad_x2_val, np.ones_like(x2_val))
105 | 
106 | 
107 | def test_div_two_var():
108 |     x2 = ad.Variable(name='x2')
109 |     x3 = ad.Variable(name='x3')
110 |     y = x2 / x3
111 | 
112 |     grad_x2, grad_x3 = ad.gradients(y, [x2, x3])
113 |     executor = ad.Executor([y, grad_x2, grad_x3])
114 |     x2_val = 4 * np.ones(3)
115 |     x3_val = 2 * np.ones(3)
116 |     y_val, grad_x2_val, grad_x3_val = executor.run(feed_shapes={x2: x2_val, x3: x3_val})
117 | 
118 |     # asserts
119 |     assert isinstance(y, ad.Node)
120 |     assert np.array_equal(grad_x2_val, 1.0 / x3_val)
121 |     assert np.array_equal(grad_x3_val, -1.0 * x2_val / (x3_val * x3_val))
122 | 
123 | 
124 | def test_div_by_const():
125 |     x2 = ad.Variable(name='x2')
126 |     y = x2 / 2.0
127 | 
128 |     grad_x2, = ad.gradients(y, [x2])
129 |     executor = ad.Executor([y, grad_x2])
130 |     x2_val = 2 * np.ones(3)
131 |     y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val})
132 | 
133 |     # asserts
134 |     assert isinstance(y, ad.Node)
135 |     assert np.array_equal(y_val, x2_val / 2.0)
136 |     assert np.array_equal(grad_x2_val, np.ones_like(x2_val) / 2.0)
137 | 
138 | 
139 | def test_reduce_sum():
140 |     x2 = ad.Variable(name='x2')
141 |     y = ad.reduce_sum(x2)
142 | 
143 |     grad_x2, = ad.gradients(y, [x2])
144 |     executor = ad.Executor([y, grad_x2])
145 |     x2_val = np.array([[1, 2, 3], [4, 5, 6]])
146 |     y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val})
147 | 
148 |     # asserts
149 |     assert isinstance(y, ad.Node)
150 |     assert np.array_equal(y_val, np.array([5, 7, 9]))
151 |     assert np.array_equal(grad_x2_val, np.array([1, 1, 1]))
152 | 
153 | 
154 | def test_broadcast_to():
155 |     x2 = ad.Variable(name='x2')
156 |     x3 = ad.Variable(name='x3')
157 |     y = ad.broadcast_to(x2, x3)
158 | 
159 |     grad_x2, grad_x3 = ad.gradients(y, [x2, x3])
160 |     executor = ad.Executor([y, grad_x2, grad_x3])
161 |     x2_val = np.array([[1, 2, 3]])
162 |     x3_val = np.zeros((3, 3))
163 |     y_val, grad_x2_val, grad_x3_val = executor.run(feed_shapes={x2: x2_val, x3: x3_val})
164 | 
165 |     # asserts
166 |     assert isinstance(y, ad.Node)
167 |     assert np.array_equal(y_val, np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]]))
168 |     assert np.array_equal(grad_x2_val, np.array([3, 3, 3]))
169 | 
170 | 
171 | def test_matmul_two_vars():
172 |     x2 = ad.Variable(name='x2')
173 |     x3 = ad.Variable(name='x3')
174 |     y = ad.matmul(x2, x3)
175 | 
176 |     grad_x2, grad_x3 = ad.gradients(y, [x2, x3])
177 |     executor = ad.Executor([y, grad_x2, grad_x3])
178 |     x2_val = np.array([[1, 2], [3, 4], [5, 6]])  # 3x2
179 |     x3_val = np.array([[7, 8, 9], [10, 11, 12]])  # 2x3
180 | 
181 |     y_val, grad_x2_val, grad_x3_val = executor.run(feed_shapes={x2: x2_val, x3: x3_val})
182 | 
183 |     expected_yval = np.matmul(x2_val, x3_val)
184 |     expected_grad_x2_val = np.matmul(np.ones_like(expected_yval), np.transpose(x3_val))
185 |     expected_grad_x3_val = np.matmul(np.transpose(x2_val), np.ones_like(expected_yval))
186 | 
187 |     assert isinstance(y, ad.Node)
188 |     assert np.array_equal(y_val, expected_yval)
189 |     assert np.array_equal(grad_x2_val, expected_grad_x2_val)
190 |     assert np.array_equal(grad_x3_val, expected_grad_x3_val)
191 | 
192 | 
193 | def test_relu():
194 |     x2 = ad.Variable(name='x2')
195 |     y = au.nn.relu(x2)
196 | 
197 |     grad_x2, = ad.gradients(y, [x2])
198 |     executor = ad.Executor([y, grad_x2])
199 |     x2_val = np.array([[-1, 2, 3], [1, -2, 0]])
200 |     y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val})
201 |     expected_y_val = np.array([[0, 2, 3], [1, 0, 0]])
202 |     expected_x2_grad = np.array([[0, 1, 1], [1, 0, 0]])
203 |     assert np.array_equal(y_val, expected_y_val)
204 |     assert np.array_equal(grad_x2_val, expected_x2_grad)
205 | 
206 | 
207 | def test_cross_entropy():
208 |     x2_pred = ad.Variable(name='x2_pred')
209 |     x2_actu = ad.Variable(name='x2_actu')
210 |     y = au.nn.softmax_cross_entropy_with_logits(x2_pred, x2_actu)
211 | 
212 |     x2_pred_grad, x2_actu_grad = ad.gradients(y, [x2_pred, x2_actu])
213 | 
214 |     x2_pred_val = np.array([[0.8, 0.01, 0.5], [0.8, 0.01, 0.5]])
215 |     x2_actu_val = np.array([[1.0, 1.0, 0], [1.0, 1.0, 0]])
216 | 
217 |     executor = ad.Executor([y, x2_pred_grad, x2_actu_grad])
218 |     y_val, x2_pred_grad_val, x2_actu_grad_val = executor.run(feed_shapes={x2_pred: x2_pred_val, x2_actu: x2_actu_val})
219 |     # print(x2_actu_grad_val)
220 |     assert True
221 | 
222 | 
223 | def test_matmul_var_and_param():
224 |     x2 = ad.Variable(name="x2")
225 |     w2_val = np.array([[7, 8, 9], [10, 11, 12]])  # 2x3
226 |     w2 = ad.Parameter(name="w2", init=w2_val)
227 |     y = ad.matmul(x2, w2)
228 | 
229 |     grad_x2, grad_w2 = ad.gradients(y, [x2, w2])
230 | 
231 |     executor = ad.Executor([y, grad_x2, grad_w2])
232 |     x2_val = np.array([[1, 2], [3, 4], [5, 6]])  # 3x2
233 | 
234 |     y_val, grad_x2_val, grad_w2_val = executor.run(feed_shapes={x2: x2_val})
235 | 
236 |     expected_yval = np.matmul(x2_val, w2_val)
237 |     expected_grad_x2_val = np.matmul(np.ones_like(expected_yval), np.transpose(w2_val))
238 |     expected_grad_x3_val = np.matmul(np.transpose(x2_val), np.ones_like(expected_yval))
239 | 
240 |     assert isinstance(y, ad.Node)
241 |     # assert np.array_equal(y_val, expected_yval)
242 |     # assert np.array_equal(grad_x2_val, expected_grad_x2_val)
243 |     # assert np.array_equal(grad_w2_val, expected_grad_x3_val)
244 | 
245 | 
246 | def test_sigmoid_activation():
247 |     x2 = ad.Variable(name='x2')
248 |     y = au.nn.sigmoid(x2)
249 | 
250 |     x2_val = np.array([-100, 0, 100])
251 |     grad_x2, = ad.gradients(y, [x2])
252 |     executor = ad.Executor([y, grad_x2])
253 |     y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val})
254 |     npt.assert_array_almost_equal(np.array([0.000, 0.500, 1.0]), y_val)
255 |     npt.assert_array_almost_equal(np.array([0, 0.25, 0]), grad_x2_val)
256 | 
257 |     # testing with extreme values for numerical stability.
258 |     x2_val = np.array([-9.9e10, 9.9e10]).astype(np.float32)
259 |     y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val})
260 |     npt.assert_array_almost_equal(np.array([0.0, 1.0]), y_val)
261 |     npt.assert_array_almost_equal(np.array([0.0, 0.0]), grad_x2_val)
262 | 
263 | 
264 | def test_max_pooling():
265 |     x2 = ad.Variable(name='x2')
266 |     y = au.nn.maxPool(x2, filter=(2, 2), strides=(2, 2))
267 | 
268 |     grad_x2, = ad.gradients(y, [x2])
269 |     executor = ad.Executor([y, grad_x2])
270 |     x2_val = np.random.randn(1, 1, 4, 4)
271 | 
272 |     y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val})
273 | 
274 |     numerical_grad_x2 = ad.eval_numerical_grad(y,
275 |                                                feed_dict={x2: x2_val},
276 |                                                wrt=x2,
277 |                                                h=1e-5)
278 |     assert isinstance(y, ad.Node)
279 |     # TODO: (upul) looks like a bug in my eval_numerical_grad implementation
280 |     #       Hence I'm using one decimal points
281 |     npt.assert_array_almost_equal(grad_x2_val, numerical_grad_x2, decimal=2)
282 | 
283 | 
284 | def test_conv2d():
285 |     x2 = ad.Variable(name='x2')
286 |     w2 = ad.Variable(name='w2')
287 |     b2 = ad.Variable(name='b2')
288 | 
289 |     y = au.nn.conv2d(input=x2, filter=w2, bias=b2)
290 | 
291 |     grad_x2, grad_w2, grad_b2 = ad.gradients(y, [x2, w2, b2])
292 |     executor = ad.Executor([y, grad_x2, grad_w2, grad_b2])
293 |     x2_val = np.random.randn(1, 2, 4, 4)
294 |     w2_val = np.random.randn(2, 2, 3, 3)
295 |     b2_val = np.random.randn(2, )
296 | 
297 |     y_val, grad_x2_val, grad_w2_val, grad_b2_val = executor.run(feed_shapes={x2: x2_val,
298 |                                                                              w2: w2_val,
299 |                                                                              b2: b2_val})
300 | 
301 |     numerical_grad_w2 = ad.eval_numerical_grad(y,
302 |                                                feed_dict={x2: x2_val,
303 |                                                           w2: w2_val,
304 |                                                           b2: b2_val},
305 |                                                wrt=w2)
306 |     numerical_grad_x2 = ad.eval_numerical_grad(y,
307 |                                                feed_dict={x2: x2_val,
308 |                                                           w2: w2_val,
309 |                                                           b2: b2_val},
310 |                                                wrt=x2)
311 |     numerical_grad_b2 = ad.eval_numerical_grad(y,
312 |                                                feed_dict={x2: x2_val,
313 |                                                           w2: w2_val,
314 |                                                           b2: b2_val},
315 |                                                wrt=b2)
316 | 
317 |     assert isinstance(y, ad.Node)
318 |     npt.assert_array_almost_equal(numerical_grad_w2, grad_w2_val)
319 |     npt.assert_array_almost_equal(numerical_grad_x2, grad_x2_val)
320 |     npt.assert_array_almost_equal(numerical_grad_b2, grad_b2_val)
321 | 
322 |     x2 = ad.Variable(name='x2')
323 |     w2 = ad.Parameter(name='w2', init=w2_val)
324 |     b2 = ad.Parameter(name='b2', init=b2_val)
325 |     y = au.nn.conv2d(x2, w2, b2)
326 | 
327 |     grad_x2, grad_w2, grad_b2 = ad.gradients(y, [x2, w2, b2])
328 |     executor = ad.Executor([y, grad_x2, grad_w2, grad_b2])
329 |     y_val, grad_x2_val, grad_w2_val, grad_b2_val = executor.run(feed_shapes={x2: x2_val})
330 | 
331 |     assert isinstance(y, ad.Node)
332 |     npt.assert_array_almost_equal(numerical_grad_w2, grad_w2_val)
333 |     npt.assert_array_almost_equal(numerical_grad_b2, grad_b2_val)
334 |     npt.assert_array_almost_equal(numerical_grad_x2, grad_x2_val)
335 | 
336 | 
337 | def test_reshape():
338 |     x2 = ad.Variable(name='x2')
339 |     y = ad.reshape(x2, newshape=(1, 4))
340 | 
341 |     grad_x2, = ad.gradients(y, [x2])
342 |     executor = ad.Executor([y, grad_x2])
343 |     x2_val = np.random.randn(2, 2)
344 |     y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val})
345 | 
346 |     assert isinstance(y, ad.Node)
347 |     assert y_val.shape == (1, 4)
348 |     npt.assert_array_equal(grad_x2_val, np.ones((2, 2)))
349 | 
350 |     # x2 = ad.Variable(name='x2')
351 |     # y = ad.reshape(x2, newshape=(2, 1, 2, 3))
352 |     # grad_x2, = ad.gradients(y, [x2])
353 |     # executor = ad.Executor([y, grad_x2])
354 |     # x2_val = np.random.randn(2, 6)
355 |     # y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val})
356 |     #
357 |     # assert isinstance(y, ad.Node)
358 |     # assert y_val.shape == (2, 1, 2, 3)
359 |     # npt.assert_array_equal(grad_x2_val, np.ones((2, 1, 2, 3)))
360 | 


--------------------------------------------------------------------------------
/tests/test_autodiff_gpu.py:
--------------------------------------------------------------------------------
 1 | import aurora as au
 2 | import aurora.autodiff as ad
 3 | import numpy as np
 4 | import numpy.testing as npt
 5 | from aurora.ndarray import ndarray, gpu_op
 6 | 
 7 | 
 8 | def test_identity():
 9 |     x2 = ad.Variable(name='x2')
10 |     y = x2
11 | 
12 |     grad_x2, = ad.gradients(y, [x2])
13 | 
14 |     executor = ad.Executor([y, grad_x2], use_gpu=True)
15 |     x2_val = 2 * np.ones(3)
16 |     y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val})
17 | 
18 |     y_val_np = y_val.asnumpy()
19 |     grad_x2_val_np = grad_x2_val.asnumpy()
20 | 
21 |     assert isinstance(y, ad.Node)
22 |     assert np.array_equal(y_val_np, x2_val)
23 |     assert np.array_equal(grad_x2_val_np, np.ones_like(x2_val))
24 | 
25 | 
26 | def test_add_by_const():
27 |     x2 = ad.Variable(name="x2")
28 |     y = 5 + x2
29 | 
30 |     grad_x2, = ad.gradients(y, [x2])
31 | 
32 |     executor = ad.Executor([y, grad_x2], use_gpu=True)
33 |     x2_val = 2 * np.ones(3)
34 |     y_val, grad_x2_val = executor.run(feed_shapes={x2: x2_val})
35 | 
36 |     y_val = y_val.asnumpy()
37 |     grad_x2_val = grad_x2_val.asnumpy()
38 | 
39 |     assert isinstance(y, ad.Node)
40 |     assert np.array_equal(y_val, x2_val + 5)
41 |     assert np.array_equal(grad_x2_val, np.ones_like(x2_val))
42 | 
43 | 
44 | def test_softmax():
45 |     shape = (2, 2)
46 |     x_val = np.random.uniform(-5, 5, shape).astype(np.float32)
47 | 
48 |     x2 = ad.Variable(name='x2')
49 |     prob = au.nn.softmax(x2)
50 |     executor = ad.Executor([prob], use_gpu=True)
51 |     y, = executor.run(feed_shapes={x2: x_val})
52 |     y = y.asnumpy()
53 |     np.testing.assert_allclose(au.nn.softmax_func(x_val), y, rtol=1e-5)
54 | 


--------------------------------------------------------------------------------
/tests/test_gpu_operations.py:
--------------------------------------------------------------------------------
  1 | import aurora as au
  2 | import aurora.autodiff as ad
  3 | import numpy as np
  4 | import numpy.testing as npt
  5 | from aurora.ndarray import ndarray, gpu_op
  6 | 
  7 | 
  8 | def test_dummy():
  9 |     assert 1 == 1
 10 | 
 11 | 
 12 | def test_array_set():
 13 |     ctx = ndarray.gpu(0)
 14 |     shape = (5000, 2000)
 15 |     # oneslike
 16 |     arr_x = ndarray.empty(shape, ctx=ctx)
 17 |     gpu_op.array_set(arr_x, 1.)
 18 |     x = arr_x.asnumpy()
 19 |     np.testing.assert_allclose(np.ones(shape), x)
 20 |     # zeroslike
 21 |     gpu_op.array_set(arr_x, 0.)
 22 |     x = arr_x.asnumpy()
 23 |     np.testing.assert_allclose(np.zeros(shape), x)
 24 | 
 25 | 
 26 | def test_broadcast_to():
 27 |     ctx = ndarray.gpu(0)
 28 |     shape = (200, 300)
 29 |     to_shape = (130, 200, 300)
 30 |     x = np.random.uniform(-1, 1, shape).astype(np.float32)
 31 |     arr_x = ndarray.array(x, ctx=ctx)
 32 |     arr_y = ndarray.empty(to_shape, ctx=ctx)
 33 |     gpu_op.broadcast_to(arr_x, arr_y)
 34 |     y = arr_y.asnumpy()
 35 |     np.testing.assert_allclose(np.broadcast_to(x, to_shape), y)
 36 | 
 37 | 
 38 | def test_reduce_sum_axis_zero():
 39 |     ctx = ndarray.gpu(0)
 40 |     shape = (500, 200, 100)
 41 |     to_shape = (200, 100)
 42 |     x = np.random.uniform(0, 20, shape).astype(np.float32)
 43 |     arr_x = ndarray.array(x, ctx=ctx)
 44 |     arr_y = ndarray.empty(to_shape, ctx=ctx)
 45 |     gpu_op.reduce_sum_axis_zero(arr_x, arr_y)
 46 |     y = arr_y.asnumpy()
 47 |     y_ = np.sum(x, axis=0)
 48 |     for index, _ in np.ndenumerate(y):
 49 |         v = y[index]
 50 |         v_ = y_[index]
 51 |         if abs((v - v_) / v_) > 1e-4:
 52 |             print(index, v, v_)
 53 |     np.testing.assert_allclose(np.sum(x, axis=0), y, rtol=1e-5)
 54 | 
 55 | 
 56 | def test_matrix_elementwise_add():
 57 |     ctx = ndarray.gpu(0)
 58 |     shape = (5000, 2000)
 59 |     x = np.random.uniform(0, 10, size=shape).astype(np.float32)
 60 |     y = np.random.uniform(0, 10, size=shape).astype(np.float32)
 61 |     arr_x = ndarray.array(x, ctx=ctx)
 62 |     arr_y = ndarray.array(y, ctx=ctx)
 63 |     arr_z = ndarray.empty(shape, ctx=ctx)
 64 |     gpu_op.matrix_elementwise_add(arr_x, arr_y, arr_z)
 65 |     z = arr_z.asnumpy()
 66 |     np.testing.assert_allclose(x + y, z, rtol=1e-5)
 67 | 
 68 | 
 69 | def test_matrix_elementwise_add_by_const():
 70 |     shape = (2000, 3000)
 71 |     ctx = ndarray.gpu(0)
 72 |     x = np.random.uniform(0, 10, size=shape).astype(np.float32)
 73 |     val = np.random.uniform(-5, 5)
 74 |     arr_x = ndarray.array(x, ctx=ctx)
 75 |     arr_y = ndarray.empty(shape, ctx=ctx)
 76 |     gpu_op.matrix_elementwise_add_by_const(arr_x, val, arr_y)
 77 |     y = arr_y.asnumpy()
 78 |     np.testing.assert_allclose(x + val, y, rtol=1e-5)
 79 | 
 80 | 
 81 | def test_matrix_elementwise_multiply():
 82 |     ctx = ndarray.gpu(0)
 83 |     shape = (500, 200)
 84 |     x = np.random.uniform(0, 10, size=shape).astype(np.float32)
 85 |     y = np.random.uniform(0, 10, size=shape).astype(np.float32)
 86 |     arr_x = ndarray.array(x, ctx=ctx)
 87 |     arr_y = ndarray.array(y, ctx=ctx)
 88 |     arr_z = ndarray.empty(shape, ctx=ctx)
 89 |     gpu_op.matrix_elementwise_multiply(arr_x, arr_y, arr_z)
 90 |     z = arr_z.asnumpy()
 91 |     np.testing.assert_allclose(x * y, z, rtol=1e-5)
 92 | 
 93 | 
 94 | def test_matrix_elementwise_sqrt():
 95 |     ctx = ndarray.gpu(0)
 96 |     shape = (500, 200)
 97 |     x = np.random.uniform(0, 10, size=shape).astype(np.float32)
 98 |     arr_x = ndarray.array(x, ctx=ctx)
 99 |     gpu_op.matrix_elementwise_sqrt(arr_x, arr_x)
100 |     z = arr_x.asnumpy()
101 |     np.testing.assert_allclose(np.sqrt(x), z, rtol=1e-5)
102 | 
103 | 
104 | def test_matrix_elementwise_multiply_by_const():
105 |     shape = (2000, 3000)
106 |     ctx = ndarray.gpu(0)
107 |     x = np.random.uniform(0, 10, size=shape).astype(np.float32)
108 |     val = np.random.uniform(-5, 5)
109 |     arr_x = ndarray.array(x, ctx=ctx)
110 |     arr_y = ndarray.empty(shape, ctx=ctx)
111 |     gpu_op.matrix_elementwise_multiply_by_const(arr_x, val, arr_y)
112 |     y = arr_y.asnumpy()
113 |     np.testing.assert_allclose(x * val, y, rtol=1e-5)
114 | 
115 | 
116 | def test_matrix_multiply():
117 |     ctx = ndarray.gpu(0)
118 |     x = np.random.uniform(0, 10, size=(500, 700)).astype(np.float32)
119 |     y = np.random.uniform(0, 10, size=(700, 1000)).astype(np.float32)
120 |     arr_x = ndarray.array(x, ctx=ctx)
121 |     arr_y = ndarray.array(y, ctx=ctx)
122 |     arr_z = ndarray.empty((500, 1000), ctx=ctx)
123 |     gpu_op.matrix_multiply(arr_x, False, arr_y, False, arr_z)
124 |     z = arr_z.asnumpy()
125 |     np.testing.assert_allclose(np.dot(x, y), z, rtol=1e-5)
126 | 
127 |     x = np.random.uniform(0, 10, size=(1000, 500)).astype(np.float32)
128 |     y = np.random.uniform(0, 10, size=(2000, 500)).astype(np.float32)
129 |     arr_x = ndarray.array(x, ctx=ctx)
130 |     arr_y = ndarray.array(y, ctx=ctx)
131 |     arr_z = ndarray.empty((1000, 2000), ctx=ctx)
132 |     gpu_op.matrix_multiply(arr_x, False, arr_y, True, arr_z)
133 |     z = arr_z.asnumpy()
134 |     np.testing.assert_allclose(np.dot(x, np.transpose(y)), z, rtol=1e-5)
135 | 
136 |     x = np.random.uniform(0, 10, size=(500, 1000)).astype(np.float32)
137 |     y = np.random.uniform(0, 10, size=(2000, 500)).astype(np.float32)
138 |     arr_x = ndarray.array(x, ctx=ctx)
139 |     arr_y = ndarray.array(y, ctx=ctx)
140 |     arr_z = ndarray.empty((1000, 2000), ctx=ctx)
141 |     gpu_op.matrix_multiply(arr_x, True, arr_y, True, arr_z)
142 |     z = arr_z.asnumpy()
143 |     np.testing.assert_allclose(np.dot(np.transpose(x), np.transpose(y)), z,
144 |                                rtol=1e-5)
145 | 
146 | 
147 | def test_relu():
148 |     shape = (2000, 2500)
149 |     ctx = ndarray.gpu(0)
150 |     x = np.random.uniform(-1, 1, shape).astype(np.float32)
151 |     arr_x = ndarray.array(x, ctx=ctx)
152 |     arr_y = ndarray.empty(shape, ctx=ctx)
153 |     gpu_op.relu(arr_x, arr_y)
154 |     y = arr_y.asnumpy()
155 |     np.testing.assert_allclose(np.maximum(x, 0).astype(np.float32), y)
156 | 
157 | 
158 | def test_relu_gradient():
159 |     shape = (2000, 2500)
160 |     ctx = ndarray.gpu(0)
161 |     x = np.random.uniform(-1, 1, shape).astype(np.float32)
162 |     grad_x = np.random.uniform(-5, 5, shape).astype(np.float32)
163 |     arr_x = ndarray.array(x, ctx=ctx)
164 |     arr_grad_x = ndarray.array(grad_x, ctx=ctx)
165 |     arr_y = ndarray.empty(shape, ctx=ctx)
166 |     gpu_op.relu_gradient(arr_x, arr_grad_x, arr_y)
167 |     y = arr_y.asnumpy()
168 |     np.testing.assert_allclose(((x > 0) * grad_x).astype(np.float32), y)
169 | 
170 | 
171 | def test_softmax():
172 |     ctx = ndarray.gpu(0)
173 |     shape = (400, 1000)
174 |     x = np.random.uniform(-5, 5, shape).astype(np.float32)
175 |     arr_x = ndarray.array(x, ctx=ctx)
176 |     arr_y = ndarray.empty(shape, ctx=ctx)
177 |     gpu_op.softmax(arr_x, arr_y)
178 |     y = arr_y.asnumpy()
179 |     np.testing.assert_allclose(au.nn.softmax_func(x), y, rtol=1e-5)
180 | 


--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/upul/Aurora/415a80ac5f7083475baca4a2d187cd102ba7a6c5/tests/utils/__init__.py


--------------------------------------------------------------------------------
/tests/utils/gradient_check.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def gradient_check_numpy_expr(func, x, output_gradient, h=1e-5):
 5 |     """
 6 |     This utility function calculates gradient of the function `func`
 7 |     at `x`.
 8 |     :param func:
 9 |     :param x:
10 |     :param output_gradient:
11 |     :param h:
12 |     :return:
13 |     """
14 |     grad = np.zeros_like(x).astype(np.float32)
15 |     iter = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
16 |     while not iter.finished:
17 |         idx = iter.multi_index
18 |         old_value = x[idx]
19 | 
20 |         # calculate positive value
21 |         x[idx] = old_value + h
22 |         pos = func(x).copy()
23 | 
24 |         # calculate negative value
25 |         x[idx] = old_value - h
26 |         neg = func(x).copy()
27 | 
28 |         # restore
29 |         x[idx] = old_value
30 | 
31 |         # calculate gradient
32 |         # Type of pos and neg will be memoryview if we are testing Cython functions.
33 |         # Therefore, we create numpy arrays be performing - operation.
34 |         # TODO: Don't we have an alternative method without creating numpy array from memoryview?
35 |         grad[idx] = np.sum((np.array(pos) - np.array(neg)) * output_gradient) / (2 * h)
36 |         iter.iternext()
37 | 
38 |     return grad
39 | 


--------------------------------------------------------------------------------