├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── Makefile
├── README.md
├── python
    └── tinyflow
    │   ├── __init__.py
    │   ├── __pycache__
    │       ├── __init__.cpython-36.pyc
    │       ├── _base.cpython-36.pyc
    │       ├── autodiff.cpython-36.pyc
    │       ├── gpu_op.cpython-36.pyc
    │       └── ndarray.cpython-36.pyc
    │   ├── _base.py
    │   ├── autodiff.py
    │   ├── gpu_op.py
    │   └── ndarray.py
├── src
    ├── c_runtime_api.cc
    ├── c_runtime_api.h
    ├── cpu_device_api.cc
    ├── cpu_device_api.h
    ├── cuda_device_api.cc
    ├── cuda_device_api.h
    ├── device_api.h
    ├── dlarray.h
    ├── gpu_op.cu
    └── runtime_base.h
└── tests
    ├── autodiff_test.py
    ├── mnist_dlsys.py
    └── test_gpu_op.py


/.gitignore:
--------------------------------------------------------------------------------
1 | # build directories
2 | build
3 | cmake-build-debug
4 | .pytest_cache
5 | .idea
6 | 
7 | *.gz


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 3.13)
 2 | PROJECT(tinyflow LANGUAGES C CXX)
 3 | FIND_PACKAGE(CUDA REQUIRED)
 4 | SET(CUDA_DIR /usr/local/cuda)
 5 | 
 6 | FILE(GLOB CC_SRCS "src/*.cc")
 7 | FILE(GLOB CUDA_SRCS "src/*.cu")
 8 | FILE(GLOB HEAD_FILES_DIR "src")
 9 | 
10 | INCLUDE_DIRECTORIES(${CUDA_DIR}/include)
11 | 
12 | SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -Wall -Wfatal-errors -Wno-unused -Wno-unused-result")
13 | SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
14 | SET(ARCH "-gencode arch=compute_30,code=sm_30
15 |           -gencode arch=compute_35,code=sm_35
16 |           -gencode arch=compute_50,code=[sm_50,compute_50]
17 |           -gencode arch=compute_52,code=[sm_52,compute_52]")
18 | SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11 --compiler-options '-fPIC' ${ARCH}")
19 | 
20 | LINK_DIRECTORIES(${CUDA_DIR}/lib64)
21 | CUDA_ADD_LIBRARY(c_runtime_api SHARED ${CC_SRCS} ${CUDA_SRCS})
22 | TARGET_LINK_LIBRARIES(c_runtime_api -lcuda -lcudart -lcublas)
23 | 
24 | INSTALL(TARGETS c_runtime_api LIBRARY DESTINATION ${PROJECT_SOURCE_DIR}/build/lib)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Yu Liebing
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CUDA_DIR = /usr/local/cuda
 2 | 
 3 | CC_SRCS := $(wildcard src/*.cc)
 4 | CC_OBJS := ${CC_SRCS:src/%.cc=build/obj/%.o}
 5 | CUDA_SRCS := $(wildcard src/*.cu)
 6 | CUDA_OBJS := ${CUDA_SRCS:src/%.cu=build/obj/%.o}
 7 | OBJS := $(CC_OBJS) $(CUDA_OBJS)
 8 | 
 9 | CC = g++
10 | WARNINGS = -Wall -Wfatal-errors -Wno-unused -Wno-unused-result
11 | CC_FLAGS = -std=c++11 -fPIC $(WARNINGS) -I$(CUDA_DIR)/include
12 | LD_FLAGS = -L$(CUDA_DIR)/lib64 -lcuda -lcudart -lcublas
13 | 
14 | NVCC = nvcc
15 | NVCC_FLAGS = -std=c++11 --compiler-options '-fPIC'
16 | ARCH = -gencode arch=compute_30,code=sm_30 \
17 |        -gencode arch=compute_35,code=sm_35 \
18 |        -gencode arch=compute_50,code=[sm_50,compute_50] \
19 |        -gencode arch=compute_52,code=[sm_52,compute_52]
20 | 
21 | all: build/lib/libc_runtime_api.so
22 | 
23 | build/lib/libc_runtime_api.so: $(OBJS)
24 | 	@mkdir -p build/lib
25 | 	$(CC) -shared $^ -o $@ $(LD_FLAGS)
26 | 
27 | build/obj/%.o: src/%.cc
28 | 	@mkdir -p build/obj
29 | 	$(CC) $(CC_FLAGS) -c $< -o $@
30 | 
31 | build/obj/%.o: src/%.cu
32 | 	@mkdir -p build/obj
33 | 	$(NVCC) $(ARCH) $(NVCC_FLAGS) -c $< -o $@
34 | 
35 | clean:
36 | 	rm -rf build
37 | 
38 | .PHONY: clean
39 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Tinyflow is a simple deep learning framework for learning purposes. It supports automatic 
 2 | differentiation and GPU acceleration. TinyFlow currently provides all the operators needed 
 3 | to build a multilayer perceptron models (MLP).
 4 | 
 5 | If you want to learn more about the principles behind Tinyflow, the following two blog posts may provide a lot of intuition.
 6 | + [Automatic Differentiation Based on Computation Graph](https://liebing.org.cn/automatic-differentiation.html)
 7 | + [Tinyflow - A Simple Neural Network Framework](https://liebing.org.cn/tinyflow.html)
 8 | 
 9 | # Install
10 | Tinyflow currently only supports running in 64-bit linux environment. Requirement:
11 | + gcc >= 4.8;
12 | + cmake >= 3.13 (if you choose to use cmake);
13 | + CUDA 9.0
14 | + python 3
15 | 
16 | Download the source code.
17 | ```shell
18 | git clone https://github.com/LB-Yu/tinyflow.git
19 | ```
20 | 
21 | Generally speaking, CUDA will be installed in `/use/local/cuda`. 
22 | If your installation path is different, please modify the `CUDA_DIR` variable on the first 
23 | line of the Makefile to your installation path, or modify the `CUDA_DIR` variable on the 
24 | fourth line of CMakeLists.txt to your installation path.
25 | 
26 | For compiling with Makefile.
27 | ```shell
28 | cd tinyflow
29 | make
30 | ```
31 | 
32 | For compiling with CMake.
33 | ```shell
34 | cd tinyflow
35 | mkdir build
36 | cmake ..
37 | make
38 | make install
39 | ```
40 | 
41 | # Run the MNIST Example
42 | After compiling the GPU library, we can train an MLP on the MNIST dataset.
43 | ```shell
44 | export PYTHONPATH="/path/to/tinyflow/python:${PYTHONPATH}"
45 | 
46 | # see cmd options with 
47 | # python tests/mnist_dlsys.py -h
48 | 
49 | # run logistic regression on numpy
50 | python tests/mnist_dlsys.py -l -m logreg -c numpy
51 | # run logistic regression on gpu
52 | python tests/mnist_dlsys.py -l -m logreg -c gpu
53 | # run MLP on numpy
54 | python tests/mnist_dlsys.py -l -m mlp -c numpy
55 | # run MLP on gpu
56 | python tests/mnist_dlsys.py -l -m mlp -c gpu
57 | ```
58 | 
59 | # Overview of Module
60 | - python/dlsys/autodiff.py: Implements computation graph, autodiff, GPU/Numpy Executor.
61 | - python/dlsys/gpu_op.py: Exposes Python function to call GPU kernels via ctypes.
62 | - python/dlsys/ndarray.py: Exposes Python GPU array API.
63 | 
64 | - src/dlarray.h: header for GPU array.
65 | - src/c_runtime_api.h: C API header for GPU array and GPU kernels.
66 | - src/gpu_op.cu: cuda implementation of kernels
67 | 


--------------------------------------------------------------------------------
/python/tinyflow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LB-Yu/tinyflow/772669a983f18d78b48fd9108d0fcbc942487247/python/tinyflow/__init__.py


--------------------------------------------------------------------------------
/python/tinyflow/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LB-Yu/tinyflow/772669a983f18d78b48fd9108d0fcbc942487247/python/tinyflow/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/python/tinyflow/__pycache__/_base.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LB-Yu/tinyflow/772669a983f18d78b48fd9108d0fcbc942487247/python/tinyflow/__pycache__/_base.cpython-36.pyc


--------------------------------------------------------------------------------
/python/tinyflow/__pycache__/autodiff.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LB-Yu/tinyflow/772669a983f18d78b48fd9108d0fcbc942487247/python/tinyflow/__pycache__/autodiff.cpython-36.pyc


--------------------------------------------------------------------------------
/python/tinyflow/__pycache__/gpu_op.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LB-Yu/tinyflow/772669a983f18d78b48fd9108d0fcbc942487247/python/tinyflow/__pycache__/gpu_op.cpython-36.pyc


--------------------------------------------------------------------------------
/python/tinyflow/__pycache__/ndarray.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LB-Yu/tinyflow/772669a983f18d78b48fd9108d0fcbc942487247/python/tinyflow/__pycache__/ndarray.cpython-36.pyc


--------------------------------------------------------------------------------
/python/tinyflow/_base.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # pylint: disable=invalid-name
 3 | """ ctypes library of dlsys and helper functions """
 4 | from __future__ import absolute_import
 5 | 
 6 | import os
 7 | import ctypes
 8 | 
 9 | 
10 | def _load_lib():
11 |     """Load libary in build/lib."""
12 |     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
13 |     lib_path = os.path.join(curr_path, '../../build/lib/')
14 |     path_to_so_file = os.path.join(lib_path, "libc_runtime_api.so")
15 |     lib = ctypes.CDLL(path_to_so_file, ctypes.RTLD_GLOBAL)
16 |     return lib
17 | 
18 | 
19 | # global library instance
20 | _LIB = _load_lib()
21 | 
22 | 
23 | ##################
24 | # Helper Methods #
25 | ##################
26 | 
27 | def check_call(ret):
28 |     """Check the return value of C API call
29 | 
30 |     This function will crash when error occurs.
31 |     Wrap every API call with this function
32 | 
33 |     Parameters
34 |     ----------
35 |     ret : int
36 |         return value from API calls
37 |     """
38 |     assert(ret == 0)
39 | 
40 | 
41 | def c_array(ctype, values):
42 |     """Create ctypes array from a python array
43 | 
44 |     Parameters
45 |     ----------
46 |     ctype : ctypes data type
47 |         data type of the array we want to convert to
48 | 
49 |     values : tuple or list
50 |         data content
51 | 
52 |     Returns
53 |     -------
54 |     out : ctypes array
55 |         Created ctypes array
56 |     """
57 |     return (ctype * len(values))(*values)
58 | 


--------------------------------------------------------------------------------
/python/tinyflow/autodiff.py:
--------------------------------------------------------------------------------
  1 | """ library to take autodiff and execute a computation graph """
  2 | from __future__ import absolute_import
  3 | 
  4 | import numpy as np
  5 | from . import ndarray, gpu_op
  6 | 
  7 | 
  8 | class Node(object):
  9 |     """Node in a computation graph."""
 10 |     def __init__(self):
 11 |         """Constructor, new node is indirectly created by Op object call method.
 12 | 
 13 |             Instance variables
 14 |             ------------------
 15 |             self.inputs: the list of input nodes.
 16 |             self.op: the associated op object,
 17 |                 e.g. add_op if this node is created by adding two other nodes.
 18 |             self.const_attr: the add or multiply constant.
 19 |                 e.g. self.const_attr=5 if this node is created by x+5.
 20 |             self.name: node name for debugging.
 21 |         """
 22 |         self.inputs = []
 23 |         self.op = None
 24 |         self.const_attr = None
 25 |         self.name = ""
 26 | 
 27 |     def __add__(self, other):
 28 |         """Adding two nodes return a new node."""
 29 |         if isinstance(other, Node):
 30 |             new_node = add_op(self, other)
 31 |         else:
 32 |             # Add by a constant stores the constant in new node's const_attr
 33 |             # 'other' argument is a constant
 34 |             new_node = add_byconst_op(self, other)
 35 |         return new_node
 36 | 
 37 |     def __mul__(self, other):
 38 |         """Multiplying two nodes return a new node."""
 39 |         if isinstance(other, Node):
 40 |             new_node = mul_op(self, other)
 41 |         else:
 42 |             # Mul by a constant stores the constant in new node's const_attr
 43 |             # 'other' argument is a constant
 44 |             new_node = mul_byconst_op(self, other)
 45 |         return new_node
 46 | 
 47 |     # Allow left-hand-side add and multiply.
 48 |     __radd__ = __add__
 49 |     __rmul__ = __mul__
 50 | 
 51 |     def __str__(self):
 52 |         """Allow print to display node name."""
 53 |         return self.name
 54 | 
 55 | 
 56 | def Variable(name):
 57 |     """User defined variables in an expression.
 58 |         e.g. x = Variable(name = "x")
 59 |     """
 60 |     placeholder_node = placeholder_op()
 61 |     placeholder_node.name = name
 62 |     return placeholder_node
 63 | 
 64 | 
 65 | class Op(object):
 66 |     """Op represents operations performed on nodes."""
 67 |     def __call__(self):
 68 |         """Create a new node and associate the op object with the node.
 69 | 
 70 |         Returns
 71 |         -------
 72 |         The new node object.
 73 |         """
 74 |         new_node = Node()
 75 |         new_node.op = self
 76 |         return new_node
 77 | 
 78 |     def compute(self, node, input_vals, output_val, use_numpy=True):
 79 |         """Given values of input nodes, compute the output value.
 80 | 
 81 |         Parameters
 82 |         ----------
 83 |         node: node that performs the compute.
 84 |         input_vals: values of input nodes.
 85 |         output_val: output value of the node, modified in-place.
 86 |         use_numpy: bool flag whether to use numpy for compute
 87 |         """
 88 |         raise NotImplementedError
 89 | 
 90 |     def gradient(self, node, output_grad):
 91 |         """Given output gradient, compute partial gradient to each input node.
 92 | 
 93 |         Parameters
 94 |         ----------
 95 |         node: node that performs the gradient.
 96 |         output_grad: output gradient summed from children nodes' contributions
 97 | 
 98 |         Returns
 99 |         -------
100 |         A list of gradient contributions to each input node respectively.
101 |         """
102 |         raise NotImplementedError
103 | 
104 |     def infer_shape(self, node, input_shapes):
105 |         """Given shapes of input nodes, compute shape of output node.
106 | 
107 |         Implementation note:
108 |         It's simpler to treat shape of constants as (1,), so that constants can
109 |         be stored as a numpy array too and you would need fewer special case
110 |         handling.
111 | 
112 |         Parameters
113 |         ----------
114 |         node: node whose shape is being inferred.
115 |         input_shapes: shapes of input nodes.
116 | 
117 |         Returns
118 |         -------
119 |         A tuple representing the shape of output node.
120 |         """
121 |         raise NotImplementedError
122 | 
123 | 
124 | class AddOp(Op):
125 |     def __call__(self, node_A, node_B):
126 |         new_node = Op.__call__(self)
127 |         new_node.inputs = [node_A, node_B]
128 |         new_node.name = "(%s+%s)" % (node_A.name, node_B.name)
129 |         return new_node
130 | 
131 |     def compute(self, node, input_vals, output_val, use_numpy=True):
132 |         assert len(input_vals) == 2
133 |         if use_numpy:
134 |             # output_val[:] allows modify in-place
135 |             output_val[:] = input_vals[0] + input_vals[1]
136 |         else:
137 |             if input_vals[0].shape == input_vals[1].shape:
138 |                 gpu_op.matrix_elementwise_add(
139 |                     input_vals[0], input_vals[1], output_val)
140 |             else:
141 |                 if input_vals[1].shape == (1,):
142 |                     const_val = input_vals[1].asnumpy()[0]
143 |                     gpu_op.matrix_elementwise_add_by_const(
144 |                         input_vals[0], const_val, output_val)
145 |                 elif input_vals[0].shape == (1,):
146 |                     const_val = input_vals[0].asnumpy()[0]
147 |                     gpu_op.matrix_elementwise_add_by_const(
148 |                         input_vals[1], const_val, output_val)
149 | 
150 |     def gradient(self, node, output_grad):
151 |         return [output_grad, output_grad]
152 | 
153 |     def infer_shape(self, node, input_shapes):
154 |         """Need to handle input_vals[0].shape != input_vals[1].shape"""
155 |         if input_shapes[0] == input_shapes[1]:
156 |             return input_shapes[0]
157 |         elif input_shapes[0] == (1,):
158 |             return input_shapes[1]
159 |         elif input_shapes[1] == (1,):
160 |             return input_shapes[0]
161 | 
162 | 
163 | class AddByConstOp(Op):
164 |     def __call__(self, node_A, const_val):
165 |         new_node = Op.__call__(self)
166 |         new_node.const_attr = const_val
167 |         new_node.inputs = [node_A]
168 |         new_node.name = "(%s+%s)" % (node_A.name, str(const_val))
169 |         return new_node
170 | 
171 |     def compute(self, node, input_vals, output_val, use_numpy=True):
172 |         assert len(input_vals) == 1
173 |         if use_numpy:
174 |             output_val[:] = input_vals[0] + node.const_attr
175 |         else:
176 |             gpu_op.matrix_elementwise_add_by_const(
177 |                 input_vals[0], node.const_attr, output_val)
178 | 
179 |     def gradient(self, node, output_grad):
180 |         return [output_grad]
181 | 
182 |     def infer_shape(self, node, input_shapes):
183 |         return input_shapes[0]
184 | 
185 | 
186 | class MulOp(Op):
187 |     def __call__(self, node_A, node_B):
188 |         new_node = Op.__call__(self)
189 |         new_node.inputs = [node_A, node_B]
190 |         new_node.name = "(%s*%s)" % (node_A.name, node_B.name)
191 |         return new_node
192 | 
193 |     def compute(self, node, input_vals, output_val, use_numpy=True):
194 |         assert len(input_vals) == 2
195 |         if use_numpy:
196 |             output_val[:] = input_vals[0] * input_vals[1]
197 |         else:
198 |             if input_vals[0].shape == input_vals[1].shape:
199 |                 gpu_op.matrix_elementwise_multiply(
200 |                     input_vals[0], input_vals[1], output_val)
201 |             else:
202 |                 if input_vals[1].shape == (1,):
203 |                     const_val = input_vals[1].asnumpy()[0]
204 |                     gpu_op.matrix_elementwise_multiply_by_const(
205 |                         input_vals[0], const_val, output_val)
206 |                 elif input_vals[0].shape == (1,):
207 |                     const_val = input_vals[0].asnumpy()[0]
208 |                     gpu_op.matrix_elementwise_multiply_by_const(
209 |                         input_vals[1], const_val, output_val)
210 | 
211 |     def gradient(self, node, output_grad):
212 |         return [node.inputs[1] * output_grad, node.inputs[0] * output_grad]
213 | 
214 |     def infer_shape(self, node, input_shapes):
215 |         """Need to handle input_vals[0].shape != input_vals[1].shape"""
216 |         if input_shapes[0] == input_shapes[1]:
217 |             return input_shapes[0]
218 |         elif input_shapes[0] == (1,):
219 |             return input_shapes[1]
220 |         elif input_shapes[1] == (1,):
221 |             return input_shapes[0]
222 | 
223 | 
224 | class MulByConstOp(Op):
225 |     def __call__(self, node_A, const_val):
226 |         new_node = Op.__call__(self)
227 |         new_node.const_attr = const_val
228 |         new_node.inputs = [node_A]
229 |         new_node.name = "(%s*%s)" % (node_A.name, str(const_val))
230 |         return new_node
231 | 
232 |     def compute(self, node, input_vals, output_val, use_numpy=True):
233 |         assert len(input_vals) == 1
234 |         if use_numpy:
235 |             output_val[:] = input_vals[0] * node.const_attr
236 |         else:
237 |             gpu_op.matrix_elementwise_multiply_by_const(
238 |                 input_vals[0], node.const_attr, output_val)
239 | 
240 |     def gradient(self, node, output_grad):
241 |         return [node.const_attr * output_grad]
242 | 
243 |     def infer_shape(self, node, input_shapes):
244 |         return input_shapes[0]
245 | 
246 | 
247 | class MatMulOp(Op):
248 |     def __call__(self, node_A, node_B, trans_A=False, trans_B=False):
249 |         new_node = Op.__call__(self)
250 |         new_node.matmul_attr_trans_A = trans_A
251 |         new_node.matmul_attr_trans_B = trans_B
252 |         new_node.inputs = [node_A, node_B]
253 |         new_node.name = "MatMul(%s,%s,%s,%s)" % (
254 |             node_A.name, node_B.name, str(trans_A), str(trans_B))
255 |         return new_node
256 | 
257 |     def compute(self, node, input_vals, output_val, use_numpy=True):
258 |         if use_numpy:
259 |             if ((node.matmul_attr_trans_A is False) and
260 |                     (node.matmul_attr_trans_B is False)):
261 |                 output_val[:] = np.matmul(input_vals[0], input_vals[1])
262 |             elif ((node.matmul_attr_trans_A is True) and
263 |                     (node.matmul_attr_trans_B is False)):
264 |                 output_val[:] = np.matmul(
265 |                     np.transpose(input_vals[0]), input_vals[1])
266 |             elif ((node.matmul_attr_trans_A is False) and
267 |                     (node.matmul_attr_trans_B is True)):
268 |                 output_val[:] = np.matmul(
269 |                     input_vals[0], np.transpose(input_vals[1]))
270 |             elif ((node.matmul_attr_trans_A is True) and
271 |                     (node.matmul_attr_trans_B is True)):
272 |                 output_val[:] = np.matmul(
273 |                     np.transpose(input_vals[0]), np.transpose(input_vals[1]))
274 |         else:
275 |             gpu_op.matrix_multiply(
276 |                 input_vals[0], node.matmul_attr_trans_A,
277 |                 input_vals[1], node.matmul_attr_trans_B,
278 |                 output_val)
279 | 
280 |     def gradient(self, node, output_grad):
281 |         if ((node.matmul_attr_trans_A is False) and
282 |                 (node.matmul_attr_trans_B is False)):
283 |             # if Y=AB, then dA=dY B^T, dB=A^T dY
284 |             lhs_grad = matmul_op(
285 |                 output_grad, node.inputs[1], trans_A=False, trans_B=True)
286 |             rhs_grad = matmul_op(
287 |                 node.inputs[0], output_grad, trans_A=True, trans_B=False)
288 |         elif ((node.matmul_attr_trans_A is True) and
289 |                 (node.matmul_attr_trans_B is False)):
290 |             # if Y=A^T B, then dA=(dY B^T)^T=B dY^T, dB=A^T dY
291 |             lhs_grad = matmul_op(
292 |                 node.inputs[1], output_grad, trans_A=False, trans_B=True)
293 |             rhs_grad = matmul_op(
294 |                 node.inputs[0], output_grad, trans_A=True, trans_B=False)
295 |         elif ((node.matmul_attr_trans_A is False) and
296 |                 (node.matmul_attr_trans_B is True)):
297 |             # if Y=A B^T, then dA=dY B^T, dB=(A^T dY)^T=dY^T A
298 |             lhs_grad = matmul_op(
299 |                 output_grad, node.inputs[1], trans_A=False, trans_B=True)
300 |             rhs_grad = matmul_op(
301 |                 output_grad, node.inputs[0], trans_A=True, trans_B=False)
302 |         elif ((node.matmul_attr_trans_A is True) and
303 |                 (node.matmul_attr_trans_B is True)):
304 |             # if Y=A^T B^T, then dA=(dY B^T)^T=B dY^T, dB=(A^T dY)^T=dY^T A
305 |             lhs_grad = matmul_op(
306 |                 node.inputs[1], output_grad, trans_A=False, trans_B=True)
307 |             rhs_grad = matmul_op(
308 |                 output_grad, node.inputs[0], trans_A=True, trans_B=False)
309 |         return [lhs_grad, rhs_grad]
310 | 
311 |     def infer_shape(self, node, input_shapes):
312 |         if node.matmul_attr_trans_A is False and node.matmul_attr_trans_B is False:
313 |             return input_shapes[0][0], input_shapes[1][1]
314 |         elif node.matmul_attr_trans_A is False and node.matmul_attr_trans_B is True:
315 |             return input_shapes[0][0], input_shapes[1][0]
316 |         elif node.matmul_attr_trans_A is True and node.matmul_attr_trans_B is False:
317 |             return input_shapes[0][1], input_shapes[1][1]
318 |         else:
319 |             return input_shapes[0][1], input_shapes[1][0]
320 | 
321 | 
322 | class PlaceholderOp(Op):
323 |     def __call__(self):
324 |         """Creates a variable node."""
325 |         new_node = Op.__call__(self)
326 |         return new_node
327 | 
328 |     def compute(self, node, input_vals, output_val, use_numpy=True):
329 |         assert False, "placeholder %s values provided by feed_dict" % node.name
330 | 
331 |     def gradient(self, node, output_grad):
332 |         return None
333 | 
334 |     def infer_shape(self, node, input_shapes):
335 |         assert False, "placeholder %s shape provided by feed_shape" % node.name
336 | 
337 | 
338 | class ZerosLikeOp(Op):
339 |     def __call__(self, node_A):
340 |         """Creates a node that represents np.zeros(node_A.shape)."""
341 |         new_node = Op.__call__(self)
342 |         new_node.inputs = [node_A]
343 |         new_node.name = "Zeroslike(%s)" % node_A.name
344 |         return new_node
345 | 
346 |     def compute(self, node, input_vals, output_val, use_numpy=True):
347 |         assert len(input_vals) == 1
348 |         if use_numpy:
349 |             output_val[:] = np.zeros(input_vals[0].shape)
350 |         else:
351 |             gpu_op.array_set(output_val, 0)
352 | 
353 |     def gradient(self, node, output_grad):
354 |         return [zeroslike_op(node.inputs[0])]
355 | 
356 |     def infer_shape(self, node, input_shapes):
357 |         """If input_shape is a vector, simpler to return (1,)"""
358 |         return input_shapes[0]
359 | 
360 | 
361 | class OnesLikeOp(Op):
362 |     def __call__(self, node_A):
363 |         """Creates a node that represents np.ones(node_A.shape)."""
364 |         new_node = Op.__call__(self)
365 |         new_node.inputs = [node_A]
366 |         new_node.name = "Oneslike(%s)" % node_A.name
367 |         return new_node
368 | 
369 |     def compute(self, node, input_vals, output_val, use_numpy=True):
370 |         assert len(input_vals) == 1
371 |         if use_numpy:
372 |             output_val[:] = np.ones(input_vals[0].shape)
373 |         else:
374 |             gpu_op.array_set(output_val, 1)
375 | 
376 |     def gradient(self, node, output_grad):
377 |         return [zeroslike_op(node.inputs[0])]
378 | 
379 |     def infer_shape(self, node, input_shapes):
380 |         """If input_shape is a vector, simpler to return (1,)"""
381 |         return input_shapes[0]
382 | 
383 | 
384 | class ReduceSumAxisZeroOp(Op):
385 |     def __call__(self, node_A):
386 |         """Creates a node that represents np.sum(node_A, axis=0).
387 |         Only support common-case axis=0 reduction for simplicity of gradient.
388 |         """
389 |         new_node = Op.__call__(self)
390 |         new_node.inputs = [node_A]
391 |         new_node.name = "ReduceSumAxisZero(%s)" % (node_A.name)
392 |         return new_node
393 | 
394 |     def compute(self, node, input_vals, output_val, use_numpy=True):
395 |         assert len(input_vals) == 1
396 |         if use_numpy:
397 |             assert(isinstance(input_vals[0], np.ndarray))
398 |             output_val[:] = np.sum(input_vals[0], axis=0)
399 |         else:
400 |             gpu_op.reduce_sum_axis_zero(input_vals[0], output_val)
401 | 
402 |     def gradient(self, node, output_grad):
403 |         return [broadcastto_op(output_grad, node.inputs[0])]
404 | 
405 |     def infer_shape(self, node, input_shapes):
406 |         """summation reduction axis = 0
407 |         e.g. (3,4,5)->(4,5)
408 |         for vector, simpler to do (3,)->(1,)
409 |         """
410 |         assert len(input_shapes) == 1
411 |         if len(input_shapes[0]) == 1:
412 |             return (1,)
413 |         return input_shapes[0][1:]
414 | 
415 | 
416 | class BroadcastToOp(Op):
417 |     def __call__(self, node_A, node_B):
418 |         """Creates a node that represents np.broadcast_to(node_A, node_B.shape).
419 |         Only support axis=0. e.g. (3,4)->(2,3,4) to make gradient simple.
420 |         """
421 |         new_node = Op.__call__(self)
422 |         new_node.inputs = [node_A, node_B]
423 |         new_node.name = "BroadcastTo(%s,%s.shape)" % (node_A.name, node_B.name)
424 |         return new_node
425 | 
426 |     def compute(self, node, input_vals, output_val, use_numpy=True):
427 |         assert(len(input_vals) == 2)
428 |         if use_numpy:
429 |             output_val[:] = np.broadcast_to(input_vals[0], input_vals[1].shape)
430 |         else:
431 |             gpu_op.broadcast_to(input_vals[0], output_val)
432 | 
433 |     def gradient(self, node, output_grad):
434 |         grad_A = reducesumaxiszero_op(output_grad)
435 |         grad_B = zeroslike_op(node.inputs[1])
436 |         return [grad_A, grad_B]
437 | 
438 |     def infer_shape(self, node, input_shapes):
439 |         return input_shapes[1]
440 | 
441 | 
442 | def softmax_func(y):
443 |     """Numerically stable softmax."""
444 |     b = y - np.max(y, axis=1, keepdims=True)
445 |     expb = np.exp(b)
446 |     softmax = expb / np.sum(expb, axis=1, keepdims=True)
447 |     return softmax
448 | 
449 | 
450 | class SoftmaxCrossEntropyOp(Op):
451 |     def __call__(self, node_A, node_B):
452 |         new_node = Op.__call__(self)
453 |         new_node.inputs = [node_A, node_B]
454 |         new_node.name = "SoftmaxXEntropy(%s,%s)" % (node_A.name, node_B.name)
455 |         return new_node
456 | 
457 |     def compute(self, node, input_vals, output_val, use_numpy=True):
458 |         assert len(input_vals) == 2
459 |         y = input_vals[0]
460 |         y_ = input_vals[1]
461 |         if use_numpy:
462 |             softmax = softmax_func(y)
463 |             cross_entropy = np.mean(
464 |                 -np.sum(y_ * np.log(softmax), axis=1), keepdims=True)
465 |             output_val[:] = cross_entropy
466 |         else:
467 |             gpu_op.softmax_cross_entropy(y, y_, output_val)
468 | 
469 |     def gradient(self, node, output_grad):
470 |         grad_A = (softmax_op(node.inputs[0]) + -1 * node.inputs[1])*output_grad
471 |         grad_B = zeroslike_op(node.inputs[1])
472 |         return [grad_A, grad_B]
473 | 
474 |     def infer_shape(self, node, input_shapes):
475 |         return (1,)
476 | 
477 | 
478 | class SoftmaxOp(Op):
479 |     def __call__(self, node_A):
480 |         new_node = Op.__call__(self)
481 |         new_node.inputs = [node_A]
482 |         new_node.name = "Softmax(%s)" % (node_A.name)
483 |         return new_node
484 | 
485 |     def compute(self, node, input_vals, output_val, use_numpy=True):
486 |         assert len(input_vals) == 1
487 |         if use_numpy:
488 |             output_val[:] = softmax_func(input_vals[0])
489 |         else:
490 |             gpu_op.softmax(input_vals[0], output_val)
491 | 
492 |     def gradient(self, node, output_grad):
493 |         # Do not directly use SoftmaxOp, use SoftmaxCrossEntropyOp instead.
494 |         # Not allowing taking 2nd derivative of SoftmaxCrossEntropyOp.
495 |         raise NotImplementedError
496 | 
497 |     def infer_shape(self, node, input_shapes):
498 |         return input_shapes[0]
499 | 
500 | 
501 | class ReluOp(Op):
502 |     def __call__(self, node_A):
503 |         new_node = Op.__call__(self)
504 |         new_node.inputs = [node_A]
505 |         new_node.name = "Relu(%s)" % (node_A.name)
506 |         return new_node
507 | 
508 |     def compute(self, node, input_vals, output_val, use_numpy=True):
509 |         assert len(input_vals) == 1
510 |         if use_numpy:
511 |             output_val[:] = np.maximum(input_vals[0], 0)
512 |         else:
513 |             gpu_op.relu(input_vals[0], output_val)
514 | 
515 |     def gradient(self, node, output_grad):
516 |         return [relu_gradient_op(node.inputs[0], output_grad)]
517 | 
518 |     def infer_shape(self, node, input_shapes):
519 |         return input_shapes[0]
520 | 
521 | 
522 | class ReluGradientOp(Op):
523 |     def __call__(self, node_A, node_B):
524 |         """node_B is output_grad"""
525 |         new_node = Op.__call__(self)
526 |         new_node.inputs = [node_A, node_B]
527 |         new_node.name = "ReluGradient(%s)" % (node_A.name)
528 |         return new_node
529 | 
530 |     def compute(self, node, input_vals, output_val, use_numpy=True):
531 |         assert len(input_vals) == 2
532 |         if use_numpy:
533 |             # heaviside function, 0.5 at x=0
534 |             output_val[:] = (np.sign(input_vals[0]) + 1) * 0.5 * input_vals[1]
535 |         else:
536 |             gpu_op.relu_gradient(input_vals[0], input_vals[1], output_val)
537 | 
538 |     def gradient(self, node, output_grad):
539 |         raise NotImplementedError
540 | 
541 |     def infer_shape(self, node, input_shapes):
542 |         return input_shapes[0]
543 | 
544 | 
545 | # Create global singletons of operators.
546 | add_op = AddOp()
547 | mul_op = MulOp()
548 | add_byconst_op = AddByConstOp()
549 | mul_byconst_op = MulByConstOp()
550 | matmul_op = MatMulOp()
551 | placeholder_op = PlaceholderOp()
552 | oneslike_op = OnesLikeOp()
553 | zeroslike_op = ZerosLikeOp()
554 | reducesumaxiszero_op = ReduceSumAxisZeroOp()
555 | broadcastto_op = BroadcastToOp()
556 | softmaxcrossentropy_op = SoftmaxCrossEntropyOp()
557 | softmax_op = SoftmaxOp()
558 | relu_op = ReluOp()
559 | relu_gradient_op = ReluGradientOp()
560 | 
561 | 
562 | class Executor(object):
563 |     """Executor computes values for given set of nodes in computation graph."""
564 |     def __init__(self, eval_node_list, ctx=None):
565 |         """
566 |         Parameters
567 |         ----------
568 |         eval_node_list: list of nodes whose values need to be computed.
569 |         ctx: runtime DLContext, default is None which means np.ndarray on cpu
570 |         topo_order: list of nodes in topological order
571 |         node_to_shape_map: dict from node to shape of the node
572 |         node_to_arr_map: dict from node to ndarray.NDArray allocated for node
573 |         feed_shapes: shapes of feed_dict from last run(...)
574 |         """
575 |         self.eval_node_list = eval_node_list
576 |         self.ctx = ctx
577 |         self.topo_order = find_topo_sort(self.eval_node_list)
578 |         self.node_to_shape_map = None
579 |         self.node_to_arr_map = None
580 |         self.feed_shapes = None
581 | 
582 |     def infer_shape(self, feed_shapes):
583 |         """Given shapes of feed_dict nodes, infer shape for all nodes in graph.
584 | 
585 |         Implementation note:
586 |         Iteratively calls node.op.infer_shape to infer shapes.
587 |         Node shapes stored in self.node_to_shape_map.
588 | 
589 |         Parameters
590 |         ----------
591 |         feed_shapes: node->shapes mapping for feed_dict nodes.
592 |         """
593 |         self.node_to_shape_map = dict(feed_shapes)
594 |         for node in self.topo_order:
595 |             if node in self.node_to_shape_map:
596 |                 continue
597 |             input_shapes = [self.node_to_shape_map[i] for i in node.inputs]
598 |             self.node_to_shape_map[node] = node.op.infer_shape(node, input_shapes)
599 | 
600 |     def memory_plan(self, feed_shapes):
601 |         """Allocates ndarray.NDArray for every node except feed_dict nodes.
602 | 
603 |         Implementation note:
604 |         Option 1: Alloc a ndarray.NDArray per node that persists across run()
605 |         Option 2: Implement a memory pool to reuse memory for nodes of same
606 |                 shapes. More details see Lecture 7.
607 | 
608 |         For both options, self.node_to_arr_map stores node->NDArray mapping to
609 |         allow mapping to persist across multiple executor.run().
610 | 
611 |         Hint: use ndarray.empty(shape, ctx=self.ctx) to allocate NDArray.
612 | 
613 |         Parameters
614 |         ----------
615 |         feed_shapes: node->shapes mapping for feed_dict nodes.
616 |         """
617 |         self.node_to_arr_map = {}
618 |         for node in self.topo_order:
619 |             self.node_to_arr_map[node] = ndarray.empty(self.node_to_shape_map[node], ctx=self.ctx)
620 | 
621 |     def run(self, feed_dict, convert_to_numpy_ret_vals=False):
622 |         """
623 |         Parameters
624 |         ----------
625 |         feed_dict: a dictionary of node->np.ndarray supplied by user.
626 |         convert_to_numpy_ret_vals: whether to convert ret vals to np.array
627 | 
628 |         Returns
629 |         -------
630 |         A list of values for nodes in eval_node_list. NDArray or np.ndarray.
631 |         """
632 |         def are_feed_shapes_equal(sa, sb):
633 |             if (not isinstance(sa, dict)) or (not isinstance(sb, dict)):
634 |                 return False
635 |             unmatched_item = set(sa.items()) ^ set(sb.items())
636 |             return len(unmatched_item) == 0
637 | 
638 |         # Assume self.ctx is None implies numpy array and numpy ops.
639 |         use_numpy = self.ctx is None
640 |         node_to_val_map = {}
641 |         for node, value in feed_dict.items():
642 |             if use_numpy:
643 |                 # all values passed in feed_dict must be np.ndarray
644 |                 assert isinstance(value, np.ndarray)
645 |                 node_to_val_map[node] = value
646 |             else:
647 |                 # convert values to ndarray.NDArray if necessary
648 |                 if isinstance(value, np.ndarray):
649 |                     node_to_val_map[node] = ndarray.array(value, ctx=self.ctx)
650 |                 elif isinstance(value, ndarray.NDArray):
651 |                     node_to_val_map[node] = value
652 |                 else:
653 |                     assert False, "feed_dict value type not supported"
654 | 
655 |         # collect shapes for all placeholders
656 |         feed_shapes = {}
657 |         for node in node_to_val_map:
658 |             feed_shapes[node] = node_to_val_map[node].shape
659 | 
660 |         # infer shape if feed_shapes changed since last run
661 |         # e.g. call run() on test data after trainng
662 |         if (not are_feed_shapes_equal(feed_shapes, self.feed_shapes)):
663 |             self.infer_shape(feed_shapes)
664 |             self.feed_shapes = feed_shapes
665 |             # plan memory if using GPU
666 |             if (not use_numpy):
667 |                 self.memory_plan(feed_shapes)
668 | 
669 |         # Traverse graph in topo order and compute values for all nodes.
670 |         for node in self.topo_order:
671 |             if node in node_to_val_map:
672 |                 # Skip placeholder nodes. Values already provided by feed_dict.
673 |                 continue
674 |             input_vals = [node_to_val_map[n] for n in node.inputs]
675 |             if use_numpy:
676 |                 node_val = np.empty(shape=self.node_to_shape_map[node])
677 |             else:
678 |                 node_val = self.node_to_arr_map[node]
679 |             # node_val is modified in-place whether np.ndarray or NDArray
680 |             node.op.compute(node, input_vals, node_val, use_numpy)
681 |             node_to_val_map[node] = node_val
682 | 
683 |         # Collect node values.
684 |         if not use_numpy and convert_to_numpy_ret_vals:
685 |             return [node_to_val_map[n].asnumpy() for n in self.eval_node_list]
686 |         return [node_to_val_map[n] for n in self.eval_node_list]
687 | 
688 | 
689 | def gradients(output_node, node_list):
690 |     """Take gradient of output node with respect to each node in node_list.
691 | 
692 |     Parameters
693 |     ----------
694 |     output_node: output node that we are taking derivative of.
695 |     node_list: list of nodes that we are taking derivative wrt.
696 | 
697 |     Returns
698 |     -------
699 |     A list of gradient values, one for each node in node_list respectively.
700 | 
701 |     """
702 |     node_to_output_grads_list = {}
703 |     node_to_output_grads_list[output_node] = [oneslike_op(output_node)]
704 |     node_to_output_grad = {}
705 |     # Traverse forward graph in reverse topological order
706 |     reverse_topo_order = reversed(find_topo_sort([output_node]))
707 |     for node in reverse_topo_order:
708 |         output_grad = sum_node_list(node_to_output_grads_list[node])
709 |         node_to_output_grad[node] = output_grad
710 |         input_grads_list = node.op.gradient(node, output_grad)
711 |         for i in range(len(node.inputs)):
712 |             if node.inputs[i] not in node_to_output_grads_list:
713 |                 node_to_output_grads_list[node.inputs[i]] = []
714 |             # Calculate partial adjoint for input nodes.
715 |             node_to_output_grads_list[node.inputs[i]].append(
716 |                 input_grads_list[i])
717 | 
718 |     grad_node_list = [node_to_output_grad[node] for node in node_list]
719 |     return grad_node_list
720 | 
721 | ##################
722 | # Helper Methods #
723 | ##################
724 | 
725 | 
726 | def find_topo_sort(node_list):
727 |     """Given a list of nodes, return a topo ordering of nodes ending in them.
728 | 
729 |     A simple algorithm is to do a post-order DFS traversal on the given nodes,
730 |     going backwards based on input edges. Since a node is added to the ordering
731 |     after all its predecessors are traversed due to post-order DFS, we get a
732 |     topological sort.
733 | 
734 |     """
735 |     visited = set()
736 |     topo_order = []
737 |     for node in node_list:
738 |         topo_sort_dfs(node, visited, topo_order)
739 |     return topo_order
740 | 
741 | 
742 | def topo_sort_dfs(node, visited, topo_order):
743 |     """Post-order DFS"""
744 |     if node in visited:
745 |         return
746 |     visited.add(node)
747 |     for n in node.inputs:
748 |         topo_sort_dfs(n, visited, topo_order)
749 |     topo_order.append(node)
750 | 
751 | 
752 | def sum_node_list(node_list):
753 |     """Custom sum func to avoid creating redundant nodes in Python sum func."""
754 |     from operator import add
755 |     from functools import reduce
756 |     return reduce(add, node_list)
757 | 
758 | 
759 | def broadcast_rule(shape_a, shape_b):
760 |     """Return output shape of broadcast shape_a, shape_b.
761 |     e.g. broadcast_rule((3,2), (4,3,2))
762 |     returns output_shape = (4,3,2)
763 | 
764 |     Check out explanations and more examples at
765 |     https://docs.scipy.org/doc/numpy-1.10.0/user/basics.broadcasting.html
766 |     http://eli.thegreenplace.net/2015/broadcasting-arrays-in-numpy/
767 |     """
768 |     assert(isinstance(shape_a, tuple))
769 |     assert(isinstance(shape_b, tuple))
770 |     if len(shape_a) > len(shape_b):
771 |         longer_shape, shorter_shape = shape_a, shape_b
772 |     else:
773 |         longer_shape, shorter_shape = shape_b, shape_a
774 |     len_diff = len(longer_shape) - len(shorter_shape)
775 |     for i in range(len_diff):
776 |         # pad with leading 1s
777 |         shorter_shape = (1,) + shorter_shape
778 |     assert len(shorter_shape) == len(longer_shape)
779 |     output_shape = list(longer_shape)
780 |     for i in range(len(output_shape)):
781 |         assert (shorter_shape[i] == longer_shape[i]) \
782 |             or (shorter_shape[i] == 1) \
783 |             or (longer_shape[i] == 1)
784 |         output_shape[i] = max(shorter_shape[i], longer_shape[i])
785 |     return tuple(output_shape)
786 | 


--------------------------------------------------------------------------------
/python/tinyflow/gpu_op.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | 
 3 | import ctypes
 4 | from ._base import _LIB
 5 | from . import ndarray as _nd
 6 | 
 7 | 
 8 | def array_set(arr, value):
 9 |     assert isinstance(arr, _nd.NDArray)
10 |     _LIB.DLGpuArraySet(arr.handle, ctypes.c_float(value))
11 | 
12 | 
13 | def broadcast_to(in_arr, out_arr):
14 |     assert isinstance(in_arr, _nd.NDArray)
15 |     assert isinstance(out_arr, _nd.NDArray)
16 |     _LIB.DLGpuBroadcastTo(in_arr.handle, out_arr.handle)
17 | 
18 | 
19 | def reduce_sum_axis_zero(in_arr, out_arr):
20 |     assert isinstance(in_arr, _nd.NDArray)
21 |     assert isinstance(out_arr, _nd.NDArray)
22 |     _LIB.DLGpuReduceSumAxisZero(in_arr.handle, out_arr.handle)
23 | 
24 | 
25 | def matrix_elementwise_add(matA, matB, matC):
26 |     assert isinstance(matA, _nd.NDArray)
27 |     assert isinstance(matB, _nd.NDArray)
28 |     assert isinstance(matC, _nd.NDArray)
29 |     _LIB.DLGpuMatrixElementwiseAdd(matA.handle, matB.handle, matC.handle)
30 | 
31 | 
32 | def matrix_elementwise_add_by_const(in_mat, val, out_mat):
33 |     assert isinstance(in_mat, _nd.NDArray)
34 |     assert isinstance(out_mat, _nd.NDArray)
35 |     _LIB.DLGpuMatrixElementwiseAddByConst(
36 |         in_mat.handle, ctypes.c_float(val), out_mat.handle)
37 | 
38 | 
39 | def matrix_elementwise_multiply(matA, matB, matC):
40 |     assert isinstance(matA, _nd.NDArray)
41 |     assert isinstance(matB, _nd.NDArray)
42 |     assert isinstance(matC, _nd.NDArray)
43 |     _LIB.DLGpuMatrixElementwiseMultiply(
44 |         matA.handle, matB.handle, matC.handle)
45 | 
46 | 
47 | def matrix_elementwise_multiply_by_const(in_mat, val, out_mat):
48 |     assert isinstance(in_mat, _nd.NDArray)
49 |     assert isinstance(out_mat, _nd.NDArray)
50 |     _LIB.DLGpuMatrixMultiplyByConst(
51 |         in_mat.handle, ctypes.c_float(val), out_mat.handle)
52 | 
53 | 
54 | def matrix_multiply(matA, transA, matB, transB, matC):
55 |     assert isinstance(matA, _nd.NDArray)
56 |     assert isinstance(matB, _nd.NDArray)
57 |     assert isinstance(matC, _nd.NDArray)
58 |     _LIB.DLGpuMatrixMultiply(
59 |         matA.handle, transA, matB.handle, transB, matC.handle)
60 | 
61 | 
62 | def relu(in_arr, out_arr):
63 |     assert isinstance(in_arr, _nd.NDArray)
64 |     assert isinstance(out_arr, _nd.NDArray)
65 |     _LIB.DLGpuRelu(in_arr.handle, out_arr.handle)
66 | 
67 | 
68 | def relu_gradient(in_arr, in_grad_arr, out_arr):
69 |     assert isinstance(in_arr, _nd.NDArray)
70 |     assert isinstance(in_grad_arr, _nd.NDArray)
71 |     assert isinstance(out_arr, _nd.NDArray)
72 |     _LIB.DLGpuReluGradient(in_arr.handle, in_grad_arr.handle, out_arr.handle)
73 | 
74 | 
75 | def softmax(in_arr, out_arr):
76 |     assert isinstance(in_arr, _nd.NDArray)
77 |     assert isinstance(out_arr, _nd.NDArray)
78 |     _LIB.DLGpuSoftmax(in_arr.handle, out_arr.handle)
79 | 
80 | 
81 | def softmax_cross_entropy(in_arr_a, in_arr_b, out_arr):
82 |     assert isinstance(in_arr_a, _nd.NDArray)
83 |     assert isinstance(in_arr_b, _nd.NDArray)
84 |     assert isinstance(out_arr, _nd.NDArray)
85 |     _LIB.DLGpuSoftmaxCrossEntropy(
86 |         in_arr_a.handle, in_arr_b.handle, out_arr.handle)
87 | 


--------------------------------------------------------------------------------
/python/tinyflow/ndarray.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | 
  3 | from ._base import _LIB, check_call, c_array
  4 | import ctypes
  5 | import numpy as np
  6 | 
  7 | 
  8 | class DLContext(ctypes.Structure):
  9 |     """DL context strucure."""
 10 |     _fields_ = [("device_id", ctypes.c_int),
 11 |                 ("device_type", ctypes.c_int)]
 12 | 
 13 |     MASK2STR = {
 14 |         1: 'cpu',
 15 |         2: 'gpu',
 16 |     }
 17 | 
 18 |     def __init__(self, device_id, device_type):
 19 |         super(DLContext, self).__init__()
 20 |         self.device_id = device_id
 21 |         self.device_type = device_type
 22 | 
 23 |     def __repr__(self):
 24 |         return "%s(%d)" % (
 25 |             DLContext.MASK2STR[self.device_type], self.device_id)
 26 | 
 27 | 
 28 | class DLArray(ctypes.Structure):
 29 |     """DLArray in C API"""
 30 |     _fields_ = [("data", ctypes.c_void_p),
 31 |                 ("ctx", DLContext),
 32 |                 ("ndim", ctypes.c_int),
 33 |                 ("shape", ctypes.POINTER(ctypes.c_int64))]
 34 | 
 35 | 
 36 | DLArrayHandle = ctypes.POINTER(DLArray)
 37 | 
 38 | 
 39 | def cpu(dev_id=0):
 40 |     """Construct a CPU device
 41 |     Parameters
 42 |     ----------
 43 |     dev_id : int, optional
 44 |         The integer device id
 45 |     """
 46 |     return DLContext(dev_id, 1)
 47 | 
 48 | 
 49 | def gpu(dev_id=0):
 50 |     """Construct a CPU device
 51 |     Parameters
 52 |     ----------
 53 |     dev_id : int, optional
 54 |         The integer device id
 55 |     """
 56 |     return DLContext(dev_id, 2)
 57 | 
 58 | 
 59 | def is_gpu_ctx(ctx):
 60 |     """Return if context is GPU context.
 61 |     Parameters
 62 |     ----------
 63 |     ctx : DLContext
 64 |         The query context
 65 |     """
 66 |     return ctx and ctx.device_type == 2
 67 | 
 68 | 
 69 | class NDArray(object):
 70 |     """Lightweight NDArray class of DL runtime.
 71 |     Strictly this is only an Array Container(a buffer object)
 72 |     No arthimetic operations are defined.
 73 |     """
 74 |     __slots__ = ["handle"]
 75 | 
 76 |     # pylint: disable=no-member
 77 |     def __init__(self, handle):
 78 |         """Initialize the function with handle
 79 |         Parameters
 80 |         ----------
 81 |         handle : DLArrayHandle
 82 |             the handle to the underlying C++ DLArray
 83 |         """
 84 |         self.handle = handle
 85 | 
 86 |     def __del__(self):
 87 |         check_call(_LIB.DLArrayFree(self.handle))
 88 | 
 89 |     @property
 90 |     def shape(self):
 91 |         """Shape of this array"""
 92 |         return tuple(self.handle.contents.shape[i]
 93 |                      for i in range(self.handle.contents.ndim))
 94 | 
 95 |     @property
 96 |     def ctx(self):
 97 |         """context of this array"""
 98 |         return self.handle.contents.ctx
 99 | 
100 |     def __setitem__(self, in_slice, value):
101 |         """Set ndarray value"""
102 |         if (not isinstance(in_slice, slice) or
103 |                 in_slice.start is not None
104 |                 or in_slice.stop is not None):
105 |             raise ValueError('Array only support set from numpy array')
106 |         if isinstance(value, NDArray):
107 |             if value.handle is not self.handle:
108 |                 value.copyto(self)
109 |         elif isinstance(value, (np.ndarray, np.generic)):
110 |             self._sync_copyfrom(value)
111 |         else:
112 |             raise TypeError('type %s not supported' % str(type(value)))
113 | 
114 |     def _sync_copyfrom(self, source_array):
115 |         """Peform an synchronize copy from the array.
116 |         Parameters
117 |         ----------
118 |         source_array : array_like
119 |             The data source we should like to copy from.
120 |         """
121 |         if not isinstance(source_array, np.ndarray):
122 |             try:
123 |                 source_array = np.array(source_array, dtype=np.float32)
124 |             except:
125 |                 raise TypeError('array must be an array_like data,' +
126 |                                 'type %s is not supported'
127 |                                 % str(type(source_array)))
128 |         source_array = np.ascontiguousarray(source_array, dtype=np.float32)
129 |         if source_array.shape != self.shape:
130 |             raise ValueError('array shape do not match the shape of NDArray')
131 |         source_arr, shape = NDArray._numpyasarray(source_array)
132 |         check_call(_LIB.DLArrayCopyFromTo(
133 |             ctypes.byref(source_arr), self.handle, None))
134 |         # de-allocate shape until now
135 |         _ = shape
136 | 
137 |     @staticmethod
138 |     def _numpyasarray(np_data):
139 |         """Return a DLArray representation of a numpy array."""
140 |         data = np_data
141 |         assert data.flags['C_CONTIGUOUS']
142 |         arr = DLArray()
143 |         shape = c_array(ctypes.c_int64, data.shape)
144 |         arr.data = data.ctypes.data_as(ctypes.c_void_p)
145 |         arr.shape = shape
146 |         arr.ndim = data.ndim
147 |         # CPU device
148 |         arr.ctx = cpu(0)
149 |         return arr, shape
150 | 
151 |     def asnumpy(self):
152 |         """Convert this array to numpy array
153 |         Returns
154 |         -------
155 |         np_arr : numpy.ndarray
156 |             The corresponding numpy array.
157 |         """
158 |         np_arr = np.empty(self.shape, dtype=np.float32)
159 |         arr, shape = NDArray._numpyasarray(np_arr)
160 |         check_call(_LIB.DLArrayCopyFromTo(
161 |             self.handle, ctypes.byref(arr), None))
162 |         _ = shape
163 |         return np_arr
164 | 
165 |     def copyto(self, target):
166 |         """Copy array to target
167 |         Parameters
168 |         ----------
169 |         target : NDArray
170 |             The target array to be copied, must have same shape as this array.
171 |         """
172 |         if isinstance(target, DLContext):
173 |             target = empty(self.shape, target)
174 |         if isinstance(target, NDArray):
175 |             check_call(_LIB.DLArrayCopyFromTo(
176 |                 self.handle, target.handle, None))
177 |         else:
178 |             raise ValueError("Unsupported target type %s" % str(type(target)))
179 |         return target
180 | 
181 | 
182 | def array(arr, ctx=cpu(0)):
183 |     """Create an array from source arr.
184 |     Parameters
185 |     ----------
186 |     arr : numpy.ndarray
187 |         The array to be copied from
188 |     ctx : DLContext, optional
189 |         The device context to create the array
190 |     Returns
191 |     -------
192 |     ret : NDArray
193 |         The created array
194 |     """
195 |     if not isinstance(arr, np.ndarray):
196 |         arr = np.array(arr)
197 |     ret = empty(arr.shape, ctx)
198 |     ret._sync_copyfrom(arr)
199 |     return ret
200 | 
201 | 
202 | def empty(shape, ctx=cpu(0)):
203 |     """Create an empty array given shape and device
204 |     Parameters
205 |     ----------
206 |     shape : tuple of int
207 |         The shape of the array
208 |     ctx : DLContext
209 |         The context of the array
210 |     Returns
211 |     -------
212 |     arr : ndarray
213 |         The array dlsys supported.
214 |     """
215 |     shape = c_array(ctypes.c_int64, shape)
216 |     ndim = ctypes.c_int(len(shape))
217 |     handle = DLArrayHandle()
218 |     check_call(_LIB.DLArrayAlloc(
219 |         shape, ndim, ctx, ctypes.byref(handle)))
220 |     return NDArray(handle)
221 | 


--------------------------------------------------------------------------------
/src/c_runtime_api.cc:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  * \file c_runtime_api.cc
  3 |  * \brief Device specific implementations
  4 |  */
  5 | #include "./c_runtime_api.h"
  6 | #include "./cpu_device_api.h"
  7 | #include "./cuda_device_api.h"
  8 | #include "./runtime_base.h"
  9 | #include <algorithm>
 10 | #include <array>
 11 | #include <cassert>
 12 | #include <cstdlib>
 13 | #include <iostream>
 14 | #include <stdlib.h>
 15 | #include <string>
 16 | #include <thread>
 17 | 
 18 | namespace tinyflow {
 19 | namespace runtime {
 20 | 
 21 | class DeviceAPIManager {
 22 | public:
 23 |   static const int kMaxDeviceAPI = 8;
 24 |   // Get API
 25 |   static DeviceAPI *Get(DLContext ctx) {
 26 |     return Global()->GetAPI(ctx.device_type);
 27 |   }
 28 | 
 29 | private:
 30 |   std::array<DeviceAPI *, kMaxDeviceAPI> api_;
 31 |   DeviceAPIManager() {
 32 |     std::fill(api_.begin(), api_.end(), nullptr);
 33 |     static CPUDeviceAPI cpu_device_api_inst;
 34 |     static CUDADeviceAPI gpu_device_api_inst;
 35 |     api_[kCPU] = static_cast<DeviceAPI *>(&cpu_device_api_inst);
 36 |     api_[kGPU] = static_cast<DeviceAPI *>(&gpu_device_api_inst);
 37 |   }
 38 |   // Get global static variable.
 39 |   static DeviceAPIManager *Global() {
 40 |     static DeviceAPIManager inst;
 41 |     return &inst;
 42 |   }
 43 |   // Get API.
 44 |   DeviceAPI *GetAPI(DLDeviceType type) {
 45 |     if (api_[type] == nullptr) {
 46 |       std::cerr << "Device API not supported" << std::endl;
 47 |       exit(EXIT_FAILURE);
 48 |     }
 49 |     return api_[type];
 50 |   }
 51 | };
 52 | 
 53 | inline DLArray *DLArrayCreate_() {
 54 |   DLArray *arr = new DLArray();
 55 |   arr->shape = nullptr;
 56 |   arr->ndim = 0;
 57 |   arr->data = nullptr;
 58 |   return arr;
 59 | }
 60 | 
 61 | inline void DLArrayFree_(DLArray *arr) {
 62 |   if (arr != nullptr) {
 63 |     // ok to delete nullptr
 64 |     delete[] arr->shape;
 65 |     if (arr->data != nullptr) {
 66 |       DeviceAPIManager::Get(arr->ctx)->FreeDataSpace(arr->ctx, arr->data);
 67 |     }
 68 |   }
 69 |   delete arr;
 70 | }
 71 | 
 72 | inline size_t GetDataSize(DLArray *arr) {
 73 |   size_t size = 1;
 74 |   for (index_t i = 0; i < arr->ndim; ++i) {
 75 |     size *= arr->shape[i];
 76 |   }
 77 |   // assume 32-bit float
 78 |   size *= 4;
 79 |   return size;
 80 | }
 81 | 
 82 | inline size_t GetDataAlignment(DLArray *arr) {
 83 |   // assume 32-bit float
 84 |   return 8;
 85 | }
 86 | 
 87 | } // namespace runtime
 88 | } // namespace tinyflow
 89 | 
 90 | using namespace tinyflow::runtime;
 91 | 
 92 | int DLArrayAlloc(const index_t *shape, index_t ndim, DLContext ctx,
 93 |                  DLArrayHandle *out) {
 94 |   DLArray *arr = nullptr;
 95 |   API_BEGIN();
 96 |   // shape
 97 |   arr = DLArrayCreate_();
 98 |   // ndim
 99 |   arr->ndim = ndim;
100 |   index_t *shape_copy = new index_t[ndim];
101 |   std::copy(shape, shape + ndim, shape_copy);
102 |   arr->shape = shape_copy;
103 |   // ctx
104 |   arr->ctx = ctx;
105 |   size_t size = GetDataSize(arr);
106 |   size_t alignment = GetDataAlignment(arr);
107 |   arr->data = DeviceAPIManager::Get(ctx)->AllocDataSpace(ctx, size, alignment);
108 |   *out = arr;
109 |   API_END_HANDLE_ERROR(DLArrayFree_(arr));
110 | }
111 | 
112 | int DLArrayFree(DLArrayHandle handle) {
113 |   API_BEGIN();
114 |   DLArray *arr = handle;
115 |   DLArrayFree_(arr);
116 |   API_END();
117 | }
118 | 
119 | int DLArrayCopyFromTo(DLArrayHandle from, DLArrayHandle to,
120 |                       DLStreamHandle stream) {
121 |   API_BEGIN();
122 |   size_t from_size = GetDataSize(from);
123 |   size_t to_size = GetDataSize(to);
124 |   // The size must exactly match
125 |   assert(from_size == to_size);
126 |   DLContext ctx = from->ctx;
127 |   if (ctx.device_type == kCPU) {
128 |     ctx = to->ctx;
129 |   } else {
130 |     // Can not copy across different ctx types directly
131 |     assert((to->ctx.device_type == kCPU) ||
132 |            (to->ctx.device_type == from->ctx.device_type));
133 |   }
134 |   DeviceAPIManager::Get(ctx)->CopyDataFromTo(from->data, to->data, from_size,
135 |                                              from->ctx, to->ctx, stream);
136 |   API_END();
137 | }
138 | 


--------------------------------------------------------------------------------
/src/c_runtime_api.h:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  * \file c_runtime_api.h
  3 |  * \brief DL runtime library.
  4 |  *
  5 |  */
  6 | 
  7 | #ifndef TINYFLOW_RUNTIME_C_RUNTIME_API_H_
  8 | #define TINYFLOW_RUNTIME_C_RUNTIME_API_H_
  9 | 
 10 | #ifdef __cplusplus
 11 | #define TINYFLOW_EXTERN_C extern "C"
 12 | #else
 13 | #define TINYFLOW_EXTERN_C
 14 | #endif
 15 | 
 16 | #include "dlarray.h"
 17 | #include <stddef.h>
 18 | #include <stdint.h>
 19 | 
 20 | TINYFLOW_EXTERN_C {
 21 |   /*! \brief type of array index. */
 22 |   typedef int64_t index_t;
 23 | 
 24 |   /*! \brief the array handle */
 25 |   typedef DLArray *DLArrayHandle;
 26 |   /*!
 27 |    * \brief The stream that is specific to device
 28 |    * can be NULL, which indicates the default one.
 29 |    */
 30 |   typedef void *DLStreamHandle;
 31 | 
 32 |   // Array related apis for quick proptying
 33 |   /*!
 34 |    * \brief Allocate a nd-array's memory,
 35 |    *  including space of shape, of given spec.
 36 |    *
 37 |    * \param shape The shape of the array, the data content will be copied to out
 38 |    * \param ndim The number of dimension of the array.
 39 |    * \param ctx The ctx this array sits on.
 40 |    * \param out The output handle.
 41 |    * \return 0 when success, -1 when failure happens
 42 |    */
 43 |   int DLArrayAlloc(const index_t *shape, index_t ndim, DLContext ctx,
 44 |                    DLArrayHandle *out);
 45 | 
 46 |   /*!
 47 |    * \brief Free the DL Array.
 48 |    * \param handle The array handle to be freed.
 49 |    * \return 0 when success, -1 when failure happens
 50 |    */
 51 |   int DLArrayFree(DLArrayHandle handle);
 52 | 
 53 |   /*!
 54 |    * \brief Copy the array, both from and to must be valid during the copy.
 55 |    * \param from The array to be copied from.
 56 |    * \param to The target space.
 57 |    * \param stream The stream where the copy happens, can be NULL.
 58 |    * \return 0 when success, -1 when failure happens
 59 |    */
 60 |   int DLArrayCopyFromTo(DLArrayHandle from, DLArrayHandle to,
 61 |                         DLStreamHandle stream);
 62 | 
 63 |   /*!
 64 |    * \brief Set all array elements to given value.
 65 |    * \param arr The array to be Set.
 66 |    * \param value The target value.
 67 |    * \return 0 when success, -1 when failure happens
 68 |    */
 69 |   int DLGpuArraySet(DLArrayHandle arr, float value);
 70 | 
 71 |   /*!
 72 |    * \brief Broadcast input array to output array.
 73 |    * \param input The input array.
 74 |    * \param output The output array.
 75 |    * \return 0 when success, -1 when failure happens
 76 |    */
 77 |   int DLGpuBroadcastTo(const DLArrayHandle input, DLArrayHandle output);
 78 | 
 79 |   /*!
 80 |    * \brief Reduce sum input array by axis=0 and store to output.
 81 |    * \param input The input array.
 82 |    * \param output The output array.
 83 |    * \return 0 when success, -1 when failure happens
 84 |    */
 85 |   int DLGpuReduceSumAxisZero(const DLArrayHandle input, DLArrayHandle output);
 86 | 
 87 |   /*!
 88 |    * \brief Elementwise add two matrices and store to output.
 89 |    * \param matA The left input array.
 90 |    * \param matB The right input array.
 91 |    * \param output The output array.
 92 |    * \return 0 when success, -1 when failure happens
 93 |    */
 94 |   int DLGpuMatrixElementwiseAdd(const DLArrayHandle matA,
 95 |                                 const DLArrayHandle matB, DLArrayHandle output);
 96 | 
 97 |   /*!
 98 |    * \brief Add matrix by const and store to output.
 99 |    * \param input The input array.
100 |    * \param val The constant.
101 |    * \param output The output array.
102 |    * \return 0 when success, -1 when failure happens
103 |    */
104 |   int DLGpuMatrixElementwiseAddByConst(const DLArrayHandle input, float val,
105 |                                        DLArrayHandle output);
106 | 
107 |   /*!
108 |    * \brief Elementwise multiply two matrices and store to output.
109 |    * \param matA The left input array.
110 |    * \param matB The right input array.
111 |    * \param output The output array.
112 |    * \return 0 when success, -1 when failure happens
113 |    */
114 |   int DLGpuMatrixElementwiseMultiply(
115 |       const DLArrayHandle matA, const DLArrayHandle matB, DLArrayHandle output);
116 | 
117 |   /*!
118 |    * \brief Multiply matrix by const and store to output.
119 |    * \param input The input array.
120 |    * \param val The constant.
121 |    * \param output The output array.
122 |    * \return 0 when success, -1 when failure happens
123 |    */
124 |   int DLGpuMatrixMultiplyByConst(const DLArrayHandle input, float val,
125 |                                  DLArrayHandle output);
126 | 
127 |   /*!
128 |    * \brief Matrix multiply two matrices and store to output.
129 |    * \param matA The left input array.
130 |    * \param transposeA Whether matA needs to be transposed
131 |    * \param matB The right input array.
132 |    * \param transposeB Whether matB needs to be transposed
133 |    * \param output The output array.
134 |    * \return 0 when success, -1 when failure happens
135 |    */
136 |   int DLGpuMatrixMultiply(const DLArrayHandle matA, bool transposeA,
137 |                           const DLArrayHandle matB, bool transposeB,
138 |                           DLArrayHandle matC);
139 | 
140 |   /*!
141 |    * \brief Compute relu on all array elements, and store to output.
142 |    * \param input The input array.
143 |    * \param output The output value.
144 |    * \return 0 when success, -1 when failure happens
145 |    */
146 |   int DLGpuRelu(const DLArrayHandle input, DLArrayHandle output);
147 | 
148 |   /*!
149 |    * \brief Compute relu gradient, and store to output.
150 |    * \param input The input array.
151 |    * \param in_grad The input gradients value.
152 |    * \param output The output array.
153 |    * \return 0 when success, -1 when failure happens
154 |    */
155 |   int DLGpuReluGradient(const DLArrayHandle input, const DLArrayHandle in_grad,
156 |                         DLArrayHandle output);
157 | 
158 |   /*!
159 |    * \brief Compute softmax on matrix, and store to output.
160 |    * \param input The input array.
161 |    * \param output The output value.
162 |    * \return 0 when success, -1 when failure happens
163 |    */
164 |   int DLGpuSoftmax(const DLArrayHandle input, DLArrayHandle output);
165 | 
166 |   /*!
167 |    * \brief Compute softmax_cross_entropy.
168 |    *  np.mean(-np.sum(y_ * np.log(softmax(y)), axis=1), keepdims=True)
169 |    * \param input_a The y array.
170 |    * \param input_b The y_ array.
171 |    * \param output The output value.
172 |    * \return 0 when success, -1 when failure happens
173 |    */
174 |   int DLGpuSoftmaxCrossEntropy(const DLArrayHandle input_a,
175 |                                const DLArrayHandle input_b,
176 |                                DLArrayHandle output);
177 | } // TINYFLOW_EXTERN_C
178 | 
179 | #endif // TINYFLOW_RUNTIME_C_RUNTIME_API_H_
180 | 


--------------------------------------------------------------------------------
/src/cpu_device_api.cc:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * \file cpu_device_api.cc
 3 |  */
 4 | #include "./cpu_device_api.h"
 5 | #include <cstdlib>
 6 | #include <cstring>
 7 | #include <iostream>
 8 | 
 9 | namespace tinyflow {
10 | namespace runtime {
11 | 
12 | void *CPUDeviceAPI::AllocDataSpace(DLContext ctx, size_t size,
13 |                                    size_t alignment) {
14 |   // std::cout << "allocating cpu data" << std::endl;
15 |   void *ptr;
16 |   int ret = posix_memalign(&ptr, alignment, size);
17 |   if (ret != 0)
18 |     throw std::bad_alloc();
19 |   return ptr;
20 | }
21 | 
22 | void CPUDeviceAPI::FreeDataSpace(DLContext ctx, void *ptr) { free(ptr); }
23 | 
24 | void CPUDeviceAPI::CopyDataFromTo(const void *from, void *to, size_t size,
25 |                                   DLContext ctx_from, DLContext ctx_to,
26 |                                   DLStreamHandle stream) {
27 |   // std::cout << "copying cpu data" << std::endl;
28 |   memcpy(to, from, size);
29 | }
30 | 
31 | void CPUDeviceAPI::StreamSync(DLContext ctx, DLStreamHandle stream) {}
32 | 
33 | } // namespace runtime
34 | } // namespace tinyflow
35 | 


--------------------------------------------------------------------------------
/src/cpu_device_api.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * \file device_api.h
 3 |  * \brief Device specific API
 4 |  */
 5 | #ifndef TINYFLOW_RUNTIME_CPU_DEVICE_API_H_
 6 | #define TINYFLOW_RUNTIME_CPU_DEVICE_API_H_
 7 | 
 8 | #include "c_runtime_api.h"
 9 | #include "device_api.h"
10 | #include <assert.h>
11 | #include <string>
12 | 
13 | namespace tinyflow {
14 | namespace runtime {
15 | 
16 | class CPUDeviceAPI : public DeviceAPI {
17 | public:
18 |   void *AllocDataSpace(DLContext ctx, size_t size, size_t alignment) final;
19 | 
20 |   void FreeDataSpace(DLContext ctx, void *ptr) final;
21 | 
22 |   void CopyDataFromTo(const void *from, void *to, size_t size,
23 |                       DLContext ctx_from, DLContext ctx_to,
24 |                       DLStreamHandle stream) final;
25 | 
26 |   void StreamSync(DLContext ctx, DLStreamHandle stream) final;
27 | };
28 | 
29 | } // namespace runtime
30 | } // namespace tinyflow
31 | #endif // tinyflow_RUNTIME_CPU_DEVICE_API_H_
32 | 


--------------------------------------------------------------------------------
/src/cuda_device_api.cc:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * \file cuda_device_api.cc
 3 |  * \brief GPU specific API
 4 |  */
 5 | 
 6 | #include "./cuda_device_api.h"
 7 | #include <cassert>
 8 | #include <cuda_runtime.h>
 9 | #include <iostream>
10 | 
11 | #define CUDA_CALL(func)                                                        \
12 |   {                                                                            \
13 |     cudaError_t e = (func);                                                    \
14 |     assert((e == cudaSuccess) || (e == cudaErrorCudartUnloading));             \
15 |   }
16 | 
17 | namespace tinyflow {
18 | namespace runtime {
19 | 
20 | static void GPUCopy(const void *from, void *to, size_t size,
21 |                     cudaMemcpyKind kind, cudaStream_t stream) {
22 |   if (stream != 0) {
23 |     CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream));
24 |   } else {
25 |     CUDA_CALL(cudaMemcpy(to, from, size, kind));
26 |   }
27 | }
28 | 
29 | void *CUDADeviceAPI::AllocDataSpace(DLContext ctx, size_t size,
30 |                                     size_t alignment) {
31 |   // std::cout << "allocating cuda data" << std::endl;
32 |   CUDA_CALL(cudaSetDevice(ctx.device_id));
33 |   assert((256 % alignment) == 0U); // << "CUDA space is aligned at 256 bytes";
34 |   void *ret;
35 |   CUDA_CALL(cudaMalloc(&ret, size));
36 |   return ret;
37 | }
38 | 
39 | void CUDADeviceAPI::FreeDataSpace(DLContext ctx, void *ptr) {
40 |   CUDA_CALL(cudaSetDevice(ctx.device_id));
41 |   CUDA_CALL(cudaFree(ptr));
42 | }
43 | 
44 | void CUDADeviceAPI::CopyDataFromTo(const void *from, void *to, size_t size,
45 |                                    DLContext ctx_from, DLContext ctx_to,
46 |                                    DLStreamHandle stream) {
47 |   // std::cout << "copying cuda data" << std::endl;
48 |   cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
49 |   if (ctx_from.device_type == kGPU && ctx_to.device_type == kGPU) {
50 |     CUDA_CALL(cudaSetDevice(ctx_from.device_id));
51 |     if (ctx_from.device_id == ctx_to.device_id) {
52 |       GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream);
53 |     } else {
54 |       cudaMemcpyPeerAsync(to, ctx_to.device_id, from, ctx_from.device_id, size,
55 |                           cu_stream);
56 |     }
57 |   } else if (ctx_from.device_type == kGPU && ctx_to.device_type == kCPU) {
58 |     CUDA_CALL(cudaSetDevice(ctx_from.device_id));
59 |     GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream);
60 |   } else if (ctx_from.device_type == kCPU && ctx_to.device_type == kGPU) {
61 |     CUDA_CALL(cudaSetDevice(ctx_to.device_id));
62 |     GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream);
63 |   } else {
64 |     std::cerr << "expect copy from/to GPU or between GPU" << std::endl;
65 |   }
66 | }
67 | 
68 | void CUDADeviceAPI::StreamSync(DLContext ctx, DLStreamHandle stream) {
69 |   CUDA_CALL(cudaSetDevice(ctx.device_id));
70 |   CUDA_CALL(cudaStreamSynchronize(static_cast<cudaStream_t>(stream)));
71 | }
72 | 
73 | } // namespace runtime
74 | } // namespace tinyflow
75 | 


--------------------------------------------------------------------------------
/src/cuda_device_api.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * \file device_api.h
 3 |  * \brief Device specific API
 4 |  */
 5 | #ifndef TINYFLOW_RUNTIME_CUDA_DEVICE_API_H_
 6 | #define TINYFLOW_RUNTIME_CUDA_DEVICE_API_H_
 7 | 
 8 | #include "c_runtime_api.h"
 9 | #include "device_api.h"
10 | #include <cuda_runtime.h>
11 | 
12 | #include <assert.h>
13 | #include <string>
14 | 
15 | namespace tinyflow {
16 | namespace runtime {
17 | 
18 | class CUDADeviceAPI : public DeviceAPI {
19 | public:
20 |   void *AllocDataSpace(DLContext ctx, size_t size, size_t alignment) final;
21 | 
22 |   void FreeDataSpace(DLContext ctx, void *ptr) final;
23 | 
24 |   void CopyDataFromTo(const void *from, void *to, size_t size,
25 |                       DLContext ctx_from, DLContext ctx_to,
26 |                       DLStreamHandle stream) final;
27 | 
28 |   void StreamSync(DLContext ctx, DLStreamHandle stream) final;
29 | };
30 | 
31 | } // namespace runtime
32 | } // namespace tinyflow
33 | #endif // TINYFLOW_RUNTIME_CUDA_DEVICE_API_H_
34 | 


--------------------------------------------------------------------------------
/src/device_api.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * \file device_api.h
 3 |  * \brief Device specific API
 4 |  */
 5 | #ifndef TINYFLOW_RUNTIME_DEVICE_API_H_
 6 | #define TINYFLOW_RUNTIME_DEVICE_API_H_
 7 | 
 8 | #include "c_runtime_api.h"
 9 | #include <assert.h>
10 | #include <string>
11 | 
12 | namespace tinyflow {
13 | namespace runtime {
14 | 
15 | class DeviceAPI {
16 | public:
17 |   /*! \brief virtual destructor */
18 |   virtual ~DeviceAPI() {}
19 |   /*!
20 |    * \brief Allocate a data space on device.
21 |    * \param ctx The device context to perform operation.
22 |    * \param size The size of the memory
23 |    * \param alignment The alignment of the memory.
24 |    * \return The allocated device pointer
25 |    */
26 |   virtual void *AllocDataSpace(DLContext ctx, size_t size,
27 |                                size_t alignment) = 0;
28 |   /*!
29 |    * \brief Free a data space on device.
30 |    * \param ctx The device context to perform operation.
31 |    * \param ptr The data space.
32 |    * \tparam xpu The device mask.
33 |    */
34 |   virtual void FreeDataSpace(DLContext ctx, void *ptr) = 0;
35 |   /*!
36 |    * \brief copy data from one place to another
37 |    * \param dev The device to perform operation.
38 |    * \param from The source array.
39 |    * \param to The target array.
40 |    * \param size The size of the memory
41 |    * \param ctx_from The source context
42 |    * \param ctx_to The target context
43 |    */
44 |   virtual void CopyDataFromTo(const void *from, void *to, size_t size,
45 |                               DLContext ctx_from, DLContext ctx_to,
46 |                               DLStreamHandle stream) = 0;
47 |   /*!
48 |    * \brief Synchronize the stream
49 |    * \param ctx The context to perform operation.
50 |    * \param stream The stream to be sync.
51 |    */
52 |   virtual void StreamSync(DLContext ctx, DLStreamHandle stream) = 0;
53 | };
54 | 
55 | } // namespace runtime
56 | } // namespace tinyflow
57 | #endif // TINYFLOW_RUNTIME_DEVICE_API_H_
58 | 


--------------------------------------------------------------------------------
/src/dlarray.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * \file dlarray.h
 3 |  * \brief Header that defines array struct.
 4 |  */
 5 | #ifndef TINYFLOW_H_
 6 | #define TINYFLOW_H_
 7 | 
 8 | #ifdef __cplusplus
 9 | #define TINYFLOW_EXTERN_C extern "C"
10 | #else
11 | #define TINYFLOW_EXTERN_C
12 | #endif
13 | 
14 | #include <stddef.h>
15 | #include <stdint.h>
16 | 
17 | TINYFLOW_EXTERN_C {
18 |   /*!
19 |    * \brief The device type in DLContext.
20 |    */
21 |   typedef enum {
22 |     kCPU = 1,
23 |     kGPU = 2,
24 |   } DLDeviceType;
25 | 
26 |   /*!
27 |    * \brief A Device context for array.
28 |    */
29 |   typedef struct {
30 |     /*! \brief The device index */
31 |     int device_id;
32 |     /*! \brief The device type used in the device. */
33 |     DLDeviceType device_type;
34 |   } DLContext;
35 | 
36 |   /*!
37 |    * \brief Plain C Array object, does not manage memory.
38 |    */
39 |   typedef struct {
40 |     /*!
41 |      * \brief The opaque data pointer points to the allocated data.
42 |      *  This will be CUDA device pointer or cl_mem handle in OpenCL.
43 |      *  This pointer is always aligns to 256 bytes as in CUDA.
44 |      */
45 |     void *data;
46 |     /*! \brief The device context of the tensor */
47 |     DLContext ctx;
48 |     /*! \brief Number of dimensions */
49 |     int ndim;
50 |     /*! \brief The shape of the tensor */
51 |     int64_t *shape;
52 |   } DLArray;
53 | 
54 | } // TINYFLOW_EXTERN_C
55 | #endif // TINYFLOW_H_
56 | 


--------------------------------------------------------------------------------
/src/gpu_op.cu:
--------------------------------------------------------------------------------
  1 | #include "./c_runtime_api.h"
  2 | #include <cassert>
  3 | #include <cstdio>
  4 | #include <cublas_v2.h>
  5 | #include <cuda_runtime.h>
  6 | #include <device_launch_parameters.h>
  7 | #include <algorithm>
  8 | 
  9 | #define MAX_THREADS_NUM 512
 10 | #define MAX_BLOCKS_NUM 4096
 11 | #define BLOCK_NUM(count) min(((count + MAX_THREADS_NUM - 1) / MAX_THREADS_NUM), MAX_BLOCKS_NUM)
 12 | #define CUDA_1D_KERNEL_LOOP(i, n) \
 13 |   for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
 14 |         i += blockDim.x * gridDim.x)
 15 | 
 16 | __global__ void matrix_array_set_kernel(int count,
 17 |                                         float *arr,
 18 |                                         float value) {
 19 |   CUDA_1D_KERNEL_LOOP(index, count) {
 20 |     arr[index] = value;
 21 |   }
 22 | }
 23 | 
 24 | __global__ void matrix_broadcast_to_kernel(int inputCount, float* inputArr,
 25 |                                            int outputCount, float* outputArr) {
 26 |   CUDA_1D_KERNEL_LOOP(index, outputCount) {
 27 |       outputArr[index] = inputArr[index % inputCount];
 28 |   }
 29 | }
 30 | 
 31 | __global__ void matrix_reduce_sum_axis_zero_kernel(float* inputArr,
 32 |                                                    int outputCount, float* outputArr,
 33 |                                                    int zeroDim) {
 34 |       CUDA_1D_KERNEL_LOOP(index, outputCount) {
 35 |           float sum = 0;
 36 |           for (int i = 0; i < zeroDim; ++i) {
 37 |               sum += inputArr[index + i * outputCount];
 38 |           }
 39 |           outputArr[index] = sum;
 40 |       }
 41 | }
 42 | 
 43 | __global__ void matrix_elementwise_add_kernel(float* matAData, float* matBData,
 44 |                                               float* outputData, int count) {
 45 |     CUDA_1D_KERNEL_LOOP(index, count) {
 46 |         outputData[index] = matAData[index] + matBData[index];
 47 |     }
 48 | }
 49 | 
 50 | __global__ void matrix_elementwise_add_by_const_kernel(float* inputArr, float val,
 51 |                                                        float* outputArr, int count) {
 52 |     CUDA_1D_KERNEL_LOOP(index, count) {
 53 |         outputArr[index] = inputArr[index] + val;
 54 |     }
 55 | }
 56 | 
 57 | __global__ void matrix_elementwise_multiply_kernel(float* matAData, float* matBData,
 58 |                                                    float* outputData, int count) {
 59 |     CUDA_1D_KERNEL_LOOP(index, count) {
 60 |         outputData[index] = matAData[index] * matBData[index];
 61 |     }
 62 | }
 63 | 
 64 | __global__ void matrix_elementwise_multipy_by_const_kernel(float* inputArr, float val,
 65 |                                                            float* outputArr, int count) {
 66 |     CUDA_1D_KERNEL_LOOP(index, count) {
 67 |         outputArr[index] = inputArr[index] * val;
 68 |     }
 69 | }
 70 | 
 71 | __global__ void matrix_relu_kernel(float* inputArr, float* outputArr, int count) {
 72 |     CUDA_1D_KERNEL_LOOP(index, count) {
 73 |         outputArr[index] = inputArr[index];
 74 |         if (inputArr[index] < 0) {
 75 |             outputArr[index] = 0.f;
 76 |         }
 77 |     }
 78 | }
 79 | 
 80 | __global__ void matrix_relu_gradient_kernel(const float* inputArr, const float* gradArr,
 81 |                                             float* outputArr, int count) {
 82 |     CUDA_1D_KERNEL_LOOP(index, count) {
 83 |         outputArr[index] = inputArr[index] > 0 ? gradArr[index] : 0;
 84 |     }
 85 | }
 86 | 
 87 | __global__ void matrix_softmax_kernel(int nRow, int nCol, float* inputArr, float* outputArr) {
 88 |     int y = blockIdx.x * blockDim.x + threadIdx.x;
 89 |     if (y >= nRow) return;
 90 | 
 91 |     float* input = inputArr + y * nCol;
 92 |     float* output = outputArr + y * nCol;
 93 | 
 94 |     float maxval = *input;
 95 |     for (int i = 1; i < nCol; ++i) {
 96 |         maxval = max(input[i], maxval);
 97 |     }
 98 |     float sum = 0;
 99 |     for (int i = 0; i < nCol; ++i) {
100 |         sum += expf(input[i] - maxval);
101 |     }
102 |     for (int i = 0; i < nCol; ++i) {
103 |         output[i] = expf(input[i] - maxval) / sum;
104 |     }
105 | }
106 | 
107 | /* all your GPU kernel code, e.g. matrix_softmax_cross_entropy_kernel */
108 | 
109 | // y = inputs[0], y_ = inputs[1]
110 | // np.mean(-np.sum(y_ * np.log(softmax(y)), axis=1), keepdims=True)
111 | __global__ void matrix_softmax_cross_entropy_kernel(int nrow, int ncol,
112 |                                                     const float *input_a,
113 |                                                     const float *input_b,
114 |                                                     float *output) {
115 |   // Dynamic shared memory, size provided at kernel launch.
116 |   extern __shared__ float loss_per_row[];
117 |   // Two dimensional thread blocks.
118 |   int y = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
119 |           threadIdx.x;
120 |   if (y >= nrow) {
121 |     return;
122 |   }
123 |   input_a += y * ncol;
124 |   input_b += y * ncol;
125 |   float maxval = *input_a;
126 |   // Find max for a row.
127 |   for (int x = 1; x < ncol; ++x) {
128 |     maxval = max(maxval, input_a[x]);
129 |   }
130 |   // Deduct by max for a row, and raise to exp.
131 |   float sum = 0;
132 |   for (int x = 0; x < ncol; ++x) {
133 |     sum += exp(input_a[x] - maxval);
134 |   }
135 |   // Compute per-row loss.
136 |   float loss = 0;
137 |   for (int x = 0; x < ncol; ++x) {
138 |     loss -= input_b[x] * log(exp(input_a[x] - maxval) / sum);
139 |   }
140 |   loss_per_row[y] = loss;
141 |   __syncthreads();
142 |   // Compute reduce_mean across rows.
143 |   float mean_loss = 0;
144 |   // Use a single thread to reduce mean across rows.
145 |   if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
146 |     for (int i = 0; i < nrow; ++i) {
147 |       mean_loss += loss_per_row[i];
148 |     }
149 |     mean_loss /= nrow;
150 |     output[0] = mean_loss;
151 |   }
152 | }
153 | 
154 | int DLGpuArraySet(DLArrayHandle arr, float value) {
155 |   int count = 1;
156 |   for (int i = 0; i < arr->ndim; ++i) {
157 |     count *= arr->shape[i];
158 |   }
159 |   float *arr_data = (float *)arr->data;
160 |   matrix_array_set_kernel<<<BLOCK_NUM(count), MAX_THREADS_NUM>>>(
161 |     count, arr_data, value);
162 |   return 0;
163 | }
164 | 
165 | int DLGpuBroadcastTo(const DLArrayHandle input, DLArrayHandle output) {
166 |   assert(input->ndim + 1 == output->ndim);
167 |   int inputCount = 1, outputCount = output->shape[0];
168 |   for (int i = 0; i < input->ndim; ++i) {
169 |       assert(input->shape[i] == output->shape[i + 1]);
170 |       inputCount *= input->shape[i];
171 |       outputCount *= output->shape[i + 1];
172 |   }
173 |   float* inputArr = (float*) input->data;
174 |   float* outputArr = (float*) output->data;
175 |   matrix_broadcast_to_kernel<<<BLOCK_NUM(outputCount), MAX_THREADS_NUM>>>(
176 |     inputCount, inputArr, outputCount, outputArr);
177 |   return 0;
178 | }
179 | 
180 | int DLGpuReduceSumAxisZero(const DLArrayHandle input, DLArrayHandle output) {
181 |   assert(input->ndim == output->ndim + 1);
182 |   int zeroDim = input->shape[0], outputCount = 1;
183 |     for (int i = 0; i < output->ndim; ++i) {
184 |         assert(input->shape[i+1] == output->shape[i]);
185 |         outputCount *= output->shape[i];
186 |     }
187 |   float* inputArr = (float*) input->data;
188 |   float* outputArr = (float*) output->data;
189 |   matrix_reduce_sum_axis_zero_kernel<<<BLOCK_NUM(outputCount), MAX_THREADS_NUM>>>(
190 |           inputArr, outputCount, outputArr, zeroDim);
191 |   return 0;
192 | }
193 | 
194 | int DLGpuMatrixElementwiseAdd(const DLArrayHandle matA,
195 |                               const DLArrayHandle matB, DLArrayHandle output) {
196 |   assert(matA->ndim == output->ndim);
197 |   assert(matB->ndim == output->ndim);
198 |   int count = 1;
199 |   for (int i = 0; i < matA->ndim; ++i) {
200 |     assert(matA->shape[i] == output->shape[i]);
201 |     assert(matB->shape[i] == output->shape[i]);
202 |     count *= matA->shape[i];
203 |   }
204 |   float* matAData = (float*) matA->data;
205 |   float* matBData = (float*) matB->data;
206 |   float* outputData = (float*) output->data;
207 |   matrix_elementwise_add_kernel<<<BLOCK_NUM(count), MAX_THREADS_NUM>>>(
208 |           matAData, matBData, outputData, count);
209 |   return 0;
210 | }
211 | 
212 | int DLGpuMatrixElementwiseAddByConst(const DLArrayHandle input, float val,
213 |                                      DLArrayHandle output) {
214 |   assert(input->ndim == output->ndim);
215 |   int count = 1;
216 |   for (int i = 0; i < input->ndim; ++i) {
217 |     assert(input->shape[i] == output->shape[i]);
218 |     count *= input->shape[i];
219 |   }
220 |   float* inputArr = (float*) input->data;
221 |   float* outputArr = (float*) output->data;
222 |   matrix_elementwise_add_by_const_kernel<<<BLOCK_NUM(count), MAX_THREADS_NUM>>>(
223 |           inputArr, val, outputArr, count);
224 |   return 0;
225 | }
226 | 
227 | int DLGpuMatrixElementwiseMultiply(const DLArrayHandle matA,
228 |                                    const DLArrayHandle matB,
229 |                                    DLArrayHandle output) {
230 |   assert(matA->ndim == output->ndim);
231 |   assert(matB->ndim == output->ndim);
232 |   int count = 1;
233 |   for (int i = 0; i < matA->ndim; ++i) {
234 |     assert(matA->shape[i] == output->shape[i]);
235 |     assert(matB->shape[i] == output->shape[i]);
236 |     count *= matA->shape[i];
237 |   }
238 |   float* matAData = (float*) matA->data;
239 |   float* matBData = (float*) matB->data;
240 |   float* outputData = (float*) output->data;
241 |   matrix_elementwise_multiply_kernel<<<BLOCK_NUM(count), MAX_THREADS_NUM>>>(
242 |           matAData, matBData, outputData, count);
243 |   return 0;
244 | }
245 | 
246 | int DLGpuMatrixMultiplyByConst(const DLArrayHandle input, float val,
247 |                                DLArrayHandle output) {
248 |   assert(input->ndim == output->ndim);
249 |   int count = 1;
250 |   for (int i = 0; i < input->ndim; ++i) {
251 |     assert(input->shape[i] == output->shape[i]);
252 |     count *= input->shape[i];
253 |   }
254 |   float* inputArr = (float*) input->data;
255 |   float* outputArr = (float*) output->data;
256 |   matrix_elementwise_multipy_by_const_kernel<<<BLOCK_NUM(count), MAX_THREADS_NUM>>>(
257 |           inputArr, val, outputArr, count);
258 |   return 0;
259 | }
260 | 
261 | int DLGpuMatrixMultiply(const DLArrayHandle matA, bool transposeA,
262 |                         const DLArrayHandle matB, bool transposeB,
263 |                         DLArrayHandle matC) {
264 |   // Hint: use cublas
265 |   // cublas assume matrix is column major
266 |   assert(matA->ndim == 2);
267 |   assert(matB->ndim == 2);
268 |   assert(matC->ndim == 2);
269 |   assert(matA->shape[transposeA ? 0 : 1] == matB->shape[transposeB ? 1 : 0]);
270 |   assert(matA->shape[transposeA ? 1 : 0] == matC->shape[0]);
271 |   assert(matB->shape[transposeB ? 0 : 1] == matC->shape[1]);
272 | 
273 |   cublasHandle_t handle;
274 |   cublasCreate(&handle);
275 |   const float* matAData = (const float*) matA->data;
276 |   const float* matBData = (const float*) matB->data;
277 |   float* matCData = (float*) matC->data;
278 |   float alpha = 1, beta = 0;
279 | 
280 |   cublasSgemm(handle,
281 |               (transposeB ? CUBLAS_OP_T : CUBLAS_OP_N),
282 |               (transposeA ? CUBLAS_OP_T : CUBLAS_OP_N),
283 |               (transposeB ? matB->shape[0] : matB->shape[1]),
284 |               (transposeA ? matA->shape[1] : matA->shape[0]),
285 |               (transposeB ? matB->shape[1] : matB->shape[0]),
286 |               &alpha,
287 |               matBData, matB->shape[1],
288 |               matAData, matA->shape[1],
289 |               &beta,
290 |               matCData, (transposeB ? matB->shape[0] : matB->shape[1]));
291 | 
292 |   return 0;
293 | }
294 | 
295 | int DLGpuRelu(const DLArrayHandle input, DLArrayHandle output) {
296 |   assert(input->ndim == output->ndim);
297 |   int count = 1;
298 |   for (int i = 0; i < input->ndim; ++i) {
299 |       assert(input->shape[i] == output->shape[i]);
300 |       count *= input->shape[i];
301 |   }
302 |   float* inputArr = (float*) input->data;
303 |   float* outputArr = (float*) output->data;
304 |   matrix_relu_kernel<<<BLOCK_NUM(count), MAX_THREADS_NUM>>>(
305 |           inputArr, outputArr, count);
306 |   return 0;
307 | }
308 | 
309 | int DLGpuReluGradient(const DLArrayHandle input, const DLArrayHandle in_grad,
310 |                       DLArrayHandle output) {
311 |   assert(input->ndim == in_grad->ndim);
312 |   assert(input->ndim == output->ndim);
313 |   int count = 1;
314 |   for (int i = 0; i < input->ndim; ++i) {
315 |       assert(input->shape[i] == in_grad->shape[i]);
316 |       assert(input->shape[i] == output->shape[i]);
317 |       count *= input->shape[i];
318 |   }
319 |   const float* inputArr = (const float*) input->data;
320 |   const float* gradArr = (const float*) in_grad->data;
321 |   float* outputArr = (float*) output->data;
322 |   matrix_relu_gradient_kernel<<<BLOCK_NUM(count), MAX_THREADS_NUM>>>(
323 |           inputArr, gradArr, outputArr, count);
324 |   return 0;
325 | }
326 | 
327 | int DLGpuSoftmax(const DLArrayHandle input, DLArrayHandle output) {
328 |   assert(input->ndim == 2);
329 |   assert(output->ndim == 2);
330 |   assert(input->shape[0] == output->shape[0]);
331 |   assert(input->shape[1] == output->shape[1]);
332 | 
333 |   int nRow = input->shape[0];
334 |   int nCol = input->shape[1];
335 | 
336 |   dim3 block(MAX_THREADS_NUM);
337 |   dim3 grid((nRow + block.x - 1) / block.x);
338 | 
339 |   float* inputArr = (float*) input->data;
340 |   float* outputArr = (float*) output->data;
341 | 
342 |   matrix_softmax_kernel<<<grid, block>>>(nRow, nCol, inputArr, outputArr);
343 | 
344 |   return 0;
345 | }
346 | 
347 | int DLGpuSoftmaxCrossEntropy(const DLArrayHandle input_a,
348 |                              const DLArrayHandle input_b,
349 |                              DLArrayHandle output) {
350 |   assert(input_a->ndim == 2);
351 |   assert(input_b->ndim == 2);
352 |   assert(output->ndim == 1);
353 |   assert(input_a->shape[0] == input_b->shape[0] &&
354 |          input_a->shape[1] == input_b->shape[1]);
355 |   int nrow = input_a->shape[0];
356 |   // Maximum x- or y-dimension of a block = 1024
357 |   // But we need 'nrow' shared memory, and max shared memory is 48KB.
358 |   // Conservatively allow max 16KB shared memory.
359 |   assert(nrow <= 1024 * 4);
360 |   int ncol = input_a->shape[1];
361 |   const float *input_data_a = (const float *)input_a->data;
362 |   const float *input_data_b = (const float *)input_b->data;
363 |   float *output_data = (float *)output->data;
364 |   dim3 threads;
365 |   if (nrow <= 1024) {
366 |     threads.x = nrow;
367 |   } else {
368 |     threads.x = 1024;
369 |     threads.y = (nrow + 1023) / 1024;
370 |   }
371 |   // 1 block, each block with 'threads' number of threads with 'nrow' shared
372 |   // memory size
373 |   matrix_softmax_cross_entropy_kernel<<<1, threads, nrow * sizeof(float)>>>(
374 |       nrow, ncol, input_data_a, input_data_b, output_data);
375 |   return 0;
376 | }
377 | 


--------------------------------------------------------------------------------
/src/runtime_base.h:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * \file runtime_base.h
 3 |  * \brief Base of all C APIs
 4 |  */
 5 | #ifndef TINYFLOW_RUNTIME_RUNTIME_BASE_H_
 6 | #define TINYFLOW_RUNTIME_RUNTIME_BASE_H_
 7 | 
 8 | #include "c_runtime_api.h"
 9 | #include <stdexcept>
10 | 
11 | /*! \brief  macro to guard beginning and end section of all functions */
12 | #define API_BEGIN() try {
13 | /*!
14 |  * \brief every function starts with API_BEGIN(), and finishes with API_END()
15 |  *  or API_END_HANDLE_ERROR
16 |  */
17 | #define API_END()                                                              \
18 |   }                                                                            \
19 |   catch (std::runtime_error & _except_) {                                      \
20 |     return TINYFLOWAPIHandleException(_except_);                                  \
21 |   }                                                                            \
22 |   return 0;
23 | 
24 | /*!
25 |  * \brief every function starts with API_BEGIN() and finishes with API_END() or
26 |  * API_END_HANDLE_ERROR. The finally clause contains procedure to cleanup states
27 |  * when an error happens.
28 |  */
29 | #define API_END_HANDLE_ERROR(Finalize)                                         \
30 |   }                                                                            \
31 |   catch (std::runtime_error & _except_) {                                      \
32 |     Finalize;                                                                  \
33 |     return TINYFLOWAPIHandleException(_except_);                                  \
34 |   }                                                                            \
35 |   return 0;
36 | 
37 | /*!
38 |  * \brief handle exception throwed out
39 |  * \param e the exception
40 |  * \return the return value of API after exception is handled
41 |  */
42 | inline int TINYFLOWAPIHandleException(const std::runtime_error &e) {
43 |   // TODO
44 |   // TVMAPISetLastError(e.what());
45 |   return -1;
46 | }
47 | 
48 | #endif // TINYFLOW_RUNTIME_RUNTIME_BASE_H_
49 | 


--------------------------------------------------------------------------------
/tests/autodiff_test.py:
--------------------------------------------------------------------------------
  1 | from tinyflow import autodiff as ad
  2 | import numpy as np
  3 | 
  4 | 
  5 | def test_identity():
  6 |     x2 = ad.Variable(name="x2")
  7 |     y = x2
  8 | 
  9 |     grad_x2, = ad.gradients(y, [x2])
 10 | 
 11 |     executor = ad.Executor([y, grad_x2])
 12 |     x2_val = 2 * np.ones(3)
 13 |     y_val, grad_x2_val = executor.run(feed_dict={x2: x2_val})
 14 | 
 15 |     assert isinstance(y, ad.Node)
 16 |     assert np.array_equal(y_val, x2_val)
 17 |     assert np.array_equal(grad_x2_val, np.ones_like(x2_val))
 18 | 
 19 | 
 20 | def test_add_by_const():
 21 |     x2 = ad.Variable(name="x2")
 22 |     y = 5 + x2
 23 | 
 24 |     grad_x2, = ad.gradients(y, [x2])
 25 | 
 26 |     executor = ad.Executor([y, grad_x2])
 27 |     x2_val = 2 * np.ones(3)
 28 |     y_val, grad_x2_val = executor.run(feed_dict={x2: x2_val})
 29 | 
 30 |     assert isinstance(y, ad.Node)
 31 |     assert np.array_equal(y_val, x2_val + 5)
 32 |     assert np.array_equal(grad_x2_val, np.ones_like(x2_val))
 33 | 
 34 | 
 35 | def test_mul_by_const():
 36 |     x2 = ad.Variable(name="x2")
 37 |     y = 5 * x2
 38 | 
 39 |     grad_x2, = ad.gradients(y, [x2])
 40 | 
 41 |     executor = ad.Executor([y, grad_x2])
 42 |     x2_val = 2 * np.ones(3)
 43 |     y_val, grad_x2_val = executor.run(feed_dict={x2: x2_val})
 44 | 
 45 |     assert isinstance(y, ad.Node)
 46 |     assert np.array_equal(y_val, x2_val * 5)
 47 |     assert np.array_equal(grad_x2_val, np.ones_like(x2_val) * 5)
 48 | 
 49 | 
 50 | def test_add_two_vars():
 51 |     x2 = ad.Variable(name="x2")
 52 |     x3 = ad.Variable(name="x3")
 53 |     y = x2 + x3
 54 | 
 55 |     grad_x2, grad_x3 = ad.gradients(y, [x2, x3])
 56 | 
 57 |     executor = ad.Executor([y, grad_x2, grad_x3])
 58 |     x2_val = 2 * np.ones(3)
 59 |     x3_val = 3 * np.ones(3)
 60 |     y_val, grad_x2_val, grad_x3_val = executor.run(feed_dict={x2: x2_val, x3: x3_val})
 61 | 
 62 |     assert isinstance(y, ad.Node)
 63 |     assert np.array_equal(y_val, x2_val + x3_val)
 64 |     assert np.array_equal(grad_x2_val, np.ones_like(x2_val))
 65 |     assert np.array_equal(grad_x3_val, np.ones_like(x3_val))
 66 | 
 67 | 
 68 | def test_mul_two_vars():
 69 |     x2 = ad.Variable(name="x2")
 70 |     x3 = ad.Variable(name="x3")
 71 |     y = x2 * x3
 72 | 
 73 |     grad_x2, grad_x3 = ad.gradients(y, [x2, x3])
 74 | 
 75 |     executor = ad.Executor([y, grad_x2, grad_x3])
 76 |     x2_val = 2 * np.ones(3)
 77 |     x3_val = 3 * np.ones(3)
 78 |     y_val, grad_x2_val, grad_x3_val = executor.run(feed_dict={x2: x2_val, x3: x3_val})
 79 | 
 80 |     assert isinstance(y, ad.Node)
 81 |     assert np.array_equal(y_val, x2_val * x3_val)
 82 |     assert np.array_equal(grad_x2_val, x3_val)
 83 |     assert np.array_equal(grad_x3_val, x2_val)
 84 | 
 85 | 
 86 | def test_add_mul_mix_1():
 87 |     x1 = ad.Variable(name="x1")
 88 |     x2 = ad.Variable(name="x2")
 89 |     x3 = ad.Variable(name="x3")
 90 |     y = x1 + x2 * x3 * x1
 91 | 
 92 |     grad_x1, grad_x2, grad_x3 = ad.gradients(y, [x1, x2, x3])
 93 | 
 94 |     executor = ad.Executor([y, grad_x1, grad_x2, grad_x3])
 95 |     x1_val = 1 * np.ones(3)
 96 |     x2_val = 2 * np.ones(3)
 97 |     x3_val = 3 * np.ones(3)
 98 |     y_val, grad_x1_val, grad_x2_val, grad_x3_val = executor.run(feed_dict={x1: x1_val, x2: x2_val, x3: x3_val})
 99 | 
100 |     assert isinstance(y, ad.Node)
101 |     assert np.array_equal(y_val, x1_val + x2_val * x3_val)
102 |     assert np.array_equal(grad_x1_val, np.ones_like(x1_val) + x2_val * x3_val)
103 |     assert np.array_equal(grad_x2_val, x3_val * x1_val)
104 |     assert np.array_equal(grad_x3_val, x2_val * x1_val)
105 | 
106 | 
107 | def test_add_mul_mix_2():
108 |     x1 = ad.Variable(name="x1")
109 |     x2 = ad.Variable(name="x2")
110 |     x3 = ad.Variable(name="x3")
111 |     x4 = ad.Variable(name="x4")
112 |     y = x1 + x2 * x3 * x4
113 | 
114 |     grad_x1, grad_x2, grad_x3, grad_x4 = ad.gradients(y, [x1, x2, x3, x4])
115 | 
116 |     executor = ad.Executor([y, grad_x1, grad_x2, grad_x3, grad_x4])
117 |     x1_val = 1 * np.ones(3)
118 |     x2_val = 2 * np.ones(3)
119 |     x3_val = 3 * np.ones(3)
120 |     x4_val = 4 * np.ones(3)
121 |     y_val, grad_x1_val, grad_x2_val, grad_x3_val, grad_x4_val = executor.run(
122 |         feed_dict={x1: x1_val, x2: x2_val, x3: x3_val, x4: x4_val}
123 |     )
124 | 
125 |     assert isinstance(y, ad.Node)
126 |     assert np.array_equal(y_val, x1_val + x2_val * x3_val * x4_val)
127 |     assert np.array_equal(grad_x1_val, np.ones_like(x1_val))
128 |     assert np.array_equal(grad_x2_val, x3_val * x4_val)
129 |     assert np.array_equal(grad_x3_val, x2_val * x4_val)
130 |     assert np.array_equal(grad_x4_val, x2_val * x3_val)
131 | 
132 | 
133 | def test_add_mul_mix_3():
134 |     x2 = ad.Variable(name="x2")
135 |     x3 = ad.Variable(name="x3")
136 |     z = x2 * x2 + x2 + x3 + 3
137 |     y = z * z + x3
138 | 
139 |     grad_x2, grad_x3 = ad.gradients(y, [x2, x3])
140 | 
141 |     executor = ad.Executor([y, grad_x2, grad_x3])
142 |     x2_val = 2 * np.ones(3)
143 |     x3_val = 3 * np.ones(3)
144 |     y_val, grad_x2_val, grad_x3_val = executor.run(feed_dict={x2: x2_val, x3: x3_val})
145 | 
146 |     z_val = x2_val * x2_val + x2_val + x3_val + 3
147 |     expected_yval = z_val * z_val + x3_val
148 |     expected_grad_x2_val = 2 * (x2_val * x2_val + x2_val + x3_val + 3) * (2 * x2_val + 1)
149 |     expected_grad_x3_val = 2 * (x2_val * x2_val + x2_val + x3_val + 3) + 1
150 |     assert isinstance(y, ad.Node)
151 |     assert np.array_equal(y_val, expected_yval)
152 |     assert np.array_equal(grad_x2_val, expected_grad_x2_val)
153 |     assert np.array_equal(grad_x3_val, expected_grad_x3_val)
154 | 
155 | 
156 | def test_grad_of_grad():
157 |     x2 = ad.Variable(name="x2")
158 |     x3 = ad.Variable(name="x3")
159 |     y = x2 * x2 + x2 * x3
160 | 
161 |     grad_x2, grad_x3 = ad.gradients(y, [x2, x3])
162 |     grad_x2_x2, grad_x2_x3 = ad.gradients(grad_x2, [x2, x3])
163 | 
164 |     executor = ad.Executor([y, grad_x2, grad_x3, grad_x2_x2, grad_x2_x3])
165 |     x2_val = 2 * np.ones(3)
166 |     x3_val = 3 * np.ones(3)
167 |     y_val, grad_x2_val, grad_x3_val, grad_x2_x2_val, grad_x2_x3_val = executor.run(
168 |         feed_dict={x2: x2_val, x3: x3_val}
169 |     )
170 | 
171 |     expected_yval = x2_val * x2_val + x2_val * x3_val
172 |     expected_grad_x2_val = 2 * x2_val + x3_val
173 |     expected_grad_x3_val = x2_val
174 |     expected_grad_x2_x2_val = 2 * np.ones_like(x2_val)
175 |     expected_grad_x2_x3_val = 1 * np.ones_like(x2_val)
176 | 
177 |     assert isinstance(y, ad.Node)
178 |     assert np.array_equal(y_val, expected_yval)
179 |     assert np.array_equal(grad_x2_val, expected_grad_x2_val)
180 |     assert np.array_equal(grad_x3_val, expected_grad_x3_val)
181 |     assert np.array_equal(grad_x2_x2_val, expected_grad_x2_x2_val)
182 |     assert np.array_equal(grad_x2_x3_val, expected_grad_x2_x3_val)
183 | 
184 | 
185 | def test_matmul_two_vars():
186 |     x2 = ad.Variable(name="x2")
187 |     x3 = ad.Variable(name="x3")
188 |     y = ad.matmul_op(x2, x3)
189 | 
190 |     grad_x2, grad_x3 = ad.gradients(y, [x2, x3])
191 | 
192 |     executor = ad.Executor([y, grad_x2, grad_x3])
193 |     x2_val = np.array([[1, 2], [3, 4], [5, 6]])  # 3x2
194 |     x3_val = np.array([[7, 8, 9], [10, 11, 12]])  # 2x3
195 | 
196 |     y_val, grad_x2_val, grad_x3_val = executor.run(feed_dict={x2: x2_val, x3: x3_val})
197 | 
198 |     expected_yval = np.matmul(x2_val, x3_val)
199 |     expected_grad_x2_val = np.matmul(np.ones_like(expected_yval), np.transpose(x3_val))
200 |     expected_grad_x3_val = np.matmul(np.transpose(x2_val), np.ones_like(expected_yval))
201 | 
202 |     assert isinstance(y, ad.Node)
203 |     assert np.array_equal(y_val, expected_yval)
204 |     assert np.array_equal(grad_x2_val, expected_grad_x2_val)
205 |     assert np.array_equal(grad_x3_val, expected_grad_x3_val)
206 | 
207 | 
208 | def test_exp():
209 |     x1 = ad.Variable("x1")
210 |     x2 = ad.exp_op(x1)
211 |     x3 = x2 + 1
212 |     x4 = x2 * x3
213 | 
214 |     x1_grad, = ad.gradients(x4, [x1])
215 | 
216 |     executor = ad.Executor([x4])
217 |     x1_val = 1
218 |     x4_val, x1_grad = executor.run(feed_dict={x1: x1_val})
219 |     print(x4_val)
220 |     print(x1_grad)
221 | 
222 | 
223 | def test_exp_grad():
224 |     x = ad.Variable("x")
225 |     y = ad.exp_op(x)
226 | 
227 |     x_grad, = ad.gradients(y, [x])
228 | 
229 |     executor = ad.Executor([y, x_grad])
230 |     x_val = 1
231 |     y_val, x_grad_val = executor.run(feed_dict={x: x_val})
232 |     print(y_val)
233 |     print(x_grad_val)
234 | 
235 | 
236 | def test_lr():
237 |     W = ad.Variable(name="W")
238 |     b = ad.Variable(name="b")
239 |     X = ad.Variable(name="X")
240 |     y_ = ad.Variable(name="y_")
241 | 
242 |     z = ad.matmul_op(X, W) + b
243 |     loss = ad.sigmoidcrossentropy_op(z, y_)
244 | 
245 |     grad_W, grad_b = ad.gradients(loss, [W, b])


--------------------------------------------------------------------------------
/tests/mnist_dlsys.py:
--------------------------------------------------------------------------------
  1 | from tinyflow import autodiff as ad
  2 | from tinyflow import ndarray, gpu_op
  3 | import numpy as np
  4 | 
  5 | import argparse
  6 | import six.moves.cPickle as pickle
  7 | import gzip
  8 | import os
  9 | 
 10 | 
 11 | def load_mnist_data(dataset):
 12 |     """ Load the dataset
 13 |     Code adapted from http://deeplearning.net/tutorial/code/logistic_sgd.py
 14 | 
 15 |     :type dataset: string
 16 |     :param dataset: the path to the dataset (here MNIST)
 17 |     """
 18 |     # Download the MNIST dataset if it is not present
 19 |     data_dir, data_file = os.path.split(dataset)
 20 |     if data_dir == "" and not os.path.isfile(dataset):
 21 |         # Check if dataset is in the data directory.
 22 |         new_path = os.path.join(
 23 |             os.path.split(__file__)[0],
 24 |             dataset
 25 |         )
 26 |         if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
 27 |             dataset = new_path
 28 | 
 29 |     if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
 30 |         from six.moves import urllib
 31 |         origin = (
 32 |             'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
 33 |         )
 34 |         print('Downloading data from %s' % origin)
 35 |         urllib.request.urlretrieve(origin, dataset)
 36 | 
 37 |     print('Loading data...')
 38 | 
 39 |     # Load the dataset
 40 |     with gzip.open(dataset, 'rb') as f:
 41 |         try:
 42 |             train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
 43 |         except:
 44 |             train_set, valid_set, test_set = pickle.load(f)
 45 |     # train_set, valid_set, test_set format: tuple(input, target)
 46 |     # input is a numpy.ndarray of 2 dimensions (a matrix), np.float32
 47 |     # where each row corresponds to an example. target is a
 48 |     # numpy.ndarray of 1 dimension (vector), np.int64 that has the same length
 49 |     # as the number of rows in the input. It should give the target
 50 |     # to the example with the same index in the input.
 51 |     return train_set, valid_set, test_set
 52 | 
 53 | 
 54 | def convert_to_one_hot(vals):
 55 |     """Helper method to convert label array to one-hot array."""
 56 |     one_hot_vals = np.zeros((vals.size, vals.max()+1))
 57 |     one_hot_vals[np.arange(vals.size), vals] = 1
 58 |     return one_hot_vals
 59 | 
 60 | 
 61 | def sgd_update_gpu(param, grad_param, learning_rate):
 62 |     """Helper GPU SGD update method. Avoids copying NDArray to cpu."""
 63 |     assert isinstance(param, ndarray.NDArray)
 64 |     assert isinstance(grad_param, ndarray.NDArray)
 65 |     gpu_op.matrix_elementwise_multiply_by_const(
 66 |         grad_param, -learning_rate, grad_param)
 67 |     gpu_op.matrix_elementwise_add(param, grad_param, param)
 68 | 
 69 | 
 70 | def mnist_logreg(executor_ctx=None, num_epochs=10, print_loss_val_each_epoch=False):
 71 |     print("Build logistic regression model...")
 72 | 
 73 |     W1 = ad.Variable(name="W1")
 74 |     b1 = ad.Variable(name="b1")
 75 |     X = ad.Variable(name="X")
 76 |     y_ = ad.Variable(name="y_")
 77 | 
 78 |     z1 = ad.matmul_op(X, W1)
 79 |     y = z1 + ad.broadcastto_op(b1, z1)
 80 | 
 81 |     loss = ad.softmaxcrossentropy_op(y, y_)
 82 | 
 83 |     grad_W1, grad_b1 = ad.gradients(loss, [W1, b1])
 84 |     executor = ad.Executor([loss, grad_W1, grad_b1, y], ctx=executor_ctx)
 85 | 
 86 |     # Read input data
 87 |     datasets = load_mnist_data("mnist.pkl.gz")
 88 |     train_set_x, train_set_y = datasets[0]
 89 |     valid_set_x, valid_set_y = datasets[1]
 90 |     test_set_x, test_set_y = datasets[2]
 91 | 
 92 |     # Set up minibatch
 93 |     batch_size = 1000
 94 |     n_train_batches = train_set_x.shape[0] // batch_size
 95 |     n_valid_batches = valid_set_x.shape[0] // batch_size
 96 | 
 97 |     print("Start training loop...")
 98 | 
 99 |     # Initialize parameters
100 |     W1_val = np.zeros((784, 10))
101 |     b1_val = np.zeros((10))
102 |     X_val = np.empty(shape=(batch_size, 784), dtype=np.float32)
103 |     y_val = np.empty(shape=(batch_size, 10), dtype=np.float32)
104 |     valid_X_val = np.empty(shape=(batch_size, 784), dtype=np.float32)
105 |     valid_y_val = np.empty(shape=(batch_size, 10), dtype=np.float32)
106 |     if ndarray.is_gpu_ctx(executor_ctx):
107 |         W1_val = ndarray.array(W1_val, ctx=executor_ctx)
108 |         b1_val = ndarray.array(b1_val, ctx=executor_ctx)
109 |         X_val = ndarray.array(X_val, ctx=executor_ctx)
110 |         y_val = ndarray.array(y_val, ctx=executor_ctx)
111 | 
112 |     lr = 1e-3
113 |     for i in range(num_epochs):
114 |         print("epoch %d" % i)
115 |         for minibatch_index in range(n_train_batches):
116 |             minibatch_start = minibatch_index * batch_size
117 |             minibatch_end = (minibatch_index + 1) * batch_size
118 |             X_val[:] = train_set_x[minibatch_start:minibatch_end]
119 |             y_val[:] = convert_to_one_hot(
120 |                 train_set_y[minibatch_start:minibatch_end])
121 |             loss_val, grad_W1_val, grad_b1_val, _ = executor.run(
122 |                 feed_dict = {X: X_val, y_: y_val, W1: W1_val, b1: b1_val})
123 |             # SGD update
124 |             if (executor_ctx is None):
125 |                 W1_val = W1_val - lr * grad_W1_val
126 |                 b1_val = b1_val - lr * grad_b1_val
127 |             else:
128 |                 sgd_update_gpu(W1_val, grad_W1_val, lr)
129 |                 sgd_update_gpu(b1_val, grad_b1_val, lr)
130 |         if print_loss_val_each_epoch:
131 |             if isinstance(loss_val, ndarray.NDArray):
132 |                 print(loss_val.asnumpy())
133 |             else:
134 |                 print(loss_val)
135 | 
136 |     correct_predictions = []
137 |     for minibatch_index in range(n_valid_batches):
138 |         minibatch_start = minibatch_index * batch_size
139 |         minibatch_end = (minibatch_index + 1) * batch_size
140 |         valid_X_val[:] = valid_set_x[minibatch_start:minibatch_end]
141 |         valid_y_val[:] = convert_to_one_hot(
142 |             valid_set_y[minibatch_start:minibatch_end])
143 |         _, _, _, valid_y_predicted = executor.run(
144 |             feed_dict={
145 |                         X: valid_X_val,
146 |                         y_: valid_y_val,
147 |                         W1: W1_val,
148 |                         b1: b1_val},
149 |             convert_to_numpy_ret_vals=True)
150 |         correct_prediction = np.equal(
151 |             np.argmax(valid_y_val, 1),
152 |             np.argmax(valid_y_predicted, 1)).astype(np.float)
153 |         correct_predictions.extend(correct_prediction)
154 |     accuracy = np.mean(correct_predictions)
155 |     # validation set accuracy=0.928200
156 |     print("validation set accuracy=%f" % accuracy)
157 | 
158 | 
159 | def mnist_mlp(executor_ctx=None, num_epochs=10, print_loss_val_each_epoch=False):
160 |     print("Build 3-layer MLP model...")
161 | 
162 |     W1 = ad.Variable(name="W1")
163 |     W2 = ad.Variable(name="W2")
164 |     W3 = ad.Variable(name="W3")
165 |     b1 = ad.Variable(name="b1")
166 |     b2 = ad.Variable(name="b2")
167 |     b3 = ad.Variable(name="b3")
168 |     X = ad.Variable(name="X")
169 |     y_ = ad.Variable(name="y_")
170 | 
171 |     # relu(X W1+b1)
172 |     z1 = ad.matmul_op(X, W1)
173 |     z2 = z1 + ad.broadcastto_op(b1, z1)
174 |     z3 = ad.relu_op(z2)
175 | 
176 |     # relu(z3 W2+b2)
177 |     z4 = ad.matmul_op(z3, W2)
178 |     z5 = z4 + ad.broadcastto_op(b2, z4)
179 |     z6 = ad.relu_op(z5)
180 | 
181 |     # softmax(z5 W2+b2)
182 |     z7 = ad.matmul_op(z6, W3)
183 |     y = z7 + ad.broadcastto_op(b3, z7)
184 | 
185 |     loss = ad.softmaxcrossentropy_op(y, y_)
186 | 
187 |     grad_W1, grad_W2, grad_W3, grad_b1, grad_b2, grad_b3 = ad.gradients(
188 |         loss, [W1, W2, W3, b1, b2, b3])
189 |     executor = ad.Executor(
190 |         [loss, grad_W1, grad_W2, grad_W3, grad_b1, grad_b2, grad_b3, y],
191 |         ctx=executor_ctx)
192 | 
193 |     # Read input data
194 |     datasets = load_mnist_data("mnist.pkl.gz")
195 |     train_set_x, train_set_y = datasets[0]
196 |     valid_set_x, valid_set_y = datasets[1]
197 |     test_set_x, test_set_y = datasets[2]
198 |     # Set up minibatch
199 |     batch_size = 1000
200 |     n_train_batches = train_set_x.shape[0] // batch_size
201 |     n_valid_batches = valid_set_x.shape[0] // batch_size
202 | 
203 |     print("Start training loop...")
204 | 
205 |     # Initialize parameters
206 |     rand = np.random.RandomState(seed=123)
207 |     W1_val = rand.normal(scale=0.1, size=(784, 256))
208 |     W2_val = rand.normal(scale=0.1, size=(256, 100))
209 |     W3_val = rand.normal(scale=0.1, size=(100, 10))
210 |     b1_val = rand.normal(scale=0.1, size=(256))
211 |     b2_val = rand.normal(scale=0.1, size=(100))
212 |     b3_val = rand.normal(scale=0.1, size=(10))
213 |     X_val = np.empty(shape=(batch_size, 784), dtype=np.float32)
214 |     y_val = np.empty(shape=(batch_size, 10), dtype=np.float32)
215 |     valid_X_val = np.empty(shape=(batch_size, 784), dtype=np.float32)
216 |     valid_y_val = np.empty(shape=(batch_size, 10), dtype=np.float32)
217 |     if ndarray.is_gpu_ctx(executor_ctx):
218 |         W1_val = ndarray.array(W1_val, ctx=executor_ctx)
219 |         W2_val = ndarray.array(W2_val, ctx=executor_ctx)
220 |         W3_val = ndarray.array(W3_val, ctx=executor_ctx)
221 |         b1_val = ndarray.array(b1_val, ctx=executor_ctx)
222 |         b2_val = ndarray.array(b2_val, ctx=executor_ctx)
223 |         b3_val = ndarray.array(b3_val, ctx=executor_ctx)
224 |         X_val = ndarray.array(X_val, ctx=executor_ctx)
225 |         y_val = ndarray.array(y_val, ctx=executor_ctx)
226 | 
227 |     lr = 1.0e-3
228 |     for i in range(num_epochs):
229 |         print("epoch %d" % i)
230 |         for minibatch_index in range(n_train_batches):
231 |             minibatch_start = minibatch_index * batch_size
232 |             minibatch_end = (minibatch_index + 1) * batch_size
233 |             X_val[:] = train_set_x[minibatch_start:minibatch_end]
234 |             y_val[:] = convert_to_one_hot(
235 |                 train_set_y[minibatch_start:minibatch_end])
236 |             loss_val, grad_W1_val, grad_W2_val, grad_W3_val, \
237 |                 grad_b1_val, grad_b2_val, grad_b3_val, _ = executor.run(
238 |                     feed_dict={
239 |                         X: X_val,
240 |                         y_: y_val,
241 |                         W1: W1_val,
242 |                         W2: W2_val,
243 |                         W3: W3_val,
244 |                         b1: b1_val,
245 |                         b2: b2_val,
246 |                         b3: b3_val})
247 |             # SGD update
248 |             if (executor_ctx is None):
249 |                 W1_val = W1_val - lr * grad_W1_val
250 |                 W2_val = W2_val - lr * grad_W2_val
251 |                 W3_val = W3_val - lr * grad_W3_val
252 |                 b1_val = b1_val - lr * grad_b1_val
253 |                 b2_val = b2_val - lr * grad_b2_val
254 |                 b3_val = b3_val - lr * grad_b3_val
255 |             else:
256 |                 sgd_update_gpu(W1_val, grad_W1_val, lr)
257 |                 sgd_update_gpu(W2_val, grad_W2_val, lr)
258 |                 sgd_update_gpu(W3_val, grad_W3_val, lr)
259 |                 sgd_update_gpu(b1_val, grad_b1_val, lr)
260 |                 sgd_update_gpu(b2_val, grad_b2_val, lr)
261 |                 sgd_update_gpu(b3_val, grad_b3_val, lr)
262 |         if print_loss_val_each_epoch:
263 |             if isinstance(loss_val, ndarray.NDArray):
264 |                 print(loss_val.asnumpy())
265 |             else:
266 |                 print(loss_val)
267 | 
268 |     correct_predictions = []
269 |     for minibatch_index in range(n_valid_batches):
270 |         minibatch_start = minibatch_index * batch_size
271 |         minibatch_end = (minibatch_index + 1) * batch_size
272 |         valid_X_val[:] = valid_set_x[minibatch_start:minibatch_end]
273 |         valid_y_val[:] = convert_to_one_hot(
274 |             valid_set_y[minibatch_start:minibatch_end])
275 |         _, _, _, _, _, _, _, valid_y_predicted = executor.run(
276 |             feed_dict={
277 |                 X: valid_X_val,
278 |                 y_: valid_y_val,
279 |                 W1: W1_val,
280 |                 W2: W2_val,
281 |                 W3: W3_val,
282 |                 b1: b1_val,
283 |                 b2: b2_val,
284 |                 b3: b3_val},
285 |             convert_to_numpy_ret_vals=True)
286 |         correct_prediction = np.equal(
287 |             np.argmax(valid_y_val, 1),
288 |             np.argmax(valid_y_predicted, 1)).astype(np.float)
289 |         correct_predictions.extend(correct_prediction)
290 |     accuracy = np.mean(correct_predictions)
291 |     # validation set accuracy=0.970800
292 |     print("validation set accuracy=%f" % accuracy)
293 | 
294 | 
295 | if __name__ == "__main__":
296 |     parser = argparse.ArgumentParser()
297 |     parser.add_argument(
298 |         "-m", "--model",
299 |         help="Choose model: all, logreg, mlp", default="all")
300 |     parser.add_argument(
301 |         "-c", "--executor_context",
302 |         help="Choose executor context: numpy, gpu", default="numpy")
303 |     parser.add_argument(
304 |         "-e", "--num_epoch",
305 |         help="Provide number of epochs to train.", type=int, default=20)
306 |     parser.add_argument(
307 |         "-l", "--print_loss_val_each_epoch",
308 |         help="Print loss value at the end of each epoch", action="store_true")
309 |     args = parser.parse_args()
310 | 
311 |     models = []
312 |     executor_ctx = None
313 |     print_loss_val_each_epoch = False
314 |     if args.model == "logreg":
315 |         models = [mnist_logreg]
316 |     elif args.model == "mlp":
317 |         models = [mnist_mlp]
318 |     elif args.model == "all":
319 |         models = [mnist_logreg, mnist_mlp]
320 | 
321 |     if args.executor_context == "numpy":
322 |         executor_ctx = None
323 |     elif args.executor_context == "gpu":
324 |         # Assume only use gpu 0.
325 |         executor_ctx = ndarray.gpu(0)
326 | 
327 |     if args.print_loss_val_each_epoch:
328 |         print_loss_val_each_epoch = True
329 | 
330 |     num_epochs = args.num_epoch
331 |     for m in models:
332 |         import time
333 |         tic = time.time()
334 |         m(executor_ctx, num_epochs, print_loss_val_each_epoch)
335 |         toc = time.time()
336 |         print("mode use time: " + str(toc - tic))
337 | 


--------------------------------------------------------------------------------
/tests/test_gpu_op.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from tinyflow import ndarray, gpu_op, autodiff
  3 | 
  4 | 
  5 | def test_array_set():
  6 |     ctx = ndarray.gpu(0)
  7 |     shape = (500, 200)
  8 |     # oneslike
  9 |     arr_x = ndarray.empty(shape, ctx=ctx)
 10 |     gpu_op.array_set(arr_x, 1.)
 11 |     x = arr_x.asnumpy()
 12 |     np.testing.assert_allclose(np.ones(shape), x)
 13 |     # zeroslike
 14 |     gpu_op.array_set(arr_x, 0.)
 15 |     x = arr_x.asnumpy()
 16 |     np.testing.assert_allclose(np.zeros(shape), x)
 17 | 
 18 | 
 19 | def test_broadcast_to():
 20 |     ctx = ndarray.gpu(0)
 21 |     shape = (200, 300)
 22 |     to_shape = (130, 200, 300)
 23 |     x = np.random.uniform(-1, 1, shape).astype(np.float32)
 24 |     arr_x = ndarray.array(x, ctx=ctx)
 25 |     arr_y = ndarray.empty(to_shape, ctx=ctx)
 26 |     gpu_op.broadcast_to(arr_x, arr_y)
 27 |     y = arr_y.asnumpy()
 28 |     np.testing.assert_allclose(np.broadcast_to(x, to_shape), y)
 29 | 
 30 | 
 31 | def test_reduce_sum_axis_zero():
 32 |     ctx = ndarray.gpu(0)
 33 |     shape = (500, 200, 100)
 34 |     to_shape = (200, 100)
 35 |     x = np.random.uniform(0, 20, shape).astype(np.float32)
 36 |     arr_x = ndarray.array(x, ctx=ctx)
 37 |     arr_y = ndarray.empty(to_shape, ctx=ctx)
 38 |     gpu_op.reduce_sum_axis_zero(arr_x, arr_y)
 39 |     y = arr_y.asnumpy()
 40 |     y_ = np.sum(x, axis=0)
 41 |     for index, _ in np.ndenumerate(y):
 42 |         v = y[index]
 43 |         v_ = y_[index]
 44 |         if abs((v - v_) / v_) > 1e-4:
 45 |             print(index, v, v_)
 46 |     np.testing.assert_allclose(np.sum(x, axis=0), y, rtol=1e-5)
 47 | 
 48 | 
 49 | def test_matrix_elementwise_add():
 50 |     ctx = ndarray.gpu(0)
 51 |     shape = (500, 200)
 52 |     x = np.random.uniform(0, 10, size=shape).astype(np.float32)
 53 |     y = np.random.uniform(0, 10, size=shape).astype(np.float32)
 54 |     arr_x = ndarray.array(x, ctx=ctx)
 55 |     arr_y = ndarray.array(y, ctx=ctx)
 56 |     arr_z = ndarray.empty(shape, ctx=ctx)
 57 |     gpu_op.matrix_elementwise_add(arr_x, arr_y, arr_z)
 58 |     z = arr_z.asnumpy()
 59 |     np.testing.assert_allclose(x + y, z, rtol=1e-5)
 60 | 
 61 | 
 62 | def test_matrix_elementwise_add_by_const():
 63 |     shape = (2000, 3000)
 64 |     ctx = ndarray.gpu(0)
 65 |     x = np.random.uniform(0, 10, size=shape).astype(np.float32)
 66 |     val = np.random.uniform(-5, 5)
 67 |     arr_x = ndarray.array(x, ctx=ctx)
 68 |     arr_y = ndarray.empty(shape, ctx=ctx)
 69 |     gpu_op.matrix_elementwise_add_by_const(arr_x, val, arr_y)
 70 |     y = arr_y.asnumpy()
 71 |     np.testing.assert_allclose(x + val, y, rtol=1e-5)
 72 | 
 73 | 
 74 | def test_matrix_elementwise_multiply():
 75 |     ctx = ndarray.gpu(0)
 76 |     shape = (500, 200)
 77 |     x = np.random.uniform(0, 10, size=shape).astype(np.float32)
 78 |     y = np.random.uniform(0, 10, size=shape).astype(np.float32)
 79 |     arr_x = ndarray.array(x, ctx=ctx)
 80 |     arr_y = ndarray.array(y, ctx=ctx)
 81 |     arr_z = ndarray.empty(shape, ctx=ctx)
 82 |     gpu_op.matrix_elementwise_multiply(arr_x, arr_y, arr_z)
 83 |     z = arr_z.asnumpy()
 84 |     np.testing.assert_allclose(x * y, z, rtol=1e-5)
 85 | 
 86 | 
 87 | def test_matrix_elementwise_multiply_by_const():
 88 |     shape = (2000, 3000)
 89 |     ctx = ndarray.gpu(0)
 90 |     x = np.random.uniform(0, 10, size=shape).astype(np.float32)
 91 |     val = np.random.uniform(-5, 5)
 92 |     arr_x = ndarray.array(x, ctx=ctx)
 93 |     arr_y = ndarray.empty(shape, ctx=ctx)
 94 |     gpu_op.matrix_elementwise_multiply_by_const(arr_x, val, arr_y)
 95 |     y = arr_y.asnumpy()
 96 |     np.testing.assert_allclose(x * val, y, rtol=1e-5)
 97 | 
 98 | 
 99 | def test_matrix_multiply():
100 |     ctx = ndarray.gpu(0)
101 |     x = np.random.uniform(0, 10, size=(500, 700)).astype(np.float32)
102 |     y = np.random.uniform(0, 10, size=(700, 1000)).astype(np.float32)
103 |     arr_x = ndarray.array(x, ctx=ctx)
104 |     arr_y = ndarray.array(y, ctx=ctx)
105 |     arr_z = ndarray.empty((500, 1000), ctx=ctx)
106 |     gpu_op.matrix_multiply(arr_x, False, arr_y, False, arr_z)
107 |     z = arr_z.asnumpy()
108 |     np.testing.assert_allclose(np.dot(x, y), z, rtol=1e-5)
109 | 
110 |     x = np.random.uniform(0, 10, size=(1000, 500)).astype(np.float32)
111 |     y = np.random.uniform(0, 10, size=(2000, 500)).astype(np.float32)
112 |     arr_x = ndarray.array(x, ctx=ctx)
113 |     arr_y = ndarray.array(y, ctx=ctx)
114 |     arr_z = ndarray.empty((1000, 2000), ctx=ctx)
115 |     gpu_op.matrix_multiply(arr_x, False, arr_y, True, arr_z)
116 |     z = arr_z.asnumpy()
117 |     np.testing.assert_allclose(np.dot(x, np.transpose(y)), z, rtol=1e-5)
118 |     
119 |     x = np.random.uniform(0, 10, size=(500, 1000)).astype(np.float32)
120 |     y = np.random.uniform(0, 10, size=(2000, 500)).astype(np.float32)
121 |     arr_x = ndarray.array(x, ctx=ctx)
122 |     arr_y = ndarray.array(y, ctx=ctx)
123 |     arr_z = ndarray.empty((1000, 2000), ctx=ctx)
124 |     gpu_op.matrix_multiply(arr_x, True, arr_y, True, arr_z)
125 |     z = arr_z.asnumpy()
126 |     np.testing.assert_allclose(np.dot(np.transpose(x), np.transpose(y)), z,
127 |                                rtol=1e-5)
128 | 
129 | 
130 | def test_relu():
131 |     shape = (2000, 2500)
132 |     ctx = ndarray.gpu(0)
133 |     x = np.random.uniform(-1, 1, shape).astype(np.float32)
134 |     arr_x = ndarray.array(x, ctx=ctx)
135 |     arr_y = ndarray.empty(shape, ctx=ctx)
136 |     gpu_op.relu(arr_x, arr_y)
137 |     y = arr_y.asnumpy()
138 |     np.testing.assert_allclose(np.maximum(x, 0).astype(np.float32), y)
139 | 
140 | 
141 | def test_relu_gradient():
142 |     shape = (2000, 2500)
143 |     ctx = ndarray.gpu(0)
144 |     x = np.random.uniform(-1, 1, shape).astype(np.float32)
145 |     grad_x = np.random.uniform(-5, 5, shape).astype(np.float32)
146 |     arr_x = ndarray.array(x, ctx=ctx)
147 |     arr_grad_x = ndarray.array(grad_x, ctx=ctx)
148 |     arr_y = ndarray.empty(shape, ctx=ctx)
149 |     gpu_op.relu_gradient(arr_x, arr_grad_x, arr_y)
150 |     y = arr_y.asnumpy()
151 |     np.testing.assert_allclose(((x > 0) * grad_x).astype(np.float32), y)
152 | 
153 | 
154 | def test_softmax():
155 |     ctx = ndarray.gpu(0)
156 |     shape = (400, 1000)
157 |     x = np.random.uniform(-5, 5, shape).astype(np.float32)
158 |     arr_x = ndarray.array(x, ctx=ctx)
159 |     arr_y = ndarray.empty(shape, ctx=ctx)
160 |     gpu_op.softmax(arr_x, arr_y)
161 |     y = arr_y.asnumpy()
162 |     np.testing.assert_allclose(autodiff.softmax_func(x), y, rtol=1e-5)
163 | 
164 | 
165 | def test_softmax_cross_entropy():
166 |     ctx = ndarray.gpu(0)
167 |     shape = (400, 1000)
168 |     y = np.random.uniform(-5, 5, shape).astype(np.float32)
169 |     y_ = np.random.uniform(-5, 5, shape).astype(np.float32)
170 |     arr_y = ndarray.array(y, ctx=ctx)
171 |     arr_y_ = ndarray.array(y_, ctx=ctx)
172 |     arr_out = ndarray.empty((1,), ctx=ctx)
173 |     gpu_op.softmax_cross_entropy(arr_y, arr_y_, arr_out)
174 |     out = arr_out.asnumpy()
175 |     # numpy calculation
176 |     cross_entropy = np.mean(
177 |         -np.sum(y_ * np.log(autodiff.softmax_func(y)), axis=1), keepdims=True)
178 |     np.testing.assert_allclose(cross_entropy, out, rtol=1e-5)
179 | 


--------------------------------------------------------------------------------