├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── Makefile ├── README.md ├── python └── tinyflow │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── _base.cpython-36.pyc │ ├── autodiff.cpython-36.pyc │ ├── gpu_op.cpython-36.pyc │ └── ndarray.cpython-36.pyc │ ├── _base.py │ ├── autodiff.py │ ├── gpu_op.py │ └── ndarray.py ├── src ├── c_runtime_api.cc ├── c_runtime_api.h ├── cpu_device_api.cc ├── cpu_device_api.h ├── cuda_device_api.cc ├── cuda_device_api.h ├── device_api.h ├── dlarray.h ├── gpu_op.cu └── runtime_base.h └── tests ├── autodiff_test.py ├── mnist_dlsys.py └── test_gpu_op.py /.gitignore: -------------------------------------------------------------------------------- 1 | # build directories 2 | build 3 | cmake-build-debug 4 | .pytest_cache 5 | .idea 6 | 7 | *.gz -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 3.13) 2 | PROJECT(tinyflow LANGUAGES C CXX) 3 | FIND_PACKAGE(CUDA REQUIRED) 4 | SET(CUDA_DIR /usr/local/cuda) 5 | 6 | FILE(GLOB CC_SRCS "src/*.cc") 7 | FILE(GLOB CUDA_SRCS "src/*.cu") 8 | FILE(GLOB HEAD_FILES_DIR "src") 9 | 10 | INCLUDE_DIRECTORIES(${CUDA_DIR}/include) 11 | 12 | SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -Wall -Wfatal-errors -Wno-unused -Wno-unused-result") 13 | SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99") 14 | SET(ARCH "-gencode arch=compute_30,code=sm_30 15 | -gencode arch=compute_35,code=sm_35 16 | -gencode arch=compute_50,code=[sm_50,compute_50] 17 | -gencode arch=compute_52,code=[sm_52,compute_52]") 18 | SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11 --compiler-options '-fPIC' ${ARCH}") 19 | 20 | LINK_DIRECTORIES(${CUDA_DIR}/lib64) 21 | CUDA_ADD_LIBRARY(c_runtime_api SHARED ${CC_SRCS} ${CUDA_SRCS}) 22 | TARGET_LINK_LIBRARIES(c_runtime_api -lcuda -lcudart -lcublas) 23 | 24 | INSTALL(TARGETS c_runtime_api LIBRARY DESTINATION ${PROJECT_SOURCE_DIR}/build/lib) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Yu Liebing 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CUDA_DIR = /usr/local/cuda 2 | 3 | CC_SRCS := $(wildcard src/*.cc) 4 | CC_OBJS := ${CC_SRCS:src/%.cc=build/obj/%.o} 5 | CUDA_SRCS := $(wildcard src/*.cu) 6 | CUDA_OBJS := ${CUDA_SRCS:src/%.cu=build/obj/%.o} 7 | OBJS := $(CC_OBJS) $(CUDA_OBJS) 8 | 9 | CC = g++ 10 | WARNINGS = -Wall -Wfatal-errors -Wno-unused -Wno-unused-result 11 | CC_FLAGS = -std=c++11 -fPIC $(WARNINGS) -I$(CUDA_DIR)/include 12 | LD_FLAGS = -L$(CUDA_DIR)/lib64 -lcuda -lcudart -lcublas 13 | 14 | NVCC = nvcc 15 | NVCC_FLAGS = -std=c++11 --compiler-options '-fPIC' 16 | ARCH = -gencode arch=compute_30,code=sm_30 \ 17 | -gencode arch=compute_35,code=sm_35 \ 18 | -gencode arch=compute_50,code=[sm_50,compute_50] \ 19 | -gencode arch=compute_52,code=[sm_52,compute_52] 20 | 21 | all: build/lib/libc_runtime_api.so 22 | 23 | build/lib/libc_runtime_api.so: $(OBJS) 24 | @mkdir -p build/lib 25 | $(CC) -shared $^ -o $@ $(LD_FLAGS) 26 | 27 | build/obj/%.o: src/%.cc 28 | @mkdir -p build/obj 29 | $(CC) $(CC_FLAGS) -c $< -o $@ 30 | 31 | build/obj/%.o: src/%.cu 32 | @mkdir -p build/obj 33 | $(NVCC) $(ARCH) $(NVCC_FLAGS) -c $< -o $@ 34 | 35 | clean: 36 | rm -rf build 37 | 38 | .PHONY: clean 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Tinyflow is a simple deep learning framework for learning purposes. It supports automatic 2 | differentiation and GPU acceleration. TinyFlow currently provides all the operators needed 3 | to build a multilayer perceptron models (MLP). 4 | 5 | If you want to learn more about the principles behind Tinyflow, the following two blog posts may provide a lot of intuition. 6 | + [Automatic Differentiation Based on Computation Graph](https://liebing.org.cn/automatic-differentiation.html) 7 | + [Tinyflow - A Simple Neural Network Framework](https://liebing.org.cn/tinyflow.html) 8 | 9 | # Install 10 | Tinyflow currently only supports running in 64-bit linux environment. Requirement: 11 | + gcc >= 4.8; 12 | + cmake >= 3.13 (if you choose to use cmake); 13 | + CUDA 9.0 14 | + python 3 15 | 16 | Download the source code. 17 | ```shell 18 | git clone https://github.com/LB-Yu/tinyflow.git 19 | ``` 20 | 21 | Generally speaking, CUDA will be installed in `/use/local/cuda`. 22 | If your installation path is different, please modify the `CUDA_DIR` variable on the first 23 | line of the Makefile to your installation path, or modify the `CUDA_DIR` variable on the 24 | fourth line of CMakeLists.txt to your installation path. 25 | 26 | For compiling with Makefile. 27 | ```shell 28 | cd tinyflow 29 | make 30 | ``` 31 | 32 | For compiling with CMake. 33 | ```shell 34 | cd tinyflow 35 | mkdir build 36 | cmake .. 37 | make 38 | make install 39 | ``` 40 | 41 | # Run the MNIST Example 42 | After compiling the GPU library, we can train an MLP on the MNIST dataset. 43 | ```shell 44 | export PYTHONPATH="/path/to/tinyflow/python:${PYTHONPATH}" 45 | 46 | # see cmd options with 47 | # python tests/mnist_dlsys.py -h 48 | 49 | # run logistic regression on numpy 50 | python tests/mnist_dlsys.py -l -m logreg -c numpy 51 | # run logistic regression on gpu 52 | python tests/mnist_dlsys.py -l -m logreg -c gpu 53 | # run MLP on numpy 54 | python tests/mnist_dlsys.py -l -m mlp -c numpy 55 | # run MLP on gpu 56 | python tests/mnist_dlsys.py -l -m mlp -c gpu 57 | ``` 58 | 59 | # Overview of Module 60 | - python/dlsys/autodiff.py: Implements computation graph, autodiff, GPU/Numpy Executor. 61 | - python/dlsys/gpu_op.py: Exposes Python function to call GPU kernels via ctypes. 62 | - python/dlsys/ndarray.py: Exposes Python GPU array API. 63 | 64 | - src/dlarray.h: header for GPU array. 65 | - src/c_runtime_api.h: C API header for GPU array and GPU kernels. 66 | - src/gpu_op.cu: cuda implementation of kernels 67 | -------------------------------------------------------------------------------- /python/tinyflow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LB-Yu/tinyflow/772669a983f18d78b48fd9108d0fcbc942487247/python/tinyflow/__init__.py -------------------------------------------------------------------------------- /python/tinyflow/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LB-Yu/tinyflow/772669a983f18d78b48fd9108d0fcbc942487247/python/tinyflow/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /python/tinyflow/__pycache__/_base.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LB-Yu/tinyflow/772669a983f18d78b48fd9108d0fcbc942487247/python/tinyflow/__pycache__/_base.cpython-36.pyc -------------------------------------------------------------------------------- /python/tinyflow/__pycache__/autodiff.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LB-Yu/tinyflow/772669a983f18d78b48fd9108d0fcbc942487247/python/tinyflow/__pycache__/autodiff.cpython-36.pyc -------------------------------------------------------------------------------- /python/tinyflow/__pycache__/gpu_op.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LB-Yu/tinyflow/772669a983f18d78b48fd9108d0fcbc942487247/python/tinyflow/__pycache__/gpu_op.cpython-36.pyc -------------------------------------------------------------------------------- /python/tinyflow/__pycache__/ndarray.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LB-Yu/tinyflow/772669a983f18d78b48fd9108d0fcbc942487247/python/tinyflow/__pycache__/ndarray.cpython-36.pyc -------------------------------------------------------------------------------- /python/tinyflow/_base.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # pylint: disable=invalid-name 3 | """ ctypes library of dlsys and helper functions """ 4 | from __future__ import absolute_import 5 | 6 | import os 7 | import ctypes 8 | 9 | 10 | def _load_lib(): 11 | """Load libary in build/lib.""" 12 | curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) 13 | lib_path = os.path.join(curr_path, '../../build/lib/') 14 | path_to_so_file = os.path.join(lib_path, "libc_runtime_api.so") 15 | lib = ctypes.CDLL(path_to_so_file, ctypes.RTLD_GLOBAL) 16 | return lib 17 | 18 | 19 | # global library instance 20 | _LIB = _load_lib() 21 | 22 | 23 | ################## 24 | # Helper Methods # 25 | ################## 26 | 27 | def check_call(ret): 28 | """Check the return value of C API call 29 | 30 | This function will crash when error occurs. 31 | Wrap every API call with this function 32 | 33 | Parameters 34 | ---------- 35 | ret : int 36 | return value from API calls 37 | """ 38 | assert(ret == 0) 39 | 40 | 41 | def c_array(ctype, values): 42 | """Create ctypes array from a python array 43 | 44 | Parameters 45 | ---------- 46 | ctype : ctypes data type 47 | data type of the array we want to convert to 48 | 49 | values : tuple or list 50 | data content 51 | 52 | Returns 53 | ------- 54 | out : ctypes array 55 | Created ctypes array 56 | """ 57 | return (ctype * len(values))(*values) 58 | -------------------------------------------------------------------------------- /python/tinyflow/autodiff.py: -------------------------------------------------------------------------------- 1 | """ library to take autodiff and execute a computation graph """ 2 | from __future__ import absolute_import 3 | 4 | import numpy as np 5 | from . import ndarray, gpu_op 6 | 7 | 8 | class Node(object): 9 | """Node in a computation graph.""" 10 | def __init__(self): 11 | """Constructor, new node is indirectly created by Op object call method. 12 | 13 | Instance variables 14 | ------------------ 15 | self.inputs: the list of input nodes. 16 | self.op: the associated op object, 17 | e.g. add_op if this node is created by adding two other nodes. 18 | self.const_attr: the add or multiply constant. 19 | e.g. self.const_attr=5 if this node is created by x+5. 20 | self.name: node name for debugging. 21 | """ 22 | self.inputs = [] 23 | self.op = None 24 | self.const_attr = None 25 | self.name = "" 26 | 27 | def __add__(self, other): 28 | """Adding two nodes return a new node.""" 29 | if isinstance(other, Node): 30 | new_node = add_op(self, other) 31 | else: 32 | # Add by a constant stores the constant in new node's const_attr 33 | # 'other' argument is a constant 34 | new_node = add_byconst_op(self, other) 35 | return new_node 36 | 37 | def __mul__(self, other): 38 | """Multiplying two nodes return a new node.""" 39 | if isinstance(other, Node): 40 | new_node = mul_op(self, other) 41 | else: 42 | # Mul by a constant stores the constant in new node's const_attr 43 | # 'other' argument is a constant 44 | new_node = mul_byconst_op(self, other) 45 | return new_node 46 | 47 | # Allow left-hand-side add and multiply. 48 | __radd__ = __add__ 49 | __rmul__ = __mul__ 50 | 51 | def __str__(self): 52 | """Allow print to display node name.""" 53 | return self.name 54 | 55 | 56 | def Variable(name): 57 | """User defined variables in an expression. 58 | e.g. x = Variable(name = "x") 59 | """ 60 | placeholder_node = placeholder_op() 61 | placeholder_node.name = name 62 | return placeholder_node 63 | 64 | 65 | class Op(object): 66 | """Op represents operations performed on nodes.""" 67 | def __call__(self): 68 | """Create a new node and associate the op object with the node. 69 | 70 | Returns 71 | ------- 72 | The new node object. 73 | """ 74 | new_node = Node() 75 | new_node.op = self 76 | return new_node 77 | 78 | def compute(self, node, input_vals, output_val, use_numpy=True): 79 | """Given values of input nodes, compute the output value. 80 | 81 | Parameters 82 | ---------- 83 | node: node that performs the compute. 84 | input_vals: values of input nodes. 85 | output_val: output value of the node, modified in-place. 86 | use_numpy: bool flag whether to use numpy for compute 87 | """ 88 | raise NotImplementedError 89 | 90 | def gradient(self, node, output_grad): 91 | """Given output gradient, compute partial gradient to each input node. 92 | 93 | Parameters 94 | ---------- 95 | node: node that performs the gradient. 96 | output_grad: output gradient summed from children nodes' contributions 97 | 98 | Returns 99 | ------- 100 | A list of gradient contributions to each input node respectively. 101 | """ 102 | raise NotImplementedError 103 | 104 | def infer_shape(self, node, input_shapes): 105 | """Given shapes of input nodes, compute shape of output node. 106 | 107 | Implementation note: 108 | It's simpler to treat shape of constants as (1,), so that constants can 109 | be stored as a numpy array too and you would need fewer special case 110 | handling. 111 | 112 | Parameters 113 | ---------- 114 | node: node whose shape is being inferred. 115 | input_shapes: shapes of input nodes. 116 | 117 | Returns 118 | ------- 119 | A tuple representing the shape of output node. 120 | """ 121 | raise NotImplementedError 122 | 123 | 124 | class AddOp(Op): 125 | def __call__(self, node_A, node_B): 126 | new_node = Op.__call__(self) 127 | new_node.inputs = [node_A, node_B] 128 | new_node.name = "(%s+%s)" % (node_A.name, node_B.name) 129 | return new_node 130 | 131 | def compute(self, node, input_vals, output_val, use_numpy=True): 132 | assert len(input_vals) == 2 133 | if use_numpy: 134 | # output_val[:] allows modify in-place 135 | output_val[:] = input_vals[0] + input_vals[1] 136 | else: 137 | if input_vals[0].shape == input_vals[1].shape: 138 | gpu_op.matrix_elementwise_add( 139 | input_vals[0], input_vals[1], output_val) 140 | else: 141 | if input_vals[1].shape == (1,): 142 | const_val = input_vals[1].asnumpy()[0] 143 | gpu_op.matrix_elementwise_add_by_const( 144 | input_vals[0], const_val, output_val) 145 | elif input_vals[0].shape == (1,): 146 | const_val = input_vals[0].asnumpy()[0] 147 | gpu_op.matrix_elementwise_add_by_const( 148 | input_vals[1], const_val, output_val) 149 | 150 | def gradient(self, node, output_grad): 151 | return [output_grad, output_grad] 152 | 153 | def infer_shape(self, node, input_shapes): 154 | """Need to handle input_vals[0].shape != input_vals[1].shape""" 155 | if input_shapes[0] == input_shapes[1]: 156 | return input_shapes[0] 157 | elif input_shapes[0] == (1,): 158 | return input_shapes[1] 159 | elif input_shapes[1] == (1,): 160 | return input_shapes[0] 161 | 162 | 163 | class AddByConstOp(Op): 164 | def __call__(self, node_A, const_val): 165 | new_node = Op.__call__(self) 166 | new_node.const_attr = const_val 167 | new_node.inputs = [node_A] 168 | new_node.name = "(%s+%s)" % (node_A.name, str(const_val)) 169 | return new_node 170 | 171 | def compute(self, node, input_vals, output_val, use_numpy=True): 172 | assert len(input_vals) == 1 173 | if use_numpy: 174 | output_val[:] = input_vals[0] + node.const_attr 175 | else: 176 | gpu_op.matrix_elementwise_add_by_const( 177 | input_vals[0], node.const_attr, output_val) 178 | 179 | def gradient(self, node, output_grad): 180 | return [output_grad] 181 | 182 | def infer_shape(self, node, input_shapes): 183 | return input_shapes[0] 184 | 185 | 186 | class MulOp(Op): 187 | def __call__(self, node_A, node_B): 188 | new_node = Op.__call__(self) 189 | new_node.inputs = [node_A, node_B] 190 | new_node.name = "(%s*%s)" % (node_A.name, node_B.name) 191 | return new_node 192 | 193 | def compute(self, node, input_vals, output_val, use_numpy=True): 194 | assert len(input_vals) == 2 195 | if use_numpy: 196 | output_val[:] = input_vals[0] * input_vals[1] 197 | else: 198 | if input_vals[0].shape == input_vals[1].shape: 199 | gpu_op.matrix_elementwise_multiply( 200 | input_vals[0], input_vals[1], output_val) 201 | else: 202 | if input_vals[1].shape == (1,): 203 | const_val = input_vals[1].asnumpy()[0] 204 | gpu_op.matrix_elementwise_multiply_by_const( 205 | input_vals[0], const_val, output_val) 206 | elif input_vals[0].shape == (1,): 207 | const_val = input_vals[0].asnumpy()[0] 208 | gpu_op.matrix_elementwise_multiply_by_const( 209 | input_vals[1], const_val, output_val) 210 | 211 | def gradient(self, node, output_grad): 212 | return [node.inputs[1] * output_grad, node.inputs[0] * output_grad] 213 | 214 | def infer_shape(self, node, input_shapes): 215 | """Need to handle input_vals[0].shape != input_vals[1].shape""" 216 | if input_shapes[0] == input_shapes[1]: 217 | return input_shapes[0] 218 | elif input_shapes[0] == (1,): 219 | return input_shapes[1] 220 | elif input_shapes[1] == (1,): 221 | return input_shapes[0] 222 | 223 | 224 | class MulByConstOp(Op): 225 | def __call__(self, node_A, const_val): 226 | new_node = Op.__call__(self) 227 | new_node.const_attr = const_val 228 | new_node.inputs = [node_A] 229 | new_node.name = "(%s*%s)" % (node_A.name, str(const_val)) 230 | return new_node 231 | 232 | def compute(self, node, input_vals, output_val, use_numpy=True): 233 | assert len(input_vals) == 1 234 | if use_numpy: 235 | output_val[:] = input_vals[0] * node.const_attr 236 | else: 237 | gpu_op.matrix_elementwise_multiply_by_const( 238 | input_vals[0], node.const_attr, output_val) 239 | 240 | def gradient(self, node, output_grad): 241 | return [node.const_attr * output_grad] 242 | 243 | def infer_shape(self, node, input_shapes): 244 | return input_shapes[0] 245 | 246 | 247 | class MatMulOp(Op): 248 | def __call__(self, node_A, node_B, trans_A=False, trans_B=False): 249 | new_node = Op.__call__(self) 250 | new_node.matmul_attr_trans_A = trans_A 251 | new_node.matmul_attr_trans_B = trans_B 252 | new_node.inputs = [node_A, node_B] 253 | new_node.name = "MatMul(%s,%s,%s,%s)" % ( 254 | node_A.name, node_B.name, str(trans_A), str(trans_B)) 255 | return new_node 256 | 257 | def compute(self, node, input_vals, output_val, use_numpy=True): 258 | if use_numpy: 259 | if ((node.matmul_attr_trans_A is False) and 260 | (node.matmul_attr_trans_B is False)): 261 | output_val[:] = np.matmul(input_vals[0], input_vals[1]) 262 | elif ((node.matmul_attr_trans_A is True) and 263 | (node.matmul_attr_trans_B is False)): 264 | output_val[:] = np.matmul( 265 | np.transpose(input_vals[0]), input_vals[1]) 266 | elif ((node.matmul_attr_trans_A is False) and 267 | (node.matmul_attr_trans_B is True)): 268 | output_val[:] = np.matmul( 269 | input_vals[0], np.transpose(input_vals[1])) 270 | elif ((node.matmul_attr_trans_A is True) and 271 | (node.matmul_attr_trans_B is True)): 272 | output_val[:] = np.matmul( 273 | np.transpose(input_vals[0]), np.transpose(input_vals[1])) 274 | else: 275 | gpu_op.matrix_multiply( 276 | input_vals[0], node.matmul_attr_trans_A, 277 | input_vals[1], node.matmul_attr_trans_B, 278 | output_val) 279 | 280 | def gradient(self, node, output_grad): 281 | if ((node.matmul_attr_trans_A is False) and 282 | (node.matmul_attr_trans_B is False)): 283 | # if Y=AB, then dA=dY B^T, dB=A^T dY 284 | lhs_grad = matmul_op( 285 | output_grad, node.inputs[1], trans_A=False, trans_B=True) 286 | rhs_grad = matmul_op( 287 | node.inputs[0], output_grad, trans_A=True, trans_B=False) 288 | elif ((node.matmul_attr_trans_A is True) and 289 | (node.matmul_attr_trans_B is False)): 290 | # if Y=A^T B, then dA=(dY B^T)^T=B dY^T, dB=A^T dY 291 | lhs_grad = matmul_op( 292 | node.inputs[1], output_grad, trans_A=False, trans_B=True) 293 | rhs_grad = matmul_op( 294 | node.inputs[0], output_grad, trans_A=True, trans_B=False) 295 | elif ((node.matmul_attr_trans_A is False) and 296 | (node.matmul_attr_trans_B is True)): 297 | # if Y=A B^T, then dA=dY B^T, dB=(A^T dY)^T=dY^T A 298 | lhs_grad = matmul_op( 299 | output_grad, node.inputs[1], trans_A=False, trans_B=True) 300 | rhs_grad = matmul_op( 301 | output_grad, node.inputs[0], trans_A=True, trans_B=False) 302 | elif ((node.matmul_attr_trans_A is True) and 303 | (node.matmul_attr_trans_B is True)): 304 | # if Y=A^T B^T, then dA=(dY B^T)^T=B dY^T, dB=(A^T dY)^T=dY^T A 305 | lhs_grad = matmul_op( 306 | node.inputs[1], output_grad, trans_A=False, trans_B=True) 307 | rhs_grad = matmul_op( 308 | output_grad, node.inputs[0], trans_A=True, trans_B=False) 309 | return [lhs_grad, rhs_grad] 310 | 311 | def infer_shape(self, node, input_shapes): 312 | if node.matmul_attr_trans_A is False and node.matmul_attr_trans_B is False: 313 | return input_shapes[0][0], input_shapes[1][1] 314 | elif node.matmul_attr_trans_A is False and node.matmul_attr_trans_B is True: 315 | return input_shapes[0][0], input_shapes[1][0] 316 | elif node.matmul_attr_trans_A is True and node.matmul_attr_trans_B is False: 317 | return input_shapes[0][1], input_shapes[1][1] 318 | else: 319 | return input_shapes[0][1], input_shapes[1][0] 320 | 321 | 322 | class PlaceholderOp(Op): 323 | def __call__(self): 324 | """Creates a variable node.""" 325 | new_node = Op.__call__(self) 326 | return new_node 327 | 328 | def compute(self, node, input_vals, output_val, use_numpy=True): 329 | assert False, "placeholder %s values provided by feed_dict" % node.name 330 | 331 | def gradient(self, node, output_grad): 332 | return None 333 | 334 | def infer_shape(self, node, input_shapes): 335 | assert False, "placeholder %s shape provided by feed_shape" % node.name 336 | 337 | 338 | class ZerosLikeOp(Op): 339 | def __call__(self, node_A): 340 | """Creates a node that represents np.zeros(node_A.shape).""" 341 | new_node = Op.__call__(self) 342 | new_node.inputs = [node_A] 343 | new_node.name = "Zeroslike(%s)" % node_A.name 344 | return new_node 345 | 346 | def compute(self, node, input_vals, output_val, use_numpy=True): 347 | assert len(input_vals) == 1 348 | if use_numpy: 349 | output_val[:] = np.zeros(input_vals[0].shape) 350 | else: 351 | gpu_op.array_set(output_val, 0) 352 | 353 | def gradient(self, node, output_grad): 354 | return [zeroslike_op(node.inputs[0])] 355 | 356 | def infer_shape(self, node, input_shapes): 357 | """If input_shape is a vector, simpler to return (1,)""" 358 | return input_shapes[0] 359 | 360 | 361 | class OnesLikeOp(Op): 362 | def __call__(self, node_A): 363 | """Creates a node that represents np.ones(node_A.shape).""" 364 | new_node = Op.__call__(self) 365 | new_node.inputs = [node_A] 366 | new_node.name = "Oneslike(%s)" % node_A.name 367 | return new_node 368 | 369 | def compute(self, node, input_vals, output_val, use_numpy=True): 370 | assert len(input_vals) == 1 371 | if use_numpy: 372 | output_val[:] = np.ones(input_vals[0].shape) 373 | else: 374 | gpu_op.array_set(output_val, 1) 375 | 376 | def gradient(self, node, output_grad): 377 | return [zeroslike_op(node.inputs[0])] 378 | 379 | def infer_shape(self, node, input_shapes): 380 | """If input_shape is a vector, simpler to return (1,)""" 381 | return input_shapes[0] 382 | 383 | 384 | class ReduceSumAxisZeroOp(Op): 385 | def __call__(self, node_A): 386 | """Creates a node that represents np.sum(node_A, axis=0). 387 | Only support common-case axis=0 reduction for simplicity of gradient. 388 | """ 389 | new_node = Op.__call__(self) 390 | new_node.inputs = [node_A] 391 | new_node.name = "ReduceSumAxisZero(%s)" % (node_A.name) 392 | return new_node 393 | 394 | def compute(self, node, input_vals, output_val, use_numpy=True): 395 | assert len(input_vals) == 1 396 | if use_numpy: 397 | assert(isinstance(input_vals[0], np.ndarray)) 398 | output_val[:] = np.sum(input_vals[0], axis=0) 399 | else: 400 | gpu_op.reduce_sum_axis_zero(input_vals[0], output_val) 401 | 402 | def gradient(self, node, output_grad): 403 | return [broadcastto_op(output_grad, node.inputs[0])] 404 | 405 | def infer_shape(self, node, input_shapes): 406 | """summation reduction axis = 0 407 | e.g. (3,4,5)->(4,5) 408 | for vector, simpler to do (3,)->(1,) 409 | """ 410 | assert len(input_shapes) == 1 411 | if len(input_shapes[0]) == 1: 412 | return (1,) 413 | return input_shapes[0][1:] 414 | 415 | 416 | class BroadcastToOp(Op): 417 | def __call__(self, node_A, node_B): 418 | """Creates a node that represents np.broadcast_to(node_A, node_B.shape). 419 | Only support axis=0. e.g. (3,4)->(2,3,4) to make gradient simple. 420 | """ 421 | new_node = Op.__call__(self) 422 | new_node.inputs = [node_A, node_B] 423 | new_node.name = "BroadcastTo(%s,%s.shape)" % (node_A.name, node_B.name) 424 | return new_node 425 | 426 | def compute(self, node, input_vals, output_val, use_numpy=True): 427 | assert(len(input_vals) == 2) 428 | if use_numpy: 429 | output_val[:] = np.broadcast_to(input_vals[0], input_vals[1].shape) 430 | else: 431 | gpu_op.broadcast_to(input_vals[0], output_val) 432 | 433 | def gradient(self, node, output_grad): 434 | grad_A = reducesumaxiszero_op(output_grad) 435 | grad_B = zeroslike_op(node.inputs[1]) 436 | return [grad_A, grad_B] 437 | 438 | def infer_shape(self, node, input_shapes): 439 | return input_shapes[1] 440 | 441 | 442 | def softmax_func(y): 443 | """Numerically stable softmax.""" 444 | b = y - np.max(y, axis=1, keepdims=True) 445 | expb = np.exp(b) 446 | softmax = expb / np.sum(expb, axis=1, keepdims=True) 447 | return softmax 448 | 449 | 450 | class SoftmaxCrossEntropyOp(Op): 451 | def __call__(self, node_A, node_B): 452 | new_node = Op.__call__(self) 453 | new_node.inputs = [node_A, node_B] 454 | new_node.name = "SoftmaxXEntropy(%s,%s)" % (node_A.name, node_B.name) 455 | return new_node 456 | 457 | def compute(self, node, input_vals, output_val, use_numpy=True): 458 | assert len(input_vals) == 2 459 | y = input_vals[0] 460 | y_ = input_vals[1] 461 | if use_numpy: 462 | softmax = softmax_func(y) 463 | cross_entropy = np.mean( 464 | -np.sum(y_ * np.log(softmax), axis=1), keepdims=True) 465 | output_val[:] = cross_entropy 466 | else: 467 | gpu_op.softmax_cross_entropy(y, y_, output_val) 468 | 469 | def gradient(self, node, output_grad): 470 | grad_A = (softmax_op(node.inputs[0]) + -1 * node.inputs[1])*output_grad 471 | grad_B = zeroslike_op(node.inputs[1]) 472 | return [grad_A, grad_B] 473 | 474 | def infer_shape(self, node, input_shapes): 475 | return (1,) 476 | 477 | 478 | class SoftmaxOp(Op): 479 | def __call__(self, node_A): 480 | new_node = Op.__call__(self) 481 | new_node.inputs = [node_A] 482 | new_node.name = "Softmax(%s)" % (node_A.name) 483 | return new_node 484 | 485 | def compute(self, node, input_vals, output_val, use_numpy=True): 486 | assert len(input_vals) == 1 487 | if use_numpy: 488 | output_val[:] = softmax_func(input_vals[0]) 489 | else: 490 | gpu_op.softmax(input_vals[0], output_val) 491 | 492 | def gradient(self, node, output_grad): 493 | # Do not directly use SoftmaxOp, use SoftmaxCrossEntropyOp instead. 494 | # Not allowing taking 2nd derivative of SoftmaxCrossEntropyOp. 495 | raise NotImplementedError 496 | 497 | def infer_shape(self, node, input_shapes): 498 | return input_shapes[0] 499 | 500 | 501 | class ReluOp(Op): 502 | def __call__(self, node_A): 503 | new_node = Op.__call__(self) 504 | new_node.inputs = [node_A] 505 | new_node.name = "Relu(%s)" % (node_A.name) 506 | return new_node 507 | 508 | def compute(self, node, input_vals, output_val, use_numpy=True): 509 | assert len(input_vals) == 1 510 | if use_numpy: 511 | output_val[:] = np.maximum(input_vals[0], 0) 512 | else: 513 | gpu_op.relu(input_vals[0], output_val) 514 | 515 | def gradient(self, node, output_grad): 516 | return [relu_gradient_op(node.inputs[0], output_grad)] 517 | 518 | def infer_shape(self, node, input_shapes): 519 | return input_shapes[0] 520 | 521 | 522 | class ReluGradientOp(Op): 523 | def __call__(self, node_A, node_B): 524 | """node_B is output_grad""" 525 | new_node = Op.__call__(self) 526 | new_node.inputs = [node_A, node_B] 527 | new_node.name = "ReluGradient(%s)" % (node_A.name) 528 | return new_node 529 | 530 | def compute(self, node, input_vals, output_val, use_numpy=True): 531 | assert len(input_vals) == 2 532 | if use_numpy: 533 | # heaviside function, 0.5 at x=0 534 | output_val[:] = (np.sign(input_vals[0]) + 1) * 0.5 * input_vals[1] 535 | else: 536 | gpu_op.relu_gradient(input_vals[0], input_vals[1], output_val) 537 | 538 | def gradient(self, node, output_grad): 539 | raise NotImplementedError 540 | 541 | def infer_shape(self, node, input_shapes): 542 | return input_shapes[0] 543 | 544 | 545 | # Create global singletons of operators. 546 | add_op = AddOp() 547 | mul_op = MulOp() 548 | add_byconst_op = AddByConstOp() 549 | mul_byconst_op = MulByConstOp() 550 | matmul_op = MatMulOp() 551 | placeholder_op = PlaceholderOp() 552 | oneslike_op = OnesLikeOp() 553 | zeroslike_op = ZerosLikeOp() 554 | reducesumaxiszero_op = ReduceSumAxisZeroOp() 555 | broadcastto_op = BroadcastToOp() 556 | softmaxcrossentropy_op = SoftmaxCrossEntropyOp() 557 | softmax_op = SoftmaxOp() 558 | relu_op = ReluOp() 559 | relu_gradient_op = ReluGradientOp() 560 | 561 | 562 | class Executor(object): 563 | """Executor computes values for given set of nodes in computation graph.""" 564 | def __init__(self, eval_node_list, ctx=None): 565 | """ 566 | Parameters 567 | ---------- 568 | eval_node_list: list of nodes whose values need to be computed. 569 | ctx: runtime DLContext, default is None which means np.ndarray on cpu 570 | topo_order: list of nodes in topological order 571 | node_to_shape_map: dict from node to shape of the node 572 | node_to_arr_map: dict from node to ndarray.NDArray allocated for node 573 | feed_shapes: shapes of feed_dict from last run(...) 574 | """ 575 | self.eval_node_list = eval_node_list 576 | self.ctx = ctx 577 | self.topo_order = find_topo_sort(self.eval_node_list) 578 | self.node_to_shape_map = None 579 | self.node_to_arr_map = None 580 | self.feed_shapes = None 581 | 582 | def infer_shape(self, feed_shapes): 583 | """Given shapes of feed_dict nodes, infer shape for all nodes in graph. 584 | 585 | Implementation note: 586 | Iteratively calls node.op.infer_shape to infer shapes. 587 | Node shapes stored in self.node_to_shape_map. 588 | 589 | Parameters 590 | ---------- 591 | feed_shapes: node->shapes mapping for feed_dict nodes. 592 | """ 593 | self.node_to_shape_map = dict(feed_shapes) 594 | for node in self.topo_order: 595 | if node in self.node_to_shape_map: 596 | continue 597 | input_shapes = [self.node_to_shape_map[i] for i in node.inputs] 598 | self.node_to_shape_map[node] = node.op.infer_shape(node, input_shapes) 599 | 600 | def memory_plan(self, feed_shapes): 601 | """Allocates ndarray.NDArray for every node except feed_dict nodes. 602 | 603 | Implementation note: 604 | Option 1: Alloc a ndarray.NDArray per node that persists across run() 605 | Option 2: Implement a memory pool to reuse memory for nodes of same 606 | shapes. More details see Lecture 7. 607 | 608 | For both options, self.node_to_arr_map stores node->NDArray mapping to 609 | allow mapping to persist across multiple executor.run(). 610 | 611 | Hint: use ndarray.empty(shape, ctx=self.ctx) to allocate NDArray. 612 | 613 | Parameters 614 | ---------- 615 | feed_shapes: node->shapes mapping for feed_dict nodes. 616 | """ 617 | self.node_to_arr_map = {} 618 | for node in self.topo_order: 619 | self.node_to_arr_map[node] = ndarray.empty(self.node_to_shape_map[node], ctx=self.ctx) 620 | 621 | def run(self, feed_dict, convert_to_numpy_ret_vals=False): 622 | """ 623 | Parameters 624 | ---------- 625 | feed_dict: a dictionary of node->np.ndarray supplied by user. 626 | convert_to_numpy_ret_vals: whether to convert ret vals to np.array 627 | 628 | Returns 629 | ------- 630 | A list of values for nodes in eval_node_list. NDArray or np.ndarray. 631 | """ 632 | def are_feed_shapes_equal(sa, sb): 633 | if (not isinstance(sa, dict)) or (not isinstance(sb, dict)): 634 | return False 635 | unmatched_item = set(sa.items()) ^ set(sb.items()) 636 | return len(unmatched_item) == 0 637 | 638 | # Assume self.ctx is None implies numpy array and numpy ops. 639 | use_numpy = self.ctx is None 640 | node_to_val_map = {} 641 | for node, value in feed_dict.items(): 642 | if use_numpy: 643 | # all values passed in feed_dict must be np.ndarray 644 | assert isinstance(value, np.ndarray) 645 | node_to_val_map[node] = value 646 | else: 647 | # convert values to ndarray.NDArray if necessary 648 | if isinstance(value, np.ndarray): 649 | node_to_val_map[node] = ndarray.array(value, ctx=self.ctx) 650 | elif isinstance(value, ndarray.NDArray): 651 | node_to_val_map[node] = value 652 | else: 653 | assert False, "feed_dict value type not supported" 654 | 655 | # collect shapes for all placeholders 656 | feed_shapes = {} 657 | for node in node_to_val_map: 658 | feed_shapes[node] = node_to_val_map[node].shape 659 | 660 | # infer shape if feed_shapes changed since last run 661 | # e.g. call run() on test data after trainng 662 | if (not are_feed_shapes_equal(feed_shapes, self.feed_shapes)): 663 | self.infer_shape(feed_shapes) 664 | self.feed_shapes = feed_shapes 665 | # plan memory if using GPU 666 | if (not use_numpy): 667 | self.memory_plan(feed_shapes) 668 | 669 | # Traverse graph in topo order and compute values for all nodes. 670 | for node in self.topo_order: 671 | if node in node_to_val_map: 672 | # Skip placeholder nodes. Values already provided by feed_dict. 673 | continue 674 | input_vals = [node_to_val_map[n] for n in node.inputs] 675 | if use_numpy: 676 | node_val = np.empty(shape=self.node_to_shape_map[node]) 677 | else: 678 | node_val = self.node_to_arr_map[node] 679 | # node_val is modified in-place whether np.ndarray or NDArray 680 | node.op.compute(node, input_vals, node_val, use_numpy) 681 | node_to_val_map[node] = node_val 682 | 683 | # Collect node values. 684 | if not use_numpy and convert_to_numpy_ret_vals: 685 | return [node_to_val_map[n].asnumpy() for n in self.eval_node_list] 686 | return [node_to_val_map[n] for n in self.eval_node_list] 687 | 688 | 689 | def gradients(output_node, node_list): 690 | """Take gradient of output node with respect to each node in node_list. 691 | 692 | Parameters 693 | ---------- 694 | output_node: output node that we are taking derivative of. 695 | node_list: list of nodes that we are taking derivative wrt. 696 | 697 | Returns 698 | ------- 699 | A list of gradient values, one for each node in node_list respectively. 700 | 701 | """ 702 | node_to_output_grads_list = {} 703 | node_to_output_grads_list[output_node] = [oneslike_op(output_node)] 704 | node_to_output_grad = {} 705 | # Traverse forward graph in reverse topological order 706 | reverse_topo_order = reversed(find_topo_sort([output_node])) 707 | for node in reverse_topo_order: 708 | output_grad = sum_node_list(node_to_output_grads_list[node]) 709 | node_to_output_grad[node] = output_grad 710 | input_grads_list = node.op.gradient(node, output_grad) 711 | for i in range(len(node.inputs)): 712 | if node.inputs[i] not in node_to_output_grads_list: 713 | node_to_output_grads_list[node.inputs[i]] = [] 714 | # Calculate partial adjoint for input nodes. 715 | node_to_output_grads_list[node.inputs[i]].append( 716 | input_grads_list[i]) 717 | 718 | grad_node_list = [node_to_output_grad[node] for node in node_list] 719 | return grad_node_list 720 | 721 | ################## 722 | # Helper Methods # 723 | ################## 724 | 725 | 726 | def find_topo_sort(node_list): 727 | """Given a list of nodes, return a topo ordering of nodes ending in them. 728 | 729 | A simple algorithm is to do a post-order DFS traversal on the given nodes, 730 | going backwards based on input edges. Since a node is added to the ordering 731 | after all its predecessors are traversed due to post-order DFS, we get a 732 | topological sort. 733 | 734 | """ 735 | visited = set() 736 | topo_order = [] 737 | for node in node_list: 738 | topo_sort_dfs(node, visited, topo_order) 739 | return topo_order 740 | 741 | 742 | def topo_sort_dfs(node, visited, topo_order): 743 | """Post-order DFS""" 744 | if node in visited: 745 | return 746 | visited.add(node) 747 | for n in node.inputs: 748 | topo_sort_dfs(n, visited, topo_order) 749 | topo_order.append(node) 750 | 751 | 752 | def sum_node_list(node_list): 753 | """Custom sum func to avoid creating redundant nodes in Python sum func.""" 754 | from operator import add 755 | from functools import reduce 756 | return reduce(add, node_list) 757 | 758 | 759 | def broadcast_rule(shape_a, shape_b): 760 | """Return output shape of broadcast shape_a, shape_b. 761 | e.g. broadcast_rule((3,2), (4,3,2)) 762 | returns output_shape = (4,3,2) 763 | 764 | Check out explanations and more examples at 765 | https://docs.scipy.org/doc/numpy-1.10.0/user/basics.broadcasting.html 766 | http://eli.thegreenplace.net/2015/broadcasting-arrays-in-numpy/ 767 | """ 768 | assert(isinstance(shape_a, tuple)) 769 | assert(isinstance(shape_b, tuple)) 770 | if len(shape_a) > len(shape_b): 771 | longer_shape, shorter_shape = shape_a, shape_b 772 | else: 773 | longer_shape, shorter_shape = shape_b, shape_a 774 | len_diff = len(longer_shape) - len(shorter_shape) 775 | for i in range(len_diff): 776 | # pad with leading 1s 777 | shorter_shape = (1,) + shorter_shape 778 | assert len(shorter_shape) == len(longer_shape) 779 | output_shape = list(longer_shape) 780 | for i in range(len(output_shape)): 781 | assert (shorter_shape[i] == longer_shape[i]) \ 782 | or (shorter_shape[i] == 1) \ 783 | or (longer_shape[i] == 1) 784 | output_shape[i] = max(shorter_shape[i], longer_shape[i]) 785 | return tuple(output_shape) 786 | -------------------------------------------------------------------------------- /python/tinyflow/gpu_op.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import ctypes 4 | from ._base import _LIB 5 | from . import ndarray as _nd 6 | 7 | 8 | def array_set(arr, value): 9 | assert isinstance(arr, _nd.NDArray) 10 | _LIB.DLGpuArraySet(arr.handle, ctypes.c_float(value)) 11 | 12 | 13 | def broadcast_to(in_arr, out_arr): 14 | assert isinstance(in_arr, _nd.NDArray) 15 | assert isinstance(out_arr, _nd.NDArray) 16 | _LIB.DLGpuBroadcastTo(in_arr.handle, out_arr.handle) 17 | 18 | 19 | def reduce_sum_axis_zero(in_arr, out_arr): 20 | assert isinstance(in_arr, _nd.NDArray) 21 | assert isinstance(out_arr, _nd.NDArray) 22 | _LIB.DLGpuReduceSumAxisZero(in_arr.handle, out_arr.handle) 23 | 24 | 25 | def matrix_elementwise_add(matA, matB, matC): 26 | assert isinstance(matA, _nd.NDArray) 27 | assert isinstance(matB, _nd.NDArray) 28 | assert isinstance(matC, _nd.NDArray) 29 | _LIB.DLGpuMatrixElementwiseAdd(matA.handle, matB.handle, matC.handle) 30 | 31 | 32 | def matrix_elementwise_add_by_const(in_mat, val, out_mat): 33 | assert isinstance(in_mat, _nd.NDArray) 34 | assert isinstance(out_mat, _nd.NDArray) 35 | _LIB.DLGpuMatrixElementwiseAddByConst( 36 | in_mat.handle, ctypes.c_float(val), out_mat.handle) 37 | 38 | 39 | def matrix_elementwise_multiply(matA, matB, matC): 40 | assert isinstance(matA, _nd.NDArray) 41 | assert isinstance(matB, _nd.NDArray) 42 | assert isinstance(matC, _nd.NDArray) 43 | _LIB.DLGpuMatrixElementwiseMultiply( 44 | matA.handle, matB.handle, matC.handle) 45 | 46 | 47 | def matrix_elementwise_multiply_by_const(in_mat, val, out_mat): 48 | assert isinstance(in_mat, _nd.NDArray) 49 | assert isinstance(out_mat, _nd.NDArray) 50 | _LIB.DLGpuMatrixMultiplyByConst( 51 | in_mat.handle, ctypes.c_float(val), out_mat.handle) 52 | 53 | 54 | def matrix_multiply(matA, transA, matB, transB, matC): 55 | assert isinstance(matA, _nd.NDArray) 56 | assert isinstance(matB, _nd.NDArray) 57 | assert isinstance(matC, _nd.NDArray) 58 | _LIB.DLGpuMatrixMultiply( 59 | matA.handle, transA, matB.handle, transB, matC.handle) 60 | 61 | 62 | def relu(in_arr, out_arr): 63 | assert isinstance(in_arr, _nd.NDArray) 64 | assert isinstance(out_arr, _nd.NDArray) 65 | _LIB.DLGpuRelu(in_arr.handle, out_arr.handle) 66 | 67 | 68 | def relu_gradient(in_arr, in_grad_arr, out_arr): 69 | assert isinstance(in_arr, _nd.NDArray) 70 | assert isinstance(in_grad_arr, _nd.NDArray) 71 | assert isinstance(out_arr, _nd.NDArray) 72 | _LIB.DLGpuReluGradient(in_arr.handle, in_grad_arr.handle, out_arr.handle) 73 | 74 | 75 | def softmax(in_arr, out_arr): 76 | assert isinstance(in_arr, _nd.NDArray) 77 | assert isinstance(out_arr, _nd.NDArray) 78 | _LIB.DLGpuSoftmax(in_arr.handle, out_arr.handle) 79 | 80 | 81 | def softmax_cross_entropy(in_arr_a, in_arr_b, out_arr): 82 | assert isinstance(in_arr_a, _nd.NDArray) 83 | assert isinstance(in_arr_b, _nd.NDArray) 84 | assert isinstance(out_arr, _nd.NDArray) 85 | _LIB.DLGpuSoftmaxCrossEntropy( 86 | in_arr_a.handle, in_arr_b.handle, out_arr.handle) 87 | -------------------------------------------------------------------------------- /python/tinyflow/ndarray.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from ._base import _LIB, check_call, c_array 4 | import ctypes 5 | import numpy as np 6 | 7 | 8 | class DLContext(ctypes.Structure): 9 | """DL context strucure.""" 10 | _fields_ = [("device_id", ctypes.c_int), 11 | ("device_type", ctypes.c_int)] 12 | 13 | MASK2STR = { 14 | 1: 'cpu', 15 | 2: 'gpu', 16 | } 17 | 18 | def __init__(self, device_id, device_type): 19 | super(DLContext, self).__init__() 20 | self.device_id = device_id 21 | self.device_type = device_type 22 | 23 | def __repr__(self): 24 | return "%s(%d)" % ( 25 | DLContext.MASK2STR[self.device_type], self.device_id) 26 | 27 | 28 | class DLArray(ctypes.Structure): 29 | """DLArray in C API""" 30 | _fields_ = [("data", ctypes.c_void_p), 31 | ("ctx", DLContext), 32 | ("ndim", ctypes.c_int), 33 | ("shape", ctypes.POINTER(ctypes.c_int64))] 34 | 35 | 36 | DLArrayHandle = ctypes.POINTER(DLArray) 37 | 38 | 39 | def cpu(dev_id=0): 40 | """Construct a CPU device 41 | Parameters 42 | ---------- 43 | dev_id : int, optional 44 | The integer device id 45 | """ 46 | return DLContext(dev_id, 1) 47 | 48 | 49 | def gpu(dev_id=0): 50 | """Construct a CPU device 51 | Parameters 52 | ---------- 53 | dev_id : int, optional 54 | The integer device id 55 | """ 56 | return DLContext(dev_id, 2) 57 | 58 | 59 | def is_gpu_ctx(ctx): 60 | """Return if context is GPU context. 61 | Parameters 62 | ---------- 63 | ctx : DLContext 64 | The query context 65 | """ 66 | return ctx and ctx.device_type == 2 67 | 68 | 69 | class NDArray(object): 70 | """Lightweight NDArray class of DL runtime. 71 | Strictly this is only an Array Container(a buffer object) 72 | No arthimetic operations are defined. 73 | """ 74 | __slots__ = ["handle"] 75 | 76 | # pylint: disable=no-member 77 | def __init__(self, handle): 78 | """Initialize the function with handle 79 | Parameters 80 | ---------- 81 | handle : DLArrayHandle 82 | the handle to the underlying C++ DLArray 83 | """ 84 | self.handle = handle 85 | 86 | def __del__(self): 87 | check_call(_LIB.DLArrayFree(self.handle)) 88 | 89 | @property 90 | def shape(self): 91 | """Shape of this array""" 92 | return tuple(self.handle.contents.shape[i] 93 | for i in range(self.handle.contents.ndim)) 94 | 95 | @property 96 | def ctx(self): 97 | """context of this array""" 98 | return self.handle.contents.ctx 99 | 100 | def __setitem__(self, in_slice, value): 101 | """Set ndarray value""" 102 | if (not isinstance(in_slice, slice) or 103 | in_slice.start is not None 104 | or in_slice.stop is not None): 105 | raise ValueError('Array only support set from numpy array') 106 | if isinstance(value, NDArray): 107 | if value.handle is not self.handle: 108 | value.copyto(self) 109 | elif isinstance(value, (np.ndarray, np.generic)): 110 | self._sync_copyfrom(value) 111 | else: 112 | raise TypeError('type %s not supported' % str(type(value))) 113 | 114 | def _sync_copyfrom(self, source_array): 115 | """Peform an synchronize copy from the array. 116 | Parameters 117 | ---------- 118 | source_array : array_like 119 | The data source we should like to copy from. 120 | """ 121 | if not isinstance(source_array, np.ndarray): 122 | try: 123 | source_array = np.array(source_array, dtype=np.float32) 124 | except: 125 | raise TypeError('array must be an array_like data,' + 126 | 'type %s is not supported' 127 | % str(type(source_array))) 128 | source_array = np.ascontiguousarray(source_array, dtype=np.float32) 129 | if source_array.shape != self.shape: 130 | raise ValueError('array shape do not match the shape of NDArray') 131 | source_arr, shape = NDArray._numpyasarray(source_array) 132 | check_call(_LIB.DLArrayCopyFromTo( 133 | ctypes.byref(source_arr), self.handle, None)) 134 | # de-allocate shape until now 135 | _ = shape 136 | 137 | @staticmethod 138 | def _numpyasarray(np_data): 139 | """Return a DLArray representation of a numpy array.""" 140 | data = np_data 141 | assert data.flags['C_CONTIGUOUS'] 142 | arr = DLArray() 143 | shape = c_array(ctypes.c_int64, data.shape) 144 | arr.data = data.ctypes.data_as(ctypes.c_void_p) 145 | arr.shape = shape 146 | arr.ndim = data.ndim 147 | # CPU device 148 | arr.ctx = cpu(0) 149 | return arr, shape 150 | 151 | def asnumpy(self): 152 | """Convert this array to numpy array 153 | Returns 154 | ------- 155 | np_arr : numpy.ndarray 156 | The corresponding numpy array. 157 | """ 158 | np_arr = np.empty(self.shape, dtype=np.float32) 159 | arr, shape = NDArray._numpyasarray(np_arr) 160 | check_call(_LIB.DLArrayCopyFromTo( 161 | self.handle, ctypes.byref(arr), None)) 162 | _ = shape 163 | return np_arr 164 | 165 | def copyto(self, target): 166 | """Copy array to target 167 | Parameters 168 | ---------- 169 | target : NDArray 170 | The target array to be copied, must have same shape as this array. 171 | """ 172 | if isinstance(target, DLContext): 173 | target = empty(self.shape, target) 174 | if isinstance(target, NDArray): 175 | check_call(_LIB.DLArrayCopyFromTo( 176 | self.handle, target.handle, None)) 177 | else: 178 | raise ValueError("Unsupported target type %s" % str(type(target))) 179 | return target 180 | 181 | 182 | def array(arr, ctx=cpu(0)): 183 | """Create an array from source arr. 184 | Parameters 185 | ---------- 186 | arr : numpy.ndarray 187 | The array to be copied from 188 | ctx : DLContext, optional 189 | The device context to create the array 190 | Returns 191 | ------- 192 | ret : NDArray 193 | The created array 194 | """ 195 | if not isinstance(arr, np.ndarray): 196 | arr = np.array(arr) 197 | ret = empty(arr.shape, ctx) 198 | ret._sync_copyfrom(arr) 199 | return ret 200 | 201 | 202 | def empty(shape, ctx=cpu(0)): 203 | """Create an empty array given shape and device 204 | Parameters 205 | ---------- 206 | shape : tuple of int 207 | The shape of the array 208 | ctx : DLContext 209 | The context of the array 210 | Returns 211 | ------- 212 | arr : ndarray 213 | The array dlsys supported. 214 | """ 215 | shape = c_array(ctypes.c_int64, shape) 216 | ndim = ctypes.c_int(len(shape)) 217 | handle = DLArrayHandle() 218 | check_call(_LIB.DLArrayAlloc( 219 | shape, ndim, ctx, ctypes.byref(handle))) 220 | return NDArray(handle) 221 | -------------------------------------------------------------------------------- /src/c_runtime_api.cc: -------------------------------------------------------------------------------- 1 | /*! 2 | * \file c_runtime_api.cc 3 | * \brief Device specific implementations 4 | */ 5 | #include "./c_runtime_api.h" 6 | #include "./cpu_device_api.h" 7 | #include "./cuda_device_api.h" 8 | #include "./runtime_base.h" 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace tinyflow { 19 | namespace runtime { 20 | 21 | class DeviceAPIManager { 22 | public: 23 | static const int kMaxDeviceAPI = 8; 24 | // Get API 25 | static DeviceAPI *Get(DLContext ctx) { 26 | return Global()->GetAPI(ctx.device_type); 27 | } 28 | 29 | private: 30 | std::array api_; 31 | DeviceAPIManager() { 32 | std::fill(api_.begin(), api_.end(), nullptr); 33 | static CPUDeviceAPI cpu_device_api_inst; 34 | static CUDADeviceAPI gpu_device_api_inst; 35 | api_[kCPU] = static_cast(&cpu_device_api_inst); 36 | api_[kGPU] = static_cast(&gpu_device_api_inst); 37 | } 38 | // Get global static variable. 39 | static DeviceAPIManager *Global() { 40 | static DeviceAPIManager inst; 41 | return &inst; 42 | } 43 | // Get API. 44 | DeviceAPI *GetAPI(DLDeviceType type) { 45 | if (api_[type] == nullptr) { 46 | std::cerr << "Device API not supported" << std::endl; 47 | exit(EXIT_FAILURE); 48 | } 49 | return api_[type]; 50 | } 51 | }; 52 | 53 | inline DLArray *DLArrayCreate_() { 54 | DLArray *arr = new DLArray(); 55 | arr->shape = nullptr; 56 | arr->ndim = 0; 57 | arr->data = nullptr; 58 | return arr; 59 | } 60 | 61 | inline void DLArrayFree_(DLArray *arr) { 62 | if (arr != nullptr) { 63 | // ok to delete nullptr 64 | delete[] arr->shape; 65 | if (arr->data != nullptr) { 66 | DeviceAPIManager::Get(arr->ctx)->FreeDataSpace(arr->ctx, arr->data); 67 | } 68 | } 69 | delete arr; 70 | } 71 | 72 | inline size_t GetDataSize(DLArray *arr) { 73 | size_t size = 1; 74 | for (index_t i = 0; i < arr->ndim; ++i) { 75 | size *= arr->shape[i]; 76 | } 77 | // assume 32-bit float 78 | size *= 4; 79 | return size; 80 | } 81 | 82 | inline size_t GetDataAlignment(DLArray *arr) { 83 | // assume 32-bit float 84 | return 8; 85 | } 86 | 87 | } // namespace runtime 88 | } // namespace tinyflow 89 | 90 | using namespace tinyflow::runtime; 91 | 92 | int DLArrayAlloc(const index_t *shape, index_t ndim, DLContext ctx, 93 | DLArrayHandle *out) { 94 | DLArray *arr = nullptr; 95 | API_BEGIN(); 96 | // shape 97 | arr = DLArrayCreate_(); 98 | // ndim 99 | arr->ndim = ndim; 100 | index_t *shape_copy = new index_t[ndim]; 101 | std::copy(shape, shape + ndim, shape_copy); 102 | arr->shape = shape_copy; 103 | // ctx 104 | arr->ctx = ctx; 105 | size_t size = GetDataSize(arr); 106 | size_t alignment = GetDataAlignment(arr); 107 | arr->data = DeviceAPIManager::Get(ctx)->AllocDataSpace(ctx, size, alignment); 108 | *out = arr; 109 | API_END_HANDLE_ERROR(DLArrayFree_(arr)); 110 | } 111 | 112 | int DLArrayFree(DLArrayHandle handle) { 113 | API_BEGIN(); 114 | DLArray *arr = handle; 115 | DLArrayFree_(arr); 116 | API_END(); 117 | } 118 | 119 | int DLArrayCopyFromTo(DLArrayHandle from, DLArrayHandle to, 120 | DLStreamHandle stream) { 121 | API_BEGIN(); 122 | size_t from_size = GetDataSize(from); 123 | size_t to_size = GetDataSize(to); 124 | // The size must exactly match 125 | assert(from_size == to_size); 126 | DLContext ctx = from->ctx; 127 | if (ctx.device_type == kCPU) { 128 | ctx = to->ctx; 129 | } else { 130 | // Can not copy across different ctx types directly 131 | assert((to->ctx.device_type == kCPU) || 132 | (to->ctx.device_type == from->ctx.device_type)); 133 | } 134 | DeviceAPIManager::Get(ctx)->CopyDataFromTo(from->data, to->data, from_size, 135 | from->ctx, to->ctx, stream); 136 | API_END(); 137 | } 138 | -------------------------------------------------------------------------------- /src/c_runtime_api.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * \file c_runtime_api.h 3 | * \brief DL runtime library. 4 | * 5 | */ 6 | 7 | #ifndef TINYFLOW_RUNTIME_C_RUNTIME_API_H_ 8 | #define TINYFLOW_RUNTIME_C_RUNTIME_API_H_ 9 | 10 | #ifdef __cplusplus 11 | #define TINYFLOW_EXTERN_C extern "C" 12 | #else 13 | #define TINYFLOW_EXTERN_C 14 | #endif 15 | 16 | #include "dlarray.h" 17 | #include 18 | #include 19 | 20 | TINYFLOW_EXTERN_C { 21 | /*! \brief type of array index. */ 22 | typedef int64_t index_t; 23 | 24 | /*! \brief the array handle */ 25 | typedef DLArray *DLArrayHandle; 26 | /*! 27 | * \brief The stream that is specific to device 28 | * can be NULL, which indicates the default one. 29 | */ 30 | typedef void *DLStreamHandle; 31 | 32 | // Array related apis for quick proptying 33 | /*! 34 | * \brief Allocate a nd-array's memory, 35 | * including space of shape, of given spec. 36 | * 37 | * \param shape The shape of the array, the data content will be copied to out 38 | * \param ndim The number of dimension of the array. 39 | * \param ctx The ctx this array sits on. 40 | * \param out The output handle. 41 | * \return 0 when success, -1 when failure happens 42 | */ 43 | int DLArrayAlloc(const index_t *shape, index_t ndim, DLContext ctx, 44 | DLArrayHandle *out); 45 | 46 | /*! 47 | * \brief Free the DL Array. 48 | * \param handle The array handle to be freed. 49 | * \return 0 when success, -1 when failure happens 50 | */ 51 | int DLArrayFree(DLArrayHandle handle); 52 | 53 | /*! 54 | * \brief Copy the array, both from and to must be valid during the copy. 55 | * \param from The array to be copied from. 56 | * \param to The target space. 57 | * \param stream The stream where the copy happens, can be NULL. 58 | * \return 0 when success, -1 when failure happens 59 | */ 60 | int DLArrayCopyFromTo(DLArrayHandle from, DLArrayHandle to, 61 | DLStreamHandle stream); 62 | 63 | /*! 64 | * \brief Set all array elements to given value. 65 | * \param arr The array to be Set. 66 | * \param value The target value. 67 | * \return 0 when success, -1 when failure happens 68 | */ 69 | int DLGpuArraySet(DLArrayHandle arr, float value); 70 | 71 | /*! 72 | * \brief Broadcast input array to output array. 73 | * \param input The input array. 74 | * \param output The output array. 75 | * \return 0 when success, -1 when failure happens 76 | */ 77 | int DLGpuBroadcastTo(const DLArrayHandle input, DLArrayHandle output); 78 | 79 | /*! 80 | * \brief Reduce sum input array by axis=0 and store to output. 81 | * \param input The input array. 82 | * \param output The output array. 83 | * \return 0 when success, -1 when failure happens 84 | */ 85 | int DLGpuReduceSumAxisZero(const DLArrayHandle input, DLArrayHandle output); 86 | 87 | /*! 88 | * \brief Elementwise add two matrices and store to output. 89 | * \param matA The left input array. 90 | * \param matB The right input array. 91 | * \param output The output array. 92 | * \return 0 when success, -1 when failure happens 93 | */ 94 | int DLGpuMatrixElementwiseAdd(const DLArrayHandle matA, 95 | const DLArrayHandle matB, DLArrayHandle output); 96 | 97 | /*! 98 | * \brief Add matrix by const and store to output. 99 | * \param input The input array. 100 | * \param val The constant. 101 | * \param output The output array. 102 | * \return 0 when success, -1 when failure happens 103 | */ 104 | int DLGpuMatrixElementwiseAddByConst(const DLArrayHandle input, float val, 105 | DLArrayHandle output); 106 | 107 | /*! 108 | * \brief Elementwise multiply two matrices and store to output. 109 | * \param matA The left input array. 110 | * \param matB The right input array. 111 | * \param output The output array. 112 | * \return 0 when success, -1 when failure happens 113 | */ 114 | int DLGpuMatrixElementwiseMultiply( 115 | const DLArrayHandle matA, const DLArrayHandle matB, DLArrayHandle output); 116 | 117 | /*! 118 | * \brief Multiply matrix by const and store to output. 119 | * \param input The input array. 120 | * \param val The constant. 121 | * \param output The output array. 122 | * \return 0 when success, -1 when failure happens 123 | */ 124 | int DLGpuMatrixMultiplyByConst(const DLArrayHandle input, float val, 125 | DLArrayHandle output); 126 | 127 | /*! 128 | * \brief Matrix multiply two matrices and store to output. 129 | * \param matA The left input array. 130 | * \param transposeA Whether matA needs to be transposed 131 | * \param matB The right input array. 132 | * \param transposeB Whether matB needs to be transposed 133 | * \param output The output array. 134 | * \return 0 when success, -1 when failure happens 135 | */ 136 | int DLGpuMatrixMultiply(const DLArrayHandle matA, bool transposeA, 137 | const DLArrayHandle matB, bool transposeB, 138 | DLArrayHandle matC); 139 | 140 | /*! 141 | * \brief Compute relu on all array elements, and store to output. 142 | * \param input The input array. 143 | * \param output The output value. 144 | * \return 0 when success, -1 when failure happens 145 | */ 146 | int DLGpuRelu(const DLArrayHandle input, DLArrayHandle output); 147 | 148 | /*! 149 | * \brief Compute relu gradient, and store to output. 150 | * \param input The input array. 151 | * \param in_grad The input gradients value. 152 | * \param output The output array. 153 | * \return 0 when success, -1 when failure happens 154 | */ 155 | int DLGpuReluGradient(const DLArrayHandle input, const DLArrayHandle in_grad, 156 | DLArrayHandle output); 157 | 158 | /*! 159 | * \brief Compute softmax on matrix, and store to output. 160 | * \param input The input array. 161 | * \param output The output value. 162 | * \return 0 when success, -1 when failure happens 163 | */ 164 | int DLGpuSoftmax(const DLArrayHandle input, DLArrayHandle output); 165 | 166 | /*! 167 | * \brief Compute softmax_cross_entropy. 168 | * np.mean(-np.sum(y_ * np.log(softmax(y)), axis=1), keepdims=True) 169 | * \param input_a The y array. 170 | * \param input_b The y_ array. 171 | * \param output The output value. 172 | * \return 0 when success, -1 when failure happens 173 | */ 174 | int DLGpuSoftmaxCrossEntropy(const DLArrayHandle input_a, 175 | const DLArrayHandle input_b, 176 | DLArrayHandle output); 177 | } // TINYFLOW_EXTERN_C 178 | 179 | #endif // TINYFLOW_RUNTIME_C_RUNTIME_API_H_ 180 | -------------------------------------------------------------------------------- /src/cpu_device_api.cc: -------------------------------------------------------------------------------- 1 | /*! 2 | * \file cpu_device_api.cc 3 | */ 4 | #include "./cpu_device_api.h" 5 | #include 6 | #include 7 | #include 8 | 9 | namespace tinyflow { 10 | namespace runtime { 11 | 12 | void *CPUDeviceAPI::AllocDataSpace(DLContext ctx, size_t size, 13 | size_t alignment) { 14 | // std::cout << "allocating cpu data" << std::endl; 15 | void *ptr; 16 | int ret = posix_memalign(&ptr, alignment, size); 17 | if (ret != 0) 18 | throw std::bad_alloc(); 19 | return ptr; 20 | } 21 | 22 | void CPUDeviceAPI::FreeDataSpace(DLContext ctx, void *ptr) { free(ptr); } 23 | 24 | void CPUDeviceAPI::CopyDataFromTo(const void *from, void *to, size_t size, 25 | DLContext ctx_from, DLContext ctx_to, 26 | DLStreamHandle stream) { 27 | // std::cout << "copying cpu data" << std::endl; 28 | memcpy(to, from, size); 29 | } 30 | 31 | void CPUDeviceAPI::StreamSync(DLContext ctx, DLStreamHandle stream) {} 32 | 33 | } // namespace runtime 34 | } // namespace tinyflow 35 | -------------------------------------------------------------------------------- /src/cpu_device_api.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * \file device_api.h 3 | * \brief Device specific API 4 | */ 5 | #ifndef TINYFLOW_RUNTIME_CPU_DEVICE_API_H_ 6 | #define TINYFLOW_RUNTIME_CPU_DEVICE_API_H_ 7 | 8 | #include "c_runtime_api.h" 9 | #include "device_api.h" 10 | #include 11 | #include 12 | 13 | namespace tinyflow { 14 | namespace runtime { 15 | 16 | class CPUDeviceAPI : public DeviceAPI { 17 | public: 18 | void *AllocDataSpace(DLContext ctx, size_t size, size_t alignment) final; 19 | 20 | void FreeDataSpace(DLContext ctx, void *ptr) final; 21 | 22 | void CopyDataFromTo(const void *from, void *to, size_t size, 23 | DLContext ctx_from, DLContext ctx_to, 24 | DLStreamHandle stream) final; 25 | 26 | void StreamSync(DLContext ctx, DLStreamHandle stream) final; 27 | }; 28 | 29 | } // namespace runtime 30 | } // namespace tinyflow 31 | #endif // tinyflow_RUNTIME_CPU_DEVICE_API_H_ 32 | -------------------------------------------------------------------------------- /src/cuda_device_api.cc: -------------------------------------------------------------------------------- 1 | /*! 2 | * \file cuda_device_api.cc 3 | * \brief GPU specific API 4 | */ 5 | 6 | #include "./cuda_device_api.h" 7 | #include 8 | #include 9 | #include 10 | 11 | #define CUDA_CALL(func) \ 12 | { \ 13 | cudaError_t e = (func); \ 14 | assert((e == cudaSuccess) || (e == cudaErrorCudartUnloading)); \ 15 | } 16 | 17 | namespace tinyflow { 18 | namespace runtime { 19 | 20 | static void GPUCopy(const void *from, void *to, size_t size, 21 | cudaMemcpyKind kind, cudaStream_t stream) { 22 | if (stream != 0) { 23 | CUDA_CALL(cudaMemcpyAsync(to, from, size, kind, stream)); 24 | } else { 25 | CUDA_CALL(cudaMemcpy(to, from, size, kind)); 26 | } 27 | } 28 | 29 | void *CUDADeviceAPI::AllocDataSpace(DLContext ctx, size_t size, 30 | size_t alignment) { 31 | // std::cout << "allocating cuda data" << std::endl; 32 | CUDA_CALL(cudaSetDevice(ctx.device_id)); 33 | assert((256 % alignment) == 0U); // << "CUDA space is aligned at 256 bytes"; 34 | void *ret; 35 | CUDA_CALL(cudaMalloc(&ret, size)); 36 | return ret; 37 | } 38 | 39 | void CUDADeviceAPI::FreeDataSpace(DLContext ctx, void *ptr) { 40 | CUDA_CALL(cudaSetDevice(ctx.device_id)); 41 | CUDA_CALL(cudaFree(ptr)); 42 | } 43 | 44 | void CUDADeviceAPI::CopyDataFromTo(const void *from, void *to, size_t size, 45 | DLContext ctx_from, DLContext ctx_to, 46 | DLStreamHandle stream) { 47 | // std::cout << "copying cuda data" << std::endl; 48 | cudaStream_t cu_stream = static_cast(stream); 49 | if (ctx_from.device_type == kGPU && ctx_to.device_type == kGPU) { 50 | CUDA_CALL(cudaSetDevice(ctx_from.device_id)); 51 | if (ctx_from.device_id == ctx_to.device_id) { 52 | GPUCopy(from, to, size, cudaMemcpyDeviceToDevice, cu_stream); 53 | } else { 54 | cudaMemcpyPeerAsync(to, ctx_to.device_id, from, ctx_from.device_id, size, 55 | cu_stream); 56 | } 57 | } else if (ctx_from.device_type == kGPU && ctx_to.device_type == kCPU) { 58 | CUDA_CALL(cudaSetDevice(ctx_from.device_id)); 59 | GPUCopy(from, to, size, cudaMemcpyDeviceToHost, cu_stream); 60 | } else if (ctx_from.device_type == kCPU && ctx_to.device_type == kGPU) { 61 | CUDA_CALL(cudaSetDevice(ctx_to.device_id)); 62 | GPUCopy(from, to, size, cudaMemcpyHostToDevice, cu_stream); 63 | } else { 64 | std::cerr << "expect copy from/to GPU or between GPU" << std::endl; 65 | } 66 | } 67 | 68 | void CUDADeviceAPI::StreamSync(DLContext ctx, DLStreamHandle stream) { 69 | CUDA_CALL(cudaSetDevice(ctx.device_id)); 70 | CUDA_CALL(cudaStreamSynchronize(static_cast(stream))); 71 | } 72 | 73 | } // namespace runtime 74 | } // namespace tinyflow 75 | -------------------------------------------------------------------------------- /src/cuda_device_api.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * \file device_api.h 3 | * \brief Device specific API 4 | */ 5 | #ifndef TINYFLOW_RUNTIME_CUDA_DEVICE_API_H_ 6 | #define TINYFLOW_RUNTIME_CUDA_DEVICE_API_H_ 7 | 8 | #include "c_runtime_api.h" 9 | #include "device_api.h" 10 | #include 11 | 12 | #include 13 | #include 14 | 15 | namespace tinyflow { 16 | namespace runtime { 17 | 18 | class CUDADeviceAPI : public DeviceAPI { 19 | public: 20 | void *AllocDataSpace(DLContext ctx, size_t size, size_t alignment) final; 21 | 22 | void FreeDataSpace(DLContext ctx, void *ptr) final; 23 | 24 | void CopyDataFromTo(const void *from, void *to, size_t size, 25 | DLContext ctx_from, DLContext ctx_to, 26 | DLStreamHandle stream) final; 27 | 28 | void StreamSync(DLContext ctx, DLStreamHandle stream) final; 29 | }; 30 | 31 | } // namespace runtime 32 | } // namespace tinyflow 33 | #endif // TINYFLOW_RUNTIME_CUDA_DEVICE_API_H_ 34 | -------------------------------------------------------------------------------- /src/device_api.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * \file device_api.h 3 | * \brief Device specific API 4 | */ 5 | #ifndef TINYFLOW_RUNTIME_DEVICE_API_H_ 6 | #define TINYFLOW_RUNTIME_DEVICE_API_H_ 7 | 8 | #include "c_runtime_api.h" 9 | #include 10 | #include 11 | 12 | namespace tinyflow { 13 | namespace runtime { 14 | 15 | class DeviceAPI { 16 | public: 17 | /*! \brief virtual destructor */ 18 | virtual ~DeviceAPI() {} 19 | /*! 20 | * \brief Allocate a data space on device. 21 | * \param ctx The device context to perform operation. 22 | * \param size The size of the memory 23 | * \param alignment The alignment of the memory. 24 | * \return The allocated device pointer 25 | */ 26 | virtual void *AllocDataSpace(DLContext ctx, size_t size, 27 | size_t alignment) = 0; 28 | /*! 29 | * \brief Free a data space on device. 30 | * \param ctx The device context to perform operation. 31 | * \param ptr The data space. 32 | * \tparam xpu The device mask. 33 | */ 34 | virtual void FreeDataSpace(DLContext ctx, void *ptr) = 0; 35 | /*! 36 | * \brief copy data from one place to another 37 | * \param dev The device to perform operation. 38 | * \param from The source array. 39 | * \param to The target array. 40 | * \param size The size of the memory 41 | * \param ctx_from The source context 42 | * \param ctx_to The target context 43 | */ 44 | virtual void CopyDataFromTo(const void *from, void *to, size_t size, 45 | DLContext ctx_from, DLContext ctx_to, 46 | DLStreamHandle stream) = 0; 47 | /*! 48 | * \brief Synchronize the stream 49 | * \param ctx The context to perform operation. 50 | * \param stream The stream to be sync. 51 | */ 52 | virtual void StreamSync(DLContext ctx, DLStreamHandle stream) = 0; 53 | }; 54 | 55 | } // namespace runtime 56 | } // namespace tinyflow 57 | #endif // TINYFLOW_RUNTIME_DEVICE_API_H_ 58 | -------------------------------------------------------------------------------- /src/dlarray.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * \file dlarray.h 3 | * \brief Header that defines array struct. 4 | */ 5 | #ifndef TINYFLOW_H_ 6 | #define TINYFLOW_H_ 7 | 8 | #ifdef __cplusplus 9 | #define TINYFLOW_EXTERN_C extern "C" 10 | #else 11 | #define TINYFLOW_EXTERN_C 12 | #endif 13 | 14 | #include 15 | #include 16 | 17 | TINYFLOW_EXTERN_C { 18 | /*! 19 | * \brief The device type in DLContext. 20 | */ 21 | typedef enum { 22 | kCPU = 1, 23 | kGPU = 2, 24 | } DLDeviceType; 25 | 26 | /*! 27 | * \brief A Device context for array. 28 | */ 29 | typedef struct { 30 | /*! \brief The device index */ 31 | int device_id; 32 | /*! \brief The device type used in the device. */ 33 | DLDeviceType device_type; 34 | } DLContext; 35 | 36 | /*! 37 | * \brief Plain C Array object, does not manage memory. 38 | */ 39 | typedef struct { 40 | /*! 41 | * \brief The opaque data pointer points to the allocated data. 42 | * This will be CUDA device pointer or cl_mem handle in OpenCL. 43 | * This pointer is always aligns to 256 bytes as in CUDA. 44 | */ 45 | void *data; 46 | /*! \brief The device context of the tensor */ 47 | DLContext ctx; 48 | /*! \brief Number of dimensions */ 49 | int ndim; 50 | /*! \brief The shape of the tensor */ 51 | int64_t *shape; 52 | } DLArray; 53 | 54 | } // TINYFLOW_EXTERN_C 55 | #endif // TINYFLOW_H_ 56 | -------------------------------------------------------------------------------- /src/gpu_op.cu: -------------------------------------------------------------------------------- 1 | #include "./c_runtime_api.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #define MAX_THREADS_NUM 512 10 | #define MAX_BLOCKS_NUM 4096 11 | #define BLOCK_NUM(count) min(((count + MAX_THREADS_NUM - 1) / MAX_THREADS_NUM), MAX_BLOCKS_NUM) 12 | #define CUDA_1D_KERNEL_LOOP(i, n) \ 13 | for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ 14 | i += blockDim.x * gridDim.x) 15 | 16 | __global__ void matrix_array_set_kernel(int count, 17 | float *arr, 18 | float value) { 19 | CUDA_1D_KERNEL_LOOP(index, count) { 20 | arr[index] = value; 21 | } 22 | } 23 | 24 | __global__ void matrix_broadcast_to_kernel(int inputCount, float* inputArr, 25 | int outputCount, float* outputArr) { 26 | CUDA_1D_KERNEL_LOOP(index, outputCount) { 27 | outputArr[index] = inputArr[index % inputCount]; 28 | } 29 | } 30 | 31 | __global__ void matrix_reduce_sum_axis_zero_kernel(float* inputArr, 32 | int outputCount, float* outputArr, 33 | int zeroDim) { 34 | CUDA_1D_KERNEL_LOOP(index, outputCount) { 35 | float sum = 0; 36 | for (int i = 0; i < zeroDim; ++i) { 37 | sum += inputArr[index + i * outputCount]; 38 | } 39 | outputArr[index] = sum; 40 | } 41 | } 42 | 43 | __global__ void matrix_elementwise_add_kernel(float* matAData, float* matBData, 44 | float* outputData, int count) { 45 | CUDA_1D_KERNEL_LOOP(index, count) { 46 | outputData[index] = matAData[index] + matBData[index]; 47 | } 48 | } 49 | 50 | __global__ void matrix_elementwise_add_by_const_kernel(float* inputArr, float val, 51 | float* outputArr, int count) { 52 | CUDA_1D_KERNEL_LOOP(index, count) { 53 | outputArr[index] = inputArr[index] + val; 54 | } 55 | } 56 | 57 | __global__ void matrix_elementwise_multiply_kernel(float* matAData, float* matBData, 58 | float* outputData, int count) { 59 | CUDA_1D_KERNEL_LOOP(index, count) { 60 | outputData[index] = matAData[index] * matBData[index]; 61 | } 62 | } 63 | 64 | __global__ void matrix_elementwise_multipy_by_const_kernel(float* inputArr, float val, 65 | float* outputArr, int count) { 66 | CUDA_1D_KERNEL_LOOP(index, count) { 67 | outputArr[index] = inputArr[index] * val; 68 | } 69 | } 70 | 71 | __global__ void matrix_relu_kernel(float* inputArr, float* outputArr, int count) { 72 | CUDA_1D_KERNEL_LOOP(index, count) { 73 | outputArr[index] = inputArr[index]; 74 | if (inputArr[index] < 0) { 75 | outputArr[index] = 0.f; 76 | } 77 | } 78 | } 79 | 80 | __global__ void matrix_relu_gradient_kernel(const float* inputArr, const float* gradArr, 81 | float* outputArr, int count) { 82 | CUDA_1D_KERNEL_LOOP(index, count) { 83 | outputArr[index] = inputArr[index] > 0 ? gradArr[index] : 0; 84 | } 85 | } 86 | 87 | __global__ void matrix_softmax_kernel(int nRow, int nCol, float* inputArr, float* outputArr) { 88 | int y = blockIdx.x * blockDim.x + threadIdx.x; 89 | if (y >= nRow) return; 90 | 91 | float* input = inputArr + y * nCol; 92 | float* output = outputArr + y * nCol; 93 | 94 | float maxval = *input; 95 | for (int i = 1; i < nCol; ++i) { 96 | maxval = max(input[i], maxval); 97 | } 98 | float sum = 0; 99 | for (int i = 0; i < nCol; ++i) { 100 | sum += expf(input[i] - maxval); 101 | } 102 | for (int i = 0; i < nCol; ++i) { 103 | output[i] = expf(input[i] - maxval) / sum; 104 | } 105 | } 106 | 107 | /* all your GPU kernel code, e.g. matrix_softmax_cross_entropy_kernel */ 108 | 109 | // y = inputs[0], y_ = inputs[1] 110 | // np.mean(-np.sum(y_ * np.log(softmax(y)), axis=1), keepdims=True) 111 | __global__ void matrix_softmax_cross_entropy_kernel(int nrow, int ncol, 112 | const float *input_a, 113 | const float *input_b, 114 | float *output) { 115 | // Dynamic shared memory, size provided at kernel launch. 116 | extern __shared__ float loss_per_row[]; 117 | // Two dimensional thread blocks. 118 | int y = blockIdx.x * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + 119 | threadIdx.x; 120 | if (y >= nrow) { 121 | return; 122 | } 123 | input_a += y * ncol; 124 | input_b += y * ncol; 125 | float maxval = *input_a; 126 | // Find max for a row. 127 | for (int x = 1; x < ncol; ++x) { 128 | maxval = max(maxval, input_a[x]); 129 | } 130 | // Deduct by max for a row, and raise to exp. 131 | float sum = 0; 132 | for (int x = 0; x < ncol; ++x) { 133 | sum += exp(input_a[x] - maxval); 134 | } 135 | // Compute per-row loss. 136 | float loss = 0; 137 | for (int x = 0; x < ncol; ++x) { 138 | loss -= input_b[x] * log(exp(input_a[x] - maxval) / sum); 139 | } 140 | loss_per_row[y] = loss; 141 | __syncthreads(); 142 | // Compute reduce_mean across rows. 143 | float mean_loss = 0; 144 | // Use a single thread to reduce mean across rows. 145 | if ((threadIdx.x == 0) && (threadIdx.y == 0)) { 146 | for (int i = 0; i < nrow; ++i) { 147 | mean_loss += loss_per_row[i]; 148 | } 149 | mean_loss /= nrow; 150 | output[0] = mean_loss; 151 | } 152 | } 153 | 154 | int DLGpuArraySet(DLArrayHandle arr, float value) { 155 | int count = 1; 156 | for (int i = 0; i < arr->ndim; ++i) { 157 | count *= arr->shape[i]; 158 | } 159 | float *arr_data = (float *)arr->data; 160 | matrix_array_set_kernel<<>>( 161 | count, arr_data, value); 162 | return 0; 163 | } 164 | 165 | int DLGpuBroadcastTo(const DLArrayHandle input, DLArrayHandle output) { 166 | assert(input->ndim + 1 == output->ndim); 167 | int inputCount = 1, outputCount = output->shape[0]; 168 | for (int i = 0; i < input->ndim; ++i) { 169 | assert(input->shape[i] == output->shape[i + 1]); 170 | inputCount *= input->shape[i]; 171 | outputCount *= output->shape[i + 1]; 172 | } 173 | float* inputArr = (float*) input->data; 174 | float* outputArr = (float*) output->data; 175 | matrix_broadcast_to_kernel<<>>( 176 | inputCount, inputArr, outputCount, outputArr); 177 | return 0; 178 | } 179 | 180 | int DLGpuReduceSumAxisZero(const DLArrayHandle input, DLArrayHandle output) { 181 | assert(input->ndim == output->ndim + 1); 182 | int zeroDim = input->shape[0], outputCount = 1; 183 | for (int i = 0; i < output->ndim; ++i) { 184 | assert(input->shape[i+1] == output->shape[i]); 185 | outputCount *= output->shape[i]; 186 | } 187 | float* inputArr = (float*) input->data; 188 | float* outputArr = (float*) output->data; 189 | matrix_reduce_sum_axis_zero_kernel<<>>( 190 | inputArr, outputCount, outputArr, zeroDim); 191 | return 0; 192 | } 193 | 194 | int DLGpuMatrixElementwiseAdd(const DLArrayHandle matA, 195 | const DLArrayHandle matB, DLArrayHandle output) { 196 | assert(matA->ndim == output->ndim); 197 | assert(matB->ndim == output->ndim); 198 | int count = 1; 199 | for (int i = 0; i < matA->ndim; ++i) { 200 | assert(matA->shape[i] == output->shape[i]); 201 | assert(matB->shape[i] == output->shape[i]); 202 | count *= matA->shape[i]; 203 | } 204 | float* matAData = (float*) matA->data; 205 | float* matBData = (float*) matB->data; 206 | float* outputData = (float*) output->data; 207 | matrix_elementwise_add_kernel<<>>( 208 | matAData, matBData, outputData, count); 209 | return 0; 210 | } 211 | 212 | int DLGpuMatrixElementwiseAddByConst(const DLArrayHandle input, float val, 213 | DLArrayHandle output) { 214 | assert(input->ndim == output->ndim); 215 | int count = 1; 216 | for (int i = 0; i < input->ndim; ++i) { 217 | assert(input->shape[i] == output->shape[i]); 218 | count *= input->shape[i]; 219 | } 220 | float* inputArr = (float*) input->data; 221 | float* outputArr = (float*) output->data; 222 | matrix_elementwise_add_by_const_kernel<<>>( 223 | inputArr, val, outputArr, count); 224 | return 0; 225 | } 226 | 227 | int DLGpuMatrixElementwiseMultiply(const DLArrayHandle matA, 228 | const DLArrayHandle matB, 229 | DLArrayHandle output) { 230 | assert(matA->ndim == output->ndim); 231 | assert(matB->ndim == output->ndim); 232 | int count = 1; 233 | for (int i = 0; i < matA->ndim; ++i) { 234 | assert(matA->shape[i] == output->shape[i]); 235 | assert(matB->shape[i] == output->shape[i]); 236 | count *= matA->shape[i]; 237 | } 238 | float* matAData = (float*) matA->data; 239 | float* matBData = (float*) matB->data; 240 | float* outputData = (float*) output->data; 241 | matrix_elementwise_multiply_kernel<<>>( 242 | matAData, matBData, outputData, count); 243 | return 0; 244 | } 245 | 246 | int DLGpuMatrixMultiplyByConst(const DLArrayHandle input, float val, 247 | DLArrayHandle output) { 248 | assert(input->ndim == output->ndim); 249 | int count = 1; 250 | for (int i = 0; i < input->ndim; ++i) { 251 | assert(input->shape[i] == output->shape[i]); 252 | count *= input->shape[i]; 253 | } 254 | float* inputArr = (float*) input->data; 255 | float* outputArr = (float*) output->data; 256 | matrix_elementwise_multipy_by_const_kernel<<>>( 257 | inputArr, val, outputArr, count); 258 | return 0; 259 | } 260 | 261 | int DLGpuMatrixMultiply(const DLArrayHandle matA, bool transposeA, 262 | const DLArrayHandle matB, bool transposeB, 263 | DLArrayHandle matC) { 264 | // Hint: use cublas 265 | // cublas assume matrix is column major 266 | assert(matA->ndim == 2); 267 | assert(matB->ndim == 2); 268 | assert(matC->ndim == 2); 269 | assert(matA->shape[transposeA ? 0 : 1] == matB->shape[transposeB ? 1 : 0]); 270 | assert(matA->shape[transposeA ? 1 : 0] == matC->shape[0]); 271 | assert(matB->shape[transposeB ? 0 : 1] == matC->shape[1]); 272 | 273 | cublasHandle_t handle; 274 | cublasCreate(&handle); 275 | const float* matAData = (const float*) matA->data; 276 | const float* matBData = (const float*) matB->data; 277 | float* matCData = (float*) matC->data; 278 | float alpha = 1, beta = 0; 279 | 280 | cublasSgemm(handle, 281 | (transposeB ? CUBLAS_OP_T : CUBLAS_OP_N), 282 | (transposeA ? CUBLAS_OP_T : CUBLAS_OP_N), 283 | (transposeB ? matB->shape[0] : matB->shape[1]), 284 | (transposeA ? matA->shape[1] : matA->shape[0]), 285 | (transposeB ? matB->shape[1] : matB->shape[0]), 286 | &alpha, 287 | matBData, matB->shape[1], 288 | matAData, matA->shape[1], 289 | &beta, 290 | matCData, (transposeB ? matB->shape[0] : matB->shape[1])); 291 | 292 | return 0; 293 | } 294 | 295 | int DLGpuRelu(const DLArrayHandle input, DLArrayHandle output) { 296 | assert(input->ndim == output->ndim); 297 | int count = 1; 298 | for (int i = 0; i < input->ndim; ++i) { 299 | assert(input->shape[i] == output->shape[i]); 300 | count *= input->shape[i]; 301 | } 302 | float* inputArr = (float*) input->data; 303 | float* outputArr = (float*) output->data; 304 | matrix_relu_kernel<<>>( 305 | inputArr, outputArr, count); 306 | return 0; 307 | } 308 | 309 | int DLGpuReluGradient(const DLArrayHandle input, const DLArrayHandle in_grad, 310 | DLArrayHandle output) { 311 | assert(input->ndim == in_grad->ndim); 312 | assert(input->ndim == output->ndim); 313 | int count = 1; 314 | for (int i = 0; i < input->ndim; ++i) { 315 | assert(input->shape[i] == in_grad->shape[i]); 316 | assert(input->shape[i] == output->shape[i]); 317 | count *= input->shape[i]; 318 | } 319 | const float* inputArr = (const float*) input->data; 320 | const float* gradArr = (const float*) in_grad->data; 321 | float* outputArr = (float*) output->data; 322 | matrix_relu_gradient_kernel<<>>( 323 | inputArr, gradArr, outputArr, count); 324 | return 0; 325 | } 326 | 327 | int DLGpuSoftmax(const DLArrayHandle input, DLArrayHandle output) { 328 | assert(input->ndim == 2); 329 | assert(output->ndim == 2); 330 | assert(input->shape[0] == output->shape[0]); 331 | assert(input->shape[1] == output->shape[1]); 332 | 333 | int nRow = input->shape[0]; 334 | int nCol = input->shape[1]; 335 | 336 | dim3 block(MAX_THREADS_NUM); 337 | dim3 grid((nRow + block.x - 1) / block.x); 338 | 339 | float* inputArr = (float*) input->data; 340 | float* outputArr = (float*) output->data; 341 | 342 | matrix_softmax_kernel<<>>(nRow, nCol, inputArr, outputArr); 343 | 344 | return 0; 345 | } 346 | 347 | int DLGpuSoftmaxCrossEntropy(const DLArrayHandle input_a, 348 | const DLArrayHandle input_b, 349 | DLArrayHandle output) { 350 | assert(input_a->ndim == 2); 351 | assert(input_b->ndim == 2); 352 | assert(output->ndim == 1); 353 | assert(input_a->shape[0] == input_b->shape[0] && 354 | input_a->shape[1] == input_b->shape[1]); 355 | int nrow = input_a->shape[0]; 356 | // Maximum x- or y-dimension of a block = 1024 357 | // But we need 'nrow' shared memory, and max shared memory is 48KB. 358 | // Conservatively allow max 16KB shared memory. 359 | assert(nrow <= 1024 * 4); 360 | int ncol = input_a->shape[1]; 361 | const float *input_data_a = (const float *)input_a->data; 362 | const float *input_data_b = (const float *)input_b->data; 363 | float *output_data = (float *)output->data; 364 | dim3 threads; 365 | if (nrow <= 1024) { 366 | threads.x = nrow; 367 | } else { 368 | threads.x = 1024; 369 | threads.y = (nrow + 1023) / 1024; 370 | } 371 | // 1 block, each block with 'threads' number of threads with 'nrow' shared 372 | // memory size 373 | matrix_softmax_cross_entropy_kernel<<<1, threads, nrow * sizeof(float)>>>( 374 | nrow, ncol, input_data_a, input_data_b, output_data); 375 | return 0; 376 | } 377 | -------------------------------------------------------------------------------- /src/runtime_base.h: -------------------------------------------------------------------------------- 1 | /*! 2 | * \file runtime_base.h 3 | * \brief Base of all C APIs 4 | */ 5 | #ifndef TINYFLOW_RUNTIME_RUNTIME_BASE_H_ 6 | #define TINYFLOW_RUNTIME_RUNTIME_BASE_H_ 7 | 8 | #include "c_runtime_api.h" 9 | #include 10 | 11 | /*! \brief macro to guard beginning and end section of all functions */ 12 | #define API_BEGIN() try { 13 | /*! 14 | * \brief every function starts with API_BEGIN(), and finishes with API_END() 15 | * or API_END_HANDLE_ERROR 16 | */ 17 | #define API_END() \ 18 | } \ 19 | catch (std::runtime_error & _except_) { \ 20 | return TINYFLOWAPIHandleException(_except_); \ 21 | } \ 22 | return 0; 23 | 24 | /*! 25 | * \brief every function starts with API_BEGIN() and finishes with API_END() or 26 | * API_END_HANDLE_ERROR. The finally clause contains procedure to cleanup states 27 | * when an error happens. 28 | */ 29 | #define API_END_HANDLE_ERROR(Finalize) \ 30 | } \ 31 | catch (std::runtime_error & _except_) { \ 32 | Finalize; \ 33 | return TINYFLOWAPIHandleException(_except_); \ 34 | } \ 35 | return 0; 36 | 37 | /*! 38 | * \brief handle exception throwed out 39 | * \param e the exception 40 | * \return the return value of API after exception is handled 41 | */ 42 | inline int TINYFLOWAPIHandleException(const std::runtime_error &e) { 43 | // TODO 44 | // TVMAPISetLastError(e.what()); 45 | return -1; 46 | } 47 | 48 | #endif // TINYFLOW_RUNTIME_RUNTIME_BASE_H_ 49 | -------------------------------------------------------------------------------- /tests/autodiff_test.py: -------------------------------------------------------------------------------- 1 | from tinyflow import autodiff as ad 2 | import numpy as np 3 | 4 | 5 | def test_identity(): 6 | x2 = ad.Variable(name="x2") 7 | y = x2 8 | 9 | grad_x2, = ad.gradients(y, [x2]) 10 | 11 | executor = ad.Executor([y, grad_x2]) 12 | x2_val = 2 * np.ones(3) 13 | y_val, grad_x2_val = executor.run(feed_dict={x2: x2_val}) 14 | 15 | assert isinstance(y, ad.Node) 16 | assert np.array_equal(y_val, x2_val) 17 | assert np.array_equal(grad_x2_val, np.ones_like(x2_val)) 18 | 19 | 20 | def test_add_by_const(): 21 | x2 = ad.Variable(name="x2") 22 | y = 5 + x2 23 | 24 | grad_x2, = ad.gradients(y, [x2]) 25 | 26 | executor = ad.Executor([y, grad_x2]) 27 | x2_val = 2 * np.ones(3) 28 | y_val, grad_x2_val = executor.run(feed_dict={x2: x2_val}) 29 | 30 | assert isinstance(y, ad.Node) 31 | assert np.array_equal(y_val, x2_val + 5) 32 | assert np.array_equal(grad_x2_val, np.ones_like(x2_val)) 33 | 34 | 35 | def test_mul_by_const(): 36 | x2 = ad.Variable(name="x2") 37 | y = 5 * x2 38 | 39 | grad_x2, = ad.gradients(y, [x2]) 40 | 41 | executor = ad.Executor([y, grad_x2]) 42 | x2_val = 2 * np.ones(3) 43 | y_val, grad_x2_val = executor.run(feed_dict={x2: x2_val}) 44 | 45 | assert isinstance(y, ad.Node) 46 | assert np.array_equal(y_val, x2_val * 5) 47 | assert np.array_equal(grad_x2_val, np.ones_like(x2_val) * 5) 48 | 49 | 50 | def test_add_two_vars(): 51 | x2 = ad.Variable(name="x2") 52 | x3 = ad.Variable(name="x3") 53 | y = x2 + x3 54 | 55 | grad_x2, grad_x3 = ad.gradients(y, [x2, x3]) 56 | 57 | executor = ad.Executor([y, grad_x2, grad_x3]) 58 | x2_val = 2 * np.ones(3) 59 | x3_val = 3 * np.ones(3) 60 | y_val, grad_x2_val, grad_x3_val = executor.run(feed_dict={x2: x2_val, x3: x3_val}) 61 | 62 | assert isinstance(y, ad.Node) 63 | assert np.array_equal(y_val, x2_val + x3_val) 64 | assert np.array_equal(grad_x2_val, np.ones_like(x2_val)) 65 | assert np.array_equal(grad_x3_val, np.ones_like(x3_val)) 66 | 67 | 68 | def test_mul_two_vars(): 69 | x2 = ad.Variable(name="x2") 70 | x3 = ad.Variable(name="x3") 71 | y = x2 * x3 72 | 73 | grad_x2, grad_x3 = ad.gradients(y, [x2, x3]) 74 | 75 | executor = ad.Executor([y, grad_x2, grad_x3]) 76 | x2_val = 2 * np.ones(3) 77 | x3_val = 3 * np.ones(3) 78 | y_val, grad_x2_val, grad_x3_val = executor.run(feed_dict={x2: x2_val, x3: x3_val}) 79 | 80 | assert isinstance(y, ad.Node) 81 | assert np.array_equal(y_val, x2_val * x3_val) 82 | assert np.array_equal(grad_x2_val, x3_val) 83 | assert np.array_equal(grad_x3_val, x2_val) 84 | 85 | 86 | def test_add_mul_mix_1(): 87 | x1 = ad.Variable(name="x1") 88 | x2 = ad.Variable(name="x2") 89 | x3 = ad.Variable(name="x3") 90 | y = x1 + x2 * x3 * x1 91 | 92 | grad_x1, grad_x2, grad_x3 = ad.gradients(y, [x1, x2, x3]) 93 | 94 | executor = ad.Executor([y, grad_x1, grad_x2, grad_x3]) 95 | x1_val = 1 * np.ones(3) 96 | x2_val = 2 * np.ones(3) 97 | x3_val = 3 * np.ones(3) 98 | y_val, grad_x1_val, grad_x2_val, grad_x3_val = executor.run(feed_dict={x1: x1_val, x2: x2_val, x3: x3_val}) 99 | 100 | assert isinstance(y, ad.Node) 101 | assert np.array_equal(y_val, x1_val + x2_val * x3_val) 102 | assert np.array_equal(grad_x1_val, np.ones_like(x1_val) + x2_val * x3_val) 103 | assert np.array_equal(grad_x2_val, x3_val * x1_val) 104 | assert np.array_equal(grad_x3_val, x2_val * x1_val) 105 | 106 | 107 | def test_add_mul_mix_2(): 108 | x1 = ad.Variable(name="x1") 109 | x2 = ad.Variable(name="x2") 110 | x3 = ad.Variable(name="x3") 111 | x4 = ad.Variable(name="x4") 112 | y = x1 + x2 * x3 * x4 113 | 114 | grad_x1, grad_x2, grad_x3, grad_x4 = ad.gradients(y, [x1, x2, x3, x4]) 115 | 116 | executor = ad.Executor([y, grad_x1, grad_x2, grad_x3, grad_x4]) 117 | x1_val = 1 * np.ones(3) 118 | x2_val = 2 * np.ones(3) 119 | x3_val = 3 * np.ones(3) 120 | x4_val = 4 * np.ones(3) 121 | y_val, grad_x1_val, grad_x2_val, grad_x3_val, grad_x4_val = executor.run( 122 | feed_dict={x1: x1_val, x2: x2_val, x3: x3_val, x4: x4_val} 123 | ) 124 | 125 | assert isinstance(y, ad.Node) 126 | assert np.array_equal(y_val, x1_val + x2_val * x3_val * x4_val) 127 | assert np.array_equal(grad_x1_val, np.ones_like(x1_val)) 128 | assert np.array_equal(grad_x2_val, x3_val * x4_val) 129 | assert np.array_equal(grad_x3_val, x2_val * x4_val) 130 | assert np.array_equal(grad_x4_val, x2_val * x3_val) 131 | 132 | 133 | def test_add_mul_mix_3(): 134 | x2 = ad.Variable(name="x2") 135 | x3 = ad.Variable(name="x3") 136 | z = x2 * x2 + x2 + x3 + 3 137 | y = z * z + x3 138 | 139 | grad_x2, grad_x3 = ad.gradients(y, [x2, x3]) 140 | 141 | executor = ad.Executor([y, grad_x2, grad_x3]) 142 | x2_val = 2 * np.ones(3) 143 | x3_val = 3 * np.ones(3) 144 | y_val, grad_x2_val, grad_x3_val = executor.run(feed_dict={x2: x2_val, x3: x3_val}) 145 | 146 | z_val = x2_val * x2_val + x2_val + x3_val + 3 147 | expected_yval = z_val * z_val + x3_val 148 | expected_grad_x2_val = 2 * (x2_val * x2_val + x2_val + x3_val + 3) * (2 * x2_val + 1) 149 | expected_grad_x3_val = 2 * (x2_val * x2_val + x2_val + x3_val + 3) + 1 150 | assert isinstance(y, ad.Node) 151 | assert np.array_equal(y_val, expected_yval) 152 | assert np.array_equal(grad_x2_val, expected_grad_x2_val) 153 | assert np.array_equal(grad_x3_val, expected_grad_x3_val) 154 | 155 | 156 | def test_grad_of_grad(): 157 | x2 = ad.Variable(name="x2") 158 | x3 = ad.Variable(name="x3") 159 | y = x2 * x2 + x2 * x3 160 | 161 | grad_x2, grad_x3 = ad.gradients(y, [x2, x3]) 162 | grad_x2_x2, grad_x2_x3 = ad.gradients(grad_x2, [x2, x3]) 163 | 164 | executor = ad.Executor([y, grad_x2, grad_x3, grad_x2_x2, grad_x2_x3]) 165 | x2_val = 2 * np.ones(3) 166 | x3_val = 3 * np.ones(3) 167 | y_val, grad_x2_val, grad_x3_val, grad_x2_x2_val, grad_x2_x3_val = executor.run( 168 | feed_dict={x2: x2_val, x3: x3_val} 169 | ) 170 | 171 | expected_yval = x2_val * x2_val + x2_val * x3_val 172 | expected_grad_x2_val = 2 * x2_val + x3_val 173 | expected_grad_x3_val = x2_val 174 | expected_grad_x2_x2_val = 2 * np.ones_like(x2_val) 175 | expected_grad_x2_x3_val = 1 * np.ones_like(x2_val) 176 | 177 | assert isinstance(y, ad.Node) 178 | assert np.array_equal(y_val, expected_yval) 179 | assert np.array_equal(grad_x2_val, expected_grad_x2_val) 180 | assert np.array_equal(grad_x3_val, expected_grad_x3_val) 181 | assert np.array_equal(grad_x2_x2_val, expected_grad_x2_x2_val) 182 | assert np.array_equal(grad_x2_x3_val, expected_grad_x2_x3_val) 183 | 184 | 185 | def test_matmul_two_vars(): 186 | x2 = ad.Variable(name="x2") 187 | x3 = ad.Variable(name="x3") 188 | y = ad.matmul_op(x2, x3) 189 | 190 | grad_x2, grad_x3 = ad.gradients(y, [x2, x3]) 191 | 192 | executor = ad.Executor([y, grad_x2, grad_x3]) 193 | x2_val = np.array([[1, 2], [3, 4], [5, 6]]) # 3x2 194 | x3_val = np.array([[7, 8, 9], [10, 11, 12]]) # 2x3 195 | 196 | y_val, grad_x2_val, grad_x3_val = executor.run(feed_dict={x2: x2_val, x3: x3_val}) 197 | 198 | expected_yval = np.matmul(x2_val, x3_val) 199 | expected_grad_x2_val = np.matmul(np.ones_like(expected_yval), np.transpose(x3_val)) 200 | expected_grad_x3_val = np.matmul(np.transpose(x2_val), np.ones_like(expected_yval)) 201 | 202 | assert isinstance(y, ad.Node) 203 | assert np.array_equal(y_val, expected_yval) 204 | assert np.array_equal(grad_x2_val, expected_grad_x2_val) 205 | assert np.array_equal(grad_x3_val, expected_grad_x3_val) 206 | 207 | 208 | def test_exp(): 209 | x1 = ad.Variable("x1") 210 | x2 = ad.exp_op(x1) 211 | x3 = x2 + 1 212 | x4 = x2 * x3 213 | 214 | x1_grad, = ad.gradients(x4, [x1]) 215 | 216 | executor = ad.Executor([x4]) 217 | x1_val = 1 218 | x4_val, x1_grad = executor.run(feed_dict={x1: x1_val}) 219 | print(x4_val) 220 | print(x1_grad) 221 | 222 | 223 | def test_exp_grad(): 224 | x = ad.Variable("x") 225 | y = ad.exp_op(x) 226 | 227 | x_grad, = ad.gradients(y, [x]) 228 | 229 | executor = ad.Executor([y, x_grad]) 230 | x_val = 1 231 | y_val, x_grad_val = executor.run(feed_dict={x: x_val}) 232 | print(y_val) 233 | print(x_grad_val) 234 | 235 | 236 | def test_lr(): 237 | W = ad.Variable(name="W") 238 | b = ad.Variable(name="b") 239 | X = ad.Variable(name="X") 240 | y_ = ad.Variable(name="y_") 241 | 242 | z = ad.matmul_op(X, W) + b 243 | loss = ad.sigmoidcrossentropy_op(z, y_) 244 | 245 | grad_W, grad_b = ad.gradients(loss, [W, b]) -------------------------------------------------------------------------------- /tests/mnist_dlsys.py: -------------------------------------------------------------------------------- 1 | from tinyflow import autodiff as ad 2 | from tinyflow import ndarray, gpu_op 3 | import numpy as np 4 | 5 | import argparse 6 | import six.moves.cPickle as pickle 7 | import gzip 8 | import os 9 | 10 | 11 | def load_mnist_data(dataset): 12 | """ Load the dataset 13 | Code adapted from http://deeplearning.net/tutorial/code/logistic_sgd.py 14 | 15 | :type dataset: string 16 | :param dataset: the path to the dataset (here MNIST) 17 | """ 18 | # Download the MNIST dataset if it is not present 19 | data_dir, data_file = os.path.split(dataset) 20 | if data_dir == "" and not os.path.isfile(dataset): 21 | # Check if dataset is in the data directory. 22 | new_path = os.path.join( 23 | os.path.split(__file__)[0], 24 | dataset 25 | ) 26 | if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz': 27 | dataset = new_path 28 | 29 | if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz': 30 | from six.moves import urllib 31 | origin = ( 32 | 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' 33 | ) 34 | print('Downloading data from %s' % origin) 35 | urllib.request.urlretrieve(origin, dataset) 36 | 37 | print('Loading data...') 38 | 39 | # Load the dataset 40 | with gzip.open(dataset, 'rb') as f: 41 | try: 42 | train_set, valid_set, test_set = pickle.load(f, encoding='latin1') 43 | except: 44 | train_set, valid_set, test_set = pickle.load(f) 45 | # train_set, valid_set, test_set format: tuple(input, target) 46 | # input is a numpy.ndarray of 2 dimensions (a matrix), np.float32 47 | # where each row corresponds to an example. target is a 48 | # numpy.ndarray of 1 dimension (vector), np.int64 that has the same length 49 | # as the number of rows in the input. It should give the target 50 | # to the example with the same index in the input. 51 | return train_set, valid_set, test_set 52 | 53 | 54 | def convert_to_one_hot(vals): 55 | """Helper method to convert label array to one-hot array.""" 56 | one_hot_vals = np.zeros((vals.size, vals.max()+1)) 57 | one_hot_vals[np.arange(vals.size), vals] = 1 58 | return one_hot_vals 59 | 60 | 61 | def sgd_update_gpu(param, grad_param, learning_rate): 62 | """Helper GPU SGD update method. Avoids copying NDArray to cpu.""" 63 | assert isinstance(param, ndarray.NDArray) 64 | assert isinstance(grad_param, ndarray.NDArray) 65 | gpu_op.matrix_elementwise_multiply_by_const( 66 | grad_param, -learning_rate, grad_param) 67 | gpu_op.matrix_elementwise_add(param, grad_param, param) 68 | 69 | 70 | def mnist_logreg(executor_ctx=None, num_epochs=10, print_loss_val_each_epoch=False): 71 | print("Build logistic regression model...") 72 | 73 | W1 = ad.Variable(name="W1") 74 | b1 = ad.Variable(name="b1") 75 | X = ad.Variable(name="X") 76 | y_ = ad.Variable(name="y_") 77 | 78 | z1 = ad.matmul_op(X, W1) 79 | y = z1 + ad.broadcastto_op(b1, z1) 80 | 81 | loss = ad.softmaxcrossentropy_op(y, y_) 82 | 83 | grad_W1, grad_b1 = ad.gradients(loss, [W1, b1]) 84 | executor = ad.Executor([loss, grad_W1, grad_b1, y], ctx=executor_ctx) 85 | 86 | # Read input data 87 | datasets = load_mnist_data("mnist.pkl.gz") 88 | train_set_x, train_set_y = datasets[0] 89 | valid_set_x, valid_set_y = datasets[1] 90 | test_set_x, test_set_y = datasets[2] 91 | 92 | # Set up minibatch 93 | batch_size = 1000 94 | n_train_batches = train_set_x.shape[0] // batch_size 95 | n_valid_batches = valid_set_x.shape[0] // batch_size 96 | 97 | print("Start training loop...") 98 | 99 | # Initialize parameters 100 | W1_val = np.zeros((784, 10)) 101 | b1_val = np.zeros((10)) 102 | X_val = np.empty(shape=(batch_size, 784), dtype=np.float32) 103 | y_val = np.empty(shape=(batch_size, 10), dtype=np.float32) 104 | valid_X_val = np.empty(shape=(batch_size, 784), dtype=np.float32) 105 | valid_y_val = np.empty(shape=(batch_size, 10), dtype=np.float32) 106 | if ndarray.is_gpu_ctx(executor_ctx): 107 | W1_val = ndarray.array(W1_val, ctx=executor_ctx) 108 | b1_val = ndarray.array(b1_val, ctx=executor_ctx) 109 | X_val = ndarray.array(X_val, ctx=executor_ctx) 110 | y_val = ndarray.array(y_val, ctx=executor_ctx) 111 | 112 | lr = 1e-3 113 | for i in range(num_epochs): 114 | print("epoch %d" % i) 115 | for minibatch_index in range(n_train_batches): 116 | minibatch_start = minibatch_index * batch_size 117 | minibatch_end = (minibatch_index + 1) * batch_size 118 | X_val[:] = train_set_x[minibatch_start:minibatch_end] 119 | y_val[:] = convert_to_one_hot( 120 | train_set_y[minibatch_start:minibatch_end]) 121 | loss_val, grad_W1_val, grad_b1_val, _ = executor.run( 122 | feed_dict = {X: X_val, y_: y_val, W1: W1_val, b1: b1_val}) 123 | # SGD update 124 | if (executor_ctx is None): 125 | W1_val = W1_val - lr * grad_W1_val 126 | b1_val = b1_val - lr * grad_b1_val 127 | else: 128 | sgd_update_gpu(W1_val, grad_W1_val, lr) 129 | sgd_update_gpu(b1_val, grad_b1_val, lr) 130 | if print_loss_val_each_epoch: 131 | if isinstance(loss_val, ndarray.NDArray): 132 | print(loss_val.asnumpy()) 133 | else: 134 | print(loss_val) 135 | 136 | correct_predictions = [] 137 | for minibatch_index in range(n_valid_batches): 138 | minibatch_start = minibatch_index * batch_size 139 | minibatch_end = (minibatch_index + 1) * batch_size 140 | valid_X_val[:] = valid_set_x[minibatch_start:minibatch_end] 141 | valid_y_val[:] = convert_to_one_hot( 142 | valid_set_y[minibatch_start:minibatch_end]) 143 | _, _, _, valid_y_predicted = executor.run( 144 | feed_dict={ 145 | X: valid_X_val, 146 | y_: valid_y_val, 147 | W1: W1_val, 148 | b1: b1_val}, 149 | convert_to_numpy_ret_vals=True) 150 | correct_prediction = np.equal( 151 | np.argmax(valid_y_val, 1), 152 | np.argmax(valid_y_predicted, 1)).astype(np.float) 153 | correct_predictions.extend(correct_prediction) 154 | accuracy = np.mean(correct_predictions) 155 | # validation set accuracy=0.928200 156 | print("validation set accuracy=%f" % accuracy) 157 | 158 | 159 | def mnist_mlp(executor_ctx=None, num_epochs=10, print_loss_val_each_epoch=False): 160 | print("Build 3-layer MLP model...") 161 | 162 | W1 = ad.Variable(name="W1") 163 | W2 = ad.Variable(name="W2") 164 | W3 = ad.Variable(name="W3") 165 | b1 = ad.Variable(name="b1") 166 | b2 = ad.Variable(name="b2") 167 | b3 = ad.Variable(name="b3") 168 | X = ad.Variable(name="X") 169 | y_ = ad.Variable(name="y_") 170 | 171 | # relu(X W1+b1) 172 | z1 = ad.matmul_op(X, W1) 173 | z2 = z1 + ad.broadcastto_op(b1, z1) 174 | z3 = ad.relu_op(z2) 175 | 176 | # relu(z3 W2+b2) 177 | z4 = ad.matmul_op(z3, W2) 178 | z5 = z4 + ad.broadcastto_op(b2, z4) 179 | z6 = ad.relu_op(z5) 180 | 181 | # softmax(z5 W2+b2) 182 | z7 = ad.matmul_op(z6, W3) 183 | y = z7 + ad.broadcastto_op(b3, z7) 184 | 185 | loss = ad.softmaxcrossentropy_op(y, y_) 186 | 187 | grad_W1, grad_W2, grad_W3, grad_b1, grad_b2, grad_b3 = ad.gradients( 188 | loss, [W1, W2, W3, b1, b2, b3]) 189 | executor = ad.Executor( 190 | [loss, grad_W1, grad_W2, grad_W3, grad_b1, grad_b2, grad_b3, y], 191 | ctx=executor_ctx) 192 | 193 | # Read input data 194 | datasets = load_mnist_data("mnist.pkl.gz") 195 | train_set_x, train_set_y = datasets[0] 196 | valid_set_x, valid_set_y = datasets[1] 197 | test_set_x, test_set_y = datasets[2] 198 | # Set up minibatch 199 | batch_size = 1000 200 | n_train_batches = train_set_x.shape[0] // batch_size 201 | n_valid_batches = valid_set_x.shape[0] // batch_size 202 | 203 | print("Start training loop...") 204 | 205 | # Initialize parameters 206 | rand = np.random.RandomState(seed=123) 207 | W1_val = rand.normal(scale=0.1, size=(784, 256)) 208 | W2_val = rand.normal(scale=0.1, size=(256, 100)) 209 | W3_val = rand.normal(scale=0.1, size=(100, 10)) 210 | b1_val = rand.normal(scale=0.1, size=(256)) 211 | b2_val = rand.normal(scale=0.1, size=(100)) 212 | b3_val = rand.normal(scale=0.1, size=(10)) 213 | X_val = np.empty(shape=(batch_size, 784), dtype=np.float32) 214 | y_val = np.empty(shape=(batch_size, 10), dtype=np.float32) 215 | valid_X_val = np.empty(shape=(batch_size, 784), dtype=np.float32) 216 | valid_y_val = np.empty(shape=(batch_size, 10), dtype=np.float32) 217 | if ndarray.is_gpu_ctx(executor_ctx): 218 | W1_val = ndarray.array(W1_val, ctx=executor_ctx) 219 | W2_val = ndarray.array(W2_val, ctx=executor_ctx) 220 | W3_val = ndarray.array(W3_val, ctx=executor_ctx) 221 | b1_val = ndarray.array(b1_val, ctx=executor_ctx) 222 | b2_val = ndarray.array(b2_val, ctx=executor_ctx) 223 | b3_val = ndarray.array(b3_val, ctx=executor_ctx) 224 | X_val = ndarray.array(X_val, ctx=executor_ctx) 225 | y_val = ndarray.array(y_val, ctx=executor_ctx) 226 | 227 | lr = 1.0e-3 228 | for i in range(num_epochs): 229 | print("epoch %d" % i) 230 | for minibatch_index in range(n_train_batches): 231 | minibatch_start = minibatch_index * batch_size 232 | minibatch_end = (minibatch_index + 1) * batch_size 233 | X_val[:] = train_set_x[minibatch_start:minibatch_end] 234 | y_val[:] = convert_to_one_hot( 235 | train_set_y[minibatch_start:minibatch_end]) 236 | loss_val, grad_W1_val, grad_W2_val, grad_W3_val, \ 237 | grad_b1_val, grad_b2_val, grad_b3_val, _ = executor.run( 238 | feed_dict={ 239 | X: X_val, 240 | y_: y_val, 241 | W1: W1_val, 242 | W2: W2_val, 243 | W3: W3_val, 244 | b1: b1_val, 245 | b2: b2_val, 246 | b3: b3_val}) 247 | # SGD update 248 | if (executor_ctx is None): 249 | W1_val = W1_val - lr * grad_W1_val 250 | W2_val = W2_val - lr * grad_W2_val 251 | W3_val = W3_val - lr * grad_W3_val 252 | b1_val = b1_val - lr * grad_b1_val 253 | b2_val = b2_val - lr * grad_b2_val 254 | b3_val = b3_val - lr * grad_b3_val 255 | else: 256 | sgd_update_gpu(W1_val, grad_W1_val, lr) 257 | sgd_update_gpu(W2_val, grad_W2_val, lr) 258 | sgd_update_gpu(W3_val, grad_W3_val, lr) 259 | sgd_update_gpu(b1_val, grad_b1_val, lr) 260 | sgd_update_gpu(b2_val, grad_b2_val, lr) 261 | sgd_update_gpu(b3_val, grad_b3_val, lr) 262 | if print_loss_val_each_epoch: 263 | if isinstance(loss_val, ndarray.NDArray): 264 | print(loss_val.asnumpy()) 265 | else: 266 | print(loss_val) 267 | 268 | correct_predictions = [] 269 | for minibatch_index in range(n_valid_batches): 270 | minibatch_start = minibatch_index * batch_size 271 | minibatch_end = (minibatch_index + 1) * batch_size 272 | valid_X_val[:] = valid_set_x[minibatch_start:minibatch_end] 273 | valid_y_val[:] = convert_to_one_hot( 274 | valid_set_y[minibatch_start:minibatch_end]) 275 | _, _, _, _, _, _, _, valid_y_predicted = executor.run( 276 | feed_dict={ 277 | X: valid_X_val, 278 | y_: valid_y_val, 279 | W1: W1_val, 280 | W2: W2_val, 281 | W3: W3_val, 282 | b1: b1_val, 283 | b2: b2_val, 284 | b3: b3_val}, 285 | convert_to_numpy_ret_vals=True) 286 | correct_prediction = np.equal( 287 | np.argmax(valid_y_val, 1), 288 | np.argmax(valid_y_predicted, 1)).astype(np.float) 289 | correct_predictions.extend(correct_prediction) 290 | accuracy = np.mean(correct_predictions) 291 | # validation set accuracy=0.970800 292 | print("validation set accuracy=%f" % accuracy) 293 | 294 | 295 | if __name__ == "__main__": 296 | parser = argparse.ArgumentParser() 297 | parser.add_argument( 298 | "-m", "--model", 299 | help="Choose model: all, logreg, mlp", default="all") 300 | parser.add_argument( 301 | "-c", "--executor_context", 302 | help="Choose executor context: numpy, gpu", default="numpy") 303 | parser.add_argument( 304 | "-e", "--num_epoch", 305 | help="Provide number of epochs to train.", type=int, default=20) 306 | parser.add_argument( 307 | "-l", "--print_loss_val_each_epoch", 308 | help="Print loss value at the end of each epoch", action="store_true") 309 | args = parser.parse_args() 310 | 311 | models = [] 312 | executor_ctx = None 313 | print_loss_val_each_epoch = False 314 | if args.model == "logreg": 315 | models = [mnist_logreg] 316 | elif args.model == "mlp": 317 | models = [mnist_mlp] 318 | elif args.model == "all": 319 | models = [mnist_logreg, mnist_mlp] 320 | 321 | if args.executor_context == "numpy": 322 | executor_ctx = None 323 | elif args.executor_context == "gpu": 324 | # Assume only use gpu 0. 325 | executor_ctx = ndarray.gpu(0) 326 | 327 | if args.print_loss_val_each_epoch: 328 | print_loss_val_each_epoch = True 329 | 330 | num_epochs = args.num_epoch 331 | for m in models: 332 | import time 333 | tic = time.time() 334 | m(executor_ctx, num_epochs, print_loss_val_each_epoch) 335 | toc = time.time() 336 | print("mode use time: " + str(toc - tic)) 337 | -------------------------------------------------------------------------------- /tests/test_gpu_op.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tinyflow import ndarray, gpu_op, autodiff 3 | 4 | 5 | def test_array_set(): 6 | ctx = ndarray.gpu(0) 7 | shape = (500, 200) 8 | # oneslike 9 | arr_x = ndarray.empty(shape, ctx=ctx) 10 | gpu_op.array_set(arr_x, 1.) 11 | x = arr_x.asnumpy() 12 | np.testing.assert_allclose(np.ones(shape), x) 13 | # zeroslike 14 | gpu_op.array_set(arr_x, 0.) 15 | x = arr_x.asnumpy() 16 | np.testing.assert_allclose(np.zeros(shape), x) 17 | 18 | 19 | def test_broadcast_to(): 20 | ctx = ndarray.gpu(0) 21 | shape = (200, 300) 22 | to_shape = (130, 200, 300) 23 | x = np.random.uniform(-1, 1, shape).astype(np.float32) 24 | arr_x = ndarray.array(x, ctx=ctx) 25 | arr_y = ndarray.empty(to_shape, ctx=ctx) 26 | gpu_op.broadcast_to(arr_x, arr_y) 27 | y = arr_y.asnumpy() 28 | np.testing.assert_allclose(np.broadcast_to(x, to_shape), y) 29 | 30 | 31 | def test_reduce_sum_axis_zero(): 32 | ctx = ndarray.gpu(0) 33 | shape = (500, 200, 100) 34 | to_shape = (200, 100) 35 | x = np.random.uniform(0, 20, shape).astype(np.float32) 36 | arr_x = ndarray.array(x, ctx=ctx) 37 | arr_y = ndarray.empty(to_shape, ctx=ctx) 38 | gpu_op.reduce_sum_axis_zero(arr_x, arr_y) 39 | y = arr_y.asnumpy() 40 | y_ = np.sum(x, axis=0) 41 | for index, _ in np.ndenumerate(y): 42 | v = y[index] 43 | v_ = y_[index] 44 | if abs((v - v_) / v_) > 1e-4: 45 | print(index, v, v_) 46 | np.testing.assert_allclose(np.sum(x, axis=0), y, rtol=1e-5) 47 | 48 | 49 | def test_matrix_elementwise_add(): 50 | ctx = ndarray.gpu(0) 51 | shape = (500, 200) 52 | x = np.random.uniform(0, 10, size=shape).astype(np.float32) 53 | y = np.random.uniform(0, 10, size=shape).astype(np.float32) 54 | arr_x = ndarray.array(x, ctx=ctx) 55 | arr_y = ndarray.array(y, ctx=ctx) 56 | arr_z = ndarray.empty(shape, ctx=ctx) 57 | gpu_op.matrix_elementwise_add(arr_x, arr_y, arr_z) 58 | z = arr_z.asnumpy() 59 | np.testing.assert_allclose(x + y, z, rtol=1e-5) 60 | 61 | 62 | def test_matrix_elementwise_add_by_const(): 63 | shape = (2000, 3000) 64 | ctx = ndarray.gpu(0) 65 | x = np.random.uniform(0, 10, size=shape).astype(np.float32) 66 | val = np.random.uniform(-5, 5) 67 | arr_x = ndarray.array(x, ctx=ctx) 68 | arr_y = ndarray.empty(shape, ctx=ctx) 69 | gpu_op.matrix_elementwise_add_by_const(arr_x, val, arr_y) 70 | y = arr_y.asnumpy() 71 | np.testing.assert_allclose(x + val, y, rtol=1e-5) 72 | 73 | 74 | def test_matrix_elementwise_multiply(): 75 | ctx = ndarray.gpu(0) 76 | shape = (500, 200) 77 | x = np.random.uniform(0, 10, size=shape).astype(np.float32) 78 | y = np.random.uniform(0, 10, size=shape).astype(np.float32) 79 | arr_x = ndarray.array(x, ctx=ctx) 80 | arr_y = ndarray.array(y, ctx=ctx) 81 | arr_z = ndarray.empty(shape, ctx=ctx) 82 | gpu_op.matrix_elementwise_multiply(arr_x, arr_y, arr_z) 83 | z = arr_z.asnumpy() 84 | np.testing.assert_allclose(x * y, z, rtol=1e-5) 85 | 86 | 87 | def test_matrix_elementwise_multiply_by_const(): 88 | shape = (2000, 3000) 89 | ctx = ndarray.gpu(0) 90 | x = np.random.uniform(0, 10, size=shape).astype(np.float32) 91 | val = np.random.uniform(-5, 5) 92 | arr_x = ndarray.array(x, ctx=ctx) 93 | arr_y = ndarray.empty(shape, ctx=ctx) 94 | gpu_op.matrix_elementwise_multiply_by_const(arr_x, val, arr_y) 95 | y = arr_y.asnumpy() 96 | np.testing.assert_allclose(x * val, y, rtol=1e-5) 97 | 98 | 99 | def test_matrix_multiply(): 100 | ctx = ndarray.gpu(0) 101 | x = np.random.uniform(0, 10, size=(500, 700)).astype(np.float32) 102 | y = np.random.uniform(0, 10, size=(700, 1000)).astype(np.float32) 103 | arr_x = ndarray.array(x, ctx=ctx) 104 | arr_y = ndarray.array(y, ctx=ctx) 105 | arr_z = ndarray.empty((500, 1000), ctx=ctx) 106 | gpu_op.matrix_multiply(arr_x, False, arr_y, False, arr_z) 107 | z = arr_z.asnumpy() 108 | np.testing.assert_allclose(np.dot(x, y), z, rtol=1e-5) 109 | 110 | x = np.random.uniform(0, 10, size=(1000, 500)).astype(np.float32) 111 | y = np.random.uniform(0, 10, size=(2000, 500)).astype(np.float32) 112 | arr_x = ndarray.array(x, ctx=ctx) 113 | arr_y = ndarray.array(y, ctx=ctx) 114 | arr_z = ndarray.empty((1000, 2000), ctx=ctx) 115 | gpu_op.matrix_multiply(arr_x, False, arr_y, True, arr_z) 116 | z = arr_z.asnumpy() 117 | np.testing.assert_allclose(np.dot(x, np.transpose(y)), z, rtol=1e-5) 118 | 119 | x = np.random.uniform(0, 10, size=(500, 1000)).astype(np.float32) 120 | y = np.random.uniform(0, 10, size=(2000, 500)).astype(np.float32) 121 | arr_x = ndarray.array(x, ctx=ctx) 122 | arr_y = ndarray.array(y, ctx=ctx) 123 | arr_z = ndarray.empty((1000, 2000), ctx=ctx) 124 | gpu_op.matrix_multiply(arr_x, True, arr_y, True, arr_z) 125 | z = arr_z.asnumpy() 126 | np.testing.assert_allclose(np.dot(np.transpose(x), np.transpose(y)), z, 127 | rtol=1e-5) 128 | 129 | 130 | def test_relu(): 131 | shape = (2000, 2500) 132 | ctx = ndarray.gpu(0) 133 | x = np.random.uniform(-1, 1, shape).astype(np.float32) 134 | arr_x = ndarray.array(x, ctx=ctx) 135 | arr_y = ndarray.empty(shape, ctx=ctx) 136 | gpu_op.relu(arr_x, arr_y) 137 | y = arr_y.asnumpy() 138 | np.testing.assert_allclose(np.maximum(x, 0).astype(np.float32), y) 139 | 140 | 141 | def test_relu_gradient(): 142 | shape = (2000, 2500) 143 | ctx = ndarray.gpu(0) 144 | x = np.random.uniform(-1, 1, shape).astype(np.float32) 145 | grad_x = np.random.uniform(-5, 5, shape).astype(np.float32) 146 | arr_x = ndarray.array(x, ctx=ctx) 147 | arr_grad_x = ndarray.array(grad_x, ctx=ctx) 148 | arr_y = ndarray.empty(shape, ctx=ctx) 149 | gpu_op.relu_gradient(arr_x, arr_grad_x, arr_y) 150 | y = arr_y.asnumpy() 151 | np.testing.assert_allclose(((x > 0) * grad_x).astype(np.float32), y) 152 | 153 | 154 | def test_softmax(): 155 | ctx = ndarray.gpu(0) 156 | shape = (400, 1000) 157 | x = np.random.uniform(-5, 5, shape).astype(np.float32) 158 | arr_x = ndarray.array(x, ctx=ctx) 159 | arr_y = ndarray.empty(shape, ctx=ctx) 160 | gpu_op.softmax(arr_x, arr_y) 161 | y = arr_y.asnumpy() 162 | np.testing.assert_allclose(autodiff.softmax_func(x), y, rtol=1e-5) 163 | 164 | 165 | def test_softmax_cross_entropy(): 166 | ctx = ndarray.gpu(0) 167 | shape = (400, 1000) 168 | y = np.random.uniform(-5, 5, shape).astype(np.float32) 169 | y_ = np.random.uniform(-5, 5, shape).astype(np.float32) 170 | arr_y = ndarray.array(y, ctx=ctx) 171 | arr_y_ = ndarray.array(y_, ctx=ctx) 172 | arr_out = ndarray.empty((1,), ctx=ctx) 173 | gpu_op.softmax_cross_entropy(arr_y, arr_y_, arr_out) 174 | out = arr_out.asnumpy() 175 | # numpy calculation 176 | cross_entropy = np.mean( 177 | -np.sum(y_ * np.log(autodiff.softmax_func(y)), axis=1), keepdims=True) 178 | np.testing.assert_allclose(cross_entropy, out, rtol=1e-5) 179 | --------------------------------------------------------------------------------