├── .gitignore ├── LICENSE ├── README.md ├── README.zh-cn.md ├── examples ├── 00-download_mnist.py ├── 01-simple_forward.py ├── 02-simple_backward.py ├── 03-simple_network.py ├── 04-introduce_layer.py ├── 05-simple_network_with_layer.py ├── 06-introduce_optimizer.py ├── 07-introduce_optimizer_2.py ├── 08-introduce_weight_decay.py ├── 09-introduce_batch_normalization.py ├── 10-introduce_dropout.py ├── 11-hyperparam_search.py └── 12-CNN_and_digits_recognition.py ├── setup.py └── tinynn ├── __init__.py ├── conv_network.py ├── full_connect_network.py ├── functions.py ├── gradient.py ├── layers.py ├── mnist.py ├── optimizer.py ├── trainer.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.DS_Store 2 | .vscode 3 | .idea 4 | __pycache__ 5 | *.pyc 6 | *.egg-info 7 | htmlcov 8 | .coverage* 9 | build 10 | dist 11 | .cache 12 | 13 | dataset 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Peter Ye 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TinyNN 2 | 3 | English | [简体中文](README.zh-cn.md) 4 | 5 | Build a Complete Neural Network in Less Than 2,000 Lines of Code 6 | 7 | A minimized construction of neural network components. Supports both **Fully Connected Neural Network (FCNN)** and **Convolutional Neural Network (CNN)** for learning purposes. 8 | 9 | ## Installation 10 | 11 | python version > 3.8 12 | 13 | In your working directory, run: `python setup.py install` 14 | 15 | ## Tutorial Examples 16 | 17 | In the `examples` directory, there are many sample codes: 18 | 19 | ```bash 20 | ├─examples 21 | │ 00-download_mnist.py # Download the mnist dataset 22 | │ 01-simple_forward.py # Understand the simplest forward computation 23 | │ 02-simple_backward.py # Understand backward computation based on numerical differentiation 24 | │ 03-simple_network.py # Implement a two-layer neural network with both forward and backward computations 25 | │ 04-introduce_layer.py # Introduce the concept of the layer 26 | │ 05-simple_network_with_layer.py # Implement a neural network based on the layer concept 27 | │ 06-introduce_optimizer.py # Introduce the concept of the optimizer and compare based on a specific function 28 | │ 07-introduce_optimizer_2.py # Apply the optimizer in a real network 29 | │ 08-introduce_weight_decay.py # Introduce the concept of weight decay 30 | │ 09-introduce_batch_normalization.py # Introduce the concept of batch normalization 31 | │ 10-introduce_dropout.py # Introduce the concept of dropout 32 | │ 11-hyperparam_search.py # Implement hyperparameter search 33 | │ 12-CNN_and_digits_recognition.py # Recognize handwritten digits using CNN 34 | ``` 35 | 36 | ## References 37 | 38 | - [pytorch](https://github.com/pytorch/pytorch) 39 | - [tinynn](https://github.com/borgwang/tinynn) 40 | - 《深度学习入门-基于Python的理论与实现》 41 | -------------------------------------------------------------------------------- /README.zh-cn.md: -------------------------------------------------------------------------------- 1 | # TinyNN 2 | 3 | [English](README.md) | 简体中文 4 | 5 | 使用不到两千行代码构建一个完整的神经网络 6 | 7 | 最小化构建的神经网络运行组件,支持**全连接神经网络(FCNN)**和**卷积神经网络(CNN)**,用于学习目的。 8 | 9 | ## 安装 10 | 11 | python version > 3.8 12 | 13 | 在工作目录下,`python setup.py install` 14 | 15 | ## 教程示例 16 | 17 | 在`examples`目录下有许多示例代码和对应注解 18 | 19 | ```bash 20 | ├─examples 21 | │ 00-download_mnist.py # 下载mnist数据集 22 | │ 01-simple_forward.py # 理解最简单的前向计算 23 | │ 02-simple_backward.py # 基于数值微分理解反向计算 24 | │ 03-simple_network.py # 实现包括前向和反向的两层神经网络 25 | │ 04-introduce_layer.py # 引入layer层的概念 26 | │ 05-simple_network_with_layer.py # 基于layer实现神经网络 27 | │ 06-introduce_optimizer.py # 引入优化器的概念,并基于一个特定函数比较 28 | │ 07-introduce_optimizer_2.py # 在实际网络中运用优化器 29 | │ 08-introduce_weight_decay.py # 引入权值衰减的概念 30 | │ 09-introduce_batch_normalization.py # 引入批标准化的概念 31 | │ 10-introduce_dropout.py # 引入dropout的概念 32 | │ 11-hyperparam_search.py # 实现超参数搜索 33 | │ 12-CNN_and_digits_recognition.py # 基于CNN完成手写数字识别过程 34 | ``` 35 | 36 | ## 参考 37 | 38 | - [pytorch](https://github.com/pytorch/pytorch) 39 | - [tinynn](https://github.com/borgwang/tinynn) 40 | - 《深度学习入门-基于Python的理论与实现》 41 | -------------------------------------------------------------------------------- /examples/00-download_mnist.py: -------------------------------------------------------------------------------- 1 | from tinynn.mnist import load_mnist 2 | 3 | if __name__ == '__main__': 4 | (x_train, t_train), (x_test, t_test) = load_mnist(flatten=True, 5 | normalize=False, 6 | one_hot_label=True) 7 | img = x_train[0] 8 | label = t_train[0] 9 | print(label) # 5 10 | print(img.shape) # (784,) 11 | img = img.reshape(28, 28) 12 | print(img.shape) # (28, 28) 13 | -------------------------------------------------------------------------------- /examples/01-simple_forward.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def sigmoid(x: np.ndarray) -> np.ndarray: 5 | return 1/(1+np.exp(-x)) 6 | 7 | 8 | def identity_function(x: np.ndarray) -> np.ndarray: 9 | # 激活函数,我们这里使用等值激活函数便于理解 10 | return x 11 | 12 | 13 | def init_network() -> dict: 14 | network = {} 15 | network['W1'] = np.array([[0.1, 0.3, 0.5], [0.2, 0.4, 0.6]]) 16 | network['b1'] = np.array([0.1, 0.2, 0.3]) 17 | network['W2'] = np.array([[0.1, 0.4], [0.2, 0.5], [0.3, 0.6]]) 18 | network['b2'] = np.array([0.1, 0.2]) 19 | network['W3'] = np.array([[0.1, 0.3], [0.2, 0.4]]) 20 | network['b3'] = np.array([0.1, 0.2]) 21 | return network 22 | 23 | 24 | def forward(network, x): 25 | W1, W2, W3 = network['W1'], network['W2'], network['W3'] 26 | b1, b2, b3 = network['b1'], network['b2'], network['b3'] 27 | a1 = np.dot(x, W1) + b1 28 | z1 = sigmoid(a1) 29 | a2 = np.dot(z1, W2) + b2 30 | z2 = sigmoid(a2) 31 | a3 = np.dot(z2, W3) + b3 32 | y = identity_function(a3) 33 | return y 34 | 35 | if __name__ == '__main__': 36 | 37 | network = init_network() 38 | x = np.array([1.0, 0.5]) 39 | y = forward(network, x) 40 | print(y) # [ 0.31682708 0.69627909] 41 | -------------------------------------------------------------------------------- /examples/02-simple_backward.py: -------------------------------------------------------------------------------- 1 | from tinynn.gradient import numerical_gradient 2 | from tinynn.functions import softmax, cross_entropy_error 3 | import numpy as np 4 | 5 | 6 | class simpleNet: 7 | 8 | def __init__(self): 9 | self.W = np.random.randn(2, 3) # 用标准正态分布进行初始化,二行三列的数组。 10 | 11 | def predict(self, x): 12 | return np.dot(x, self.W) 13 | 14 | def loss(self, x, t): 15 | z = self.predict(x) 16 | y = softmax(z) 17 | loss = cross_entropy_error(y, t) 18 | return loss 19 | 20 | 21 | if __name__ == '__main__': 22 | 23 | net = simpleNet() 24 | print(f'初始化权重矩阵: {net.W}') 25 | x = np.array([0.6, 0.9]) 26 | print(f'初始化输入值: {x}') 27 | p = net.predict(x) 28 | print(f'forward结果: {p}') 29 | print(f'最大值索引: {np.argmax(p)}') 30 | t = np.array([0, 0, 1]) 31 | print(f'loss: {net.loss(x, t)}') 32 | 33 | func = lambda _: net.loss(x, t) # 定义一个无参的匿名函数,调用net.loss来求导 34 | 35 | dW = numerical_gradient(func, net.W) # 对权重基于损失函数求导 36 | print(f'权重矩阵梯度: {dW}') # 正值表示应该参数应向负方向更新,来减少损失函数。同理负值的对应参数应该向正方向更新 37 | -------------------------------------------------------------------------------- /examples/03-simple_network.py: -------------------------------------------------------------------------------- 1 | from tinynn.mnist import load_mnist 2 | from tinynn.gradient import numerical_gradient 3 | from tinynn.functions import (sigmoid, softmax, cross_entropy_error, 4 | sigmoid_grad) 5 | import numpy as np 6 | 7 | 8 | class SimpleTwoLayerNet: 9 | """ 最简单的两层神经网络,包括前向和反向 """ 10 | 11 | def __init__(self, 12 | input_size: int, 13 | hidden_size: int, 14 | output_size: int, 15 | weight_init_std: float = 0.01) -> None: 16 | # 初始化权重 17 | self.params = {} 18 | self.params['W1'] = weight_init_std * np.random.randn( 19 | input_size, hidden_size) 20 | self.params['b1'] = np.zeros(hidden_size) 21 | self.params['W2'] = weight_init_std * np.random.randn( 22 | hidden_size, output_size) 23 | self.params['b2'] = np.zeros(output_size) 24 | 25 | def predict(self, x: np.ndarray) -> np.ndarray: 26 | W1, W2 = self.params['W1'], self.params['W2'] 27 | b1, b2 = self.params['b1'], self.params['b2'] 28 | a1 = np.dot(x, W1) + b1 29 | z1 = sigmoid(a1) 30 | a2 = np.dot(z1, W2) + b2 31 | return softmax(a2) 32 | 33 | def loss(self, x: np.ndarray, t: np.ndarray) -> np.ndarray: 34 | y = self.predict(x) 35 | return cross_entropy_error(y, t) 36 | 37 | def accuracy(self, x: np.ndarray, t: np.ndarray): 38 | y = self.predict(x) 39 | y = np.argmax(y, axis=1) 40 | t = np.argmax(t, axis=1) 41 | return np.sum(y == t) / float(x.shape[0]) 42 | 43 | def numerical_gradient(self, x: np.ndarray, t: np.ndarray) -> dict: 44 | """ 计算梯度方式1:普通方式计算梯度 """ 45 | 46 | func = lambda _: self.loss(x, t) 47 | 48 | grads = {} 49 | grads['W1'] = numerical_gradient(func, self.params['W1']) 50 | grads['b1'] = numerical_gradient(func, self.params['b1']) 51 | grads['W2'] = numerical_gradient(func, self.params['W2']) 52 | grads['b2'] = numerical_gradient(func, self.params['b2']) 53 | return grads 54 | 55 | def gradient(self, x: np.ndarray, t: np.ndarray) -> dict: 56 | """ 计算梯度方式2:使用反向传播方式高速计算梯度 """ 57 | W1, W2 = self.params['W1'], self.params['W2'] 58 | b1, b2 = self.params['b1'], self.params['b2'] 59 | grads = {} 60 | 61 | batch_num = x.shape[0] 62 | 63 | # forward 64 | a1 = np.dot(x, W1) + b1 65 | z1 = sigmoid(a1) 66 | a2 = np.dot(z1, W2) + b2 67 | y = softmax(a2) 68 | 69 | # backward 70 | dy = (y - t) / batch_num 71 | grads['W2'] = np.dot(z1.T, dy) 72 | grads['b2'] = np.sum(dy, axis=0) 73 | 74 | da1 = np.dot(dy, W2.T) 75 | dz1 = sigmoid_grad(a1) * da1 76 | grads['W1'] = np.dot(x.T, dz1) 77 | grads['b1'] = np.sum(dz1, axis=0) 78 | 79 | return grads 80 | 81 | 82 | def test() -> None: 83 | net = SimpleTwoLayerNet(input_size=784, hidden_size=100, output_size=10) 84 | print(net.params['W1'].shape) # (784, 100) 85 | x = np.random.rand(100, 784) # 伪输入数据(100笔) 86 | t = np.random.rand(100, 10) # 伪正确解标签(100笔) 87 | y = net.predict(x) 88 | print(y.shape) 89 | # grads = net.numerical_gradient(x, t) 90 | grads = net.gradient(x, t) 91 | print(grads) 92 | 93 | 94 | def train() -> None: 95 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, 96 | one_hot_label=True) 97 | train_loss_list = [] 98 | train_acc_list = [] 99 | test_acc_list = [] 100 | 101 | # 超参数 102 | iters_num = 10000 103 | train_size = x_train.shape[0] 104 | batch_size = 100 105 | learning_rate = 0.1 106 | iter_per_epoch = max(train_size / batch_size, 1) 107 | 108 | network = SimpleTwoLayerNet(input_size=784, hidden_size=50, output_size=10) 109 | for i in range(iters_num): 110 | # 获取mini-batch 111 | batch_mask = np.random.choice(train_size, batch_size) 112 | x_batch = x_train[batch_mask] 113 | t_batch = t_train[batch_mask] 114 | # 计算梯度 115 | # grad = network.numerical_gradient(x_batch, t_batch) 116 | grad = network.gradient(x_batch, t_batch) # 高速版!(反向传播) 117 | # 更新参数 118 | for key in ('W1', 'b1', 'W2', 'b2'): 119 | network.params[key] -= learning_rate * grad[key] 120 | # 记录学习过程 121 | loss = network.loss(x_batch, t_batch) 122 | train_loss_list.append(loss) 123 | # 计算每个epoch的识别精度 124 | if i % iter_per_epoch == 0: 125 | train_acc = network.accuracy(x_train, t_train) 126 | test_acc = network.accuracy(x_test, t_test) 127 | train_acc_list.append(train_acc) 128 | test_acc_list.append(test_acc) 129 | print(f"train acc, test acc | {train_acc}, {test_acc}") 130 | 131 | 132 | if __name__ == '__main__': 133 | # train() 134 | test() 135 | -------------------------------------------------------------------------------- /examples/04-introduce_layer.py: -------------------------------------------------------------------------------- 1 | class MulLayer: 2 | """简单的乘法层,支持float的前向和反向""" 3 | 4 | def __init__(self): 5 | self.x = None 6 | self.y = None 7 | 8 | def forward(self, x: float, y: float) -> float: 9 | self.x = x 10 | self.y = y 11 | out = x * y 12 | return out 13 | 14 | def backward(self, dout: float) -> float: 15 | dx = dout * self.y 16 | dy = dout * self.x 17 | return dx, dy 18 | 19 | 20 | class AddLayer: 21 | """简单的加法层,支持float的前向和反向""" 22 | 23 | def forward(self, x: float, y: float) -> float: 24 | return x + y 25 | 26 | def backward(self, dout: float) -> float: 27 | # z = x + y 28 | dx = dout * 1 29 | dy = dout * 1 30 | return dx, dy 31 | 32 | 33 | if __name__ == '__main__': 34 | 35 | apple = 100 36 | apple_num = 2 37 | orange = 150 38 | orange_num = 3 39 | tax = 1.1 40 | # layer 41 | mul_apple_layer = MulLayer() 42 | mul_orange_layer = MulLayer() 43 | add_apple_orange_layer = AddLayer() 44 | mul_tax_layer = MulLayer() 45 | # forward 46 | apple_price = mul_apple_layer.forward(apple, apple_num) # (1) 47 | orange_price = mul_orange_layer.forward(orange, orange_num) # (2) 48 | all_price = add_apple_orange_layer.forward(apple_price, 49 | orange_price) # (3) 50 | price = mul_tax_layer.forward(all_price, tax) # (4) 51 | # backward 52 | dprice = 1 53 | dall_price, dtax = mul_tax_layer.backward(dprice) # (4) 54 | dapple_price, dorange_price = add_apple_orange_layer.backward( 55 | dall_price) # (3) 56 | dorange, dorange_num = mul_orange_layer.backward(dorange_price) # (2) 57 | dapple, dapple_num = mul_apple_layer.backward(dapple_price) # (1) 58 | print(price) # 715 59 | print(dapple_num, dapple, dorange, dorange_num, 60 | dtax) # 110 2.2 3.3 165 650 61 | -------------------------------------------------------------------------------- /examples/05-simple_network_with_layer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tinynn.gradient import numerical_gradient 3 | from tinynn.mnist import load_mnist 4 | from collections import OrderedDict 5 | 6 | from tinynn.layers import Affine, Layer, Relu, SoftmaxWithLoss 7 | 8 | 9 | class TwoLayerNet: 10 | 11 | def __init__(self, 12 | input_size: int, 13 | hidden_size: int, 14 | output_size: int, 15 | weight_init_std: float = 0.01) -> None: 16 | # 初始化权重 17 | self.params = {} 18 | self.params['W1'] = weight_init_std * np.random.randn( 19 | input_size, hidden_size) 20 | self.params['b1'] = np.zeros(hidden_size) 21 | self.params['W2'] = weight_init_std * np.random.randn( 22 | hidden_size, output_size) 23 | self.params['b2'] = np.zeros(output_size) 24 | # 生成层 25 | self.layers: OrderedDict[str, Layer] = OrderedDict() 26 | self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1']) 27 | self.layers['Relu1'] = Relu() 28 | self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2']) 29 | self.lastLayer = SoftmaxWithLoss() 30 | 31 | def predict(self, x: np.ndarray) -> np.ndarray: 32 | for layer in self.layers.values(): 33 | x = layer.forward(x) 34 | return x 35 | 36 | def loss(self, x: np.ndarray, t: np.ndarray) -> np.ndarray: 37 | y = self.predict(x) 38 | return self.lastLayer.forward(y, t) 39 | 40 | def accuracy(self, x: np.ndarray, t: np.ndarray) -> float: 41 | y = self.predict(x) 42 | y = np.argmax(y, axis=1) 43 | if (t.ndim != 1): 44 | t = np.argmax(t, axis=1) 45 | accuracy = np.sum(y == t) / float(x.shape[0]) 46 | return accuracy 47 | 48 | def numerical_gradient(self, x: np.ndarray, t: np.ndarray) -> dict: 49 | loss_W = lambda _: self.loss(x, t) 50 | grads = {} 51 | grads['W1'] = numerical_gradient(loss_W, self.params['W1']) 52 | grads['b1'] = numerical_gradient(loss_W, self.params['b1']) 53 | grads['W2'] = numerical_gradient(loss_W, self.params['W2']) 54 | grads['b2'] = numerical_gradient(loss_W, self.params['b2']) 55 | return grads 56 | 57 | def gradient(self, x: np.ndarray, t: np.ndarray) -> dict: 58 | # forward 59 | self.loss(x, t) 60 | # backward 61 | dout = self.lastLayer.backward(1) 62 | layers = list(self.layers.values()) # 将字典里保存的所有层化为列表 63 | layers.reverse() # 反序以反向传播 64 | for layer in layers: 65 | dout = layer.backward(dout) 66 | grads = {} 67 | grads['W1'] = self.layers['Affine1'].dW 68 | grads['b1'] = self.layers['Affine1'].db 69 | grads['W2'] = self.layers['Affine2'].dW 70 | grads['b2'] = self.layers['Affine2'].db 71 | return grads 72 | 73 | 74 | def test() -> None: 75 | # 检测误差反向传播法与数值微分是否相同。如果差距较大,则反向传播法存在问题 76 | # 读入数据 77 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, 78 | one_hot_label=True) 79 | network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10) 80 | x_batch = x_train[:3] 81 | t_batch = t_train[:3] 82 | grad_numerical = network.numerical_gradient(x_batch, t_batch) 83 | grad_backprop = network.gradient(x_batch, t_batch) 84 | # 求各个权重的绝对误差的平均值 85 | for key in grad_numerical.keys(): 86 | diff = np.average(np.abs(grad_backprop[key] - grad_numerical[key])) 87 | print(f"{key}: {diff}") 88 | 89 | 90 | def train() -> None: 91 | # 读入数据 92 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, 93 | one_hot_label=True) 94 | network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10) 95 | iters_num = 10000 96 | train_size = x_train.shape[0] #60000 97 | batch_size = 100 98 | learning_rate = 0.1 99 | train_loss_list = [] 100 | train_acc_list = [] 101 | test_acc_list = [] 102 | iter_per_epoch = max(train_size / batch_size, 1) #每一个epoch输出信息 103 | for i in range(iters_num): 104 | batch_mask = np.random.choice(train_size, batch_size) 105 | x_batch = x_train[batch_mask] 106 | t_batch = t_train[batch_mask] 107 | # 通过误差反向传播法求梯度 108 | grad = network.gradient(x_batch, t_batch) 109 | # 更新 110 | for key in ('W1', 'b1', 'W2', 'b2'): 111 | network.params[key] -= learning_rate * grad[key] 112 | loss = network.loss(x_batch, t_batch) 113 | train_loss_list.append(loss) 114 | if i % iter_per_epoch == 0: 115 | train_acc = network.accuracy(x_train, t_train) 116 | test_acc = network.accuracy(x_test, t_test) 117 | train_acc_list.append(train_acc) 118 | test_acc_list.append(test_acc) 119 | print(f"train acc, test acc | {train_acc}, {test_acc}") 120 | 121 | 122 | if __name__ == '__main__': 123 | train() 124 | # test() -------------------------------------------------------------------------------- /examples/06-introduce_optimizer.py: -------------------------------------------------------------------------------- 1 | """ 比较四种优化器基于一个特定函数的表现 """ 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from collections import OrderedDict 5 | 6 | from tinynn.optimizer import SGD, AdaGrad, Adam, Momentum, Optimizer 7 | 8 | 9 | def f(x: float, y: float) -> float: 10 | return x**2 / 20.0 + y**2 11 | 12 | 13 | def df(x: float, y: float) -> float: 14 | return x / 10.0, 2.0 * y 15 | 16 | 17 | if __name__ == '__main__': 18 | 19 | init_pos = (-7.0, 2.0) 20 | params = {} 21 | params['x'], params['y'] = init_pos[0], init_pos[1] 22 | grads = {} 23 | grads['x'], grads['y'] = 0, 0 24 | 25 | optimizers: OrderedDict[str, Optimizer] = OrderedDict() 26 | optimizers["SGD"] = SGD(lr=0.95) 27 | optimizers["Momentum"] = Momentum(lr=0.1) 28 | optimizers["AdaGrad"] = AdaGrad(lr=1.5) 29 | optimizers["Adam"] = Adam(lr=0.3) 30 | 31 | idx = 1 32 | 33 | for key in optimizers: 34 | optimizer = optimizers[key] 35 | x_history = [] 36 | y_history = [] 37 | params['x'], params['y'] = init_pos[0], init_pos[1] 38 | 39 | for i in range(30): 40 | x_history.append(params['x']) 41 | y_history.append(params['y']) 42 | 43 | grads['x'], grads['y'] = df(params['x'], params['y']) 44 | optimizer.update(params, grads) 45 | 46 | x = np.arange(-10, 10, 0.01) 47 | y = np.arange(-5, 5, 0.01) 48 | 49 | X, Y = np.meshgrid(x, y) 50 | Z = f(X, Y) 51 | 52 | # for simple contour line 53 | mask = Z > 7 54 | Z[mask] = 0 55 | 56 | # plot 57 | plt.subplot(2, 2, idx) 58 | idx += 1 59 | plt.plot(x_history, y_history, 'o-', color="red") 60 | plt.contour(X, Y, Z) 61 | plt.ylim(-10, 10) 62 | plt.xlim(-10, 10) 63 | plt.plot(0, 0, '+') 64 | plt.title(key) 65 | plt.xlabel("x") 66 | plt.ylabel("y") 67 | 68 | plt.show() -------------------------------------------------------------------------------- /examples/07-introduce_optimizer_2.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from tinynn.optimizer import SGD, AdaGrad, Adam, Momentum, Optimizer, RMSprop 3 | from tinynn.mnist import load_mnist 4 | from tinynn.util import smooth_curve 5 | from tinynn.full_connect_network import MultiLayerNet 6 | import numpy as np 7 | 8 | if __name__ == '__main__': 9 | # 0:读入MNIST数据========== 10 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True) 11 | 12 | train_size = x_train.shape[0] 13 | batch_size = 128 14 | max_iterations = 201 15 | 16 | # 1:进行实验的设置========== 17 | optimizers: dict[str, Optimizer] = {} #用字典存放优化器 18 | optimizers['SGD'] = SGD() 19 | optimizers['Momentum'] = Momentum() 20 | optimizers['AdaGrad'] = AdaGrad() 21 | optimizers['Adam'] = Adam() 22 | optimizers['RMSprop'] = RMSprop() 23 | 24 | networks: dict[str, MultiLayerNet] = {} 25 | train_loss = {} 26 | for key in optimizers.keys(): 27 | # 每个优化器建立一个神经网络 28 | networks[key] = MultiLayerNet(input_size=784, 29 | hidden_size_list=[100, 100, 100, 100], 30 | output_size=10) 31 | train_loss[key] = [] 32 | 33 | # 2:开始训练========== 34 | for i in range(max_iterations): 35 | batch_mask = np.random.choice(train_size, batch_size) 36 | x_batch = x_train[batch_mask] 37 | t_batch = t_train[batch_mask] 38 | 39 | for key in optimizers.keys(): 40 | grads = networks[key].gradient(x_batch, t_batch) 41 | optimizers[key].update(networks[key].params, grads) 42 | 43 | loss = networks[key].loss(x_batch, t_batch) 44 | train_loss[key].append(loss) 45 | 46 | if i % 100 == 0: 47 | print(f"=========== iteration: {i} ===========") 48 | for key in optimizers.keys(): 49 | loss = networks[key].loss(x_batch, t_batch) 50 | print(f"{key}: {loss}") 51 | 52 | # 3.绘制图形========== 53 | markers = { 54 | "SGD": "o", 55 | "Momentum": "x", 56 | "AdaGrad": "s", 57 | "Adam": "D", 58 | "RMSprop": "o" 59 | } 60 | x = np.arange(max_iterations) 61 | for key in optimizers.keys(): 62 | # 注:soomth_curve用于使损失函数图形变得圆滑 63 | # marker是标记,100轮标记一次 64 | # o是圆形,x是叉号,s是正方向,d是菱形 65 | plt.plot( 66 | x, 67 | smooth_curve(train_loss[key]), 68 | marker=markers[key], 69 | markevery=100, 70 | label=key) 71 | plt.xlabel("iterations") 72 | plt.ylabel("loss") 73 | plt.ylim(0, 1) #设定y轴为0~1 74 | plt.legend() #给图加上图例 75 | plt.show() 76 | -------------------------------------------------------------------------------- /examples/08-introduce_weight_decay.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from tinynn.mnist import load_mnist 4 | from tinynn.full_connect_network import MultiLayerNet 5 | from tinynn.optimizer import SGD 6 | 7 | if __name__ == '__main__': 8 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True) 9 | 10 | # 为了再现过拟合,减少学习数据 11 | x_train = x_train[:300] 12 | t_train = t_train[:300] 13 | 14 | # weight decay(权值衰减)的设定 ======================= 15 | # weight_decay_lambda = 0 # 不使用权值衰减的情况 16 | weight_decay_lambda = 0.1 17 | # ==================================================== 18 | 19 | network = MultiLayerNet( 20 | input_size=784, 21 | hidden_size_list=[100, 100, 100, 100, 100, 100], 22 | output_size=10, 23 | weight_decay_lambda=weight_decay_lambda) 24 | optimizer = SGD(lr=0.01) 25 | 26 | max_epochs = 201 27 | train_size = x_train.shape[0] 28 | batch_size = 100 29 | 30 | train_loss_list = [] 31 | train_acc_list = [] 32 | test_acc_list = [] 33 | 34 | iter_per_epoch = max(train_size / batch_size, 1) 35 | epoch_cnt = 0 36 | 37 | for i in range(max_epochs): 38 | batch_mask = np.random.choice(train_size, batch_size) 39 | x_batch = x_train[batch_mask] 40 | t_batch = t_train[batch_mask] 41 | 42 | grads = network.gradient(x_batch, t_batch) 43 | optimizer.update(network.params, grads) 44 | 45 | if i % iter_per_epoch == 0: 46 | train_acc = network.accuracy(x_train, t_train) 47 | test_acc = network.accuracy(x_test, t_test) 48 | train_acc_list.append(train_acc) 49 | test_acc_list.append(test_acc) 50 | 51 | print(f"epoch: {epoch_cnt}, train acc: {train_acc}, " 52 | f"test acc: {test_acc}") 53 | 54 | epoch_cnt += 1 55 | if epoch_cnt >= max_epochs: 56 | break 57 | 58 | # 3.绘制图形========== 59 | markers = {'train': 'o', 'test': 's'} 60 | x = np.arange(max_epochs) 61 | plt.plot(x, train_acc_list, marker='o', label='train', markevery=10) 62 | plt.plot(x, test_acc_list, marker='s', label='test', markevery=10) 63 | plt.xlabel("epochs") 64 | plt.ylabel("accuracy") 65 | plt.ylim(0, 1.0) 66 | plt.legend(loc='lower right') 67 | plt.show() -------------------------------------------------------------------------------- /examples/09-introduce_batch_normalization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from tinynn.full_connect_network import MultiLayerNet 4 | from tinynn.mnist import load_mnist 5 | from tinynn.optimizer import SGD 6 | 7 | 8 | def train(weight_init_std: float) -> tuple[list, list]: 9 | bn_network = MultiLayerNet( 10 | input_size=784, 11 | hidden_size_list=[100, 100, 100, 100, 100], 12 | output_size=10, 13 | weight_init_std=weight_init_std, 14 | # use_batch_normalization=False 15 | use_batch_normalization=True) 16 | network = MultiLayerNet(input_size=784, 17 | hidden_size_list=[100, 100, 100, 100, 100], 18 | output_size=10, 19 | weight_init_std=weight_init_std) 20 | optimizer = SGD(lr=learning_rate) 21 | 22 | train_acc_list = [] 23 | bn_train_acc_list = [] 24 | 25 | iter_per_epoch = max(train_size / batch_size, 1) 26 | epoch_cnt = 0 27 | 28 | for i in range(1000000000): 29 | batch_mask = np.random.choice(train_size, batch_size) 30 | x_batch = x_train[batch_mask] 31 | t_batch = t_train[batch_mask] 32 | 33 | for _network in (bn_network, network): 34 | grads = _network.gradient(x_batch, t_batch) 35 | optimizer.update(_network.params, grads) 36 | 37 | if i % iter_per_epoch == 0: 38 | train_acc = network.accuracy(x_train, t_train) 39 | bn_train_acc = bn_network.accuracy(x_train, t_train) 40 | train_acc_list.append(train_acc) 41 | bn_train_acc_list.append(bn_train_acc) 42 | 43 | print(f"epoch: {epoch_cnt} | {train_acc} - {bn_train_acc}") 44 | 45 | epoch_cnt += 1 46 | if epoch_cnt >= max_epochs: 47 | break 48 | 49 | return train_acc_list, bn_train_acc_list 50 | 51 | 52 | if __name__ == '__main__': 53 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True) 54 | 55 | # 减少学习数据 56 | x_train = x_train[:1000] 57 | t_train = t_train[:1000] 58 | 59 | max_epochs = 10 60 | train_size = x_train.shape[0] 61 | batch_size = 100 62 | learning_rate = 0.01 63 | 64 | # 3.绘制图形========== 65 | weight_scale_list = np.logspace(0, -4, 66 | num=16) #创建基数为10的,16个数据的从1到10的-4次方的的等比数列 67 | x = np.arange(max_epochs) 68 | 69 | for i, w in enumerate(weight_scale_list): 70 | print(f"============== {i + 1} / 16 ==============") 71 | train_acc_list, bn_train_acc_list = train(w) 72 | 73 | plt.subplot(4, 4, i + 1) 74 | plt.title(f"W: {w}") 75 | if i == 15: 76 | plt.plot(x, 77 | bn_train_acc_list, 78 | label='Batch Normalization', 79 | markevery=2) 80 | plt.plot(x, 81 | train_acc_list, 82 | linestyle="--", 83 | label='Normal(without BatchNorm)', 84 | markevery=2) 85 | else: 86 | plt.plot(x, bn_train_acc_list, markevery=2) 87 | plt.plot(x, train_acc_list, linestyle="--", markevery=2) 88 | 89 | plt.ylim(0, 1.0) 90 | if i % 4: 91 | #最终会生成4排16个图形,这样做每四个只会生成一个y轴 92 | plt.yticks([]) 93 | else: 94 | plt.ylabel("accuracy") 95 | if i < 12: 96 | #前12个图形不生成x轴 97 | plt.xticks([]) 98 | else: 99 | plt.xlabel("epochs") 100 | plt.legend(loc='lower right') 101 | 102 | plt.show() -------------------------------------------------------------------------------- /examples/10-introduce_dropout.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from tinynn.full_connect_network import MultiLayerNet 4 | from tinynn.mnist import load_mnist 5 | from tinynn.trainer import Trainer 6 | 7 | if __name__ == '__main__': 8 | 9 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True) 10 | 11 | # 为了再现过拟合,减少学习数据 12 | x_train = x_train[:300] 13 | t_train = t_train[:300] 14 | 15 | # 设定是否使用Dropuout,以及比例 ======================== 16 | use_dropout = True # 不使用Dropout的情况下为False 17 | dropout_ratio = 0.2 18 | # ==================================================== 19 | 20 | network = MultiLayerNet(input_size=784, 21 | hidden_size_list=[100, 100, 100, 100, 100, 100], 22 | output_size=10, 23 | use_dropout=use_dropout, 24 | dropout_ration=dropout_ratio) 25 | # 封装好的训练者类,可传递网络和数据集进去,直接进行训练。 26 | trainer = Trainer(network, 27 | x_train, 28 | t_train, 29 | x_test, 30 | t_test, 31 | epochs=301, 32 | mini_batch_size=100, 33 | optimizer='sgd', 34 | optimizer_param={'lr': 0.01}, 35 | verbose=True) 36 | trainer.train() 37 | 38 | # 绘制图形========== 39 | markers = {'train': 'o', 'test': 's'} 40 | x = np.arange(len(trainer.train_acc_list)) 41 | plt.plot(x, trainer.train_acc_list, marker='o', label='train', markevery=10) 42 | plt.plot(x, trainer.test_acc_list, marker='s', label='test', markevery=10) 43 | plt.xlabel("epochs") 44 | plt.ylabel("accuracy") 45 | plt.ylim(0, 1.0) 46 | plt.legend(loc='lower right') 47 | plt.show() -------------------------------------------------------------------------------- /examples/11-hyperparam_search.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from tinynn.mnist import load_mnist 4 | from tinynn.full_connect_network import MultiLayerNet 5 | from tinynn.util import shuffle_dataset 6 | from tinynn.trainer import Trainer 7 | 8 | 9 | def train(lr: float, 10 | weight_decay: float, 11 | epochs: int = 50) -> tuple[list, list]: 12 | network = MultiLayerNet(input_size=784, 13 | hidden_size_list=[100, 100, 100, 100, 100, 100], 14 | output_size=10, 15 | weight_decay_lambda=weight_decay) 16 | # optimizer_param指选择对应优化器时应传进去的参数 17 | trainer = Trainer(network, 18 | x_train, 19 | t_train, 20 | x_val, 21 | t_val, 22 | epochs=epochs, 23 | mini_batch_size=100, 24 | optimizer='sgd', 25 | optimizer_param={'lr': lr}, 26 | verbose=False) 27 | trainer.train() 28 | return trainer.test_acc_list, trainer.train_acc_list 29 | 30 | 31 | if __name__ == '__main__': 32 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True) 33 | 34 | # 为了实现高速化,减少训练数据 35 | x_train = x_train[:1000] 36 | t_train = t_train[:1000] 37 | 38 | # 分割验证数据 39 | validation_rate = 0.20 40 | #注意要加int,否则默认为float是无法作为index的 41 | validation_num = int(x_train.shape[0] * validation_rate) 42 | x_train, t_train = shuffle_dataset(x_train, t_train) #打乱数据集 43 | 44 | x_val = x_train[:validation_num] 45 | t_val = t_train[:validation_num] 46 | x_train = x_train[validation_num:] 47 | t_train = t_train[validation_num:] 48 | 49 | # ============超参数的随机搜索================= 50 | # 通过设定一个搜索范围,逐个实验,找到最优参数 51 | optimization_trial = 100 52 | results_val = {} 53 | results_train = {} 54 | for _ in range(optimization_trial): 55 | # 指定搜索的超参数的范围=============== 56 | # 我们这里搜索权重衰减值以及学习率 57 | weight_decay = 10**np.random.uniform(-8, -4) #10的负8次方到10的负4次方 58 | lr = 10**np.random.uniform(-6, -2) 59 | # ================================================ 60 | 61 | val_acc_list, train_acc_list = train(lr, weight_decay) 62 | print(f"val acc: {val_acc_list[-1]} | lr: {lr}" 63 | f", weight decay: {weight_decay}") 64 | #把随机出来的学习率和权重衰减(带值)一整个作为key 65 | key = f"lr: {lr}, weight decay: {weight_decay}" 66 | results_val[key] = val_acc_list 67 | results_train[key] = train_acc_list 68 | 69 | # 绘制图形======================================================== 70 | print("=========== Hyper-Parameter Optimization Result ===========") 71 | graph_draw_num = 20 72 | col_num = 5 73 | # np.ceil向上取整,如-0.2取整为0.0 74 | row_num = int(np.ceil(graph_draw_num / col_num)) 75 | i = 0 76 | # key=lambda x:x[1][-1]即对results_val中的第二维数据(即values),中的最后一个值 77 | # reverse=true表示由高到低进行排序 78 | for key, val_acc_list in sorted(results_val.items(), 79 | key=lambda x: x[1][-1], 80 | reverse=True): 81 | print(f"Best-{i+1} (val acc: {val_acc_list[-1]}) | {key}") 82 | 83 | plt.subplot(row_num, col_num, i + 1) 84 | plt.title(f"Best-{i+1}") 85 | plt.ylim(0.0, 1.0) 86 | #注意:i%5,i=1结果是1,i=0结果是0 87 | if i % 5: 88 | plt.yticks([]) #每五幅图创建一个y轴(当i%5结果为0是不删除y轴) 89 | plt.xticks([]) #(删除所有x轴) 90 | x = np.arange(len(val_acc_list)) 91 | plt.plot(x, val_acc_list) #图中实线表示验证集精度 92 | plt.plot(x, results_train[key], "--") #图中虚线表示训练集精度 93 | i += 1 94 | 95 | if i >= graph_draw_num: 96 | break 97 | 98 | #最后筛选出能正常进行学习的超参数,再进一步筛选 99 | plt.show() 100 | -------------------------------------------------------------------------------- /examples/12-CNN_and_digits_recognition.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from tinynn.conv_network import SimpleConvNet 4 | from tinynn.mnist import load_mnist 5 | from tinynn.trainer import Trainer 6 | 7 | if __name__ == '__main__': 8 | # 读入数据 9 | (x_train, t_train), (x_test, t_test) = load_mnist(flatten=False) 10 | 11 | # 处理花费时间较长的情况下减少数据 12 | x_train, t_train = x_train[:5000], t_train[:5000] 13 | x_test, t_test = x_test[:1000], t_test[:1000] 14 | 15 | max_epochs = 20 16 | 17 | network = SimpleConvNet(input_dim=(1, 28, 28), 18 | conv_param={ 19 | 'filter_num': 30, 20 | 'filter_size': 5, 21 | 'pad': 0, 22 | 'stride': 1 23 | }, 24 | hidden_size=100, 25 | output_size=10, 26 | weight_init_std=0.01) 27 | 28 | trainer = Trainer(network, 29 | x_train, 30 | t_train, 31 | x_test, 32 | t_test, 33 | epochs=max_epochs, 34 | mini_batch_size=100, 35 | optimizer='Adam', 36 | optimizer_param={'lr': 0.001}, 37 | evaluate_sample_num_per_epoch=1000) 38 | trainer.train() 39 | 40 | # 保存参数 41 | # network.save_params("params.pkl") 42 | # print("Saved Network Parameters!") 43 | 44 | # 绘制图形 45 | markers = {'train': 'o', 'test': 's'} 46 | x = np.arange(max_epochs) 47 | plt.plot(x, trainer.train_acc_list, marker='o', label='train', markevery=2) 48 | plt.plot(x, trainer.test_acc_list, marker='s', label='test', markevery=2) 49 | plt.xlabel("epochs") 50 | plt.ylabel("accuracy") 51 | plt.ylim(0, 1.0) 52 | plt.legend(loc='lower right') 53 | plt.show() 54 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import find_packages, setup 4 | 5 | __version__ = os.getenv('tinynn', '0.1.0') 6 | 7 | 8 | def setup_tinynn(): 9 | requires = [ 10 | 'numpy' 11 | ] 12 | setup( 13 | name='tinynn', 14 | version=__version__, 15 | description='my deep learning study', 16 | python_requires='>=3.8', 17 | install_requires=requires, 18 | packages=find_packages( 19 | include=['tinynn', 'tinynn.*']), 20 | ) 21 | 22 | 23 | if __name__ == '__main__': 24 | setup_tinynn() 25 | -------------------------------------------------------------------------------- /tinynn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yewentao256/TinyNN/32d089aeed6d8034e98dbe6806a076cfacaf0c71/tinynn/__init__.py -------------------------------------------------------------------------------- /tinynn/conv_network.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | from collections import OrderedDict 4 | from tinynn.layers import Affine, Convolution, Layer, Pooling, Relu, SoftmaxWithLoss 5 | 6 | 7 | class SimpleConvNet: 8 | """简单的ConvNet 9 | 10 | conv - relu - pool - affine - relu - affine - softmax 11 | """ 12 | 13 | def __init__(self, 14 | input_dim: tuple = (1, 28, 28), 15 | conv_param: dict = { 16 | 'filter_num': 30, 17 | 'filter_size': 5, 18 | 'pad': 0, 19 | 'stride': 1 20 | }, 21 | hidden_size: int = 100, 22 | output_size: int = 10, 23 | weight_init_std: float = 0.01) -> None: 24 | """ 25 | Args: 26 | input_dim (tuple, optional): 输入维度. Defaults to (1, 28, 28). 27 | conv_param (_type_, optional): 卷积参数. Defaults to { 'filter_num': 30, 'filter_size': 5, 'pad': 0, 'stride': 1 }. 28 | hidden_size (int, optional): 隐藏层维度. Defaults to 100. 29 | output_size (int, optional): 输出维度. Defaults to 10. 30 | weight_init_std (float, optional): 指定权重的标准差(e.g. 0.01). Defaults to 0.01. 31 | """ 32 | # 读取参数字典中的参数 33 | filter_num = conv_param['filter_num'] 34 | filter_size = conv_param['filter_size'] 35 | filter_pad = conv_param['pad'] 36 | filter_stride = conv_param['stride'] 37 | input_size = input_dim[1] 38 | conv_output_size = (input_size - filter_size + 39 | 2 * filter_pad) / filter_stride + 1 40 | pool_output_size = int(filter_num * (conv_output_size / 2) * 41 | (conv_output_size / 2)) 42 | 43 | # 初始化权重 44 | self.params = {} 45 | self.params['W1'] = weight_init_std * \ 46 | np.random.randn(filter_num, input_dim[0], filter_size, filter_size) 47 | self.params['b1'] = np.zeros(filter_num) 48 | self.params['W2'] = weight_init_std * \ 49 | np.random.randn(pool_output_size, hidden_size) 50 | self.params['b2'] = np.zeros(hidden_size) 51 | self.params['W3'] = weight_init_std * \ 52 | np.random.randn(hidden_size, output_size) 53 | self.params['b3'] = np.zeros(output_size) 54 | 55 | # 在有序字典中按顺序添加层 56 | self.layers: OrderedDict[str, Layer] = OrderedDict() 57 | self.layers['Conv1'] = Convolution(self.params['W1'], 58 | self.params['b1'], 59 | conv_param['stride'], 60 | conv_param['pad']) 61 | self.layers['Relu1'] = Relu() 62 | self.layers['Pool1'] = Pooling(pool_h=2, pool_w=2, stride=2) 63 | self.layers['Affine1'] = Affine(self.params['W2'], self.params['b2']) 64 | self.layers['Relu2'] = Relu() 65 | self.layers['Affine2'] = Affine(self.params['W3'], self.params['b3']) 66 | 67 | self.last_layer = SoftmaxWithLoss() 68 | 69 | # 预测 70 | def predict(self, x: np.ndarray) -> np.ndarray: 71 | # 每一层都前向传导,返回最后的结果 72 | for layer in self.layers.values(): 73 | x = layer.forward(x) 74 | return x 75 | 76 | def loss(self, x: np.ndarray, t: np.ndarray) -> np.ndarray: 77 | """求损失函数 78 | 参数x是输入数据、t是标签 79 | """ 80 | y = self.predict(x) 81 | return self.last_layer.forward(y, t) 82 | 83 | def accuracy(self, 84 | x: np.ndarray, 85 | t: np.ndarray, 86 | batch_size: int = 100) -> float: 87 | if t.ndim != 1: 88 | t = np.argmax(t, axis=1) 89 | 90 | acc = 0.0 91 | 92 | for i in range(int(x.shape[0] / batch_size)): 93 | tx = x[i * batch_size:(i + 1) * batch_size] 94 | tt = t[i * batch_size:(i + 1) * batch_size] 95 | y = self.predict(tx) 96 | y = np.argmax(y, axis=1) 97 | acc += np.sum(y == tt) 98 | 99 | return acc / x.shape[0] 100 | 101 | def gradient(self, x: np.ndarray, t: np.ndarray) -> dict: 102 | """ 求梯度——反向传播 """ 103 | # forward 104 | self.loss(x, t) 105 | 106 | # backward 107 | dout = self.last_layer.backward(1) 108 | 109 | layers = list(self.layers.values()) 110 | layers.reverse() 111 | for layer in layers: 112 | dout = layer.backward(dout) 113 | 114 | # 设定 115 | grads = {} 116 | grads['W1'], grads['b1'] = self.layers['Conv1'].dW, self.layers[ 117 | 'Conv1'].db 118 | grads['W2'], grads['b2'] = self.layers['Affine1'].dW, self.layers[ 119 | 'Affine1'].db 120 | grads['W3'], grads['b3'] = self.layers['Affine2'].dW, self.layers[ 121 | 'Affine2'].db 122 | 123 | return grads 124 | 125 | def save_params(self, file_path="params.pkl"): 126 | params = {} 127 | for key, val in self.params.items(): 128 | params[key] = val 129 | with open(file_path, 'wb') as f: 130 | pickle.dump(params, f) 131 | 132 | def load_params(self, file_path="params.pkl"): 133 | with open(file_path, 'rb') as f: 134 | params = pickle.load(f) 135 | for key, val in params.items(): 136 | self.params[key] = val 137 | 138 | for i, key in enumerate(['Conv1', 'Affine1', 'Affine2']): 139 | self.layers[key].W = self.params['W' + str(i + 1)] 140 | self.layers[key].b = self.params['b' + str(i + 1)] 141 | -------------------------------------------------------------------------------- /tinynn/full_connect_network.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | import numpy as np 3 | from collections import OrderedDict 4 | from tinynn.gradient import numerical_gradient 5 | from tinynn.layers import Affine, BatchNormalization, Dropout, Relu, Sigmoid, SoftmaxWithLoss 6 | 7 | 8 | class MultiLayerNet: 9 | """全连接的多层神经网络 10 | 11 | 具有Weiht Decay、Dropout、Batch Normalization的功能 12 | """ 13 | 14 | def __init__(self, 15 | input_size: int, 16 | hidden_size_list: list, 17 | output_size: int, 18 | activation: str = 'relu', 19 | weight_init_std: Union[list, str] = 'relu', 20 | weight_decay_lambda: float = 0, 21 | use_dropout: bool = False, 22 | dropout_ration: int = 0.5, 23 | use_batch_normalization: bool = False) -> None: 24 | """ 25 | Args: 26 | input_size (int): 输入大小(MNIST的情况下为784) 27 | hidden_size_list (list): 隐藏层的神经元数量的列表 28 | (e.g. [100, 100, 100]) 29 | output_size (int): 输出大小(MNIST的情况下为10) 30 | activation (str, optional): 'relu' or 'sigmoid'. Defaults to 'relu'. 31 | weight_init_std (str, optional): 指定权重的标准差(e.g. 0.01) 32 | 指定'relu'或'he'的情况下设定“He的初始值” 33 | 指定'sigmoid'或'xavier'的情况下设定“Xavier的初始值”. 34 | Defaults to 'relu'. 35 | weight_decay_lambda (float, optional): Weight Decay(L2范数)的强度. 36 | Defaults to 0. 37 | use_dropout (bool, optional): 是否使用Dropout. Defaults to False. 38 | dropout_ration (int, optional): dropout比例. Defaults to 0.5. 39 | use_batch_normalization (bool, optional): 是否使用Batch Normalization. 40 | Defaults to False. 41 | """ 42 | self.input_size = input_size 43 | self.output_size = output_size 44 | self.hidden_size_list = hidden_size_list 45 | self.hidden_layer_num = len(hidden_size_list) 46 | self.use_dropout = use_dropout 47 | self.weight_decay_lambda = weight_decay_lambda 48 | self.use_batch_normalization = use_batch_normalization 49 | self.params = {} 50 | 51 | # 初始化权重 52 | self.__init_weight(weight_init_std) 53 | 54 | # 生成层 55 | activation_layer = {'sigmoid': Sigmoid, 'relu': Relu} 56 | self.layers = OrderedDict() 57 | for idx in range(1, self.hidden_layer_num + 1): 58 | self.layers['Affine' + str(idx)] = Affine( 59 | self.params['W' + str(idx)], self.params['b' + str(idx)]) 60 | if self.use_batch_normalization: 61 | self.params['gamma' + str(idx)] = np.ones( 62 | hidden_size_list[idx - 1]) 63 | self.params['beta' + str(idx)] = np.zeros( 64 | hidden_size_list[idx - 1]) 65 | self.layers['BatchNorm' + str(idx)] = BatchNormalization( 66 | self.params['gamma' + str(idx)], 67 | self.params['beta' + str(idx)]) 68 | 69 | self.layers['Activation_function' + 70 | str(idx)] = activation_layer[activation]() 71 | 72 | if self.use_dropout: 73 | self.layers['Dropout' + str(idx)] = Dropout(dropout_ration) 74 | 75 | idx = self.hidden_layer_num + 1 76 | self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)], 77 | self.params['b' + str(idx)]) 78 | 79 | self.last_layer = SoftmaxWithLoss() 80 | 81 | def __init_weight(self, weight_init_std: Union[str, float]) -> None: 82 | """设定权重的初始值 83 | 84 | Args: 85 | weight_init_std (Union[str, float]): 指定权重的标准差(e.g. 0.01) 86 | 指定'relu'或'he'的情况下设定“He的初始值” 87 | 指定'sigmoid'或'xavier'的情况下设定“Xavier的初始值” 88 | """ 89 | 90 | all_size_list = [self.input_size 91 | ] + self.hidden_size_list + [self.output_size] 92 | for idx in range(1, len(all_size_list)): 93 | scale = weight_init_std 94 | if str(weight_init_std).lower() in ('relu', 'he'): 95 | scale = np.sqrt(2.0 / 96 | all_size_list[idx - 1]) # 使用ReLU的情况下推荐的初始值 97 | elif str(weight_init_std).lower() in ('sigmoid', 'xavier'): 98 | scale = np.sqrt(1.0 / 99 | all_size_list[idx - 1]) # 使用sigmoid的情况下推荐的初始值 100 | self.params['W' + str(idx)] = scale * np.random.randn( 101 | all_size_list[idx - 1], all_size_list[idx]) 102 | self.params['b' + str(idx)] = np.zeros(all_size_list[idx]) 103 | 104 | def predict(self, x: np.ndarray, train_flg: bool = False) -> np.ndarray: 105 | for key, layer in self.layers.items(): 106 | if "Dropout" in key or "BatchNorm" in key: 107 | x = layer.forward(x, train_flg) 108 | else: 109 | x = layer.forward(x) 110 | 111 | return x 112 | 113 | def loss(self, 114 | x: np.ndarray, 115 | t: np.ndarray, 116 | train_flg: bool = False) -> np.ndarray: 117 | """求损失函数 118 | 参数x是输入数据,t是教师标签 119 | """ 120 | y = self.predict(x, train_flg) 121 | 122 | weight_decay = 0 123 | for idx in range(1, self.hidden_layer_num + 2): 124 | W = self.params['W' + str(idx)] 125 | weight_decay += 0.5 * self.weight_decay_lambda * np.sum(W**2) 126 | 127 | return self.last_layer.forward(y, t) + weight_decay 128 | 129 | def accuracy(self, x: np.ndarray, t: np.ndarray) -> float: 130 | y = self.predict(x, train_flg=False) 131 | y = np.argmax(y, axis=1) 132 | if t.ndim != 1: 133 | t = np.argmax(t, axis=1) 134 | 135 | return np.sum(y == t) / float(x.shape[0]) 136 | 137 | def numerical_gradient(self, x: np.ndarray, t: np.ndarray) -> dict: 138 | """ 求梯度(数值微分) """ 139 | loss_W = lambda W: self.loss(x, t, train_flg=True) 140 | 141 | grads = {} 142 | for idx in range(1, self.hidden_layer_num + 2): 143 | grads['W' + str(idx)] = numerical_gradient( 144 | loss_W, self.params['W' + str(idx)]) 145 | grads['b' + str(idx)] = numerical_gradient( 146 | loss_W, self.params['b' + str(idx)]) 147 | 148 | if self.use_batch_normalization and idx != self.hidden_layer_num + 1: 149 | grads['gamma' + str(idx)] = numerical_gradient( 150 | loss_W, self.params['gamma' + str(idx)]) 151 | grads['beta' + str(idx)] = numerical_gradient( 152 | loss_W, self.params['beta' + str(idx)]) 153 | 154 | return grads 155 | 156 | def gradient(self, x: np.ndarray, t: np.ndarray) -> dict: 157 | # forward 158 | self.loss(x, t, train_flg=True) 159 | 160 | # backward 161 | dout = self.last_layer.backward(1) 162 | 163 | layers = list(self.layers.values()) 164 | layers.reverse() 165 | for layer in layers: 166 | dout = layer.backward(dout) 167 | 168 | grads = {} 169 | for idx in range(1, self.hidden_layer_num + 2): 170 | grads['W' + str(idx)] = self.layers['Affine' + str( 171 | idx)].dW + self.weight_decay_lambda * self.params['W' + 172 | str(idx)] 173 | grads['b' + str(idx)] = self.layers['Affine' + str(idx)].db 174 | 175 | if self.use_batch_normalization and idx != self.hidden_layer_num + 1: 176 | grads['gamma' + str(idx)] = self.layers['BatchNorm' + 177 | str(idx)].dgamma 178 | grads['beta' + str(idx)] = self.layers['BatchNorm' + 179 | str(idx)].dbeta 180 | 181 | return grads -------------------------------------------------------------------------------- /tinynn/functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def sigmoid(x: np.ndarray) -> float: 5 | return 1 / (1 + np.exp(-x)) 6 | 7 | 8 | def sigmoid_grad(x: np.ndarray) -> float: 9 | # 计算sigmoid的梯度 10 | return (1.0 - sigmoid(x)) * sigmoid(x) 11 | 12 | 13 | def softmax(x: np.ndarray) -> np.ndarray: 14 | if x.ndim == 2: 15 | x = x.T 16 | x = x - np.max(x, axis=0) 17 | y = np.exp(x) / np.sum(np.exp(x), axis=0) 18 | return y.T 19 | 20 | x = x - np.max(x) # 溢出对策 21 | return np.exp(x) / np.sum(np.exp(x)) 22 | 23 | 24 | def cross_entropy_error(y: np.ndarray, t: np.ndarray) -> float: 25 | """ 交叉熵误差 """ 26 | 27 | if y.ndim == 1: 28 | t = t.reshape(1, t.size) 29 | y = y.reshape(1, y.size) 30 | 31 | # 监督数据是one-hot-vector的情况下,转换为正确解标签的索引 32 | if t.size == y.size: 33 | t = t.argmax(axis=1) 34 | 35 | batch_size = y.shape[0] 36 | return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size 37 | -------------------------------------------------------------------------------- /tinynn/gradient.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | 4 | 5 | def numerical_gradient(function: object, darray: np.ndarray) -> np.ndarray: 6 | """对某个矩阵使用某函数来求梯度 7 | 8 | Args: 9 | function (object): 求微分函数 10 | darray (np.ndarray): 需处理的矩阵 11 | 12 | Returns: 13 | np.ndarray: 梯度 14 | """ 15 | 16 | minor_value = 1e-4 # 0.0001 17 | grad = np.zeros_like(darray) 18 | 19 | # np.nditer用于迭代数组 20 | it = np.nditer(darray, flags=['multi_index'], op_flags=['readwrite']) 21 | while not it.finished: 22 | idx = it.multi_index 23 | tmp_val = darray[idx] 24 | darray[idx] = float(tmp_val) + minor_value 25 | fxh1 = function(darray) # f(x+h) 26 | 27 | darray[idx] = tmp_val - minor_value 28 | fxh2 = function(darray) # f(x-h) 29 | grad[idx] = (fxh1 - fxh2) / (2*minor_value) 30 | 31 | darray[idx] = tmp_val # 还原值 32 | it.iternext() 33 | 34 | return grad 35 | -------------------------------------------------------------------------------- /tinynn/layers.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | import numpy as np 3 | from tinynn.functions import cross_entropy_error, sigmoid, softmax 4 | from tinynn.util import im2col, col2im 5 | 6 | 7 | class Layer(object): 8 | """ Layer基类 """ 9 | 10 | def forward(self, x: np.ndarray, *args: list[object]) -> np.ndarray: 11 | return self._forward(x, *args) if args else self._forward(x) 12 | 13 | def _forward(self, *_) -> np.ndarray: 14 | raise NotImplementedError 15 | 16 | def backward(self, dout: np.ndarray, *args: list[object]) -> np.ndarray: 17 | return self._backward(dout, *args) if args else self._backward(dout) 18 | 19 | def _backward(self, *_) -> np.ndarray: 20 | raise NotImplementedError 21 | 22 | def __repr__(self) -> str: 23 | return f"Layer: {self.__class__.__name__}" 24 | 25 | class Relu(Layer): 26 | 27 | def __init__(self) -> None: 28 | # mask是由True/False构成的NumPy数组, 29 | # 它会把正向传播时的输入x的元素中小于等于0的地方保存为True, 30 | # 其他地方(大于0的元素)保存为False。 31 | self.mask = None 32 | 33 | def _forward(self, x: np.ndarray) -> np.ndarray: 34 | self.mask = (x <= 0) 35 | out = x.copy() 36 | out[self.mask] = 0 37 | return out 38 | 39 | def _backward(self, dout: np.ndarray) -> np.ndarray: 40 | # RELU层,x>=0的导数不变,x<=0的导数为0 41 | dout[self.mask] = 0 42 | dx = dout 43 | return dx 44 | 45 | 46 | class Sigmoid(Layer): 47 | 48 | def __init__(self) -> None: 49 | self.out = None 50 | 51 | def _forward(self, x: np.ndarray) -> np.ndarray: 52 | self.out = sigmoid(x) 53 | return self.out 54 | 55 | def _backward(self, dout: np.ndarray) -> np.ndarray: 56 | # y = 1 / (1 + exp(-x)) 57 | # 画出反向传播图,得到dx = dout * y^2 * e^(-x),简化后得到下式 58 | return dout * (1.0 - self.out) * self.out 59 | 60 | 61 | class Affine(Layer): 62 | 63 | def __init__(self, W: np.ndarray, b: np.ndarray) -> None: 64 | self.W = W 65 | self.b = b 66 | 67 | self.x = None 68 | self.original_x_shape = None 69 | # 权重和偏置参数的导数 70 | self.dW = None 71 | self.db = None 72 | 73 | def _forward(self, x: np.ndarray) -> np.ndarray: 74 | # y = x * W + b 75 | self.original_x_shape = x.shape 76 | # 这样操作可以支持四维张量,统一转化为二维矩阵后再计算 77 | self.x = x.reshape(x.shape[0], -1) 78 | return np.dot(self.x, self.W) + self.b 79 | 80 | def _backward(self, dout: np.ndarray) -> np.ndarray: 81 | # dx = dout * W的转置(涉及矩阵求导) 82 | dx = np.dot(dout, self.W.T) 83 | # dW = x的转置 * dout(涉及矩阵求导) 84 | self.dW = np.dot(self.x.T, dout) 85 | # db = dout第零轴方向上的和(涉及矩阵求导) 86 | self.db = np.sum(dout, axis=0) 87 | 88 | return dx.reshape(*self.original_x_shape) # 还原形状 89 | 90 | 91 | class SoftmaxWithLoss(Layer): 92 | # Softmax + Cross Entropy Error 93 | 94 | def __init__(self) -> None: 95 | self.loss: np.ndarray = None 96 | self.y: np.ndarray = None # softmax的输出 97 | self.t: np.ndarray = None # 监督数据 98 | 99 | def _forward(self, x: np.ndarray, t: np.ndarray) -> np.ndarray: 100 | self.t = t 101 | self.y = softmax(x) 102 | self.loss = cross_entropy_error(self.y, self.t) 103 | return self.loss 104 | 105 | def _backward(self, dout=1) -> np.ndarray: 106 | 107 | batch_size = self.t.shape[0] 108 | 109 | # 对应标签t是one-hot矩阵的情况 110 | # 注:一定要除以batch_size,因为偏置有一个求和的过程,使用np.sum 111 | # 会将同一轴数据加和,如batch_size为2 112 | # 求得y-t = [[0.3,-0.8,0.5],[-0.7,0.2,0.5]] 113 | # y-t恰好是softmax with loss反向传播的结果,有兴趣的同学可以推导一下 114 | # sum后变为[-0.4,-0.6,1]显然不是单个数据求出的正常值 115 | # 除以batch_size才会得到单个数据的正常结果[-0.2,-0.3,0.5] 116 | if self.t.size == self.y.size: 117 | dx = (self.y - self.t) * dout 118 | dx = dx / batch_size 119 | 120 | else: 121 | dx = self.y.copy() * dout 122 | dx[np.arange(batch_size), self.t] -= 1 123 | dx = dx / batch_size 124 | return dx 125 | 126 | 127 | class Dropout(Layer): 128 | """ 129 | dropout层,可参考http://arxiv.org/abs/1207.0580 130 | """ 131 | 132 | def __init__(self, dropout_ratio: int = 0.5) -> None: 133 | self.dropout_ratio = dropout_ratio 134 | self.mask = None 135 | 136 | def _forward(self, x: np.ndarray, train_flg: bool = True) -> np.ndarray: 137 | # train_flg用于标记是测试还是训练阶段 138 | if train_flg: 139 | # np.random.rand可以返回一个或一组服从“0~1”均匀分布的随机样本值。 140 | self.mask = np.random.rand(*x.shape) > self.dropout_ratio 141 | return x * self.mask 142 | else: 143 | # 如果非train,最终应乘上训练时删除掉的比例(如删掉10%,则最后乘以0.9) 144 | # 以实现整体的“dropout” 145 | return x * (1.0 - self.dropout_ratio) 146 | 147 | def _backward(self, dout: np.ndarray) -> np.ndarray: 148 | # 反向传播时,遇到mask数组中false的(删除的),则返回0信号(或说不返回信号) 149 | return dout * self.mask 150 | 151 | 152 | class BatchNormalization(Layer): 153 | """ 154 | 批标准化:以进行学习时的mini-batch为单位,按mini batch进行标准化 155 | 156 | 具体而言,就是进行使数据分布的均值为0、方差为1的标准化 157 | 均值 u_mean = sum(x)/len(x) 158 | 方差 var = sum((x - u_mean)^2) / len(x) 159 | 最终 x_new = (x - u_mean) / sqrt(var + 微小值) —— 减均值除标准差 160 | 161 | 此外,会将x做一个平移和缩放,一开始gamma为1,beta为0 162 | 即 y = gamma * x_new + beta 163 | 164 | 详见 http://arxiv.org/abs/1502.03167 165 | """ 166 | 167 | def __init__(self, 168 | gamma: np.ndarray, 169 | beta: np.ndarray, 170 | momentum: int = 0.9, 171 | running_mean: Optional[float] = None, 172 | running_var: Optional[float] = None) -> None: 173 | self.gamma = gamma 174 | self.beta = beta 175 | self.momentum = momentum 176 | self.input_shape = None 177 | 178 | # 测试时使用的平均值和方差 179 | self.running_mean = running_mean 180 | self.running_var = running_var 181 | 182 | # backward时使用的中间数据 183 | self.batch_size = None 184 | self.x_sub_mean_u = None 185 | self.std = None 186 | self.dgamma = None 187 | self.dbeta = None 188 | 189 | def _forward(self, x: np.ndarray, train_flg: bool = True) -> np.ndarray: 190 | self.input_shape = x.shape 191 | if x.ndim != 2: 192 | # 高维矩阵全部转为二维 193 | # 本项目中,Conv层的情况下为4维,全连接层的情况下为2维 194 | x = x.reshape(x.shape[0], -1) 195 | 196 | if self.running_mean is None: 197 | _, D = x.shape 198 | self.running_mean = np.zeros(D) 199 | self.running_var = np.zeros(D) 200 | 201 | # 训练时 202 | if train_flg: 203 | # 计算均值方差,然后减均值除标准差 204 | mean_u = x.mean(axis=0) 205 | x_sub_mean_u = x - mean_u 206 | var = np.mean(x_sub_mean_u**2, axis=0) 207 | std = np.sqrt(var + 10e-7) 208 | x_new = x_sub_mean_u / std 209 | 210 | self.batch_size = x.shape[0] 211 | self.x_sub_mean_u = x_sub_mean_u 212 | self.x_new = x_new 213 | self.std = std 214 | # 按momentum更新运行时均值和标准差 215 | self.running_mean = self.momentum * self.running_mean + ( 216 | 1 - self.momentum) * mean_u 217 | self.running_var = self.momentum * self.running_var + ( 218 | 1 - self.momentum) * var 219 | 220 | # 测试时 221 | else: 222 | x_sub_mean_u = x - self.running_mean 223 | x_new = x_sub_mean_u / ((np.sqrt(self.running_var + 10e-7))) 224 | 225 | out = self.gamma * x_new + self.beta 226 | 227 | return out.reshape(*self.input_shape) 228 | 229 | def _backward(self, dout: np.ndarray) -> np.ndarray: 230 | if dout.ndim != 2: 231 | dout = dout.reshape(dout.shape[0], -1) 232 | 233 | # BN的反向传播有些复杂,参考博客 234 | # Understanding the backward pass through Batch Normalization Layer 235 | self.dgamma = dout.sum(axis=0) 236 | self.dbeta = np.sum(self.x_new * dout, axis=0) 237 | 238 | dx_new = self.gamma * dout 239 | dx_sub_mean_u = dx_new / self.std 240 | dstd = -np.sum( 241 | (dx_new * self.x_sub_mean_u) / (self.std * self.std), axis=0) 242 | dvar = 0.5 * dstd / self.std 243 | dx_sub_mean_u += (2.0 / self.batch_size) * self.x_sub_mean_u * dvar 244 | dmean_u = np.sum(dx_sub_mean_u, axis=0) 245 | 246 | dx = dx_sub_mean_u - dmean_u / self.batch_size 247 | 248 | return dx.reshape(*self.input_shape) 249 | 250 | 251 | class Convolution(Layer): 252 | """ 卷积层 """ 253 | 254 | def __init__(self, 255 | W: np.ndarray, 256 | b: np.ndarray, 257 | stride: int = 1, 258 | pad: int = 0): 259 | self.W = W 260 | self.b = b 261 | self.stride = stride 262 | self.pad = pad 263 | 264 | # 中间数据(backward时使用) 265 | self.x = None 266 | self.col = None 267 | self.col_W = None 268 | 269 | # 权重和偏置参数的梯度 270 | self.dW = None 271 | self.db = None 272 | 273 | def _forward(self, x: np.ndarray) -> np.ndarray: 274 | # 卷积的前向运算,本质就是 y = W * X + b 275 | # 其中X是输入数组,一般是NCHW,W是卷积核,一般是FCHW(F为卷积核个数) 276 | self.x = x 277 | FN, _, FH, FW = self.W.shape 278 | N, _, H, W = x.shape 279 | 280 | # im2col后得到了卷积核扫过二维矩阵,两个展开矩阵运算恰好能完成卷积过程 281 | self.col = im2col(x, FH, FW, self.stride, self.pad) # 使用im2col展开图像 282 | self.col_W = self.W.reshape(FN, -1).T # 滤波器的展开,-1会自动计算个数 283 | 284 | # 计算输出形状 285 | out_h = 1 + int((H + 2 * self.pad - FH) / self.stride) 286 | out_w = 1 + int((W + 2 * self.pad - FW) / self.stride) 287 | 288 | out: np.ndarray = np.dot(self.col, self.col_W) + self.b 289 | # transpose转换轴的顺序, (N,H,W,C) → (N,C,H,W) 290 | return out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2) 291 | 292 | def _backward(self, dout: np.ndarray) -> np.ndarray: 293 | # 卷积的反向传播类似affine的逻辑,分别求出db、dW、dx即可 294 | 295 | FN, C, FH, FW = self.W.shape 296 | # transpose成NHWC,reshape为二维矩阵 297 | dout = dout.transpose(0, 2, 3, 1).reshape(-1, FN) 298 | 299 | # db = dout第零轴方向上的和(涉及矩阵求导) 300 | self.db = np.sum(dout, axis=0) 301 | 302 | # dW = x的转置 * dout(涉及矩阵求导) 303 | # 但这里注意还需要将dW的形状转回去 304 | self.dW = np.dot(self.col.T, dout) 305 | self.dW = self.dW.transpose(1, 0).reshape(FN, C, FH, FW) 306 | 307 | # dx = dout * W的转置(涉及矩阵求导) 308 | # 同样注意将形状转回去,col2im二维转四维 309 | dcol = np.dot(dout, self.col_W.T) 310 | return col2im(dcol, self.x.shape, FH, FW, self.stride, self.pad) 311 | 312 | 313 | class Pooling(Layer): 314 | """ Max pooling """ 315 | 316 | def __init__(self, 317 | pool_h: int, 318 | pool_w: int, 319 | stride: int = 1, 320 | pad: int = 0) -> None: 321 | self.pool_h = pool_h 322 | self.pool_w = pool_w 323 | self.stride = stride 324 | self.pad = pad 325 | 326 | self.x = None 327 | self.arg_max = None # 最大值位置,用于反向传播 328 | 329 | def _forward(self, x: np.ndarray) -> np.ndarray: 330 | self.x = x 331 | 332 | N, C, H, W = x.shape 333 | # 计算输出形状 334 | out_h = int(1 + (H - self.pool_h) / self.stride) 335 | out_w = int(1 + (W - self.pool_w) / self.stride) 336 | 337 | # 二维展开image 338 | col = im2col(x, self.pool_h, self.pool_w, self.stride, self.pad) 339 | # reshape成每行为一层池化扫描过的矩阵 340 | col = col.reshape(-1, self.pool_h * self.pool_w) # N * 扫描个数 341 | 342 | # 取最大值索引并寻找最大值 343 | self.arg_max = np.argmax(col, axis=1) 344 | # out: np.ndarray = col[range(col.shape[0]), self.arg_max] 345 | # 相当于out = np.max(col, axis=1),但这样直接求值相当于算两次最大值 346 | # 这里取值方法是迭代 range => data[0, arg[0]], data[1, arg[1]]... 347 | # 也是官方文档中推荐的做法,详情见 348 | # https://numpy.org/doc/stable/user/quickstart.html#indexing-with-arrays-of-indices 349 | out: np.ndarray = col[range(col.shape[0]), self.arg_max] 350 | 351 | # 知识:NCHW中,按照[W H C N]的顺序放元素,先走W再走H最后是C和N 352 | # 以RGB为例即 'RRRRRR GGGGGG BBBBBB' 这种形式 353 | # 而NHWC按照[C W H N] 的顺序放元素,先走C再走W最后是H和N 354 | # 以RGB为例即 'RGB RGB RGB RGB RGB RGB' 这种形式 355 | 356 | # 这里我们需要NCHW的输出,但如果直接reshape成NCHW的话 357 | # 输出的并不是按卷积核扫过的顺序(有疑惑的同学可以断点调试看一下输出) 358 | # 所以我们借助NHWC中间态然后reshape成NCHW 359 | return out.reshape(N, out_h, out_w, C).transpose(0, 3, 1, 2) 360 | 361 | def _backward(self, dout: np.ndarray) -> np.ndarray: 362 | """ Pooling层最大池化的反向传播 363 | 364 | 1、padding 365 | 2、映射回原最大值位置,其他取0 366 | """ 367 | # reshape成NHWC 368 | dout = dout.transpose(0, 2, 3, 1) 369 | 370 | pool_size = self.pool_h * self.pool_w 371 | 372 | # 临时二维矩阵dmax,N * 滤波核面积,例如432000 * 4 373 | dmax = np.zeros((dout.size, pool_size)) 374 | 375 | # 将dout中,对应之前arg_max的位置赋上导数值,其他都为0 376 | dmax[np.arange(self.arg_max.size), 377 | self.arg_max.flatten()] = dout.flatten() 378 | 379 | # reshape后,变为五维,NHWC + size,如(100, 12, 12, 30, 4) 380 | dmax = dmax.reshape(dout.shape + (pool_size, )) 381 | 382 | # 再使用dmax reshape成二维dcol的形状, 如(14400, 120) 383 | dcol = dmax.reshape(dmax.shape[0] * dmax.shape[1] * dmax.shape[2], -1) 384 | 385 | # 调用col2im返回最终反向结果 386 | # 为什么要转来转去呢?就可以不用或者少用低效率的for循环来处理 387 | return col2im(dcol, self.x.shape, self.pool_h, self.pool_w, 388 | self.stride, self.pad) 389 | -------------------------------------------------------------------------------- /tinynn/mnist.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | import os.path 3 | import gzip 4 | import pickle 5 | import os 6 | import numpy as np 7 | 8 | 9 | def _load_label(file_path: str) -> np.ndarray: 10 | 11 | print(f"Converting {file_path} to NumPy Array ...") 12 | with gzip.open(file_path, 'rb') as f: 13 | labels = np.frombuffer(f.read(), np.uint8, offset=8) 14 | print("Done") 15 | 16 | return labels 17 | 18 | 19 | def _load_img(file_path: str) -> np.ndarray: 20 | print(f"Converting {file_path} to NumPy Array ...") 21 | with gzip.open(file_path, 'rb') as f: 22 | data = np.frombuffer(f.read(), np.uint8, offset=16) 23 | data = data.reshape(-1, 784) 24 | print("Done") 25 | 26 | return data 27 | 28 | 29 | def init_mnist(dataset_dir: str) -> None: 30 | url_base = 'http://yann.lecun.com/exdb/mnist/' 31 | key_file = { 32 | 'train_img': 'train-images-idx3-ubyte.gz', 33 | 'train_label': 'train-labels-idx1-ubyte.gz', 34 | 'test_img': 't10k-images-idx3-ubyte.gz', 35 | 'test_label': 't10k-labels-idx1-ubyte.gz' 36 | } 37 | for file_name in key_file.values(): 38 | file_path = dataset_dir + file_name 39 | 40 | if not os.path.exists(file_path): 41 | print(f"Downloading {file_name}... ") 42 | urllib.request.urlretrieve(url_base + file_name, file_path) 43 | print("Done") 44 | 45 | dataset = {} 46 | dataset['train_img'] = _load_img(dataset_dir + key_file['train_img']) 47 | dataset['train_label'] = _load_label(dataset_dir + key_file['train_label']) 48 | dataset['test_img'] = _load_img(dataset_dir + key_file['test_img']) 49 | dataset['test_label'] = _load_label(dataset_dir + key_file['test_label']) 50 | 51 | print("Creating pickle file ...") 52 | with open(dataset_dir + "mnist.pkl", 'wb') as f: 53 | pickle.dump(dataset, f, -1) 54 | print("Done!") 55 | 56 | 57 | def _change_one_hot_label(labels: np.ndarray) -> np.ndarray: 58 | one_hot_labels = np.zeros((labels.size, 10)) 59 | for idx, row in enumerate(one_hot_labels): 60 | row[labels[idx]] = 1 61 | 62 | return one_hot_labels 63 | 64 | 65 | def load_mnist( 66 | normalize: bool = True, 67 | flatten: bool = True, 68 | one_hot_label: bool = False 69 | ) -> tuple[tuple[np.ndarray, np.ndarray], tuple[np.ndarray, np.ndarray]]: 70 | """读入MNIST数据集,如果没有则下载到 {workdir}/dataset目录下 71 | 72 | Args: 73 | normalize (bool, optional): 将图像的像素值正规化为0.0~1.0. 74 | Defaults to True. 75 | flatten (bool, optional): 是否将图像展开为一维数组. Defaults to True. 76 | one_hot_label (bool, optional): 标签是否作为one-hot数组返回. 77 | Defaults to False. 78 | 79 | Returns: 80 | Tuple[tuple, tuple]: (训练图像, 训练标签), (测试图像, 测试标签) 81 | """ 82 | dataset_dir = f'{os.getcwd()}/dataset/' 83 | if not os.path.exists(dataset_dir): 84 | os.mkdir(dataset_dir) 85 | if not os.path.exists(dataset_dir + "mnist.pkl"): 86 | init_mnist(dataset_dir) 87 | 88 | with open(dataset_dir + "mnist.pkl", 'rb') as f: 89 | dataset = pickle.load(f) 90 | 91 | if normalize: 92 | for key in ('train_img', 'test_img'): 93 | dataset[key] = dataset[key].astype(np.float32) / 255.0 94 | 95 | if one_hot_label: 96 | dataset['train_label'] = _change_one_hot_label(dataset['train_label']) 97 | dataset['test_label'] = _change_one_hot_label(dataset['test_label']) 98 | 99 | if not flatten: 100 | for key in ('train_img', 'test_img'): 101 | dataset[key] = dataset[key].reshape(-1, 1, 28, 28) 102 | 103 | return (dataset['train_img'], 104 | dataset['train_label']), (dataset['test_img'], 105 | dataset['test_label']) 106 | -------------------------------------------------------------------------------- /tinynn/optimizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Optimizer(object): 5 | 6 | def update(self, params: dict, grads: dict) -> None: 7 | self._update(params, grads) 8 | 9 | def _update(self, *_) -> None: 10 | raise NotImplementedError 11 | 12 | 13 | class SGD(Optimizer): 14 | """ 随机梯度下降法(Stochastic Gradient Descent) """ 15 | 16 | def __init__(self, lr: float = 0.01): 17 | self.lr = lr 18 | 19 | def _update(self, params: dict, grads: dict) -> None: 20 | # python中字典属于传引用 21 | for key in params.keys(): 22 | # W = W - lr * grads 23 | params[key] -= self.lr * grads[key] 24 | 25 | 26 | class Momentum(Optimizer): 27 | """Momentum SGD""" 28 | 29 | def __init__(self, lr: float = 0.01, momentum: float = 0.9) -> None: 30 | self.lr = lr 31 | self.momentum = momentum 32 | self.velocity: dict = None 33 | 34 | def _update(self, params: dict, grads: dict) -> None: 35 | if self.velocity is None: 36 | # 初始化velocity字典 37 | self.velocity = {} 38 | for key, val in params.items(): 39 | self.velocity[key] = np.zeros_like(val) 40 | 41 | for key in params.keys(): 42 | # v = momentum*v_old - lr * grad 43 | self.velocity[key] = self.momentum * self.velocity[ 44 | key] - self.lr * grads[key] 45 | params[key] += self.velocity[key] 46 | 47 | 48 | class AdaGrad(Optimizer): 49 | """AdaGrad,学习率自动衰减的优化器""" 50 | 51 | def __init__(self, lr: float = 0.01) -> None: 52 | self.lr = lr 53 | self.h = None 54 | 55 | def _update(self, params: dict, grads: dict) -> None: 56 | if self.h is None: 57 | self.h = {} 58 | for key, val in params.items(): 59 | self.h[key] = np.zeros_like(val) 60 | 61 | for key in params.keys(): 62 | # h += grad ^ 2 63 | # W -= lr * grad / sqrt(h) 64 | self.h[key] += grads[key] * grads[key] 65 | params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7) 66 | 67 | 68 | class RMSprop(Optimizer): 69 | """RMSprop 70 | 71 | 一般AdaGrad的平方和不断累加,就会导致越到后面更新量越小,趋于0。 72 | 73 | RMSProp方法并不是将过去所有的梯度一视同仁地相加, 74 | 而是逐渐地遗忘过去的梯度,在做加法运算时将新梯度的信息更多地反映出来 75 | """ 76 | 77 | def __init__(self, lr: float = 0.01, decay_rate: float = 0.99): 78 | self.lr = lr 79 | self.decay_rate = decay_rate 80 | self.h = None 81 | 82 | def _update(self, params: dict, grads: dict) -> None: 83 | if self.h is None: 84 | self.h = {} 85 | for key, val in params.items(): 86 | self.h[key] = np.zeros_like(val) 87 | 88 | for key in params.keys(): 89 | # h * decay_rate 让h不会无限增加 90 | self.h[key] *= self.decay_rate 91 | self.h[key] += (1 - self.decay_rate) * grads[key] * grads[key] 92 | params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7) 93 | 94 | 95 | class Adam(Optimizer): 96 | """Adam (http://arxiv.org/abs/1412.6980v8) 97 | 98 | Momentum + Adagrad 的思路合并形成Adam 99 | """ 100 | 101 | def __init__(self, 102 | lr: float = 0.001, 103 | beta1: float = 0.9, 104 | beta2: float = 0.999): 105 | self.lr = lr 106 | self.beta1 = beta1 # 第一次momentum系数 107 | self.beta2 = beta2 # 第二次momentum系数 108 | self.iter = 0 109 | self.m: dict = None 110 | self.v: dict = None 111 | 112 | def _update(self, params: dict, grads: dict) -> None: 113 | if self.m is None: 114 | self.m, self.v = {}, {} 115 | for key, val in params.items(): 116 | self.m[key] = np.zeros_like(val) 117 | self.v[key] = np.zeros_like(val) 118 | 119 | self.iter += 1 120 | # 新学习率 = lr * sqrt(1 - beta2*iter) / (1 - beta1 ** iter) 121 | lr_t = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / ( 122 | 1.0 - self.beta1**self.iter) 123 | 124 | for key in params.keys(): 125 | # self.m[key] = self.beta1*self.m[key] + (1-self.beta1)*grads[key] 126 | # self.v[key] = self.beta2*self.v[key] + (1-self.beta2)*(grads[key]**2) 127 | self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key]) 128 | self.v[key] += (1 - self.beta2) * (grads[key]**2 - self.v[key]) 129 | 130 | params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7) 131 | 132 | # unbias_m += (1 - self.beta1) * (grads[key] - self.m[key]) # correct bias 133 | # unbisa_b += (1 - self.beta2) * (grads[key]*grads[key] - self.v[key]) # correct bias 134 | # params[key] += self.lr * unbias_m / (np.sqrt(unbisa_b) + 1e-7) 135 | -------------------------------------------------------------------------------- /tinynn/trainer.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | import numpy as np 3 | 4 | from tinynn.optimizer import (SGD, AdaGrad, Adam, Momentum, RMSprop) 5 | 6 | 7 | class Trainer: 8 | """ 辅助进行神经网络的训练的类 """ 9 | 10 | def __init__(self, 11 | network: object, 12 | x_train: np.ndarray, 13 | t_train: np.ndarray, 14 | x_test: np.ndarray, 15 | t_test: np.ndarray, 16 | epochs: int = 20, 17 | mini_batch_size: int = 100, 18 | optimizer: str = 'SGD', 19 | optimizer_param: dict = {'lr': 0.01}, 20 | evaluate_sample_num_per_epoch: Optional[int] = None, 21 | verbose: bool = True) -> None: 22 | self.network = network 23 | self.verbose = verbose 24 | self.x_train = x_train 25 | self.t_train = t_train 26 | self.x_test = x_test 27 | self.t_test = t_test 28 | self.epochs = epochs 29 | self.batch_size = mini_batch_size 30 | self.evaluate_sample_num_per_epoch = evaluate_sample_num_per_epoch 31 | 32 | # optimzer 33 | optimizer_class_dict = { 34 | 'sgd': SGD, 35 | 'momentum': Momentum, 36 | 'adagrad': AdaGrad, 37 | 'rmsprpo': RMSprop, 38 | 'adam': Adam, 39 | } 40 | # 传入对应字符串,小写化后选择对应的优化器类 41 | # 随后把optimizer_param作参数传进去创建优化器类 42 | self.optimizer = optimizer_class_dict[optimizer.lower()]( 43 | **optimizer_param) 44 | 45 | self.train_size = x_train.shape[0] 46 | self.iter_per_epoch = max(self.train_size / mini_batch_size, 1) 47 | self.max_iter = int(epochs * self.iter_per_epoch) 48 | self.current_iter = 0 49 | self.current_epoch = 0 50 | 51 | self.train_loss_list = [] 52 | self.train_acc_list = [] 53 | self.test_acc_list = [] 54 | 55 | def train_step(self) -> None: 56 | batch_mask = np.random.choice(self.train_size, self.batch_size) 57 | x_batch = self.x_train[batch_mask] 58 | t_batch = self.t_train[batch_mask] 59 | 60 | grads = self.network.gradient(x_batch, t_batch) 61 | self.optimizer.update(self.network.params, grads) 62 | 63 | loss = self.network.loss(x_batch, t_batch) 64 | self.train_loss_list.append(loss) 65 | if self.verbose: 66 | print(f"train loss: {loss}") 67 | 68 | if self.current_iter % self.iter_per_epoch == 0: 69 | self.current_epoch += 1 70 | 71 | x_train_sample, t_train_sample = self.x_train, self.t_train 72 | x_test_sample, t_test_sample = self.x_test, self.t_test 73 | if not self.evaluate_sample_num_per_epoch is None: 74 | t = self.evaluate_sample_num_per_epoch 75 | x_train_sample, t_train_sample = self.x_train[: 76 | t], self.t_train[: 77 | t] 78 | x_test_sample, t_test_sample = self.x_test[:t], self.t_test[:t] 79 | 80 | train_acc = self.network.accuracy(x_train_sample, t_train_sample) 81 | test_acc = self.network.accuracy(x_test_sample, t_test_sample) 82 | self.train_acc_list.append(train_acc) 83 | self.test_acc_list.append(test_acc) 84 | 85 | if self.verbose: 86 | print( 87 | f"=== epoch: {self.current_epoch}, train acc: {train_acc}, " 88 | f"test acc: {test_acc} ===") 89 | self.current_iter += 1 90 | 91 | def train(self) -> None: 92 | for _ in range(self.max_iter): 93 | self.train_step() 94 | 95 | test_acc = self.network.accuracy(self.x_test, self.t_test) 96 | 97 | if self.verbose: 98 | print("=============== Final Test Accuracy ===============") 99 | print(f"test acc: {test_acc}") 100 | -------------------------------------------------------------------------------- /tinynn/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def smooth_curve(x: np.ndarray) -> np.ndarray: 5 | """用于使损失函数的图形变圆滑 6 | 7 | 参考:http://glowingpython.blogspot.jp/2012/02/convolution-with-numpy.html 8 | """ 9 | window_len = 11 10 | s = np.r_[x[window_len - 1:0:-1], x, x[-1:-window_len:-1]] 11 | w = np.kaiser(window_len, 2) 12 | y = np.convolve(w / w.sum(), s, mode='valid') 13 | return y[5:len(y) - 5] 14 | 15 | 16 | def shuffle_dataset(x: np.ndarray, 17 | t: np.ndarray) -> tuple[np.ndarray, np.ndarray]: 18 | """ 打乱数据集 """ 19 | permutation = np.random.permutation(x.shape[0]) 20 | x = x[permutation, :] if x.ndim == 2 else x[permutation, :, :, :] 21 | t = t[permutation] 22 | return x, t 23 | 24 | 25 | def im2col(input_data: np.ndarray, 26 | filter_h: int, 27 | filter_w: int, 28 | stride: int = 1, 29 | pad: int = 0) -> np.ndarray: 30 | """将图像展开为二维矩阵,展开后可以减少几层for循环的卷积运算 31 | 32 | 其本质为将一次滤波器应用的区域的数据横向展开为一列 33 | 34 | Args: 35 | input_data (np.ndarray): 由(数据量, 通道, 高, 长)的4维数组构成的输入数据 36 | 即NCHW 37 | filter_h (int): 滤波器的高 38 | filter_w (int): 滤波器的长 39 | stride (int, optional): 步幅. Defaults to 1. 40 | pad (int, optional): 填充. Defaults to 0. 41 | 42 | Examples 43 | 44 | x1 = np.random.rand(1, 3, 7, 7) # 批大小1,通道3,7*7数据 45 | col1 = im2col(x1, 5, 5, stride=1, pad=0) # 滤波器通道3,大小5*5 46 | print(col1.shape) # (9, 75) 47 | 48 | x2 = np.random.rand(10, 3, 7, 7) # 10个数据 49 | col2 = im2col(x2, 5, 5, stride=1, pad=0) 50 | print(col2.shape) # (90, 75) 51 | 52 | Returns: 53 | np.ndarray: 2维数组 54 | 55 | """ 56 | N, C, H, W = input_data.shape 57 | 58 | # 计算卷积处理后的形状公式 59 | out_h = (H + 2 * pad - filter_h) // stride + 1 60 | out_w = (W + 2 * pad - filter_w) // stride + 1 61 | 62 | # np.pad(constant)对input data的第三四维填充p 63 | img = np.pad(input_data, [(0, 0), (0, 0), (pad, pad), (pad, pad)], 64 | 'constant') 65 | # 六维数组col (N, C, filter_h, filter_w, out_h, out_w) 66 | col = np.zeros((N, C, filter_h, filter_w, out_h, out_w)) 67 | 68 | for j in range(filter_h): 69 | j_max = j + stride * out_h 70 | for i in range(filter_w): 71 | i_max = i + stride * out_w 72 | # 知识:np.array[1:5:2]指的是从index1拿到index4(右开)的数据,stride为2 73 | # 这里即将img中j~j_max(H), i~i_max(W)的数据,以stride步长赋给col 74 | # j~j_max(H), i~i_max(W) 即一次卷积核处理扫过的所有区域 75 | col[:, :, j, i, :, :] = img[:, :, j:j_max:stride, i:i_max:stride] 76 | 77 | # transpose后,col变为 (N, out_h, out_W, C, filter_h, filter_w) 78 | # 这样调整后再reshape为 (N * out_h * out_w)行, C * filter_h * filter_w列的数据 79 | # 即每一列都为一次滤波器应用后的区域 80 | return col.transpose(0, 4, 5, 1, 2, 3).reshape(N * out_h * out_w, -1) 81 | 82 | 83 | def col2im(col: np.ndarray, 84 | input_shape: tuple, 85 | filter_h: int, 86 | filter_w: int, 87 | stride: int = 1, 88 | pad: int = 0) -> np.ndarray: 89 | """将二维矩阵展开为NCHW 90 | 91 | im2col方法的逆过程。注意:只适用于dout的输入,即backward的时候使用 92 | 93 | Args: 94 | col (np.ndarray): im2col得到的二维矩阵 95 | input_shape (tuple): 输入形状 96 | filter_h (int): 滤波器高 97 | filter_w (int): 滤波器长 98 | stride (int, optional): 步长. Defaults to 1. 99 | pad (int, optional): 填充量. Defaults to 0. 100 | 101 | Returns: 102 | np.ndarray: NCHW 103 | """ 104 | N, C, H, W = input_shape 105 | out_h = (H + 2 * pad - filter_h) // stride + 1 106 | out_w = (W + 2 * pad - filter_w) // stride + 1 107 | 108 | # 六维数组col (N, filter_h, filter_w, C, out_h, out_w) 109 | col = col.reshape(N, out_h, out_w, C, filter_h, 110 | filter_w).transpose(0, 3, 4, 5, 1, 2) 111 | 112 | # 目标img NCHW 113 | img = np.zeros((N, C, H + 2 * pad + stride - 1, W + 2 * pad + stride - 1)) 114 | 115 | for j in range(filter_h): 116 | j_max = j + stride * out_h 117 | for i in range(filter_w): 118 | i_max = i + stride * out_w 119 | # 这里即将col中j~j_max(H), i~i_max(W)的数据,以stride步长赋给img 120 | # j~j_max(H), i~i_max(W) 即一次卷积核处理扫过的所有区域 121 | # 注意此处部分区域可能会多次加和,所以只适用于backward阶段 122 | img[:, :, j:j_max:stride, i:i_max:stride] += col[:, :, j, i, :, :] 123 | 124 | # pad:H+pad 而不是 0:H,不会将pad的值算入img 125 | return img[:, :, pad:H + pad, pad:W + pad] --------------------------------------------------------------------------------