├── common ├── __init__.py ├── functions.py ├── gradient.py ├── util.py ├── trainer.py ├── optimizer.py ├── multi_layer_net.py ├── multi_layer_net_extend.py └── layers.py ├── dataset ├── __init__.py └── mnist.py ├── ch01 ├── hungry.py ├── img_show.py ├── sin_graph.py ├── simple_graph.py ├── man.py └── sin_cos_graph.py ├── ch08 ├── awesome_net.py ├── train_deepnet.py ├── half_float_network.py ├── misclassified_mnist.py └── deep_convnet.py ├── .gitignore ├── ch03 ├── relu.py ├── sigmoid.py ├── step_function.py ├── sig_step_compare.py ├── mnist_show.py ├── neuralnet_mnist.py └── neuralnet_mnist_batch.py ├── ch02 ├── xor_gate.py ├── or_gate.py ├── and_gate.py └── nand_gate.py ├── ch05 ├── buy_apple.py ├── layer_naive.py ├── gradient_check.py ├── buy_apple_orange.py ├── train_neuralnet.py └── two_layer_net.py ├── ch07 ├── gradient_check.py ├── visualize_filter.py ├── train_convnet.py ├── apply_filter.py └── simple_convnet.py ├── ch04 ├── gradient_1d.py ├── gradient_simplenet.py ├── gradient_method.py ├── gradient_2d.py ├── train_neuralnet.py └── two_layer_net.py ├── ch06 ├── batch_norm_gradient_check.py ├── weight_init_activation_histogram.py ├── overfit_dropout.py ├── optimizer_compare_naive.py ├── weight_init_compare.py ├── optimizer_compare_mnist.py ├── overfit_weight_decay.py ├── hyperparameter_optimization.py └── batch_norm_test.py ├── README.md ├── LICENSE.md └── 深度学习入门笔记.md /common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataset/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ch01/hungry.py: -------------------------------------------------------------------------------- 1 | print("I'm hungry!") 2 | -------------------------------------------------------------------------------- /ch08/awesome_net.py: -------------------------------------------------------------------------------- 1 | # Create your awesome net!! -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *~ 3 | *.gz 4 | *.pyc 5 | __pycache__/ 6 | *.tar 7 | *.tgz 8 | *.png 9 | *.jpg 10 | *.pkl 11 | -------------------------------------------------------------------------------- /ch01/img_show.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import matplotlib.pyplot as plt 3 | from matplotlib.image import imread 4 | 5 | img = imread('../dataset/lena.png') #读入图像 6 | plt.imshow(img) 7 | 8 | plt.show() -------------------------------------------------------------------------------- /ch01/sin_graph.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | # 生成数据 6 | x = np.arange(0, 6, 0.1) 7 | y = np.sin(x) 8 | 9 | # 绘制图形 10 | plt.plot(x, y) 11 | plt.show() 12 | -------------------------------------------------------------------------------- /ch01/simple_graph.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | # 生成数据 6 | x = np.arange(0, 6, 0.1) # 以0.1为单位,生成0到6的数据 7 | y = np.sin(x) 8 | 9 | # 绘制图形 10 | plt.plot(x, y) 11 | plt.show() -------------------------------------------------------------------------------- /ch03/relu.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | import matplotlib.pylab as plt 4 | 5 | 6 | def relu(x): 7 | return np.maximum(0, x) 8 | 9 | x = np.arange(-5.0, 5.0, 0.1) 10 | y = relu(x) 11 | plt.plot(x, y) 12 | plt.ylim(-1.0, 5.5) 13 | plt.show() 14 | -------------------------------------------------------------------------------- /ch03/sigmoid.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | import matplotlib.pylab as plt 4 | 5 | 6 | def sigmoid(x): 7 | return 1 / (1 + np.exp(-x)) 8 | 9 | X = np.arange(-5.0, 5.0, 0.1) 10 | Y = sigmoid(X) 11 | plt.plot(X, Y) 12 | plt.ylim(-0.1, 1.1) 13 | plt.show() 14 | -------------------------------------------------------------------------------- /ch03/step_function.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | import matplotlib.pylab as plt 4 | 5 | 6 | def step_function(x): 7 | return np.array(x > 0, dtype=np.int) 8 | 9 | X = np.arange(-5.0, 5.0, 0.1) 10 | Y = step_function(X) 11 | plt.plot(X, Y) 12 | plt.ylim(-0.1, 1.1) # 指定图中绘制的y轴的范围 13 | plt.show() 14 | -------------------------------------------------------------------------------- /ch01/man.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | class Man: 3 | """示例类""" # 示例类 4 | 5 | def __init__(self, name): 6 | self.name = name 7 | print("Initilized!") 8 | 9 | def hello(self): 10 | print("Hello " + self.name + "!") 11 | 12 | def goodbye(self): 13 | print("Good-bye " + self.name + "!") 14 | 15 | m = Man("David") 16 | m.hello() 17 | m.goodbye() -------------------------------------------------------------------------------- /ch02/xor_gate.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from and_gate import AND 3 | from or_gate import OR 4 | from nand_gate import NAND 5 | 6 | 7 | def XOR(x1, x2): 8 | s1 = NAND(x1, x2) 9 | s2 = OR(x1, x2) 10 | y = AND(s1, s2) 11 | return y 12 | 13 | if __name__ == '__main__': 14 | for xs in [(0, 0), (1, 0), (0, 1), (1, 1)]: 15 | y = XOR(xs[0], xs[1]) 16 | print(str(xs) + " -> " + str(y)) -------------------------------------------------------------------------------- /ch01/sin_cos_graph.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | # 生成数据 6 | x = np.arange(0, 6, 0.1) # 以0.1为单位,生成0到6的数据 7 | y1 = np.sin(x) 8 | y2 = np.cos(x) 9 | 10 | # 绘制图形 11 | plt.plot(x, y1, label="sin") 12 | plt.plot(x, y2, linestyle = "--", label="cos") 13 | plt.xlabel("x") # x轴的标签 14 | plt.ylabel("y") # y轴的标签 15 | plt.title('sin & cos') 16 | plt.legend() 17 | plt.show() -------------------------------------------------------------------------------- /ch02/or_gate.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | 4 | 5 | def OR(x1, x2): 6 | x = np.array([x1, x2]) 7 | w = np.array([0.5, 0.5]) 8 | b = -0.2 9 | tmp = np.sum(w*x) + b 10 | if tmp <= 0: 11 | return 0 12 | else: 13 | return 1 14 | 15 | if __name__ == '__main__': 16 | for xs in [(0, 0), (1, 0), (0, 1), (1, 1)]: 17 | y = OR(xs[0], xs[1]) 18 | print(str(xs) + " -> " + str(y)) -------------------------------------------------------------------------------- /ch02/and_gate.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | 4 | 5 | def AND(x1, x2): 6 | x = np.array([x1, x2]) 7 | w = np.array([0.5, 0.5]) 8 | b = -0.7 9 | tmp = np.sum(w*x) + b 10 | if tmp <= 0: 11 | return 0 12 | else: 13 | return 1 14 | 15 | if __name__ == '__main__': 16 | for xs in [(0, 0), (1, 0), (0, 1), (1, 1)]: 17 | y = AND(xs[0], xs[1]) 18 | print(str(xs) + " -> " + str(y)) 19 | -------------------------------------------------------------------------------- /ch02/nand_gate.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | 4 | 5 | def NAND(x1, x2): 6 | x = np.array([x1, x2]) 7 | w = np.array([-0.5, -0.5]) 8 | b = 0.7 9 | tmp = np.sum(w*x) + b 10 | if tmp <= 0: 11 | return 0 12 | else: 13 | return 1 14 | 15 | if __name__ == '__main__': 16 | for xs in [(0, 0), (1, 0), (0, 1), (1, 1)]: 17 | y = NAND(xs[0], xs[1]) 18 | print(str(xs) + " -> " + str(y)) 19 | -------------------------------------------------------------------------------- /ch03/sig_step_compare.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | import matplotlib.pylab as plt 4 | 5 | 6 | def sigmoid(x): 7 | return 1 / (1 + np.exp(-x)) 8 | 9 | 10 | def step_function(x): 11 | return np.array(x > 0, dtype=np.int) 12 | 13 | x = np.arange(-5.0, 5.0, 0.1) 14 | y1 = sigmoid(x) 15 | y2 = step_function(x) 16 | 17 | plt.plot(x, y1) 18 | plt.plot(x, y2, 'k--') 19 | plt.ylim(-0.1, 1.1) #指定图中绘制的y轴的范围 20 | plt.show() 21 | -------------------------------------------------------------------------------- /ch03/mnist_show.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 4 | import numpy as np 5 | from dataset.mnist import load_mnist 6 | from PIL import Image 7 | 8 | 9 | def img_show(img): 10 | pil_img = Image.fromarray(np.uint8(img)) 11 | pil_img.show() 12 | 13 | (x_train, t_train), (x_test, t_test) = load_mnist(flatten=True, normalize=False) 14 | 15 | img = x_train[0] 16 | label = t_train[0] 17 | print(label) # 5 18 | 19 | print(img.shape) # (784,) 20 | img = img.reshape(28, 28) # 把图像的形状变为原来的尺寸 21 | print(img.shape) # (28, 28) 22 | 23 | img_show(img) 24 | -------------------------------------------------------------------------------- /ch05/buy_apple.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from layer_naive import * 3 | 4 | 5 | apple = 100 6 | apple_num = 2 7 | tax = 1.1 8 | 9 | mul_apple_layer = MulLayer() 10 | mul_tax_layer = MulLayer() 11 | 12 | # forward 13 | apple_price = mul_apple_layer.forward(apple, apple_num) 14 | price = mul_tax_layer.forward(apple_price, tax) 15 | 16 | # backward 17 | dprice = 1 18 | dapple_price, dtax = mul_tax_layer.backward(dprice) 19 | dapple, dapple_num = mul_apple_layer.backward(dapple_price) 20 | 21 | print("price:", int(price)) 22 | print("dApple:", dapple) 23 | print("dApple_num:", int(dapple_num)) 24 | print("dTax:", dtax) 25 | -------------------------------------------------------------------------------- /ch07/gradient_check.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | from simple_convnet import SimpleConvNet 4 | 5 | network = SimpleConvNet(input_dim=(1,10, 10), 6 | conv_param = {'filter_num':10, 'filter_size':3, 'pad':0, 'stride':1}, 7 | hidden_size=10, output_size=10, weight_init_std=0.01) 8 | 9 | X = np.random.rand(100).reshape((1, 1, 10, 10)) 10 | T = np.array([1]).reshape((1,1)) 11 | 12 | grad_num = network.numerical_gradient(X, T) 13 | grad = network.gradient(X, T) 14 | 15 | for key, val in grad_num.items(): 16 | print(key, np.abs(grad_num[key] - grad[key]).mean()) -------------------------------------------------------------------------------- /ch04/gradient_1d.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | import matplotlib.pylab as plt 4 | 5 | 6 | def numerical_diff(f, x): 7 | h = 1e-4 # 0.0001 8 | return (f(x+h) - f(x-h)) / (2*h) 9 | 10 | 11 | def function_1(x): 12 | return 0.01*x**2 + 0.1*x 13 | 14 | 15 | def tangent_line(f, x): 16 | d = numerical_diff(f, x) 17 | print(d) 18 | y = f(x) - d*x # 计算的截距 19 | return lambda t: d*t + y 20 | 21 | x = np.arange(0.0, 20.0, 0.1) 22 | y = function_1(x) 23 | plt.xlabel("x") 24 | plt.ylabel("f(x)") 25 | 26 | tf = tangent_line(function_1, 5) 27 | y2 = tf(x) 28 | 29 | plt.plot(x, y) 30 | plt.plot(x, y2) 31 | plt.show() 32 | -------------------------------------------------------------------------------- /ch05/layer_naive.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | 4 | class MulLayer: 5 | def __init__(self): 6 | self.x = None 7 | self.y = None 8 | 9 | def forward(self, x, y): 10 | self.x = x 11 | self.y = y 12 | out = x * y 13 | 14 | return out 15 | 16 | def backward(self, dout): 17 | dx = dout * self.y 18 | dy = dout * self.x 19 | 20 | return dx, dy 21 | 22 | 23 | class AddLayer: 24 | def __init__(self): 25 | pass 26 | 27 | def forward(self, x, y): 28 | out = x + y 29 | 30 | return out 31 | 32 | def backward(self, dout): 33 | dx = dout * 1 34 | dy = dout * 1 35 | 36 | return dx, dy 37 | -------------------------------------------------------------------------------- /ch08/train_deepnet.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录而进行的设定 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from dataset.mnist import load_mnist 7 | from deep_convnet import DeepConvNet 8 | from common.trainer import Trainer 9 | 10 | (x_train, t_train), (x_test, t_test) = load_mnist(flatten=False) 11 | 12 | network = DeepConvNet() 13 | trainer = Trainer(network, x_train, t_train, x_test, t_test, 14 | epochs=20, mini_batch_size=100, 15 | optimizer='Adam', optimizer_param={'lr':0.001}, 16 | evaluate_sample_num_per_epoch=1000) 17 | trainer.train() 18 | 19 | # 保存参数 20 | network.save_params("deep_convnet_params.pkl") 21 | print("Saved Network Parameters!") 22 | -------------------------------------------------------------------------------- /ch04/gradient_simplenet.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录中的文件而进行的设定 4 | import numpy as np 5 | from common.functions import softmax, cross_entropy_error 6 | from common.gradient import numerical_gradient 7 | 8 | 9 | class simpleNet: 10 | def __init__(self): 11 | self.W = np.random.randn(2,3) 12 | 13 | def predict(self, x): 14 | return np.dot(x, self.W) 15 | 16 | def loss(self, x, t): 17 | z = self.predict(x) 18 | y = softmax(z) 19 | loss = cross_entropy_error(y, t) 20 | 21 | return loss 22 | 23 | x = np.array([0.6, 0.9]) 24 | t = np.array([0, 0, 1]) 25 | 26 | net = simpleNet() 27 | 28 | f = lambda w: net.loss(x, t) 29 | dW = numerical_gradient(f, net.W) 30 | 31 | print(dW) 32 | -------------------------------------------------------------------------------- /ch05/gradient_check.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 4 | import numpy as np 5 | from dataset.mnist import load_mnist 6 | from two_layer_net import TwoLayerNet 7 | 8 | # 读入数据 9 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True) 10 | 11 | network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10) 12 | print(x_train.shape) 13 | x_batch = x_train[:3] 14 | t_batch = t_train[:3] 15 | print(x_batch.shape) 16 | grad_numerical = network.numerical_gradient(x_batch, t_batch) 17 | #grad_backprop = network.gradient(x_batch, t_batch) 18 | 19 | #for key in grad_numerical.keys(): 20 | # diff = np.average( np.abs(grad_backprop[key] - grad_numerical[key]) ) 21 | # print(key + ":" + str(diff)) 22 | -------------------------------------------------------------------------------- /ch06/batch_norm_gradient_check.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 4 | import numpy as np 5 | from dataset.mnist import load_mnist 6 | from common.multi_layer_net_extend import MultiLayerNetExtend 7 | 8 | # 读入数据 9 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True) 10 | 11 | network = MultiLayerNetExtend(input_size=784, hidden_size_list=[100, 100], output_size=10, 12 | use_batchnorm=True) 13 | 14 | x_batch = x_train[:1] 15 | t_batch = t_train[:1] 16 | 17 | grad_backprop = network.gradient(x_batch, t_batch) 18 | grad_numerical = network.numerical_gradient(x_batch, t_batch) 19 | 20 | 21 | for key in grad_numerical.keys(): 22 | diff = np.average( np.abs(grad_backprop[key] - grad_numerical[key]) ) 23 | print(key + ":" + str(diff)) -------------------------------------------------------------------------------- /ch08/half_float_network.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录而进行的设定 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from deep_convnet import DeepConvNet 7 | from dataset.mnist import load_mnist 8 | 9 | 10 | (x_train, t_train), (x_test, t_test) = load_mnist(flatten=False) 11 | 12 | network = DeepConvNet() 13 | network.load_params("deep_convnet_params.pkl") 14 | 15 | sampled = 10000 # 为了实现高速化 16 | x_test = x_test[:sampled] 17 | t_test = t_test[:sampled] 18 | 19 | print("caluculate accuracy (float64) ... ") 20 | print(network.accuracy(x_test, t_test)) 21 | 22 | # 转换为float16型 23 | x_test = x_test.astype(np.float16) 24 | for param in network.params.values(): 25 | param[...] = param.astype(np.float16) 26 | 27 | print("caluculate accuracy (float16) ... ") 28 | print(network.accuracy(x_test, t_test)) 29 | -------------------------------------------------------------------------------- /ch07/visualize_filter.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from simple_convnet import SimpleConvNet 5 | 6 | def filter_show(filters, nx=8, margin=3, scale=10): 7 | """ 8 | c.f. https://gist.github.com/aidiary/07d530d5e08011832b12#file-draw_weight-py 9 | """ 10 | FN, C, FH, FW = filters.shape 11 | ny = int(np.ceil(FN / nx)) 12 | 13 | fig = plt.figure() 14 | fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05) 15 | 16 | for i in range(FN): 17 | ax = fig.add_subplot(ny, nx, i+1, xticks=[], yticks=[]) 18 | ax.imshow(filters[i, 0], cmap=plt.cm.gray_r, interpolation='nearest') 19 | plt.show() 20 | 21 | 22 | network = SimpleConvNet() 23 | # 随机进行初始化后的权重 24 | filter_show(network.params['W1']) 25 | 26 | # 学习后的权重 27 | network.load_params("params.pkl") 28 | filter_show(network.params['W1']) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 深度学习入门 2 | 3 | 4 | 5 | ## 文件结构 6 | 7 | |文件夹名 |说明 | 8 | |:-- |:-- | 9 | |ch01 |第1章使用的源代码 | 10 | |ch02 |第2章使用的源代码 | 11 | |... |... | 12 | |ch08 |第8章使用的源代码 | 13 | |common |共同使用的源代码 | 14 | |dataset |数据集用的源代码 | 15 | 16 | 17 | 源代码的解释请参考本书。 18 | 19 | ## 必要条件 20 | 执行源代码需要按照以下软件。 21 | 22 | * Python 3.x 23 | * NumPy 24 | * Matplotlib 25 | 26 | ※Python的版本为Python 3。 27 | 28 | ## 执行方法 29 | 30 | 前进到各章节的文件夹,执行Python命令。 31 | 32 | ``` 33 | $ cd ch01 34 | $ python man.py 35 | 36 | $ cd ../ch05 37 | $ python train_nueralnet.py 38 | ``` 39 | 40 | ## 使用许可 41 | 42 | 本源代码使用[MIT许可协议](http://www.opensource.org/licenses/MIT)。 43 | 无论是否为商业行为,均可自由使用。 44 | 45 | ## 勘误表 46 | 47 | 本书的勘误信息在以下网址中公开。读者可以在以下网址中查看和提交勘误。 48 | 49 | http://www.ituring.com.cn/book/1921 50 | 51 | 52 | -------------------------------------------------------------------------------- /ch04/gradient_method.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | import matplotlib.pylab as plt 4 | from gradient_2d import numerical_gradient 5 | 6 | 7 | def gradient_descent(f, init_x, lr=0.01, step_num=100): 8 | x = init_x 9 | x_history = [] 10 | 11 | for i in range(step_num): 12 | x_history.append( x.copy() ) 13 | 14 | grad = numerical_gradient(f, x) 15 | x -= lr * grad 16 | 17 | return x, np.array(x_history) 18 | 19 | 20 | def function_2(x): 21 | return x[0]**2 + x[1]**2 22 | 23 | init_x = np.array([-3.0, 4.0]) 24 | 25 | lr = 0.1 26 | step_num = 20 27 | x, x_history = gradient_descent(function_2, init_x, lr=lr, step_num=step_num) 28 | 29 | plt.plot( [-5, 5], [0,0], '--b') 30 | plt.plot( [0,0], [-5, 5], '--b') 31 | plt.plot(x_history[:,0], x_history[:,1], 'o') 32 | 33 | plt.xlim(-3.5, 3.5) 34 | plt.ylim(-4.5, 4.5) 35 | plt.xlabel("X0") 36 | plt.ylabel("X1") 37 | plt.show() 38 | -------------------------------------------------------------------------------- /ch05/buy_apple_orange.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from layer_naive import * 3 | 4 | apple = 100 5 | apple_num = 2 6 | orange = 150 7 | orange_num = 3 8 | tax = 1.1 9 | 10 | # layer 11 | mul_apple_layer = MulLayer() 12 | mul_orange_layer = MulLayer() 13 | add_apple_orange_layer = AddLayer() 14 | mul_tax_layer = MulLayer() 15 | 16 | # forward 17 | apple_price = mul_apple_layer.forward(apple, apple_num) # (1) 18 | orange_price = mul_orange_layer.forward(orange, orange_num) # (2) 19 | all_price = add_apple_orange_layer.forward(apple_price, orange_price) # (3) 20 | price = mul_tax_layer.forward(all_price, tax) # (4) 21 | 22 | # backward 23 | dprice = 1 24 | dall_price, dtax = mul_tax_layer.backward(dprice) # (4) 25 | dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price) # (3) 26 | dorange, dorange_num = mul_orange_layer.backward(dorange_price) # (2) 27 | dapple, dapple_num = mul_apple_layer.backward(dapple_price) # (1) 28 | 29 | print("price:", int(price)) 30 | print("dApple:", dapple) 31 | print("dApple_num:", int(dapple_num)) 32 | print("dOrange:", dorange) 33 | print("dOrange_num:", int(dorange_num)) 34 | print("dTax:", dtax) 35 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Koki Saitoh 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ch03/neuralnet_mnist.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 4 | import numpy as np 5 | import pickle 6 | from dataset.mnist import load_mnist 7 | from common.functions import sigmoid, softmax 8 | 9 | 10 | def get_data(): 11 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, flatten=True, one_hot_label=False) 12 | return x_test, t_test 13 | 14 | 15 | def init_network(): 16 | with open("sample_weight.pkl", 'rb') as f: 17 | network = pickle.load(f) 18 | return network 19 | 20 | 21 | def predict(network, x): 22 | W1, W2, W3 = network['W1'], network['W2'], network['W3'] 23 | b1, b2, b3 = network['b1'], network['b2'], network['b3'] 24 | 25 | a1 = np.dot(x, W1) + b1 26 | z1 = sigmoid(a1) 27 | a2 = np.dot(z1, W2) + b2 28 | z2 = sigmoid(a2) 29 | a3 = np.dot(z2, W3) + b3 30 | y = softmax(a3) 31 | 32 | return y 33 | 34 | 35 | x, t = get_data() 36 | network = init_network() 37 | accuracy_cnt = 0 38 | for i in range(len(x)): 39 | y = predict(network, x[i]) 40 | p= np.argmax(y) # 获取概率最高的元素的索引 41 | if p == t[i]: 42 | accuracy_cnt += 1 43 | 44 | print("Accuracy:" + str(float(accuracy_cnt) / len(x))) -------------------------------------------------------------------------------- /ch03/neuralnet_mnist_batch.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 4 | import numpy as np 5 | import pickle 6 | from dataset.mnist import load_mnist 7 | from common.functions import sigmoid, softmax 8 | 9 | 10 | def get_data(): 11 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, flatten=True, one_hot_label=False) 12 | return x_test, t_test 13 | 14 | 15 | def init_network(): 16 | with open("sample_weight.pkl", 'rb') as f: 17 | network = pickle.load(f) 18 | return network 19 | 20 | 21 | def predict(network, x): 22 | w1, w2, w3 = network['W1'], network['W2'], network['W3'] 23 | b1, b2, b3 = network['b1'], network['b2'], network['b3'] 24 | 25 | a1 = np.dot(x, w1) + b1 26 | z1 = sigmoid(a1) 27 | a2 = np.dot(z1, w2) + b2 28 | z2 = sigmoid(a2) 29 | a3 = np.dot(z2, w3) + b3 30 | y = softmax(a3) 31 | 32 | return y 33 | 34 | 35 | x, t = get_data() 36 | network = init_network() 37 | 38 | batch_size = 100 # 批数量 39 | accuracy_cnt = 0 40 | 41 | for i in range(0, len(x), batch_size): 42 | x_batch = x[i:i+batch_size] 43 | y_batch = predict(network, x_batch) 44 | p = np.argmax(y_batch, axis=1) 45 | accuracy_cnt += np.sum(p == t[i:i+batch_size]) 46 | 47 | print("Accuracy:" + str(float(accuracy_cnt) / len(x))) 48 | -------------------------------------------------------------------------------- /common/functions.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | 4 | 5 | def identity_function(x): 6 | return x 7 | 8 | 9 | def step_function(x): 10 | return np.array(x > 0, dtype=np.int) 11 | 12 | 13 | def sigmoid(x): 14 | return 1 / (1 + np.exp(-x)) 15 | 16 | 17 | def sigmoid_grad(x): 18 | return (1.0 - sigmoid(x)) * sigmoid(x) 19 | 20 | 21 | def relu(x): 22 | return np.maximum(0, x) 23 | 24 | 25 | def relu_grad(x): 26 | grad = np.zeros(x) 27 | grad[x>=0] = 1 28 | return grad 29 | 30 | 31 | def softmax(x): 32 | if x.ndim == 2: 33 | x = x.T 34 | x = x - np.max(x, axis=0) 35 | y = np.exp(x) / np.sum(np.exp(x), axis=0) 36 | return y.T 37 | 38 | x = x - np.max(x) # 溢出对策 39 | return np.exp(x) / np.sum(np.exp(x)) 40 | 41 | 42 | def mean_squared_error(y, t): 43 | return 0.5 * np.sum((y-t)**2) 44 | 45 | 46 | def cross_entropy_error(y, t): 47 | if y.ndim == 1: # 这里没有变化 48 | t = t.reshape(1, t.size) 49 | y = y.reshape(1, y.size) 50 | 51 | # 监督数据是one-hot-vector的情况下,转换为正确解标签的索引 52 | if t.size == y.size: 53 | t = t.argmax(axis=1) 54 | 55 | batch_size = y.shape[0] 56 | return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size 57 | 58 | 59 | def softmax_loss(X, t): 60 | y = softmax(X) 61 | return cross_entropy_error(y, t) 62 | -------------------------------------------------------------------------------- /common/gradient.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | 4 | def _numerical_gradient_1d(f, x): 5 | h = 1e-4 # 0.0001 6 | grad = np.zeros_like(x) 7 | 8 | for idx in range(x.size): 9 | tmp_val = x[idx] 10 | x[idx] = float(tmp_val) + h 11 | fxh1 = f(x) # f(x+h) 12 | 13 | x[idx] = tmp_val - h 14 | fxh2 = f(x) # f(x-h) 15 | grad[idx] = (fxh1 - fxh2) / (2*h) 16 | 17 | x[idx] = tmp_val # 还原值 18 | 19 | return grad 20 | 21 | 22 | def numerical_gradient_2d(f, X): 23 | if X.ndim == 1: 24 | return _numerical_gradient_1d(f, X) 25 | else: 26 | grad = np.zeros_like(X) 27 | 28 | for idx, x in enumerate(X): 29 | grad[idx] = _numerical_gradient_1d(f, x) 30 | 31 | return grad 32 | 33 | 34 | def numerical_gradient(f, x): 35 | h = 1e-4 # 0.0001 36 | grad = np.zeros_like(x) 37 | 38 | it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite']) 39 | while not it.finished: 40 | idx = it.multi_index 41 | tmp_val = x[idx] 42 | x[idx] = float(tmp_val) + h 43 | fxh1 = f(x) # f(x+h) 44 | 45 | x[idx] = tmp_val - h 46 | fxh2 = f(x) # f(x-h) 47 | grad[idx] = (fxh1 - fxh2) / (2*h) 48 | 49 | x[idx] = tmp_val # 还原值 50 | it.iternext() 51 | 52 | return grad -------------------------------------------------------------------------------- /ch05/train_neuralnet.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) 4 | 5 | import numpy as np 6 | from dataset.mnist import load_mnist 7 | from two_layer_net import TwoLayerNet 8 | 9 | # 读入数据 10 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True) 11 | 12 | network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10) 13 | 14 | iters_num = 10000 15 | train_size = x_train.shape[0] 16 | batch_size = 100 17 | learning_rate = 0.1 18 | 19 | train_loss_list = [] 20 | train_acc_list = [] 21 | test_acc_list = [] 22 | 23 | iter_per_epoch = max(train_size / batch_size, 1) 24 | 25 | for i in range(iters_num): 26 | batch_mask = np.random.choice(train_size, batch_size) 27 | x_batch = x_train[batch_mask] 28 | t_batch = t_train[batch_mask] 29 | 30 | # 梯度 31 | #grad = network.numerical_gradient(x_batch, t_batch) 32 | grad = network.gradient(x_batch, t_batch) 33 | 34 | # 更新 35 | for key in ('W1', 'b1', 'W2', 'b2'): 36 | network.params[key] -= learning_rate * grad[key] 37 | 38 | loss = network.loss(x_batch, t_batch) 39 | train_loss_list.append(loss) 40 | 41 | if i % iter_per_epoch == 0: 42 | train_acc = network.accuracy(x_train, t_train) 43 | test_acc = network.accuracy(x_test, t_test) 44 | train_acc_list.append(train_acc) 45 | test_acc_list.append(test_acc) 46 | print(train_acc, test_acc) 47 | -------------------------------------------------------------------------------- /ch06/weight_init_activation_histogram.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | def sigmoid(x): 7 | return 1 / (1 + np.exp(-x)) 8 | 9 | 10 | def ReLU(x): 11 | return np.maximum(0, x) 12 | 13 | 14 | def tanh(x): 15 | return np.tanh(x) 16 | 17 | input_data = np.random.randn(1000, 100) # 1000个数据 18 | node_num = 100 # 各隐藏层的节点(神经元)数 19 | hidden_layer_size = 5 # 隐藏层有5层 20 | activations = {} # 激活值的结果保存在这里 21 | 22 | x = input_data 23 | 24 | for i in range(hidden_layer_size): 25 | if i != 0: 26 | x = activations[i-1] 27 | 28 | # 改变初始值进行实验! 29 | # w = np.random.randn(node_num, node_num) * 1 30 | # w = np.random.randn(node_num, node_num) * 0.01 31 | w = np.random.randn(node_num, node_num) * np.sqrt(1.0 / node_num) # Xavier 32 | # w = np.random.randn(node_num, node_num) * np.sqrt(2.0 / node_num) # He 33 | # w = np.random.randn(node_num, node_num) * 0.0001 34 | # w = np.random.randn(node_num, node_num) * 10 35 | 36 | 37 | a = np.dot(x, w) # 38 | 39 | 40 | # 将激活函数的种类也改变,来进行实验! 41 | # z = sigmoid(a) 42 | z = ReLU(a) 43 | # z = tanh(a) 44 | 45 | activations[i] = z 46 | 47 | # 绘制直方图 48 | for i, a in activations.items(): 49 | plt.subplot(1, len(activations), i+1) 50 | plt.title(str(i+1) + "-layer") 51 | if i != 0: plt.yticks([], []) 52 | # plt.xlim(0.1, 1) 53 | # plt.ylim(0, 7000) 54 | plt.hist(a.flatten(), 30, range=(0,1)) 55 | plt.show() 56 | -------------------------------------------------------------------------------- /ch07/train_convnet.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from dataset.mnist import load_mnist 7 | from simple_convnet import SimpleConvNet 8 | from common.trainer import Trainer 9 | 10 | # 读入数据 11 | (x_train, t_train), (x_test, t_test) = load_mnist(flatten=False) 12 | 13 | # 处理花费时间较长的情况下减少数据 14 | #x_train, t_train = x_train[:5000], t_train[:5000] 15 | #x_test, t_test = x_test[:1000], t_test[:1000] 16 | 17 | max_epochs = 20 18 | 19 | network = SimpleConvNet(input_dim=(1,28,28), 20 | conv_param = {'filter_num': 30, 'filter_size': 5, 'pad': 0, 'stride': 1}, 21 | hidden_size=100, output_size=10, weight_init_std=0.01) 22 | 23 | trainer = Trainer(network, x_train, t_train, x_test, t_test, 24 | epochs=max_epochs, mini_batch_size=100, 25 | optimizer='Adam', optimizer_param={'lr': 0.001}, 26 | evaluate_sample_num_per_epoch=1000) 27 | trainer.train() 28 | 29 | # 保存参数 30 | network.save_params("params.pkl") 31 | print("Saved Network Parameters!") 32 | 33 | # 绘制图形 34 | markers = {'train': 'o', 'test': 's'} 35 | x = np.arange(max_epochs) 36 | plt.plot(x, trainer.train_acc_list, marker='o', label='train', markevery=2) 37 | plt.plot(x, trainer.test_acc_list, marker='s', label='test', markevery=2) 38 | plt.xlabel("epochs") 39 | plt.ylabel("accuracy") 40 | plt.ylim(0, 1.0) 41 | plt.legend(loc='lower right') 42 | plt.show() 43 | -------------------------------------------------------------------------------- /ch06/overfit_dropout.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import os 3 | import sys 4 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | from dataset.mnist import load_mnist 8 | from common.multi_layer_net_extend import MultiLayerNetExtend 9 | from common.trainer import Trainer 10 | 11 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True) 12 | 13 | # 为了再现过拟合,减少学习数据 14 | x_train = x_train[:2000] 15 | t_train = t_train[:2000] 16 | 17 | # 设定是否使用Dropuout,以及比例 ======================== 18 | use_dropout = True # 不使用Dropout的情况下为False 19 | dropout_ratio = 0.2 20 | # ==================================================== 21 | 22 | network = MultiLayerNetExtend(input_size=784, hidden_size_list=[100, 100, 100, 100, 100, 100], 23 | output_size=10, use_dropout=use_dropout, dropout_ration=dropout_ratio) 24 | trainer = Trainer(network, x_train, t_train, x_test, t_test, 25 | epochs=301, mini_batch_size=100, 26 | optimizer='sgd', optimizer_param={'lr': 0.01}, verbose=True) 27 | trainer.train() 28 | 29 | train_acc_list, test_acc_list = trainer.train_acc_list, trainer.test_acc_list 30 | 31 | # 绘制图形========== 32 | markers = {'train': 'o', 'test': 's'} 33 | x = np.arange(len(train_acc_list)) 34 | plt.plot(x, train_acc_list, marker='o', label='train', markevery=10) 35 | plt.plot(x, test_acc_list, marker='s', label='test', markevery=10) 36 | plt.xlabel("epochs") 37 | plt.ylabel("accuracy") 38 | plt.ylim(0, 1.0) 39 | plt.legend(loc='lower right') 40 | plt.show() -------------------------------------------------------------------------------- /ch06/optimizer_compare_naive.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from collections import OrderedDict 7 | from common.optimizer import * 8 | 9 | 10 | def f(x, y): 11 | return x**2 / 20.0 + y**2 12 | 13 | 14 | def df(x, y): 15 | return x / 10.0, 2.0*y 16 | 17 | init_pos = (-7.0, 2.0) 18 | params = {} 19 | params['x'], params['y'] = init_pos[0], init_pos[1] 20 | grads = {} 21 | grads['x'], grads['y'] = 0, 0 22 | 23 | 24 | optimizers = OrderedDict() 25 | optimizers["SGD"] = SGD(lr=0.95) 26 | optimizers["Momentum"] = Momentum(lr=0.1) 27 | optimizers["AdaGrad"] = AdaGrad(lr=1.5) 28 | optimizers["Adam"] = Adam(lr=0.3) 29 | 30 | idx = 1 31 | 32 | for key in optimizers: 33 | optimizer = optimizers[key] 34 | x_history = [] 35 | y_history = [] 36 | params['x'], params['y'] = init_pos[0], init_pos[1] 37 | 38 | for i in range(15): 39 | x_history.append(params['x']) 40 | y_history.append(params['y']) 41 | 42 | grads['x'], grads['y'] = df(params['x'], params['y']) 43 | optimizer.update(params, grads) 44 | 45 | 46 | x = np.arange(-10, 10, 0.01) 47 | y = np.arange(-5, 5, 0.01) 48 | 49 | X, Y = np.meshgrid(x, y) 50 | Z = f(X, Y) 51 | 52 | # for simple contour line 53 | mask = Z > 7 54 | Z[mask] = 0 55 | 56 | # plot 57 | plt.subplot(2, 2, idx) 58 | idx += 1 59 | plt.plot(x_history, y_history, 'o-', color="red") 60 | plt.contour(X, Y, Z) 61 | plt.ylim(-10, 10) 62 | plt.xlim(-10, 10) 63 | plt.plot(0, 0, '+') 64 | #colorbar() 65 | #spring() 66 | plt.title(key) 67 | plt.xlabel("x") 68 | plt.ylabel("y") 69 | 70 | plt.show() 71 | -------------------------------------------------------------------------------- /ch07/apply_filter.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from simple_convnet import SimpleConvNet 7 | from matplotlib.image import imread 8 | from common.layers import Convolution 9 | 10 | def filter_show(filters, nx=4, show_num=16): 11 | """ 12 | c.f. https://gist.github.com/aidiary/07d530d5e08011832b12#file-draw_weight-py 13 | """ 14 | FN, C, FH, FW = filters.shape 15 | ny = int(np.ceil(show_num / nx)) 16 | 17 | fig = plt.figure() 18 | fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05) 19 | 20 | for i in range(show_num): 21 | ax = fig.add_subplot(4, 4, i+1, xticks=[], yticks=[]) 22 | ax.imshow(filters[i, 0], cmap=plt.cm.gray_r, interpolation='nearest') 23 | 24 | 25 | network = SimpleConvNet(input_dim=(1,28,28), 26 | conv_param = {'filter_num':30, 'filter_size':5, 'pad':0, 'stride':1}, 27 | hidden_size=100, output_size=10, weight_init_std=0.01) 28 | 29 | # 学习后的权重 30 | network.load_params("params.pkl") 31 | 32 | filter_show(network.params['W1'], 16) 33 | 34 | img = imread('../dataset/lena_gray.png') 35 | img = img.reshape(1, 1, *img.shape) 36 | 37 | fig = plt.figure() 38 | 39 | w_idx = 1 40 | 41 | for i in range(16): 42 | w = network.params['W1'][i] 43 | b = 0 # network.params['b1'][i] 44 | 45 | w = w.reshape(1, *w.shape) 46 | #b = b.reshape(1, *b.shape) 47 | conv_layer = Convolution(w, b) 48 | out = conv_layer.forward(img) 49 | out = out.reshape(out.shape[2], out.shape[3]) 50 | 51 | ax = fig.add_subplot(4, 4, i+1, xticks=[], yticks=[]) 52 | ax.imshow(out, cmap=plt.cm.gray_r, interpolation='nearest') 53 | 54 | plt.show() -------------------------------------------------------------------------------- /ch08/misclassified_mnist.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录而进行的设定 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from deep_convnet import DeepConvNet 7 | from dataset.mnist import load_mnist 8 | 9 | 10 | (x_train, t_train), (x_test, t_test) = load_mnist(flatten=False) 11 | 12 | network = DeepConvNet() 13 | network.load_params("deep_convnet_params.pkl") 14 | 15 | print("calculating test accuracy ... ") 16 | #sampled = 1000 17 | #x_test = x_test[:sampled] 18 | #t_test = t_test[:sampled] 19 | 20 | classified_ids = [] 21 | 22 | acc = 0.0 23 | batch_size = 100 24 | 25 | for i in range(int(x_test.shape[0] / batch_size)): 26 | tx = x_test[i*batch_size:(i+1)*batch_size] 27 | tt = t_test[i*batch_size:(i+1)*batch_size] 28 | y = network.predict(tx, train_flg=False) 29 | y = np.argmax(y, axis=1) 30 | classified_ids.append(y) 31 | acc += np.sum(y == tt) 32 | 33 | acc = acc / x_test.shape[0] 34 | print("test accuracy:" + str(acc)) 35 | 36 | classified_ids = np.array(classified_ids) 37 | classified_ids = classified_ids.flatten() 38 | 39 | max_view = 20 40 | current_view = 1 41 | 42 | fig = plt.figure() 43 | fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.2, wspace=0.2) 44 | 45 | mis_pairs = {} 46 | for i, val in enumerate(classified_ids == t_test): 47 | if not val: 48 | ax = fig.add_subplot(4, 5, current_view, xticks=[], yticks=[]) 49 | ax.imshow(x_test[i].reshape(28, 28), cmap=plt.cm.gray_r, interpolation='nearest') 50 | mis_pairs[current_view] = (t_test[i], classified_ids[i]) 51 | 52 | current_view += 1 53 | if current_view > max_view: 54 | break 55 | 56 | print("======= misclassified result =======") 57 | print("{view index: (label, inference), ...}") 58 | print(mis_pairs) 59 | 60 | plt.show() 61 | -------------------------------------------------------------------------------- /ch04/gradient_2d.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # cf.http://d.hatena.ne.jp/white_wheels/20100327/p3 3 | import numpy as np 4 | import matplotlib.pylab as plt 5 | from mpl_toolkits.mplot3d import Axes3D 6 | 7 | 8 | def _numerical_gradient_no_batch(f, x): 9 | h = 1e-4 # 0.0001 10 | grad = np.zeros_like(x) 11 | 12 | for idx in range(x.size): 13 | tmp_val = x[idx] 14 | x[idx] = float(tmp_val) + h 15 | fxh1 = f(x) # f(x+h) 16 | 17 | x[idx] = tmp_val - h 18 | fxh2 = f(x) # f(x-h) 19 | grad[idx] = (fxh1 - fxh2) / (2*h) 20 | 21 | x[idx] = tmp_val # 还原值 22 | 23 | return grad 24 | 25 | 26 | def numerical_gradient(f, X): 27 | if X.ndim == 1: 28 | return _numerical_gradient_no_batch(f, X) 29 | else: 30 | grad = np.zeros_like(X) 31 | 32 | for idx, x in enumerate(X): 33 | grad[idx] = _numerical_gradient_no_batch(f, x) 34 | 35 | return grad 36 | 37 | 38 | def function_2(x): 39 | if x.ndim == 1: 40 | return np.sum(x**2) 41 | else: 42 | return np.sum(x**2, axis=1) 43 | 44 | 45 | def tangent_line(f, x): 46 | d = numerical_gradient(f, x) 47 | print(d) 48 | y = f(x) - d*x 49 | return lambda t: d*t + y 50 | 51 | if __name__ == '__main__': 52 | x0 = np.arange(-2, 2.5, 0.25) 53 | x1 = np.arange(-2, 2.5, 0.25) 54 | X, Y = np.meshgrid(x0, x1) 55 | 56 | X = X.flatten() 57 | Y = Y.flatten() 58 | 59 | grad = numerical_gradient(function_2, np.array([X, Y]) ) 60 | 61 | plt.figure() 62 | plt.quiver(X, Y, -grad[0], -grad[1], angles="xy",color="#666666")#,headwidth=10,scale=40,color="#444444") 63 | plt.xlim([-2, 2]) 64 | plt.ylim([-2, 2]) 65 | plt.xlabel('x0') 66 | plt.ylabel('x1') 67 | plt.grid() 68 | plt.legend() 69 | plt.draw() 70 | plt.show() -------------------------------------------------------------------------------- /ch04/train_neuralnet.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from dataset.mnist import load_mnist 7 | from two_layer_net import TwoLayerNet 8 | 9 | # 读入数据 10 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True) 11 | 12 | network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10) 13 | 14 | iters_num = 10000 # 适当设定循环的次数 15 | train_size = x_train.shape[0] 16 | batch_size = 100 17 | learning_rate = 0.1 18 | 19 | train_loss_list = [] 20 | train_acc_list = [] 21 | test_acc_list = [] 22 | 23 | iter_per_epoch = max(train_size / batch_size, 1) 24 | 25 | for i in range(iters_num): 26 | batch_mask = np.random.choice(train_size, batch_size) 27 | x_batch = x_train[batch_mask] 28 | t_batch = t_train[batch_mask] 29 | 30 | # 计算梯度 31 | #grad = network.numerical_gradient(x_batch, t_batch) 32 | grad = network.gradient(x_batch, t_batch) 33 | 34 | # 更新参数 35 | for key in ('W1', 'b1', 'W2', 'b2'): 36 | network.params[key] -= learning_rate * grad[key] 37 | 38 | loss = network.loss(x_batch, t_batch) 39 | train_loss_list.append(loss) 40 | 41 | if i % iter_per_epoch == 0: 42 | train_acc = network.accuracy(x_train, t_train) 43 | test_acc = network.accuracy(x_test, t_test) 44 | train_acc_list.append(train_acc) 45 | test_acc_list.append(test_acc) 46 | print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc)) 47 | 48 | # 绘制图形 49 | markers = {'train': 'o', 'test': 's'} 50 | x = np.arange(len(train_acc_list)) 51 | plt.plot(x, train_acc_list, label='train acc') 52 | plt.plot(x, test_acc_list, label='test acc', linestyle='--') 53 | plt.xlabel("epochs") 54 | plt.ylabel("accuracy") 55 | plt.ylim(0, 1.0) 56 | plt.legend(loc='lower right') 57 | plt.show() 58 | -------------------------------------------------------------------------------- /ch06/weight_init_compare.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from dataset.mnist import load_mnist 9 | from common.util import smooth_curve 10 | from common.multi_layer_net import MultiLayerNet 11 | from common.optimizer import SGD 12 | 13 | 14 | # 0:读入MNIST数据========== 15 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True) 16 | 17 | train_size = x_train.shape[0] 18 | batch_size = 128 19 | max_iterations = 2000 20 | 21 | 22 | # 1:进行实验的设置========== 23 | weight_init_types = {'std=0.01': 0.01, 'Xavier': 'sigmoid', 'He': 'relu'} 24 | optimizer = SGD(lr=0.01) 25 | 26 | networks = {} 27 | train_loss = {} 28 | for key, weight_type in weight_init_types.items(): 29 | networks[key] = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100], 30 | output_size=10, weight_init_std=weight_type) 31 | train_loss[key] = [] 32 | 33 | 34 | # 2:开始训练========== 35 | for i in range(max_iterations): 36 | batch_mask = np.random.choice(train_size, batch_size) 37 | x_batch = x_train[batch_mask] 38 | t_batch = t_train[batch_mask] 39 | 40 | for key in weight_init_types.keys(): 41 | grads = networks[key].gradient(x_batch, t_batch) 42 | optimizer.update(networks[key].params, grads) 43 | 44 | loss = networks[key].loss(x_batch, t_batch) 45 | train_loss[key].append(loss) 46 | 47 | if i % 100 == 0: 48 | print("===========" + "iteration:" + str(i) + "===========") 49 | for key in weight_init_types.keys(): 50 | loss = networks[key].loss(x_batch, t_batch) 51 | print(key + ":" + str(loss)) 52 | 53 | 54 | # 3.绘制图形========== 55 | markers = {'std=0.01': 'o', 'Xavier': 's', 'He': 'D'} 56 | x = np.arange(max_iterations) 57 | for key in weight_init_types.keys(): 58 | plt.plot(x, smooth_curve(train_loss[key]), marker=markers[key], markevery=100, label=key) 59 | plt.xlabel("iterations") 60 | plt.ylabel("loss") 61 | plt.ylim(0, 2.5) 62 | plt.legend() 63 | plt.show() -------------------------------------------------------------------------------- /ch06/optimizer_compare_mnist.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import os 3 | import sys 4 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 5 | import matplotlib.pyplot as plt 6 | from dataset.mnist import load_mnist 7 | from common.util import smooth_curve 8 | from common.multi_layer_net import MultiLayerNet 9 | from common.optimizer import * 10 | 11 | 12 | # 0:读入MNIST数据========== 13 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True) 14 | 15 | train_size = x_train.shape[0] 16 | batch_size = 128 17 | max_iterations = 2000 18 | 19 | 20 | # 1:进行实验的设置========== 21 | optimizers = {} 22 | optimizers['SGD'] = SGD() 23 | optimizers['Momentum'] = Momentum() 24 | optimizers['AdaGrad'] = AdaGrad() 25 | optimizers['Adam'] = Adam() 26 | #optimizers['RMSprop'] = RMSprop() 27 | 28 | networks = {} 29 | train_loss = {} 30 | for key in optimizers.keys(): 31 | networks[key] = MultiLayerNet( 32 | input_size=784, hidden_size_list=[100, 100, 100, 100], 33 | output_size=10) 34 | train_loss[key] = [] 35 | 36 | 37 | # 2:开始训练========== 38 | for i in range(max_iterations): 39 | batch_mask = np.random.choice(train_size, batch_size) 40 | x_batch = x_train[batch_mask] 41 | t_batch = t_train[batch_mask] 42 | 43 | for key in optimizers.keys(): 44 | grads = networks[key].gradient(x_batch, t_batch) 45 | optimizers[key].update(networks[key].params, grads) 46 | 47 | loss = networks[key].loss(x_batch, t_batch) 48 | train_loss[key].append(loss) 49 | 50 | if i % 100 == 0: 51 | print( "===========" + "iteration:" + str(i) + "===========") 52 | for key in optimizers.keys(): 53 | loss = networks[key].loss(x_batch, t_batch) 54 | print(key + ":" + str(loss)) 55 | 56 | 57 | # 3.绘制图形========== 58 | markers = {"SGD": "o", "Momentum": "x", "AdaGrad": "s", "Adam": "D"} 59 | x = np.arange(max_iterations) 60 | for key in optimizers.keys(): 61 | plt.plot(x, smooth_curve(train_loss[key]), marker=markers[key], markevery=100, label=key) 62 | plt.xlabel("iterations") 63 | plt.ylabel("loss") 64 | plt.ylim(0, 1) 65 | plt.legend() 66 | plt.show() 67 | -------------------------------------------------------------------------------- /ch06/overfit_weight_decay.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import os 3 | import sys 4 | 5 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from dataset.mnist import load_mnist 9 | from common.multi_layer_net import MultiLayerNet 10 | from common.optimizer import SGD 11 | 12 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True) 13 | 14 | # 为了再现过拟合,减少学习数据 15 | x_train = x_train[:300] 16 | t_train = t_train[:300] 17 | 18 | # weight decay(权值衰减)的设定 ======================= 19 | # weight_decay_lambda = 0 # 不使用权值衰减的情况 20 | weight_decay_lambda = 0.1 21 | # ==================================================== 22 | 23 | network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100, 100, 100], output_size=10, 24 | weight_decay_lambda=weight_decay_lambda) 25 | optimizer = SGD(lr=0.01) 26 | 27 | max_epochs = 201 28 | train_size = x_train.shape[0] 29 | batch_size = 100 30 | 31 | train_loss_list = [] 32 | train_acc_list = [] 33 | test_acc_list = [] 34 | 35 | iter_per_epoch = max(train_size / batch_size, 1) 36 | epoch_cnt = 0 37 | 38 | for i in range(1000000000): 39 | batch_mask = np.random.choice(train_size, batch_size) 40 | x_batch = x_train[batch_mask] 41 | t_batch = t_train[batch_mask] 42 | 43 | grads = network.gradient(x_batch, t_batch) 44 | optimizer.update(network.params, grads) 45 | 46 | if i % iter_per_epoch == 0: 47 | train_acc = network.accuracy(x_train, t_train) 48 | test_acc = network.accuracy(x_test, t_test) 49 | train_acc_list.append(train_acc) 50 | test_acc_list.append(test_acc) 51 | 52 | print("epoch:" + str(epoch_cnt) + ", train acc:" + str(train_acc) + ", test acc:" + str(test_acc)) 53 | 54 | epoch_cnt += 1 55 | if epoch_cnt >= max_epochs: 56 | break 57 | 58 | 59 | # 3.绘制图形========== 60 | markers = {'train': 'o', 'test': 's'} 61 | x = np.arange(max_epochs) 62 | plt.plot(x, train_acc_list, marker='o', label='train', markevery=15) 63 | plt.plot(x, test_acc_list, marker='s', label='test', markevery=15) 64 | plt.xlabel("epochs") 65 | plt.ylabel("accuracy") 66 | plt.ylim(0, 1.0) 67 | plt.legend(loc='lower right') 68 | plt.show() -------------------------------------------------------------------------------- /ch04/two_layer_net.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 4 | from common.functions import * 5 | from common.gradient import numerical_gradient 6 | 7 | 8 | class TwoLayerNet: 9 | 10 | def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01): 11 | # 初始化权重 12 | self.params = {} 13 | self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size) 14 | self.params['b1'] = np.zeros(hidden_size) 15 | self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size) 16 | self.params['b2'] = np.zeros(output_size) 17 | 18 | def predict(self, x): 19 | W1, W2 = self.params['W1'], self.params['W2'] 20 | b1, b2 = self.params['b1'], self.params['b2'] 21 | 22 | a1 = np.dot(x, W1) + b1 23 | z1 = sigmoid(a1) 24 | a2 = np.dot(z1, W2) + b2 25 | y = softmax(a2) 26 | 27 | return y 28 | 29 | # x:输入数据, t:监督数据 30 | def loss(self, x, t): 31 | y = self.predict(x) 32 | 33 | return cross_entropy_error(y, t) 34 | 35 | def accuracy(self, x, t): 36 | y = self.predict(x) 37 | y = np.argmax(y, axis=1) 38 | t = np.argmax(t, axis=1) 39 | 40 | accuracy = np.sum(y == t) / float(x.shape[0]) 41 | return accuracy 42 | 43 | # x:输入数据, t:监督数据 44 | def numerical_gradient(self, x, t): 45 | loss_W = lambda W: self.loss(x, t) 46 | 47 | grads = {} 48 | grads['W1'] = numerical_gradient(loss_W, self.params['W1']) 49 | grads['b1'] = numerical_gradient(loss_W, self.params['b1']) 50 | grads['W2'] = numerical_gradient(loss_W, self.params['W2']) 51 | grads['b2'] = numerical_gradient(loss_W, self.params['b2']) 52 | 53 | return grads 54 | 55 | def gradient(self, x, t): 56 | W1, W2 = self.params['W1'], self.params['W2'] 57 | b1, b2 = self.params['b1'], self.params['b2'] 58 | grads = {} 59 | 60 | batch_num = x.shape[0] 61 | 62 | # forward 63 | a1 = np.dot(x, W1) + b1 64 | z1 = sigmoid(a1) 65 | a2 = np.dot(z1, W2) + b2 66 | y = softmax(a2) 67 | 68 | # backward 69 | dy = (y - t) / batch_num 70 | grads['W2'] = np.dot(z1.T, dy) 71 | grads['b2'] = np.sum(dy, axis=0) 72 | 73 | da1 = np.dot(dy, W2.T) 74 | dz1 = sigmoid_grad(a1) * da1 75 | grads['W1'] = np.dot(x.T, dz1) 76 | grads['b1'] = np.sum(dz1, axis=0) 77 | 78 | return grads -------------------------------------------------------------------------------- /ch05/two_layer_net.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 4 | import numpy as np 5 | from common.layers import * 6 | from common.gradient import numerical_gradient 7 | from collections import OrderedDict 8 | 9 | 10 | class TwoLayerNet: 11 | 12 | def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01): 13 | # 初始化权重 14 | self.params = {} 15 | self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size) 16 | self.params['b1'] = np.zeros(hidden_size) 17 | self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size) 18 | self.params['b2'] = np.zeros(output_size) 19 | 20 | # 生成层 21 | self.layers = OrderedDict() 22 | self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1']) 23 | self.layers['Relu1'] = Relu() 24 | self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2']) 25 | 26 | self.lastLayer = SoftmaxWithLoss() 27 | 28 | def predict(self, x): 29 | for layer in self.layers.values(): 30 | x = layer.forward(x) 31 | 32 | return x 33 | 34 | # x:输入数据, t:监督数据 35 | def loss(self, x, t): 36 | y = self.predict(x) 37 | return self.lastLayer.forward(y, t) 38 | 39 | def accuracy(self, x, t): 40 | y = self.predict(x) 41 | y = np.argmax(y, axis=1) 42 | if t.ndim != 1 : t = np.argmax(t, axis=1) 43 | 44 | accuracy = np.sum(y == t) / float(x.shape[0]) 45 | return accuracy 46 | 47 | # x:输入数据, t:监督数据 48 | def numerical_gradient(self, x, t): 49 | loss_W = lambda W: self.loss(x, t) 50 | 51 | grads = {} 52 | grads['W1'] = numerical_gradient(loss_W, self.params['W1']) 53 | grads['b1'] = numerical_gradient(loss_W, self.params['b1']) 54 | grads['W2'] = numerical_gradient(loss_W, self.params['W2']) 55 | grads['b2'] = numerical_gradient(loss_W, self.params['b2']) 56 | 57 | return grads 58 | 59 | def gradient(self, x, t): 60 | # forward 61 | self.loss(x, t) 62 | 63 | # backward 64 | dout = 1 65 | dout = self.lastLayer.backward(dout) 66 | 67 | layers = list(self.layers.values()) 68 | layers.reverse() 69 | for layer in layers: 70 | dout = layer.backward(dout) 71 | 72 | # 设定 73 | grads = {} 74 | grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db 75 | grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db 76 | 77 | return grads 78 | -------------------------------------------------------------------------------- /common/util.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | 4 | 5 | def smooth_curve(x): 6 | """用于使损失函数的图形变圆滑 7 | 8 | 参考:http://glowingpython.blogspot.jp/2012/02/convolution-with-numpy.html 9 | """ 10 | window_len = 11 11 | s = np.r_[x[window_len-1:0:-1], x, x[-1:-window_len:-1]] 12 | w = np.kaiser(window_len, 2) 13 | y = np.convolve(w/w.sum(), s, mode='valid') 14 | return y[5:len(y)-5] 15 | 16 | 17 | def shuffle_dataset(x, t): 18 | """打乱数据集 19 | 20 | Parameters 21 | ---------- 22 | x : 训练数据 23 | t : 监督数据 24 | 25 | Returns 26 | ------- 27 | x, t : 打乱的训练数据和监督数据 28 | """ 29 | permutation = np.random.permutation(x.shape[0]) 30 | x = x[permutation,:] if x.ndim == 2 else x[permutation,:,:,:] 31 | t = t[permutation] 32 | 33 | return x, t 34 | 35 | def conv_output_size(input_size, filter_size, stride=1, pad=0): 36 | return (input_size + 2*pad - filter_size) / stride + 1 37 | 38 | 39 | def im2col(input_data, filter_h, filter_w, stride=1, pad=0): 40 | """ 41 | 42 | Parameters 43 | ---------- 44 | input_data : 由(数据量, 通道, 高, 长)的4维数组构成的输入数据 45 | filter_h : 滤波器的高 46 | filter_w : 滤波器的长 47 | stride : 步幅 48 | pad : 填充 49 | 50 | Returns 51 | ------- 52 | col : 2维数组 53 | """ 54 | N, C, H, W = input_data.shape 55 | out_h = (H + 2*pad - filter_h)//stride + 1 56 | out_w = (W + 2*pad - filter_w)//stride + 1 57 | 58 | img = np.pad(input_data, [(0,0), (0,0), (pad, pad), (pad, pad)], 'constant') 59 | col = np.zeros((N, C, filter_h, filter_w, out_h, out_w)) 60 | 61 | for y in range(filter_h): 62 | y_max = y + stride*out_h 63 | for x in range(filter_w): 64 | x_max = x + stride*out_w 65 | col[:, :, y, x, :, :] = img[:, :, y:y_max:stride, x:x_max:stride] 66 | 67 | col = col.transpose(0, 4, 5, 1, 2, 3).reshape(N*out_h*out_w, -1) 68 | return col 69 | 70 | 71 | def col2im(col, input_shape, filter_h, filter_w, stride=1, pad=0): 72 | """ 73 | 74 | Parameters 75 | ---------- 76 | col : 77 | input_shape : 输入数据的形状(例:(10, 1, 28, 28)) 78 | filter_h : 79 | filter_w 80 | stride 81 | pad 82 | 83 | Returns 84 | ------- 85 | 86 | """ 87 | N, C, H, W = input_shape 88 | out_h = (H + 2*pad - filter_h)//stride + 1 89 | out_w = (W + 2*pad - filter_w)//stride + 1 90 | col = col.reshape(N, out_h, out_w, C, filter_h, filter_w).transpose(0, 3, 4, 5, 1, 2) 91 | 92 | img = np.zeros((N, C, H + 2*pad + stride - 1, W + 2*pad + stride - 1)) 93 | for y in range(filter_h): 94 | y_max = y + stride*out_h 95 | for x in range(filter_w): 96 | x_max = x + stride*out_w 97 | img[:, :, y:y_max:stride, x:x_max:stride] += col[:, :, y, x, :, :] 98 | 99 | return img[:, :, pad:H + pad, pad:W + pad] -------------------------------------------------------------------------------- /ch06/hyperparameter_optimization.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from dataset.mnist import load_mnist 7 | from common.multi_layer_net import MultiLayerNet 8 | from common.util import shuffle_dataset 9 | from common.trainer import Trainer 10 | 11 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True) # 这里的x_test是没有用到 12 | 13 | # 为了实现高速化,减少训练数据 14 | x_train = x_train[:500] 15 | t_train = t_train[:500] 16 | 17 | # 分割验证数据 18 | validation_rate = 0.20 # 验证数据集和测试数据集不一样 19 | validation_num = int(x_train.shape[0] * validation_rate) 20 | x_train, t_train = shuffle_dataset(x_train, t_train) 21 | x_val = x_train[:validation_num] 22 | t_val = t_train[:validation_num] 23 | x_train = x_train[validation_num:] 24 | t_train = t_train[validation_num:] 25 | 26 | 27 | def __train(lr, weight_decay, epocs=50): 28 | network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100, 100, 100], 29 | output_size=10, weight_decay_lambda=weight_decay) 30 | trainer = Trainer(network, x_train, t_train, x_val, t_val, 31 | epochs=epocs, mini_batch_size=100, 32 | optimizer='sgd', optimizer_param={'lr': lr}, verbose=False) 33 | trainer.train() 34 | 35 | return trainer.test_acc_list, trainer.train_acc_list 36 | 37 | 38 | # 超参数的随机搜索====================================== 39 | optimization_trial = 100 # 参数100次 40 | results_val = {} 41 | results_train = {} 42 | for _ in range(optimization_trial): 43 | # 指定搜索的超参数的范围=============== 44 | weight_decay = 10 ** np.random.uniform(-8, -4) 45 | lr = 10 ** np.random.uniform(-6, -2) 46 | # ================================================ 47 | 48 | val_acc_list, train_acc_list = __train(lr, weight_decay) # 这里每次的训练和测试数据是固定的,也就是变量是lr,和权重衰减。 49 | print("val acc:" + str(val_acc_list[-1]) + " | lr:" + str(lr) + ", weight decay:" + str(weight_decay)) 50 | key = "lr:" + str(lr) + ", weight decay:" + str(weight_decay) 51 | # print('val acc 的内容',val_acc_list) 52 | results_val[key] = val_acc_list 53 | results_train[key] = train_acc_list 54 | 55 | # 绘制图形======================================================== 56 | print("=========== Hyper-Parameter Optimization Result ===========") 57 | graph_draw_num = 20 58 | col_num = 5 59 | row_num = int(np.ceil(graph_draw_num / col_num)) 60 | i = 0 61 | 62 | # acc list里面最后一个值是最后运行 一个loop的acc值,并不一定是最大的, 只是参数执行到最后的值 63 | for key, val_acc_list in sorted(results_val.items(), key=lambda x:x[1][-1], reverse=True): 64 | print("Best-" + str(i+1) + "(val acc:" + str(val_acc_list[-1]) + ") | " + key) 65 | 66 | plt.subplot(row_num, col_num, i+1) 67 | plt.title("Best-" + str(i+1)) 68 | plt.ylim(0.0, 1.0) 69 | if i % 5: plt.yticks([]) 70 | plt.xticks([]) 71 | x = np.arange(len(val_acc_list)) 72 | plt.plot(x, val_acc_list) 73 | plt.plot(x, results_train[key], "--") 74 | i += 1 75 | 76 | if i >= graph_draw_num: 77 | break 78 | 79 | plt.show() 80 | -------------------------------------------------------------------------------- /ch06/batch_norm_test.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from dataset.mnist import load_mnist 7 | from common.multi_layer_net_extend import MultiLayerNetExtend 8 | from common.optimizer import SGD, Adam 9 | 10 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True) 11 | 12 | # 减少学习数据 13 | x_train = x_train[:1000] 14 | t_train = t_train[:1000] 15 | 16 | max_epochs = 20 17 | train_size = x_train.shape[0] 18 | batch_size = 100 19 | learning_rate = 0.01 20 | 21 | 22 | def __train(weight_init_std): 23 | bn_network = MultiLayerNetExtend(input_size=784, hidden_size_list=[100, 100, 100, 100, 100], output_size=10, 24 | weight_init_std=weight_init_std, use_batchnorm=True) 25 | network = MultiLayerNetExtend(input_size=784, hidden_size_list=[100, 100, 100, 100, 100], output_size=10, 26 | weight_init_std=weight_init_std) 27 | optimizer = SGD(lr=learning_rate) 28 | 29 | train_acc_list = [] 30 | bn_train_acc_list = [] 31 | 32 | iter_per_epoch = max(train_size / batch_size, 1) 33 | epoch_cnt = 0 34 | 35 | for i in range(1000000000): 36 | batch_mask = np.random.choice(train_size, batch_size) 37 | x_batch = x_train[batch_mask] 38 | t_batch = t_train[batch_mask] 39 | 40 | for _network in (bn_network, network): 41 | grads = _network.gradient(x_batch, t_batch) 42 | optimizer.update(_network.params, grads) 43 | 44 | # 只是为了显示作用,实际可以不用? 45 | if i % iter_per_epoch == 0: 46 | train_acc = network.accuracy(x_train, t_train) 47 | bn_train_acc = bn_network.accuracy(x_train, t_train) 48 | train_acc_list.append(train_acc) 49 | bn_train_acc_list.append(bn_train_acc) 50 | 51 | print("epoch:" + str(epoch_cnt) + " | " + str(train_acc) + " - " + str(bn_train_acc)) 52 | 53 | epoch_cnt += 1 54 | if epoch_cnt >= max_epochs: 55 | break 56 | 57 | return train_acc_list, bn_train_acc_list 58 | 59 | 60 | # 3.绘制图形========== 61 | weight_scale_list = np.logspace(0, -4, num=16) 62 | x = np.arange(max_epochs) 63 | 64 | for i, w in enumerate(weight_scale_list): 65 | print( "============== " + str(i+1) + "/16" + " ==============") 66 | train_acc_list, bn_train_acc_list = __train(w) 67 | 68 | plt.subplot(4,4,i+1) 69 | plt.title("W:" + str(w)) 70 | if i == 15: 71 | plt.plot(x, bn_train_acc_list, label='Batch Normalization', markevery=2) 72 | plt.plot(x, train_acc_list, linestyle = "--", label='Normal(without BatchNorm)', markevery=2) 73 | else: 74 | plt.plot(x, bn_train_acc_list, markevery=2) 75 | plt.plot(x, train_acc_list, linestyle="--", markevery=2) 76 | 77 | plt.ylim(0, 1.0) 78 | if i % 4: 79 | plt.yticks([]) 80 | else: 81 | plt.ylabel("accuracy") 82 | if i < 12: 83 | plt.xticks([]) 84 | else: 85 | plt.xlabel("epochs") 86 | plt.legend(loc='lower right') 87 | 88 | plt.show() -------------------------------------------------------------------------------- /common/trainer.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 4 | import numpy as np 5 | from common.optimizer import * 6 | 7 | class Trainer: 8 | """进行神经网络的训练的类 9 | """ 10 | def __init__(self, network, x_train, t_train, x_test, t_test, 11 | epochs=20, mini_batch_size=100, 12 | optimizer='SGD', optimizer_param={'lr':0.01}, 13 | evaluate_sample_num_per_epoch=None, verbose=True): 14 | self.network = network 15 | self.verbose = verbose 16 | self.x_train = x_train 17 | self.t_train = t_train 18 | self.x_test = x_test 19 | self.t_test = t_test 20 | self.epochs = epochs 21 | self.batch_size = mini_batch_size 22 | self.evaluate_sample_num_per_epoch = evaluate_sample_num_per_epoch 23 | 24 | # optimzer 25 | optimizer_class_dict = {'sgd':SGD, 'momentum':Momentum, 'nesterov':Nesterov, 26 | 'adagrad':AdaGrad, 'rmsprpo':RMSprop, 'adam':Adam} 27 | self.optimizer = optimizer_class_dict[optimizer.lower()](**optimizer_param) 28 | 29 | self.train_size = x_train.shape[0] 30 | self.iter_per_epoch = max(self.train_size / mini_batch_size, 1) 31 | self.max_iter = int(epochs * self.iter_per_epoch) 32 | self.current_iter = 0 33 | self.current_epoch = 0 34 | 35 | self.train_loss_list = [] 36 | self.train_acc_list = [] 37 | self.test_acc_list = [] 38 | 39 | def train_step(self): 40 | batch_mask = np.random.choice(self.train_size, self.batch_size) 41 | x_batch = self.x_train[batch_mask] 42 | t_batch = self.t_train[batch_mask] 43 | 44 | grads = self.network.gradient(x_batch, t_batch) 45 | self.optimizer.update(self.network.params, grads) 46 | 47 | loss = self.network.loss(x_batch, t_batch) 48 | self.train_loss_list.append(loss) 49 | if self.verbose: print("train loss:" + str(loss)) 50 | 51 | if self.current_iter % self.iter_per_epoch == 0: 52 | self.current_epoch += 1 53 | 54 | x_train_sample, t_train_sample = self.x_train, self.t_train 55 | x_test_sample, t_test_sample = self.x_test, self.t_test 56 | if not self.evaluate_sample_num_per_epoch is None: 57 | t = self.evaluate_sample_num_per_epoch 58 | x_train_sample, t_train_sample = self.x_train[:t], self.t_train[:t] 59 | x_test_sample, t_test_sample = self.x_test[:t], self.t_test[:t] 60 | 61 | train_acc = self.network.accuracy(x_train_sample, t_train_sample) 62 | test_acc = self.network.accuracy(x_test_sample, t_test_sample) 63 | self.train_acc_list.append(train_acc) 64 | self.test_acc_list.append(test_acc) 65 | 66 | if self.verbose: print("=== epoch:" + str(self.current_epoch) + ", train acc:" + str(train_acc) + ", test acc:" + str(test_acc) + " ===") 67 | self.current_iter += 1 68 | 69 | def train(self): 70 | for i in range(self.max_iter): 71 | self.train_step() # 不断更新权重值W 72 | 73 | test_acc = self.network.accuracy(self.x_test, self.t_test) 74 | 75 | if self.verbose: 76 | print("=============== Final Test Accuracy ===============") 77 | print("test acc:" + str(test_acc)) 78 | 79 | -------------------------------------------------------------------------------- /dataset/mnist.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | try: 3 | import urllib.request 4 | except ImportError: 5 | raise ImportError('You should use Python 3.x') 6 | import os.path 7 | import gzip 8 | import pickle 9 | import os 10 | import numpy as np 11 | 12 | 13 | url_base = 'http://yann.lecun.com/exdb/mnist/' 14 | key_file = { 15 | 'train_img':'train-images-idx3-ubyte.gz', 16 | 'train_label':'train-labels-idx1-ubyte.gz', 17 | 'test_img':'t10k-images-idx3-ubyte.gz', 18 | 'test_label':'t10k-labels-idx1-ubyte.gz' 19 | } 20 | 21 | dataset_dir = os.path.dirname(os.path.abspath(__file__)) 22 | save_file = dataset_dir + "/mnist.pkl" 23 | 24 | train_num = 60000 25 | test_num = 10000 26 | img_dim = (1, 28, 28) 27 | img_size = 784 28 | 29 | 30 | def _download(file_name): 31 | file_path = dataset_dir + "/" + file_name 32 | 33 | if os.path.exists(file_path): 34 | return 35 | 36 | print("Downloading " + file_name + " ... ") 37 | urllib.request.urlretrieve(url_base + file_name, file_path) 38 | print("Done") 39 | 40 | def download_mnist(): 41 | for v in key_file.values(): 42 | _download(v) 43 | 44 | def _load_label(file_name): 45 | file_path = dataset_dir + "/" + file_name 46 | 47 | print("Converting " + file_name + " to NumPy Array ...") 48 | with gzip.open(file_path, 'rb') as f: 49 | labels = np.frombuffer(f.read(), np.uint8, offset=8) 50 | print("Done") 51 | 52 | return labels 53 | 54 | def _load_img(file_name): 55 | file_path = dataset_dir + "/" + file_name 56 | 57 | print("Converting " + file_name + " to NumPy Array ...") 58 | with gzip.open(file_path, 'rb') as f: 59 | data = np.frombuffer(f.read(), np.uint8, offset=16) 60 | data = data.reshape(-1, img_size) 61 | print("Done") 62 | 63 | return data 64 | 65 | def _convert_numpy(): 66 | dataset = {} 67 | dataset['train_img'] = _load_img(key_file['train_img']) 68 | dataset['train_label'] = _load_label(key_file['train_label']) 69 | dataset['test_img'] = _load_img(key_file['test_img']) 70 | dataset['test_label'] = _load_label(key_file['test_label']) 71 | 72 | return dataset 73 | 74 | def init_mnist(): 75 | download_mnist() 76 | dataset = _convert_numpy() 77 | print("Creating pickle file ...") 78 | with open(save_file, 'wb') as f: 79 | pickle.dump(dataset, f, -1) 80 | print("Done!") 81 | 82 | def _change_one_hot_label(X): 83 | T = np.zeros((X.size, 10)) 84 | for idx, row in enumerate(T): 85 | row[X[idx]] = 1 86 | 87 | return T 88 | 89 | 90 | def load_mnist(normalize=True, flatten=True, one_hot_label=False): 91 | """读入MNIST数据集 92 | 93 | Parameters 94 | ---------- 95 | normalize : 将图像的像素值正规化为0.0~1.0 96 | one_hot_label : 97 | one_hot_label为True的情况下,标签作为one-hot数组返回 98 | one-hot数组是指[0,0,1,0,0,0,0,0,0,0]这样的数组 99 | flatten : 是否将图像展开为一维数组 100 | 101 | Returns 102 | ------- 103 | (训练图像, 训练标签), (测试图像, 测试标签) 104 | """ 105 | if not os.path.exists(save_file): 106 | init_mnist() 107 | 108 | with open(save_file, 'rb') as f: 109 | dataset = pickle.load(f) 110 | 111 | if normalize: 112 | for key in ('train_img', 'test_img'): 113 | dataset[key] = dataset[key].astype(np.float32) 114 | dataset[key] /= 255.0 115 | 116 | if one_hot_label: 117 | dataset['train_label'] = _change_one_hot_label(dataset['train_label']) 118 | dataset['test_label'] = _change_one_hot_label(dataset['test_label']) 119 | 120 | if not flatten: 121 | for key in ('train_img', 'test_img'): 122 | dataset[key] = dataset[key].reshape(-1, 1, 28, 28) 123 | 124 | return (dataset['train_img'], dataset['train_label']), (dataset['test_img'], dataset['test_label']) 125 | 126 | 127 | if __name__ == '__main__': 128 | init_mnist() 129 | -------------------------------------------------------------------------------- /common/optimizer.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | 4 | class SGD: 5 | 6 | """随机梯度下降法(Stochastic Gradient Descent)""" 7 | 8 | def __init__(self, lr=0.01): 9 | self.lr = lr 10 | 11 | def update(self, params, grads): 12 | for key in params.keys(): 13 | params[key] -= self.lr * grads[key] 14 | 15 | 16 | class Momentum: 17 | 18 | """Momentum SGD""" 19 | 20 | def __init__(self, lr=0.01, momentum=0.9): 21 | self.lr = lr 22 | self.momentum = momentum 23 | self.v = None 24 | 25 | def update(self, params, grads): 26 | if self.v is None: 27 | self.v = {} 28 | for key, val in params.items(): 29 | self.v[key] = np.zeros_like(val) 30 | 31 | for key in params.keys(): 32 | self.v[key] = self.momentum*self.v[key] - self.lr*grads[key] 33 | params[key] += self.v[key] 34 | 35 | 36 | class Nesterov: 37 | 38 | """Nesterov's Accelerated Gradient (http://arxiv.org/abs/1212.0901)""" 39 | 40 | def __init__(self, lr=0.01, momentum=0.9): 41 | self.lr = lr 42 | self.momentum = momentum 43 | self.v = None 44 | 45 | def update(self, params, grads): 46 | if self.v is None: 47 | self.v = {} 48 | for key, val in params.items(): 49 | self.v[key] = np.zeros_like(val) 50 | 51 | for key in params.keys(): 52 | self.v[key] *= self.momentum 53 | self.v[key] -= self.lr * grads[key] 54 | params[key] += self.momentum * self.momentum * self.v[key] 55 | params[key] -= (1 + self.momentum) * self.lr * grads[key] 56 | 57 | 58 | class AdaGrad: 59 | 60 | """AdaGrad""" 61 | 62 | def __init__(self, lr=0.01): 63 | self.lr = lr 64 | self.h = None 65 | 66 | def update(self, params, grads): 67 | if self.h is None: 68 | self.h = {} 69 | for key, val in params.items(): 70 | self.h[key] = np.zeros_like(val) 71 | 72 | for key in params.keys(): 73 | self.h[key] += grads[key] * grads[key] 74 | params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7) 75 | 76 | 77 | class RMSprop: 78 | 79 | """RMSprop""" 80 | 81 | def __init__(self, lr=0.01, decay_rate = 0.99): 82 | self.lr = lr 83 | self.decay_rate = decay_rate 84 | self.h = None 85 | 86 | def update(self, params, grads): 87 | if self.h is None: 88 | self.h = {} 89 | for key, val in params.items(): 90 | self.h[key] = np.zeros_like(val) 91 | 92 | for key in params.keys(): 93 | self.h[key] *= self.decay_rate 94 | self.h[key] += (1 - self.decay_rate) * grads[key] * grads[key] 95 | params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7) 96 | 97 | 98 | class Adam: 99 | 100 | """Adam (http://arxiv.org/abs/1412.6980v8)""" 101 | 102 | def __init__(self, lr=0.001, beta1=0.9, beta2=0.999): 103 | self.lr = lr 104 | self.beta1 = beta1 105 | self.beta2 = beta2 106 | self.iter = 0 107 | self.m = None 108 | self.v = None 109 | 110 | def update(self, params, grads): 111 | if self.m is None: 112 | self.m, self.v = {}, {} 113 | for key, val in params.items(): 114 | self.m[key] = np.zeros_like(val) 115 | self.v[key] = np.zeros_like(val) 116 | 117 | self.iter += 1 118 | lr_t = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter) 119 | 120 | for key in params.keys(): 121 | #self.m[key] = self.beta1*self.m[key] + (1-self.beta1)*grads[key] 122 | #self.v[key] = self.beta2*self.v[key] + (1-self.beta2)*(grads[key]**2) 123 | self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key]) 124 | self.v[key] += (1 - self.beta2) * (grads[key]**2 - self.v[key]) 125 | 126 | params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7) 127 | 128 | #unbias_m += (1 - self.beta1) * (grads[key] - self.m[key]) # correct bias 129 | #unbisa_b += (1 - self.beta2) * (grads[key]*grads[key] - self.v[key]) # correct bias 130 | #params[key] += self.lr * unbias_m / (np.sqrt(unbisa_b) + 1e-7) 131 | -------------------------------------------------------------------------------- /common/multi_layer_net.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 4 | import numpy as np 5 | from collections import OrderedDict 6 | from common.layers import * 7 | from common.gradient import numerical_gradient 8 | 9 | 10 | class MultiLayerNet: 11 | """全连接的多层神经网络 12 | 13 | Parameters 14 | ---------- 15 | input_size : 输入大小(MNIST的情况下为784) 16 | hidden_size_list : 隐藏层的神经元数量的列表(e.g. [100, 100, 100]) 17 | output_size : 输出大小(MNIST的情况下为10) 18 | activation : 'relu' or 'sigmoid' 19 | weight_init_std : 指定权重的标准差(e.g. 0.01) 20 | 指定'relu'或'he'的情况下设定“He的初始值” 21 | 指定'sigmoid'或'xavier'的情况下设定“Xavier的初始值” 22 | weight_decay_lambda : Weight Decay(L2范数)的强度 23 | """ 24 | def __init__(self, input_size, hidden_size_list, output_size, 25 | activation='relu', weight_init_std='relu', weight_decay_lambda=0): 26 | self.input_size = input_size 27 | self.output_size = output_size 28 | self.hidden_size_list = hidden_size_list 29 | self.hidden_layer_num = len(hidden_size_list) 30 | self.weight_decay_lambda = weight_decay_lambda 31 | self.params = {} 32 | 33 | # 初始化权重 34 | self.__init_weight(weight_init_std) 35 | 36 | # 生成层 37 | activation_layer = {'sigmoid': Sigmoid, 'relu': Relu} 38 | self.layers = OrderedDict() 39 | for idx in range(1, self.hidden_layer_num+1): # 每一层都是差不多的, 40 | self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)], 41 | self.params['b' + str(idx)]) 42 | self.layers['Activation_function' + str(idx)] = activation_layer[activation]() 43 | 44 | idx = self.hidden_layer_num + 1 # 输出层 45 | self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)], 46 | self.params['b' + str(idx)]) 47 | 48 | self.last_layer = SoftmaxWithLoss() 49 | 50 | def __init_weight(self, weight_init_std): 51 | """设定权重的初始值 52 | 53 | Parameters 54 | ---------- 55 | weight_init_std : 指定权重的标准差(e.g. 0.01) 56 | 指定'relu'或'he'的情况下设定“He的初始值” 57 | 指定'sigmoid'或'xavier'的情况下设定“Xavier的初始值” 58 | """ 59 | all_size_list = [self.input_size] + self.hidden_size_list + [self.output_size] 60 | for idx in range(1, len(all_size_list)): 61 | scale = weight_init_std 62 | if str(weight_init_std).lower() in ('relu', 'he'): 63 | scale = np.sqrt(2.0 / all_size_list[idx - 1]) # 使用ReLU的情况下推荐的初始值 64 | elif str(weight_init_std).lower() in ('sigmoid', 'xavier'): 65 | scale = np.sqrt(1.0 / all_size_list[idx - 1]) # 使用sigmoid的情况下推荐的初始值 66 | 67 | self.params['W' + str(idx)] = scale * np.random.randn(all_size_list[idx-1], all_size_list[idx]) 68 | self.params['b' + str(idx)] = np.zeros(all_size_list[idx]) 69 | 70 | def predict(self, x): 71 | for layer in self.layers.values(): 72 | x = layer.forward(x) 73 | 74 | return x 75 | 76 | def loss(self, x, t): 77 | """求损失函数 78 | 79 | Parameters 80 | ---------- 81 | x : 输入数据 82 | t : 教师标签 83 | 84 | Returns 85 | ------- 86 | 损失函数的值 87 | """ 88 | y = self.predict(x) 89 | 90 | weight_decay = 0 91 | for idx in range(1, self.hidden_layer_num + 2): 92 | W = self.params['W' + str(idx)] 93 | weight_decay += 0.5 * self.weight_decay_lambda * np.sum(W ** 2) 94 | 95 | return self.last_layer.forward(y, t) + weight_decay 96 | 97 | def accuracy(self, x, t): 98 | y = self.predict(x) 99 | y = np.argmax(y, axis=1) 100 | if t.ndim != 1 : t = np.argmax(t, axis=1) 101 | 102 | accuracy = np.sum(y == t) / float(x.shape[0]) 103 | return accuracy 104 | 105 | def numerical_gradient(self, x, t): 106 | """求梯度(数值微分) 107 | 108 | Parameters 109 | ---------- 110 | x : 输入数据 111 | t : 教师标签 112 | 113 | Returns 114 | ------- 115 | 具有各层的梯度的字典变量 116 | grads['W1']、grads['W2']、...是各层的权重 117 | grads['b1']、grads['b2']、...是各层的偏置 118 | """ 119 | loss_W = lambda W: self.loss(x, t) 120 | 121 | grads = {} 122 | for idx in range(1, self.hidden_layer_num+2): 123 | grads['W' + str(idx)] = numerical_gradient(loss_W, self.params['W' + str(idx)]) 124 | grads['b' + str(idx)] = numerical_gradient(loss_W, self.params['b' + str(idx)]) 125 | 126 | return grads 127 | 128 | def gradient(self, x, t): 129 | """求梯度(误差反向传播法) 130 | 131 | Parameters 132 | ---------- 133 | x : 输入数据 134 | t : 教师标签 135 | 136 | Returns 137 | ------- 138 | 具有各层的梯度的字典变量 139 | grads['W1']、grads['W2']、...是各层的权重 140 | grads['b1']、grads['b2']、...是各层的偏置 141 | """ 142 | # forward 143 | self.loss(x, t) 144 | 145 | # backward 146 | dout = 1 147 | dout = self.last_layer.backward(dout) 148 | 149 | layers = list(self.layers.values()) 150 | layers.reverse() 151 | for layer in layers: 152 | dout = layer.backward(dout) 153 | 154 | # 设定 155 | grads = {} 156 | for idx in range(1, self.hidden_layer_num+2): 157 | grads['W' + str(idx)] = self.layers['Affine' + str(idx)].dW + self.weight_decay_lambda * self.layers['Affine' + str(idx)].W 158 | grads['b' + str(idx)] = self.layers['Affine' + str(idx)].db 159 | 160 | return grads 161 | -------------------------------------------------------------------------------- /ch07/simple_convnet.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 4 | import pickle 5 | import numpy as np 6 | from collections import OrderedDict 7 | from common.layers import * 8 | from common.gradient import numerical_gradient 9 | 10 | 11 | class SimpleConvNet: 12 | """简单的ConvNet 13 | 14 | conv - relu - pool - affine - relu - affine - softmax 15 | 16 | Parameters 17 | ---------- 18 | input_size : 输入大小(MNIST的情况下为784) 19 | hidden_size_list : 隐藏层的神经元数量的列表(e.g. [100, 100, 100]) 20 | output_size : 输出大小(MNIST的情况下为10) 21 | activation : 'relu' or 'sigmoid' 22 | weight_init_std : 指定权重的标准差(e.g. 0.01) 23 | 指定'relu'或'he'的情况下设定“He的初始值” 24 | 指定'sigmoid'或'xavier'的情况下设定“Xavier的初始值” 25 | """ 26 | def __init__(self, input_dim=(1, 28, 28), 27 | conv_param={'filter_num':30, 'filter_size':5, 'pad':0, 'stride':1}, 28 | hidden_size=100, output_size=10, weight_init_std=0.01): 29 | filter_num = conv_param['filter_num'] 30 | filter_size = conv_param['filter_size'] 31 | filter_pad = conv_param['pad'] 32 | filter_stride = conv_param['stride'] 33 | input_size = input_dim[1] 34 | conv_output_size = (input_size - filter_size + 2*filter_pad) / filter_stride + 1 35 | pool_output_size = int(filter_num * (conv_output_size/2) * (conv_output_size/2)) 36 | 37 | # 初始化权重 38 | self.params = {} 39 | self.params['W1'] = weight_init_std * \ 40 | np.random.randn(filter_num, input_dim[0], filter_size, filter_size) 41 | self.params['b1'] = np.zeros(filter_num) 42 | self.params['W2'] = weight_init_std * \ 43 | np.random.randn(pool_output_size, hidden_size) 44 | self.params['b2'] = np.zeros(hidden_size) 45 | self.params['W3'] = weight_init_std * \ 46 | np.random.randn(hidden_size, output_size) 47 | self.params['b3'] = np.zeros(output_size) 48 | 49 | # 生成层 50 | self.layers = OrderedDict() 51 | self.layers['Conv1'] = Convolution(self.params['W1'], self.params['b1'], 52 | conv_param['stride'], conv_param['pad']) 53 | self.layers['Relu1'] = Relu() 54 | self.layers['Pool1'] = Pooling(pool_h=2, pool_w=2, stride=2) 55 | self.layers['Affine1'] = Affine(self.params['W2'], self.params['b2']) 56 | self.layers['Relu2'] = Relu() 57 | self.layers['Affine2'] = Affine(self.params['W3'], self.params['b3']) 58 | 59 | self.last_layer = SoftmaxWithLoss() 60 | 61 | def predict(self, x): 62 | for layer in self.layers.values(): 63 | x = layer.forward(x) 64 | 65 | return x 66 | 67 | def loss(self, x, t): 68 | """求损失函数 69 | 参数x是输入数据、t是教师标签 70 | """ 71 | y = self.predict(x) 72 | return self.last_layer.forward(y, t) 73 | 74 | def accuracy(self, x, t, batch_size=100): 75 | if t.ndim != 1 : t = np.argmax(t, axis=1) 76 | 77 | acc = 0.0 78 | 79 | for i in range(int(x.shape[0] / batch_size)): 80 | tx = x[i*batch_size:(i+1)*batch_size] 81 | tt = t[i*batch_size:(i+1)*batch_size] 82 | y = self.predict(tx) 83 | y = np.argmax(y, axis=1) 84 | acc += np.sum(y == tt) 85 | 86 | return acc / x.shape[0] 87 | 88 | def numerical_gradient(self, x, t): 89 | """求梯度(数值微分) 90 | 91 | Parameters 92 | ---------- 93 | x : 输入数据 94 | t : 教师标签 95 | 96 | Returns 97 | ------- 98 | 具有各层的梯度的字典变量 99 | grads['W1']、grads['W2']、...是各层的权重 100 | grads['b1']、grads['b2']、...是各层的偏置 101 | """ 102 | loss_w = lambda w: self.loss(x, t) 103 | 104 | grads = {} 105 | for idx in (1, 2, 3): 106 | grads['W' + str(idx)] = numerical_gradient(loss_w, self.params['W' + str(idx)]) 107 | grads['b' + str(idx)] = numerical_gradient(loss_w, self.params['b' + str(idx)]) 108 | 109 | return grads 110 | 111 | def gradient(self, x, t): 112 | """求梯度(误差反向传播法) 113 | 114 | Parameters 115 | ---------- 116 | x : 输入数据 117 | t : 教师标签 118 | 119 | Returns 120 | ------- 121 | 具有各层的梯度的字典变量 122 | grads['W1']、grads['W2']、...是各层的权重 123 | grads['b1']、grads['b2']、...是各层的偏置 124 | """ 125 | # forward 126 | self.loss(x, t) 127 | 128 | # backward 129 | dout = 1 130 | dout = self.last_layer.backward(dout) 131 | 132 | layers = list(self.layers.values()) 133 | layers.reverse() 134 | for layer in layers: 135 | dout = layer.backward(dout) 136 | 137 | # 设定 138 | grads = {} 139 | grads['W1'], grads['b1'] = self.layers['Conv1'].dW, self.layers['Conv1'].db 140 | grads['W2'], grads['b2'] = self.layers['Affine1'].dW, self.layers['Affine1'].db 141 | grads['W3'], grads['b3'] = self.layers['Affine2'].dW, self.layers['Affine2'].db 142 | 143 | return grads 144 | 145 | def save_params(self, file_name="params.pkl"): 146 | params = {} 147 | for key, val in self.params.items(): 148 | params[key] = val 149 | with open(file_name, 'wb') as f: 150 | pickle.dump(params, f) 151 | 152 | def load_params(self, file_name="params.pkl"): 153 | with open(file_name, 'rb') as f: 154 | params = pickle.load(f) 155 | for key, val in params.items(): 156 | self.params[key] = val 157 | 158 | for i, key in enumerate(['Conv1', 'Affine1', 'Affine2']): 159 | self.layers[key].W = self.params['W' + str(i+1)] 160 | self.layers[key].b = self.params['b' + str(i+1)] -------------------------------------------------------------------------------- /ch08/deep_convnet.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 4 | import pickle 5 | import numpy as np 6 | from collections import OrderedDict 7 | from common.layers import * 8 | 9 | 10 | class DeepConvNet: 11 | """识别率为99%以上的高精度的ConvNet 12 | 13 | 网络结构如下所示 14 | conv - relu - conv- relu - pool - 15 | conv - relu - conv- relu - pool - 16 | conv - relu - conv- relu - pool - 17 | affine - relu - dropout - affine - dropout - softmax 18 | """ 19 | def __init__(self, input_dim=(1, 28, 28), 20 | conv_param_1 = {'filter_num':16, 'filter_size':3, 'pad':1, 'stride':1}, 21 | conv_param_2 = {'filter_num':16, 'filter_size':3, 'pad':1, 'stride':1}, 22 | conv_param_3 = {'filter_num':32, 'filter_size':3, 'pad':1, 'stride':1}, 23 | conv_param_4 = {'filter_num':32, 'filter_size':3, 'pad':2, 'stride':1}, 24 | conv_param_5 = {'filter_num':64, 'filter_size':3, 'pad':1, 'stride':1}, 25 | conv_param_6 = {'filter_num':64, 'filter_size':3, 'pad':1, 'stride':1}, 26 | hidden_size=50, output_size=10): 27 | # 初始化权重=========== 28 | # 各层的神经元平均与前一层的几个神经元有连接(TODO:自动计算) 29 | pre_node_nums = np.array([1*3*3, 16*3*3, 16*3*3, 32*3*3, 32*3*3, 64*3*3, 64*4*4, hidden_size]) 30 | wight_init_scales = np.sqrt(2.0 / pre_node_nums) # 使用ReLU的情况下推荐的初始值 31 | 32 | self.params = {} 33 | pre_channel_num = input_dim[0] 34 | for idx, conv_param in enumerate([conv_param_1, conv_param_2, conv_param_3, conv_param_4, conv_param_5, conv_param_6]): 35 | self.params['W' + str(idx+1)] = wight_init_scales[idx] * np.random.randn(conv_param['filter_num'], pre_channel_num, conv_param['filter_size'], conv_param['filter_size']) 36 | self.params['b' + str(idx+1)] = np.zeros(conv_param['filter_num']) 37 | pre_channel_num = conv_param['filter_num'] 38 | self.params['W7'] = wight_init_scales[6] * np.random.randn(64*4*4, hidden_size) 39 | self.params['b7'] = np.zeros(hidden_size) 40 | self.params['W8'] = wight_init_scales[7] * np.random.randn(hidden_size, output_size) 41 | self.params['b8'] = np.zeros(output_size) 42 | 43 | # 生成层=========== 44 | self.layers = [] 45 | self.layers.append(Convolution(self.params['W1'], self.params['b1'], 46 | conv_param_1['stride'], conv_param_1['pad'])) 47 | self.layers.append(Relu()) 48 | self.layers.append(Convolution(self.params['W2'], self.params['b2'], 49 | conv_param_2['stride'], conv_param_2['pad'])) 50 | self.layers.append(Relu()) 51 | self.layers.append(Pooling(pool_h=2, pool_w=2, stride=2)) 52 | self.layers.append(Convolution(self.params['W3'], self.params['b3'], 53 | conv_param_3['stride'], conv_param_3['pad'])) 54 | self.layers.append(Relu()) 55 | self.layers.append(Convolution(self.params['W4'], self.params['b4'], 56 | conv_param_4['stride'], conv_param_4['pad'])) 57 | self.layers.append(Relu()) 58 | self.layers.append(Pooling(pool_h=2, pool_w=2, stride=2)) 59 | self.layers.append(Convolution(self.params['W5'], self.params['b5'], 60 | conv_param_5['stride'], conv_param_5['pad'])) 61 | self.layers.append(Relu()) 62 | self.layers.append(Convolution(self.params['W6'], self.params['b6'], 63 | conv_param_6['stride'], conv_param_6['pad'])) 64 | self.layers.append(Relu()) 65 | self.layers.append(Pooling(pool_h=2, pool_w=2, stride=2)) 66 | self.layers.append(Affine(self.params['W7'], self.params['b7'])) 67 | self.layers.append(Relu()) 68 | self.layers.append(Dropout(0.5)) 69 | self.layers.append(Affine(self.params['W8'], self.params['b8'])) 70 | self.layers.append(Dropout(0.5)) 71 | 72 | self.last_layer = SoftmaxWithLoss() 73 | 74 | def predict(self, x, train_flg=False): 75 | for layer in self.layers: 76 | if isinstance(layer, Dropout): 77 | x = layer.forward(x, train_flg) 78 | else: 79 | x = layer.forward(x) 80 | return x 81 | 82 | def loss(self, x, t): 83 | y = self.predict(x, train_flg=True) 84 | return self.last_layer.forward(y, t) 85 | 86 | def accuracy(self, x, t, batch_size=100): 87 | if t.ndim != 1 : t = np.argmax(t, axis=1) 88 | 89 | acc = 0.0 90 | 91 | for i in range(int(x.shape[0] / batch_size)): 92 | tx = x[i*batch_size:(i+1)*batch_size] 93 | tt = t[i*batch_size:(i+1)*batch_size] 94 | y = self.predict(tx, train_flg=False) 95 | y = np.argmax(y, axis=1) 96 | acc += np.sum(y == tt) 97 | 98 | return acc / x.shape[0] 99 | 100 | def gradient(self, x, t): 101 | # forward 102 | self.loss(x, t) 103 | 104 | # backward 105 | dout = 1 106 | dout = self.last_layer.backward(dout) 107 | 108 | tmp_layers = self.layers.copy() 109 | tmp_layers.reverse() 110 | for layer in tmp_layers: 111 | dout = layer.backward(dout) 112 | 113 | # 设定 114 | grads = {} 115 | for i, layer_idx in enumerate((0, 2, 5, 7, 10, 12, 15, 18)): 116 | grads['W' + str(i+1)] = self.layers[layer_idx].dW 117 | grads['b' + str(i+1)] = self.layers[layer_idx].db 118 | 119 | return grads 120 | 121 | def save_params(self, file_name="params.pkl"): 122 | params = {} 123 | for key, val in self.params.items(): 124 | params[key] = val 125 | with open(file_name, 'wb') as f: 126 | pickle.dump(params, f) 127 | 128 | def load_params(self, file_name="params.pkl"): 129 | with open(file_name, 'rb') as f: 130 | params = pickle.load(f) 131 | for key, val in params.items(): 132 | self.params[key] = val 133 | 134 | for i, layer_idx in enumerate((0, 2, 5, 7, 10, 12, 15, 18)): 135 | self.layers[layer_idx].W = self.params['W' + str(i+1)] 136 | self.layers[layer_idx].b = self.params['b' + str(i+1)] 137 | -------------------------------------------------------------------------------- /common/multi_layer_net_extend.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys, os 3 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定 4 | import numpy as np 5 | from collections import OrderedDict 6 | from common.layers import * 7 | from common.gradient import numerical_gradient 8 | 9 | class MultiLayerNetExtend: 10 | """扩展版的全连接的多层神经网络 11 | 12 | 具有Weiht Decay、Dropout、Batch Normalization的功能 13 | 14 | Parameters 15 | ---------- 16 | input_size : 输入大小(MNIST的情况下为784) 17 | hidden_size_list : 隐藏层的神经元数量的列表(e.g. [100, 100, 100]) 18 | output_size : 输出大小(MNIST的情况下为10) 19 | activation : 'relu' or 'sigmoid' 20 | weight_init_std : 指定权重的标准差(e.g. 0.01) 21 | 指定'relu'或'he'的情况下设定“He的初始值” 22 | 指定'sigmoid'或'xavier'的情况下设定“Xavier的初始值” 23 | weight_decay_lambda : Weight Decay(L2范数)的强度 24 | use_dropout: 是否使用Dropout 25 | dropout_ration : Dropout的比例 26 | use_batchNorm: 是否使用Batch Normalization 27 | """ 28 | def __init__(self, input_size, hidden_size_list, output_size, 29 | activation='relu', weight_init_std='relu', weight_decay_lambda=0, 30 | use_dropout = False, dropout_ration = 0.5, use_batchnorm=False): 31 | self.input_size = input_size 32 | self.output_size = output_size 33 | self.hidden_size_list = hidden_size_list 34 | self.hidden_layer_num = len(hidden_size_list) 35 | self.use_dropout = use_dropout 36 | self.weight_decay_lambda = weight_decay_lambda 37 | self.use_batchnorm = use_batchnorm 38 | self.params = {} 39 | 40 | # 初始化权重 41 | self.__init_weight(weight_init_std) 42 | 43 | # 生成层 44 | activation_layer = {'sigmoid': Sigmoid, 'relu': Relu} 45 | self.layers = OrderedDict() 46 | for idx in range(1, self.hidden_layer_num+1): 47 | self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)], 48 | self.params['b' + str(idx)]) 49 | if self.use_batchnorm: 50 | self.params['gamma' + str(idx)] = np.ones(hidden_size_list[idx-1]) 51 | self.params['beta' + str(idx)] = np.zeros(hidden_size_list[idx-1]) 52 | self.layers['BatchNorm' + str(idx)] = BatchNormalization(self.params['gamma' + str(idx)], self.params['beta' + str(idx)]) 53 | 54 | self.layers['Activation_function' + str(idx)] = activation_layer[activation]() 55 | 56 | if self.use_dropout: 57 | self.layers['Dropout' + str(idx)] = Dropout(dropout_ration) 58 | 59 | idx = self.hidden_layer_num + 1 60 | self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)], self.params['b' + str(idx)]) 61 | 62 | self.last_layer = SoftmaxWithLoss() 63 | 64 | def __init_weight(self, weight_init_std): 65 | """设定权重的初始值 66 | 67 | Parameters 68 | ---------- 69 | weight_init_std : 指定权重的标准差(e.g. 0.01) 70 | 指定'relu'或'he'的情况下设定“He的初始值” 71 | 指定'sigmoid'或'xavier'的情况下设定“Xavier的初始值” 72 | """ 73 | all_size_list = [self.input_size] + self.hidden_size_list + [self.output_size] 74 | for idx in range(1, len(all_size_list)): 75 | scale = weight_init_std 76 | if str(weight_init_std).lower() in ('relu', 'he'): 77 | scale = np.sqrt(2.0 / all_size_list[idx - 1]) # 使用ReLU的情况下推荐的初始值 78 | elif str(weight_init_std).lower() in ('sigmoid', 'xavier'): 79 | scale = np.sqrt(1.0 / all_size_list[idx - 1]) # 使用sigmoid的情况下推荐的初始值 80 | self.params['W' + str(idx)] = scale * np.random.randn(all_size_list[idx-1], all_size_list[idx]) 81 | self.params['b' + str(idx)] = np.zeros(all_size_list[idx]) 82 | 83 | def predict(self, x, train_flg=False): 84 | for key, layer in self.layers.items(): 85 | if "Dropout" in key or "BatchNorm" in key: 86 | x = layer.forward(x, train_flg) 87 | else: 88 | x = layer.forward(x) 89 | 90 | return x 91 | 92 | def loss(self, x, t, train_flg=False): 93 | """求损失函数 94 | 参数x是输入数据,t是教师标签 95 | """ 96 | y = self.predict(x, train_flg) 97 | 98 | weight_decay = 0 99 | for idx in range(1, self.hidden_layer_num + 2): 100 | W = self.params['W' + str(idx)] 101 | weight_decay += 0.5 * self.weight_decay_lambda * np.sum(W**2) 102 | 103 | return self.last_layer.forward(y, t) + weight_decay 104 | 105 | def accuracy(self, X, T): 106 | Y = self.predict(X, train_flg=False) 107 | Y = np.argmax(Y, axis=1) 108 | if T.ndim != 1 : T = np.argmax(T, axis=1) 109 | 110 | accuracy = np.sum(Y == T) / float(X.shape[0]) 111 | return accuracy 112 | 113 | def numerical_gradient(self, X, T): 114 | """求梯度(数值微分) 115 | 116 | Parameters 117 | ---------- 118 | X : 输入数据 119 | T : 教师标签 120 | 121 | Returns 122 | ------- 123 | 具有各层的梯度的字典变量 124 | grads['W1']、grads['W2']、...是各层的权重 125 | grads['b1']、grads['b2']、...是各层的偏置 126 | """ 127 | loss_W = lambda W: self.loss(X, T, train_flg=True) 128 | 129 | grads = {} 130 | for idx in range(1, self.hidden_layer_num+2): 131 | grads['W' + str(idx)] = numerical_gradient(loss_W, self.params['W' + str(idx)]) 132 | grads['b' + str(idx)] = numerical_gradient(loss_W, self.params['b' + str(idx)]) 133 | 134 | if self.use_batchnorm and idx != self.hidden_layer_num+1: 135 | grads['gamma' + str(idx)] = numerical_gradient(loss_W, self.params['gamma' + str(idx)]) 136 | grads['beta' + str(idx)] = numerical_gradient(loss_W, self.params['beta' + str(idx)]) 137 | 138 | return grads 139 | 140 | def gradient(self, x, t): 141 | # forward 142 | self.loss(x, t, train_flg=True) 143 | 144 | # backward 145 | dout = 1 146 | dout = self.last_layer.backward(dout) 147 | 148 | layers = list(self.layers.values()) 149 | layers.reverse() 150 | for layer in layers: 151 | dout = layer.backward(dout) 152 | 153 | # 设定 154 | grads = {} 155 | for idx in range(1, self.hidden_layer_num+2): 156 | grads['W' + str(idx)] = self.layers['Affine' + str(idx)].dW + self.weight_decay_lambda * self.params['W' + str(idx)] 157 | grads['b' + str(idx)] = self.layers['Affine' + str(idx)].db 158 | 159 | if self.use_batchnorm and idx != self.hidden_layer_num+1: 160 | grads['gamma' + str(idx)] = self.layers['BatchNorm' + str(idx)].dgamma 161 | grads['beta' + str(idx)] = self.layers['BatchNorm' + str(idx)].dbeta 162 | 163 | return grads -------------------------------------------------------------------------------- /深度学习入门笔记.md: -------------------------------------------------------------------------------- 1 | - 我们看着真值 2 | 3 | 表这种“训练数据”,人工考虑(想到)了参数的值。而机器学习的课 4 | 5 | 题就是将这个决定参数值的工作交由计算机自动进行。 学习是确定 6 | 7 | 合适的参数的过程,而人要做的是思考感知机的构造(模型),并把 8 | 9 | 训练数据交给计算机 10 | 11 | 12 | 13 | - 偏置和权重w1、 w2的作用是不 14 | 15 | 一样的。具体地说, w1和w2是控制输入信号的重要性的参数,而偏置是调 16 | 17 | 整神经元被激活的容易程度(输出信号为1的程度)的参数 18 | 19 | 20 | 21 | - 感知机的局限性就在于它只能表示由一条直线分割的空间。图2-8这样弯 22 | 23 | 曲的曲线无法用感知机表示 24 | 25 | 26 | 27 | - 严格地讲,应该是“单层感知机无法 28 | 29 | 表示异或门”或者“单层感知机无法分离非线性空间”。接下来,我 30 | 31 | 们将看到通过组合感知机(叠加层)就可以实现异或门 32 | 33 | 34 | 35 | - 像这样,在异或门的感知机中,工人之间不断进行零件的传送。通过这 36 | 37 | 样的结构(2层结构),感知机得以实现异或门。这可以解释为“单层感知机 38 | 39 | 无法表示的东西,通过增加一层就可以解决”。也就是说,通过叠加层(加深 40 | 41 | 层),感知机能进行更加灵活的表示 42 | 43 | 44 | 45 | - 人们一般会认为计算机内部进行的处理非常复杂,而令人惊讶的是,实 46 | 47 | 际上只需要通过与非门的组合,就能再现计算机进行的处理 48 | 49 | 50 | 51 | - sigmoid函数的平滑性对神经网络的学习具有重要意义。 52 | 53 | - 实际上,上一章介绍的感知机和接下来要介绍 54 | 55 | 的神经网络的主要区别就在于这个激活函数。 56 | 57 | 58 | 59 | - 使用线性函数时,无法发挥多层网络带来的优势。因此,为了发挥叠加层所 60 | 61 | 带来的优势,激活函数必须使用非线性函数 62 | 63 | 64 | 65 | - ReLU函数在输入大于0时,直接输出该值;在输入小于等于0时,输 66 | 67 | 出0 68 | 69 | 70 | 71 | - 在 72 | 73 | 神经网络发展的历史上, sigmoid函数很早就开始被使用了,而最近则主要 74 | 75 | 使用ReLU(Rectifed Linear Unit)函数 76 | 77 | 78 | 79 | - 输出层所用的激活函数,要根据求解问题的性质决定。一般地,回 80 | 81 | 归问题可以使用恒等函数,二元分类问题可以使用 sigmoid函数, 82 | 83 | 多元分类问题可以使用 softmax函数。关于输出层的激活函数,我 84 | 85 | 们将在下一节详细介绍 86 | 87 | 88 | 89 | - 经网络可以用在分类问题和回归问题上,不过需要根据情况改变输出 90 | 91 | 层的激活函数。一般而言,回归问题用恒等函数,分类问题用softmax函数。 92 | 93 | 94 | 95 | - 分类问题中使用的softmax函数 96 | 97 | exp(x)是表示ex的指数函数(e是纳皮尔常数2.7182 . . .)。式(3.10)表示 98 | 99 | 假设输出层共有n个神经元,计算第k个神经元的输出yk。如式(3.10)所示, 100 | 101 | softmax函数的分子是输入信号ak的指数函数,分母是所有输入信号的指数 102 | 103 | 函数的和。 104 | 105 | 106 | 107 | - 在进行 softmax 的指数函数的运算时,加上(或者减去) 108 | 109 | 某个常数并不会改变运算的结果。这里的 C  可以使用任何值,但是为了防 110 | 111 | 止溢出,一般会使用输入信号中的最大值 112 | 113 | 114 | 115 | 116 | 117 | - softmax 函数的输出是 0.0 到 1.0 之间的实数。并且,softmax 118 | 119 | 函数的输出值的总和是 1。输出总和为 1 是 softmax 函数的一个重要性质 120 | 121 | 122 | 123 | - 一般而言,神经网络只把输出值最大的神经元所对应的类别作为识别结果。 124 | 125 | 并且,即便使用 softmax 函数,输出值最大的神经元的位置也不会变。因此, 126 | 127 | 神经网络在进行分类时,输出层的 softmax 函数可以省略 128 | 129 | 130 | 131 | - 阶跃函数就像“竹筒敲石”一样,只在某个瞬间产生变化。而 sigmoid 函数, 132 | 133 | 如图 4-4 所示,不仅函数的输出(竖轴的值)是连续变化的,曲线的斜率(导数) 134 | 135 | 也是连续变化的。也就是说,sigmoid 函数的导数在任何地方都不为 0。这对 136 | 137 | 神经网络的学习非常重要。得益于这个斜率不会为 0 的性质,神经网络的学 138 | 139 | 习得以正确进行 140 | 141 | 142 | 143 | - 我们把这里讨论的有多个变量的函数的导数称为偏导数 144 | 145 | - **梯度指示的方向 146 | 147 | 是各点处的函数值减小最多的方向 A 。这是一个非常重要的性质,请一定 148 | 149 | 牢记!** 150 | 151 | 152 | 153 | - 实验结果表明,学习率过大的话,会发散成一个很大的值;反过来,学 154 | 155 | 习率过小的话,基本上没怎么更新就结束了。也就是说,设定合适的学习率 156 | 157 | 是一个很重要的问题。 158 | 159 | 160 | 161 | - 像学习率这样的参数称为超参数。这是一种和神经网络的参数(权重 162 | 163 | 和偏置)性质不同的参数。相对于神经网络的权重参数是通过训练 164 | 165 | 数据和学习算法自动获得的,学习率这样的超参数则是人工设定的。 166 | 167 | 一般来说,超参数需要尝试多个值,以便找到一种可以使学习顺利 168 | 169 | 进行的设定。 170 | 171 | 172 | 173 | - 后面我们会详细讨论权重参数的初始化,这里只需要知道,权重使用符合高斯 174 | 175 | 分 布 的 随 机 数 进 行 初 始 化,偏 置 使 用 0 进 行 初 始 化 176 | 177 | 178 | 179 | - epoch 是一个单位。一个 epoch 表示学习中所有训练数据均被使用过 180 | 181 | 一次时的更新次数 182 | 183 | 184 | 185 | - 实线表示训练数据的识别精度,虚线表示测试数据的识别精 186 | 187 | 度。如图所示,随着 epoch 的前进(学习的进行),我们发现使用训练数据和 188 | 189 | 测试数据评价的识别精度都提高了,并且,这两个识别精度基本上没有差异(两 190 | 191 | 条线基本重叠在一起)。因此,可以说这次的学习中没有发生过拟合的现象。 192 | 193 | 194 | 195 | - 数值微分虽然费时间,但是实现起来很简单。下一章中要实现的稍 196 | 197 | 微复杂一些的误差反向传播法可以高速地计算梯度 198 | 199 | 200 | 201 | - 这里的第 2 歩“从左向右进行计算”是一种正方向上的传播,简称为正 202 | 203 | 向传播 (forward propagation)。正向传播是从计算图出发点到结束点的传播。 204 | 205 | 既然有正向传播这个名称,当然也可以考虑反向(从图上看的话,就是从右向左) 206 | 207 | 的传播。实际上,这种传播称为反向传播 (backward propagation)。反向传 208 | 209 | 播将在接下来的导数计算中发挥重要作用 210 | 211 | 212 | 213 | - 几何中,仿射变换包括一次线性变换和一次平移,分别对应神经网络的加权和运算与加偏置运算 214 | 215 | 216 | 217 | - 输入数据为张量(四维数据)的情况 218 | 219 | 220 | 221 | - 数值微分的优点是实现简单,因此,一般情况下不太容易出错。而误差 222 | 223 | 反向传播法的实现很复杂,容易出错。所以,经常会比较数值微分的结果和 224 | 225 | 误差反向传播法的结果,以确认误差反向传播法的实现是否正确。确认数值 226 | 227 | 微分求出的梯度结果和误差反向传播法求出的结果是否一致(严格地讲,是 228 | 229 | 非常相近)的操作称为梯度确认 (gradient check) 230 | 231 | 232 | 233 | - 如果我们把权重初始值全部设为 0 以减小权重的值,会怎么样呢?从结 234 | 235 | 论来说,将权重初始值设为 0 不是一个好主意。事实上,将权重初始值设为 236 | 237 | 0 的话,将无法正确进行学习。 238 | 239 | 240 | 241 | - 这里使用的 sigmoid 242 | 243 | 函数是 S 型函数,随着输出不断地靠近 0 (或者靠近 1),它的导数的值逐渐接 244 | 245 | 近 0。因此,偏向 0 和 1 的数据分布会造成反向传播中梯度的值不断变小,最 246 | 247 | 后消失。这个问题称为梯度消失 (gradient vanishing)。层次加深的深度学习 248 | 249 | 中,梯度消失的问题可能会更加严重 250 | 251 | 252 | 253 | - 各层的激活值的分布都要求有适当的广度。为什么呢?因为通过 254 | 255 | 在各层间传递多样性的数据,神经网络可以进行高效的学习。反 256 | 257 | 过来,如果传递的是有所偏向的数据,就会出现梯度消失或者“表 258 | 259 | 现力受限”的问题,导致学习可能无法顺利进行。 260 | 261 | 262 | 263 | - 机器学习中经常使用集成学习。所谓集成学习,就是让多个模型单 264 | 265 | 独进行学习,推理时再取多个模型的输出的平均值。 266 | 267 | 实验告诉我们,通过进行集成学习,神经网络的识别精度可以提高好几个百分点 268 | 269 | 270 | 271 | - 除了权重和偏置等参数,超参数 (hyper-parameter)也经 272 | 273 | 常出现。这里所说的超参数是指,比如各层的神经元数量、batch 大小、参 274 | 275 | 数更新时的学习率或权值衰减等。如果这些超参数没有设置合适的值,模型 276 | 277 | 的性能就会很差。 278 | 279 | 280 | 281 | - 不能使用测试数据评估超参数的性能。 282 | 283 | 为什么不能用测试数据评估超参数的性能呢?这是因为如果使用测试数 284 | 285 | 据调整超参数,超参数的值会对测试数据发生过拟合。换句话说,用测试数 286 | 287 | 据确认超参数的值的“好坏”,就会导致超参数的值被调整为只拟合测试数据。 288 | 289 | 这样的话,可能就会得到不能拟合其他数据、泛化能力低的模型。 290 | 291 | 292 | 293 | - 调整超参数时,必须使用超参数专用的确认数据。用于调整超参 294 | 295 | 数的数据,一般称为验证数据 (validation data)。我们使用这个验证数据来 296 | 297 | 评估超参数的好坏 298 | 299 | 300 | 301 | - 分割训练数据前,先打乱了输入数据和教师标签。这是因为数据 302 | 303 | 集的数据可能存在偏向(比如,数据从“0”到“10”按顺序排列等)。 304 | 305 | np.random.shuffle(x) 这个函数会改变x的值,重新赋值。 306 | 307 | 308 | 309 | - permutation = np.random.permutation(x.shape[0]) 返回0到x.shape[0] 的排列 310 | 311 | - 有报告 [15] 显示,在进行神经网络的超参数的最优化时,与网格搜索 312 | 313 | 等有规律的搜索相比,随机采样的搜索方式效果更好。这是因为在 314 | 315 | 多个超参数中,各个超参数对最终的识别精度的影响程度不同。 316 | 317 | 318 | 319 | - 以上就是超参数的最优化的内容,简单归纳一下,如下所示。 320 | 321 | * 步骤 0 322 | 323 | 设定超参数的范围。 324 | 325 | * 步骤 1 326 | 327 | 从设定的超参数范围中随机采样。 328 | 329 | * 步骤 2 330 | 331 | 使用步骤 1 中采样到的超参数的值进行学习,通过验证数据评估识别精 332 | 333 | 度(但是要将 epoch 设置得很小) 。 334 | 335 | * 步骤 3 336 | 337 | 重复步骤 1 和步骤 2 (100 次等),根据它们的识别精度的结果,缩小超参 338 | 339 | 数的范围。 340 | 341 | 342 | 343 | 反复进行上述操作,不断缩小超参数的范围,在缩小到一定程度时,从 344 | 345 | 该范围中选出一个超参数的值。这就是进行超参数的最优化的一种方法。 346 | 347 | 348 | 349 | * 参 数 的 更 新 方 法,除 了 SGD 之 外,还 有 Momentum、AdaGrad、 350 | 351 | Adam 等方法。 352 | 353 | * 权重初始值的赋值方法对进行正确的学习非常重要。 354 | 355 | * 作为权重初始值,Xavier 初始值、He 初始值等比较有效。 356 | 357 | * 通过使用 Batch Normalization,可以加速学习,并且对初始值变得 358 | 359 | 健壮。 360 | 361 | * 抑制过拟合的正则化技术有权值衰减、Dropout 等。 362 | 363 | * 逐渐缩小“好值”存在的范围是搜索超参数的一个有效方法 364 | 365 | 366 | 367 | - 全连接层存在什么问题呢?那就是数据的形状被“忽视”了。比如,输 368 | 369 | 入数据是图像时,图像通常是高、长、通道方向上的 3 维形状。但是,向全 370 | 371 | 连接层输入时,需要将 3 维数据拉平为 1 维数据。 372 | 373 | 374 | 375 | 图像是 3 维形状,这个形状中应该含有重要的空间信息。比如,空间上 376 | 377 | 邻近的像素为相似的值、RBG 的各个通道之间分别有密切的关联性、相距 378 | 379 | 较远的像素之间没有什么关联等,3 维形状中可能隐藏有值得提取的本质模 380 | 381 | 式。但是,因为全连接层会忽视形状,将全部的输入数据作为相同的神经元 382 | 383 | (同一维度的神经元)处理,所以无法利用与形状相关的信息。 384 | 385 | 386 | 387 | - 通过填充,大小为 (4, 4) 的输入数据变成了 (6, 6) 的形状。 388 | 389 | 然后,应用大小为 (3, 3) 的滤波器,生成了大小为 (4, 4) 的输出数据。这个例 390 | 391 | 子中将填充设成了 1,不过填充的值也可以设置成 2、 3 等任意的整数。在图 7-5 392 | 393 | 的例子中,如果将填充设为 2,则输入数据的大小变为 (8, 8);如果将填充设 394 | 395 | 为 3,则大小变为 (10, 10)。 396 | 397 | 这个填充是指 填充的个数。 填充的内容为0 398 | 399 | 400 | 401 | - 使用填充主要是为了调整输出的大小。比如,对大小为 (4, 4) 的输入 402 | 403 | 数据应用 (3, 3) 的滤波器时,输出大小变为 (2, 2),相当于输出大小 404 | 405 | 比输入大小缩小了 2 个元素。这在反复进行多次卷积运算的深度网 406 | 407 | 络中会成为问题。为什么呢?因为如果每次进行卷积运算都会缩小 408 | 409 | 空间,那么在某个时刻输出大小就有可能变为 1,导致无法再应用 410 | 411 | 卷积运算。为了避免出现这样的情况,就要使用填充。在刚才的例 412 | 413 | 子中,将填充的幅度设为 1,那么相对于输入大小 (4, 4),输出大小 414 | 415 | 也保持为原来的 (4, 4)。因此,卷积运算就可以在保持空间大小不变 416 | 417 | 的情况下将数据传给下一层。 418 | 419 | 420 | 421 | 422 | 423 | - 在 3 维数据的卷积运算中,输入数据和滤波器的通道数 424 | 425 | 要设为相同的值。在这个例子中,输入数据和滤波器的通道数一致,均为 3。 426 | 427 | 滤波器大小可以设定为任意值(不过,每个通道的滤波器大小要全部相同)。 428 | 429 | 这个例子中滤波器大小为 (3, 3),但也可以设定为 (2, 2)、(1, 1)、(5, 5) 等任 430 | 431 | 意值。再强调一下,通道数只能设定为和输入数据的通道数相同的值(本例 432 | 433 | 中为 3)。 434 | 435 | 436 | -------------------------------------------------------------------------------- /common/layers.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | from common.functions import * 4 | from common.util import im2col, col2im 5 | import time 6 | 7 | class Relu: 8 | def __init__(self): 9 | self.mask = None 10 | 11 | def forward(self, x): 12 | self.mask = (x <= 0) 13 | out = x.copy() 14 | out[self.mask] = 0 15 | 16 | return out 17 | 18 | def backward(self, dout): 19 | dout[self.mask] = 0 20 | dx = dout 21 | 22 | return dx 23 | 24 | 25 | class Sigmoid: 26 | def __init__(self): 27 | self.out = None 28 | 29 | def forward(self, x): 30 | out = sigmoid(x) 31 | self.out = out 32 | return out 33 | 34 | def backward(self, dout): 35 | dx = dout * (1.0 - self.out) * self.out 36 | 37 | return dx 38 | 39 | 40 | class Affine: 41 | def __init__(self, W, b): 42 | self.W =W 43 | self.b = b 44 | 45 | self.x = None 46 | self.original_x_shape = None 47 | # 权重和偏置参数的导数 48 | self.dW = None 49 | self.db = None 50 | 51 | def forward(self, x): 52 | # 对应张量 53 | self.original_x_shape = x.shape 54 | #print('before ---->',x.shape) 55 | x = x.reshape(x.shape[0], -1) 56 | #print('after ---->',x.shape) 57 | self.x = x 58 | 59 | out = np.dot(self.x, self.W) + self.b 60 | 61 | return out 62 | 63 | def backward(self, dout): 64 | dx = np.dot(dout, self.W.T) 65 | self.dW = np.dot(self.x.T, dout) 66 | self.db = np.sum(dout, axis=0) 67 | 68 | dx = dx.reshape(*self.original_x_shape) # 还原输入数据的形状(对应张量) 69 | return dx 70 | 71 | 72 | class SoftmaxWithLoss: 73 | def __init__(self): 74 | self.loss = None 75 | self.y = None # softmax的输出 76 | self.t = None # 监督数据 77 | 78 | def forward(self, x, t): 79 | self.t = t 80 | self.y = softmax(x) 81 | self.loss = cross_entropy_error(self.y, self.t) 82 | 83 | return self.loss 84 | 85 | def backward(self, dout=1): 86 | batch_size = self.t.shape[0] 87 | if self.t.size == self.y.size: # 监督数据是one-hot-vector的情况 88 | dx = (self.y - self.t) / batch_size 89 | else: 90 | dx = self.y.copy() 91 | dx[np.arange(batch_size), self.t] -= 1 92 | dx = dx / batch_size 93 | 94 | return dx 95 | 96 | 97 | class Dropout: 98 | """ 99 | http://arxiv.org/abs/1207.0580 100 | """ 101 | def __init__(self, dropout_ratio=0.5): 102 | self.dropout_ratio = dropout_ratio 103 | self.mask = None 104 | 105 | def forward(self, x, train_flg=True): 106 | if train_flg: 107 | self.mask = np.random.rand(*x.shape) > self.dropout_ratio 108 | return x * self.mask 109 | else: 110 | return x * (1.0 - self.dropout_ratio) 111 | 112 | def backward(self, dout): 113 | return dout * self.mask 114 | 115 | # 归一化层 116 | class BatchNormalization: 117 | """ 118 | http://arxiv.org/abs/1502.03167 119 | """ 120 | def __init__(self, gamma, beta, momentum=0.9, running_mean=None, running_var=None): 121 | self.gamma = gamma 122 | self.beta = beta 123 | self.momentum = momentum 124 | self.input_shape = None # Conv层的情况下为4维,全连接层的情况下为2维 125 | 126 | # 测试时使用的平均值和方差 127 | self.running_mean = running_mean 128 | self.running_var = running_var 129 | 130 | # backward时使用的中间数据 131 | self.batch_size = None 132 | self.xc = None 133 | self.std = None 134 | self.dgamma = None 135 | self.dbeta = None 136 | 137 | def forward(self, x, train_flg=True): 138 | self.input_shape = x.shape 139 | if x.ndim != 2: 140 | N, C, H, W = x.shape 141 | x = x.reshape(N, -1) 142 | 143 | out = self.__forward(x, train_flg) 144 | 145 | return out.reshape(*self.input_shape) 146 | 147 | def __forward(self, x, train_flg): 148 | if self.running_mean is None: 149 | N, D = x.shape 150 | self.running_mean = np.zeros(D) 151 | self.running_var = np.zeros(D) 152 | 153 | if train_flg: 154 | mu = x.mean(axis=0) 155 | xc = x - mu 156 | var = np.mean(xc**2, axis=0) 157 | std = np.sqrt(var + 10e-7) 158 | xn = xc / std 159 | 160 | self.batch_size = x.shape[0] 161 | self.xc = xc 162 | self.xn = xn 163 | self.std = std 164 | self.running_mean = self.momentum * self.running_mean + (1-self.momentum) * mu 165 | self.running_var = self.momentum * self.running_var + (1-self.momentum) * var 166 | else: 167 | xc = x - self.running_mean 168 | xn = xc / ((np.sqrt(self.running_var + 10e-7))) 169 | 170 | out = self.gamma * xn + self.beta 171 | return out 172 | 173 | def backward(self, dout): 174 | if dout.ndim != 2: 175 | N, C, H, W = dout.shape 176 | dout = dout.reshape(N, -1) 177 | 178 | dx = self.__backward(dout) 179 | 180 | dx = dx.reshape(*self.input_shape) 181 | return dx 182 | 183 | def __backward(self, dout): 184 | dbeta = dout.sum(axis=0) 185 | dgamma = np.sum(self.xn * dout, axis=0) 186 | dxn = self.gamma * dout 187 | dxc = dxn / self.std 188 | dstd = -np.sum((dxn * self.xc) / (self.std * self.std), axis=0) 189 | dvar = 0.5 * dstd / self.std 190 | dxc += (2.0 / self.batch_size) * self.xc * dvar 191 | dmu = np.sum(dxc, axis=0) 192 | dx = dxc - dmu / self.batch_size 193 | 194 | self.dgamma = dgamma 195 | self.dbeta = dbeta 196 | 197 | return dx 198 | 199 | 200 | class Convolution: 201 | def __init__(self, W, b, stride=1, pad=0): 202 | self.W = W 203 | self.b = b 204 | self.stride = stride 205 | self.pad = pad 206 | 207 | # 中间数据(backward时使用) 208 | self.x = None 209 | self.col = None 210 | self.col_W = None 211 | 212 | # 权重和偏置参数的梯度 213 | self.dW = None 214 | self.db = None 215 | 216 | def forward(self, x): 217 | FN, C, FH, FW = self.W.shape 218 | N, C, H, W = x.shape 219 | out_h = 1 + int((H + 2*self.pad - FH) / self.stride) 220 | out_w = 1 + int((W + 2*self.pad - FW) / self.stride) 221 | 222 | col = im2col(x, FH, FW, self.stride, self.pad) 223 | col_W = self.W.reshape(FN, -1).T 224 | 225 | out = np.dot(col, col_W) + self.b 226 | out = out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2) 227 | 228 | self.x = x 229 | self.col = col 230 | self.col_W = col_W 231 | 232 | return out 233 | 234 | def backward(self, dout): 235 | FN, C, FH, FW = self.W.shape 236 | dout = dout.transpose(0,2,3,1).reshape(-1, FN) 237 | 238 | self.db = np.sum(dout, axis=0) 239 | self.dW = np.dot(self.col.T, dout) 240 | self.dW = self.dW.transpose(1, 0).reshape(FN, C, FH, FW) 241 | 242 | dcol = np.dot(dout, self.col_W.T) 243 | dx = col2im(dcol, self.x.shape, FH, FW, self.stride, self.pad) 244 | 245 | return dx 246 | 247 | 248 | class Pooling: 249 | def __init__(self, pool_h, pool_w, stride=1, pad=0): 250 | self.pool_h = pool_h 251 | self.pool_w = pool_w 252 | self.stride = stride 253 | self.pad = pad 254 | 255 | self.x = None 256 | self.arg_max = None 257 | 258 | def forward(self, x): 259 | N, C, H, W = x.shape 260 | out_h = int(1 + (H - self.pool_h) / self.stride) 261 | out_w = int(1 + (W - self.pool_w) / self.stride) 262 | 263 | col = im2col(x, self.pool_h, self.pool_w, self.stride, self.pad) 264 | col = col.reshape(-1, self.pool_h*self.pool_w) 265 | 266 | arg_max = np.argmax(col, axis=1) 267 | out = np.max(col, axis=1) 268 | out = out.reshape(N, out_h, out_w, C).transpose(0, 3, 1, 2) 269 | 270 | self.x = x 271 | self.arg_max = arg_max 272 | 273 | return out 274 | 275 | def backward(self, dout): 276 | dout = dout.transpose(0, 2, 3, 1) 277 | 278 | pool_size = self.pool_h * self.pool_w 279 | dmax = np.zeros((dout.size, pool_size)) 280 | dmax[np.arange(self.arg_max.size), self.arg_max.flatten()] = dout.flatten() 281 | dmax = dmax.reshape(dout.shape + (pool_size,)) 282 | 283 | dcol = dmax.reshape(dmax.shape[0] * dmax.shape[1] * dmax.shape[2], -1) 284 | dx = col2im(dcol, self.x.shape, self.pool_h, self.pool_w, self.stride, self.pad) 285 | 286 | return dx 287 | --------------------------------------------------------------------------------