├── common
    ├── __init__.py
    ├── functions.py
    ├── gradient.py
    ├── util.py
    ├── trainer.py
    ├── optimizer.py
    ├── multi_layer_net.py
    ├── multi_layer_net_extend.py
    └── layers.py
├── dataset
    ├── __init__.py
    └── mnist.py
├── ch01
    ├── hungry.py
    ├── img_show.py
    ├── sin_graph.py
    ├── simple_graph.py
    ├── man.py
    └── sin_cos_graph.py
├── ch08
    ├── awesome_net.py
    ├── train_deepnet.py
    ├── half_float_network.py
    ├── misclassified_mnist.py
    └── deep_convnet.py
├── .gitignore
├── ch03
    ├── relu.py
    ├── sigmoid.py
    ├── step_function.py
    ├── sig_step_compare.py
    ├── mnist_show.py
    ├── neuralnet_mnist.py
    └── neuralnet_mnist_batch.py
├── ch02
    ├── xor_gate.py
    ├── or_gate.py
    ├── and_gate.py
    └── nand_gate.py
├── ch05
    ├── buy_apple.py
    ├── layer_naive.py
    ├── gradient_check.py
    ├── buy_apple_orange.py
    ├── train_neuralnet.py
    └── two_layer_net.py
├── ch07
    ├── gradient_check.py
    ├── visualize_filter.py
    ├── train_convnet.py
    ├── apply_filter.py
    └── simple_convnet.py
├── ch04
    ├── gradient_1d.py
    ├── gradient_simplenet.py
    ├── gradient_method.py
    ├── gradient_2d.py
    ├── train_neuralnet.py
    └── two_layer_net.py
├── ch06
    ├── batch_norm_gradient_check.py
    ├── weight_init_activation_histogram.py
    ├── overfit_dropout.py
    ├── optimizer_compare_naive.py
    ├── weight_init_compare.py
    ├── optimizer_compare_mnist.py
    ├── overfit_weight_decay.py
    ├── hyperparameter_optimization.py
    └── batch_norm_test.py
├── README.md
├── LICENSE.md
└── 深度学习入门笔记.md


/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ch01/hungry.py:
--------------------------------------------------------------------------------
1 | print("I'm hungry!")
2 | 


--------------------------------------------------------------------------------
/ch08/awesome_net.py:
--------------------------------------------------------------------------------
1 | # Create your awesome net!!


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *~
 3 | *.gz
 4 | *.pyc
 5 | __pycache__/
 6 | *.tar
 7 | *.tgz
 8 | *.png
 9 | *.jpg
10 | *.pkl
11 | 


--------------------------------------------------------------------------------
/ch01/img_show.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import matplotlib.pyplot as plt
3 | from matplotlib.image import imread
4 | 
5 | img = imread('../dataset/lena.png') #读入图像
6 | plt.imshow(img)
7 | 
8 | plt.show()


--------------------------------------------------------------------------------
/ch01/sin_graph.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # 生成数据
 6 | x = np.arange(0, 6, 0.1)
 7 | y = np.sin(x)
 8 | 
 9 | # 绘制图形
10 | plt.plot(x, y)
11 | plt.show()
12 | 


--------------------------------------------------------------------------------
/ch01/simple_graph.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # 生成数据
 6 | x = np.arange(0, 6, 0.1) # 以0.1为单位，生成0到6的数据
 7 | y = np.sin(x)
 8 | 
 9 | # 绘制图形
10 | plt.plot(x, y)
11 | plt.show()


--------------------------------------------------------------------------------
/ch03/relu.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | import matplotlib.pylab as plt
 4 | 
 5 | 
 6 | def relu(x):
 7 |     return np.maximum(0, x)
 8 | 
 9 | x = np.arange(-5.0, 5.0, 0.1)
10 | y = relu(x)
11 | plt.plot(x, y)
12 | plt.ylim(-1.0, 5.5)
13 | plt.show()
14 | 


--------------------------------------------------------------------------------
/ch03/sigmoid.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | import matplotlib.pylab as plt
 4 | 
 5 | 
 6 | def sigmoid(x):
 7 |     return 1 / (1 + np.exp(-x))    
 8 | 
 9 | X = np.arange(-5.0, 5.0, 0.1)
10 | Y = sigmoid(X)
11 | plt.plot(X, Y)
12 | plt.ylim(-0.1, 1.1)
13 | plt.show()
14 | 


--------------------------------------------------------------------------------
/ch03/step_function.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | import matplotlib.pylab as plt
 4 | 
 5 | 
 6 | def step_function(x):
 7 |     return np.array(x > 0, dtype=np.int)
 8 | 
 9 | X = np.arange(-5.0, 5.0, 0.1)
10 | Y = step_function(X)
11 | plt.plot(X, Y)
12 | plt.ylim(-0.1, 1.1)  # 指定图中绘制的y轴的范围
13 | plt.show()
14 | 


--------------------------------------------------------------------------------
/ch01/man.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | class Man:
 3 |     """示例类"""   # 示例类
 4 | 
 5 |     def __init__(self, name):
 6 |         self.name = name
 7 |         print("Initilized!")
 8 | 
 9 |     def hello(self):
10 |         print("Hello " + self.name + "!")
11 | 
12 |     def goodbye(self):
13 |         print("Good-bye " + self.name + "!")
14 | 
15 | m = Man("David")
16 | m.hello()
17 | m.goodbye()


--------------------------------------------------------------------------------
/ch02/xor_gate.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from and_gate import AND
 3 | from or_gate import OR
 4 | from nand_gate import NAND
 5 | 
 6 | 
 7 | def XOR(x1, x2):
 8 |     s1 = NAND(x1, x2)
 9 |     s2 = OR(x1, x2)
10 |     y = AND(s1, s2)
11 |     return y
12 | 
13 | if __name__ == '__main__':
14 |     for xs in [(0, 0), (1, 0), (0, 1), (1, 1)]:
15 |         y = XOR(xs[0], xs[1])
16 |         print(str(xs) + " -> " + str(y))


--------------------------------------------------------------------------------
/ch01/sin_cos_graph.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # 生成数据
 6 | x = np.arange(0, 6, 0.1) # 以0.1为单位，生成0到6的数据
 7 | y1 = np.sin(x)
 8 | y2 = np.cos(x)
 9 | 
10 | # 绘制图形
11 | plt.plot(x, y1, label="sin")
12 | plt.plot(x, y2, linestyle = "--", label="cos")
13 | plt.xlabel("x") # x轴的标签
14 | plt.ylabel("y") # y轴的标签
15 | plt.title('sin & cos')
16 | plt.legend()
17 | plt.show()


--------------------------------------------------------------------------------
/ch02/or_gate.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | 
 4 | 
 5 | def OR(x1, x2):
 6 |     x = np.array([x1, x2])
 7 |     w = np.array([0.5, 0.5])
 8 |     b = -0.2
 9 |     tmp = np.sum(w*x) + b
10 |     if tmp <= 0:
11 |         return 0
12 |     else:
13 |         return 1
14 | 
15 | if __name__ == '__main__':
16 |     for xs in [(0, 0), (1, 0), (0, 1), (1, 1)]:
17 |         y = OR(xs[0], xs[1])
18 |         print(str(xs) + " -> " + str(y))


--------------------------------------------------------------------------------
/ch02/and_gate.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | 
 4 | 
 5 | def AND(x1, x2):
 6 |     x = np.array([x1, x2])
 7 |     w = np.array([0.5, 0.5])
 8 |     b = -0.7
 9 |     tmp = np.sum(w*x) + b
10 |     if tmp <= 0:
11 |         return 0
12 |     else:
13 |         return 1
14 | 
15 | if __name__ == '__main__':
16 |     for xs in [(0, 0), (1, 0), (0, 1), (1, 1)]:
17 |         y = AND(xs[0], xs[1])
18 |         print(str(xs) + " -> " + str(y))
19 | 


--------------------------------------------------------------------------------
/ch02/nand_gate.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | 
 4 | 
 5 | def NAND(x1, x2):
 6 |     x = np.array([x1, x2])
 7 |     w = np.array([-0.5, -0.5])
 8 |     b = 0.7
 9 |     tmp = np.sum(w*x) + b
10 |     if tmp <= 0:
11 |         return 0
12 |     else:
13 |         return 1
14 | 
15 | if __name__ == '__main__':
16 |     for xs in [(0, 0), (1, 0), (0, 1), (1, 1)]:
17 |         y = NAND(xs[0], xs[1])
18 |         print(str(xs) + " -> " + str(y))
19 | 


--------------------------------------------------------------------------------
/ch03/sig_step_compare.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | import matplotlib.pylab as plt
 4 | 
 5 | 
 6 | def sigmoid(x):
 7 |     return 1 / (1 + np.exp(-x))    
 8 | 
 9 | 
10 | def step_function(x):
11 |     return np.array(x > 0, dtype=np.int)
12 | 
13 | x = np.arange(-5.0, 5.0, 0.1)
14 | y1 = sigmoid(x)
15 | y2 = step_function(x)
16 | 
17 | plt.plot(x, y1)
18 | plt.plot(x, y2, 'k--')
19 | plt.ylim(-0.1, 1.1) #指定图中绘制的y轴的范围
20 | plt.show()
21 | 


--------------------------------------------------------------------------------
/ch03/mnist_show.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys, os
 3 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
 4 | import numpy as np
 5 | from dataset.mnist import load_mnist
 6 | from PIL import Image
 7 | 
 8 | 
 9 | def img_show(img):
10 |     pil_img = Image.fromarray(np.uint8(img))
11 |     pil_img.show()
12 | 
13 | (x_train, t_train), (x_test, t_test) = load_mnist(flatten=True, normalize=False)
14 | 
15 | img = x_train[0]
16 | label = t_train[0]
17 | print(label)  # 5
18 | 
19 | print(img.shape)  # (784,)
20 | img = img.reshape(28, 28)  # 把图像的形状变为原来的尺寸
21 | print(img.shape)  # (28, 28)
22 | 
23 | img_show(img)
24 | 


--------------------------------------------------------------------------------
/ch05/buy_apple.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from layer_naive import *
 3 | 
 4 | 
 5 | apple = 100
 6 | apple_num = 2
 7 | tax = 1.1
 8 | 
 9 | mul_apple_layer = MulLayer()
10 | mul_tax_layer = MulLayer()
11 | 
12 | # forward
13 | apple_price = mul_apple_layer.forward(apple, apple_num)
14 | price = mul_tax_layer.forward(apple_price, tax)
15 | 
16 | # backward
17 | dprice = 1
18 | dapple_price, dtax = mul_tax_layer.backward(dprice)
19 | dapple, dapple_num = mul_apple_layer.backward(dapple_price)
20 | 
21 | print("price:", int(price))
22 | print("dApple:", dapple)
23 | print("dApple_num:", int(dapple_num))
24 | print("dTax:", dtax)
25 | 


--------------------------------------------------------------------------------
/ch07/gradient_check.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | from simple_convnet import SimpleConvNet
 4 | 
 5 | network = SimpleConvNet(input_dim=(1,10, 10), 
 6 |                         conv_param = {'filter_num':10, 'filter_size':3, 'pad':0, 'stride':1},
 7 |                         hidden_size=10, output_size=10, weight_init_std=0.01)
 8 | 
 9 | X = np.random.rand(100).reshape((1, 1, 10, 10))
10 | T = np.array([1]).reshape((1,1))
11 | 
12 | grad_num = network.numerical_gradient(X, T)
13 | grad = network.gradient(X, T)
14 | 
15 | for key, val in grad_num.items():
16 |     print(key, np.abs(grad_num[key] - grad[key]).mean())


--------------------------------------------------------------------------------
/ch04/gradient_1d.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | import matplotlib.pylab as plt
 4 | 
 5 | 
 6 | def numerical_diff(f, x):
 7 |     h = 1e-4 # 0.0001
 8 |     return (f(x+h) - f(x-h)) / (2*h)
 9 | 
10 | 
11 | def function_1(x):
12 |     return 0.01*x**2 + 0.1*x 
13 | 
14 | 
15 | def tangent_line(f, x):
16 |     d = numerical_diff(f, x)
17 |     print(d)
18 |     y = f(x) - d*x # 计算的截距
19 |     return lambda t: d*t + y
20 |      
21 | x = np.arange(0.0, 20.0, 0.1)
22 | y = function_1(x)
23 | plt.xlabel("x")
24 | plt.ylabel("f(x)")
25 | 
26 | tf = tangent_line(function_1, 5)
27 | y2 = tf(x)
28 | 
29 | plt.plot(x, y)
30 | plt.plot(x, y2)
31 | plt.show()
32 | 


--------------------------------------------------------------------------------
/ch05/layer_naive.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | 
 4 | class MulLayer:
 5 |     def __init__(self):
 6 |         self.x = None
 7 |         self.y = None
 8 | 
 9 |     def forward(self, x, y):
10 |         self.x = x
11 |         self.y = y                
12 |         out = x * y
13 | 
14 |         return out
15 | 
16 |     def backward(self, dout):
17 |         dx = dout * self.y
18 |         dy = dout * self.x
19 | 
20 |         return dx, dy
21 | 
22 | 
23 | class AddLayer:
24 |     def __init__(self):
25 |         pass
26 | 
27 |     def forward(self, x, y):
28 |         out = x + y
29 | 
30 |         return out
31 | 
32 |     def backward(self, dout):
33 |         dx = dout * 1
34 |         dy = dout * 1
35 | 
36 |         return dx, dy
37 | 


--------------------------------------------------------------------------------
/ch08/train_deepnet.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys, os
 3 | sys.path.append(os.pardir)  # 为了导入父目录而进行的设定
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | from dataset.mnist import load_mnist
 7 | from deep_convnet import DeepConvNet
 8 | from common.trainer import Trainer
 9 | 
10 | (x_train, t_train), (x_test, t_test) = load_mnist(flatten=False)
11 | 
12 | network = DeepConvNet()  
13 | trainer = Trainer(network, x_train, t_train, x_test, t_test,
14 |                   epochs=20, mini_batch_size=100,
15 |                   optimizer='Adam', optimizer_param={'lr':0.001},
16 |                   evaluate_sample_num_per_epoch=1000)
17 | trainer.train()
18 | 
19 | # 保存参数
20 | network.save_params("deep_convnet_params.pkl")
21 | print("Saved Network Parameters!")
22 | 


--------------------------------------------------------------------------------
/ch04/gradient_simplenet.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys, os
 3 | sys.path.append(os.pardir)  # 为了导入父目录中的文件而进行的设定
 4 | import numpy as np
 5 | from common.functions import softmax, cross_entropy_error
 6 | from common.gradient import numerical_gradient
 7 | 
 8 | 
 9 | class simpleNet:
10 |     def __init__(self):
11 |         self.W = np.random.randn(2,3)
12 | 
13 |     def predict(self, x):
14 |         return np.dot(x, self.W)
15 | 
16 |     def loss(self, x, t):
17 |         z = self.predict(x)
18 |         y = softmax(z)
19 |         loss = cross_entropy_error(y, t)
20 | 
21 |         return loss
22 | 
23 | x = np.array([0.6, 0.9])
24 | t = np.array([0, 0, 1])
25 | 
26 | net = simpleNet()
27 | 
28 | f = lambda w: net.loss(x, t)
29 | dW = numerical_gradient(f, net.W)
30 | 
31 | print(dW)
32 | 


--------------------------------------------------------------------------------
/ch05/gradient_check.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys, os
 3 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
 4 | import numpy as np
 5 | from dataset.mnist import load_mnist
 6 | from two_layer_net import TwoLayerNet
 7 | 
 8 | # 读入数据
 9 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
10 | 
11 | network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
12 | print(x_train.shape)
13 | x_batch = x_train[:3]
14 | t_batch = t_train[:3]
15 | print(x_batch.shape)
16 | grad_numerical = network.numerical_gradient(x_batch, t_batch)
17 | #grad_backprop = network.gradient(x_batch, t_batch)
18 | 
19 | #for key in grad_numerical.keys():
20 | #    diff = np.average( np.abs(grad_backprop[key] - grad_numerical[key]) )
21 | #    print(key + ":" + str(diff))
22 | 


--------------------------------------------------------------------------------
/ch06/batch_norm_gradient_check.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys, os
 3 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
 4 | import numpy as np
 5 | from dataset.mnist import load_mnist
 6 | from common.multi_layer_net_extend import MultiLayerNetExtend
 7 | 
 8 | # 读入数据
 9 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
10 | 
11 | network = MultiLayerNetExtend(input_size=784, hidden_size_list=[100, 100], output_size=10,
12 |                               use_batchnorm=True)
13 | 
14 | x_batch = x_train[:1]
15 | t_batch = t_train[:1]
16 | 
17 | grad_backprop = network.gradient(x_batch, t_batch)
18 | grad_numerical = network.numerical_gradient(x_batch, t_batch)
19 | 
20 | 
21 | for key in grad_numerical.keys():
22 |     diff = np.average( np.abs(grad_backprop[key] - grad_numerical[key]) )
23 |     print(key + ":" + str(diff))


--------------------------------------------------------------------------------
/ch08/half_float_network.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys, os
 3 | sys.path.append(os.pardir)  # 为了导入父目录而进行的设定
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | from deep_convnet import DeepConvNet
 7 | from dataset.mnist import load_mnist
 8 | 
 9 | 
10 | (x_train, t_train), (x_test, t_test) = load_mnist(flatten=False)
11 | 
12 | network = DeepConvNet()
13 | network.load_params("deep_convnet_params.pkl")
14 | 
15 | sampled = 10000 # 为了实现高速化
16 | x_test = x_test[:sampled]
17 | t_test = t_test[:sampled]
18 | 
19 | print("caluculate accuracy (float64) ... ")
20 | print(network.accuracy(x_test, t_test))
21 | 
22 | # 转换为float16型
23 | x_test = x_test.astype(np.float16)
24 | for param in network.params.values():
25 |     param[...] = param.astype(np.float16)
26 | 
27 | print("caluculate accuracy (float16) ... ")
28 | print(network.accuracy(x_test, t_test))
29 | 


--------------------------------------------------------------------------------
/ch07/visualize_filter.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | from simple_convnet import SimpleConvNet
 5 | 
 6 | def filter_show(filters, nx=8, margin=3, scale=10):
 7 |     """
 8 |     c.f. https://gist.github.com/aidiary/07d530d5e08011832b12#file-draw_weight-py
 9 |     """
10 |     FN, C, FH, FW = filters.shape
11 |     ny = int(np.ceil(FN / nx))
12 | 
13 |     fig = plt.figure()
14 |     fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
15 | 
16 |     for i in range(FN):
17 |         ax = fig.add_subplot(ny, nx, i+1, xticks=[], yticks=[])
18 |         ax.imshow(filters[i, 0], cmap=plt.cm.gray_r, interpolation='nearest')
19 |     plt.show()
20 | 
21 | 
22 | network = SimpleConvNet()
23 | # 随机进行初始化后的权重
24 | filter_show(network.params['W1'])
25 | 
26 | # 学习后的权重
27 | network.load_params("params.pkl")
28 | filter_show(network.params['W1'])


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 深度学习入门
 2 | 
 3 | 
 4 | 
 5 | ## 文件结构
 6 | 
 7 | |文件夹名   |说明                         |
 8 | |:--        |:--                          |
 9 | |ch01       |第1章使用的源代码            |
10 | |ch02       |第2章使用的源代码            |
11 | |...        |...                          |
12 | |ch08       |第8章使用的源代码            |
13 | |common     |共同使用的源代码             |
14 | |dataset    |数据集用的源代码             |
15 | 
16 | 
17 | 源代码的解释请参考本书。
18 | 
19 | ## 必要条件
20 | 执行源代码需要按照以下软件。
21 | 
22 | * Python 3.x
23 | * NumPy
24 | * Matplotlib
25 | 
26 | ※Python的版本为Python 3。
27 | 
28 | ## 执行方法
29 | 
30 | 前进到各章节的文件夹，执行Python命令。
31 | 
32 | ```
33 | $ cd ch01
34 | $ python man.py
35 | 
36 | $ cd ../ch05
37 | $ python train_nueralnet.py
38 | ```
39 | 
40 | ## 使用许可
41 | 
42 | 本源代码使用[MIT许可协议](http://www.opensource.org/licenses/MIT)。
43 | 无论是否为商业行为，均可自由使用。
44 | 
45 | ## 勘误表
46 | 
47 | 本书的勘误信息在以下网址中公开。读者可以在以下网址中查看和提交勘误。
48 | 
49 | http://www.ituring.com.cn/book/1921
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/ch04/gradient_method.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | import matplotlib.pylab as plt
 4 | from gradient_2d import numerical_gradient
 5 | 
 6 | 
 7 | def gradient_descent(f, init_x, lr=0.01, step_num=100):
 8 |     x = init_x
 9 |     x_history = []
10 | 
11 |     for i in range(step_num):
12 |         x_history.append( x.copy() )
13 | 
14 |         grad = numerical_gradient(f, x)
15 |         x -= lr * grad
16 | 
17 |     return x, np.array(x_history)
18 | 
19 | 
20 | def function_2(x):
21 |     return x[0]**2 + x[1]**2
22 | 
23 | init_x = np.array([-3.0, 4.0])    
24 | 
25 | lr = 0.1
26 | step_num = 20
27 | x, x_history = gradient_descent(function_2, init_x, lr=lr, step_num=step_num)
28 | 
29 | plt.plot( [-5, 5], [0,0], '--b')
30 | plt.plot( [0,0], [-5, 5], '--b')
31 | plt.plot(x_history[:,0], x_history[:,1], 'o')
32 | 
33 | plt.xlim(-3.5, 3.5)
34 | plt.ylim(-4.5, 4.5)
35 | plt.xlabel("X0")
36 | plt.ylabel("X1")
37 | plt.show()
38 | 


--------------------------------------------------------------------------------
/ch05/buy_apple_orange.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from layer_naive import *
 3 | 
 4 | apple = 100
 5 | apple_num = 2
 6 | orange = 150
 7 | orange_num = 3
 8 | tax = 1.1
 9 | 
10 | # layer
11 | mul_apple_layer = MulLayer()
12 | mul_orange_layer = MulLayer()
13 | add_apple_orange_layer = AddLayer()
14 | mul_tax_layer = MulLayer()
15 | 
16 | # forward
17 | apple_price = mul_apple_layer.forward(apple, apple_num)  # (1)
18 | orange_price = mul_orange_layer.forward(orange, orange_num)  # (2)
19 | all_price = add_apple_orange_layer.forward(apple_price, orange_price)  # (3)
20 | price = mul_tax_layer.forward(all_price, tax)  # (4)
21 | 
22 | # backward
23 | dprice = 1
24 | dall_price, dtax = mul_tax_layer.backward(dprice)  # (4)
25 | dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price)  # (3)
26 | dorange, dorange_num = mul_orange_layer.backward(dorange_price)  # (2)
27 | dapple, dapple_num = mul_apple_layer.backward(dapple_price)  # (1)
28 | 
29 | print("price:", int(price))
30 | print("dApple:", dapple)
31 | print("dApple_num:", int(dapple_num))
32 | print("dOrange:", dorange)
33 | print("dOrange_num:", int(dorange_num))
34 | print("dTax:", dtax)
35 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Koki Saitoh
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ch03/neuralnet_mnist.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys, os
 3 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
 4 | import numpy as np
 5 | import pickle
 6 | from dataset.mnist import load_mnist
 7 | from common.functions import sigmoid, softmax
 8 | 
 9 | 
10 | def get_data():
11 |     (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, flatten=True, one_hot_label=False)
12 |     return x_test, t_test
13 | 
14 | 
15 | def init_network():
16 |     with open("sample_weight.pkl", 'rb') as f:
17 |         network = pickle.load(f)
18 |     return network
19 | 
20 | 
21 | def predict(network, x):
22 |     W1, W2, W3 = network['W1'], network['W2'], network['W3']
23 |     b1, b2, b3 = network['b1'], network['b2'], network['b3']
24 | 
25 |     a1 = np.dot(x, W1) + b1
26 |     z1 = sigmoid(a1)
27 |     a2 = np.dot(z1, W2) + b2
28 |     z2 = sigmoid(a2)
29 |     a3 = np.dot(z2, W3) + b3
30 |     y = softmax(a3)
31 | 
32 |     return y
33 | 
34 | 
35 | x, t = get_data()
36 | network = init_network()
37 | accuracy_cnt = 0
38 | for i in range(len(x)):
39 |     y = predict(network, x[i])
40 |     p= np.argmax(y) # 获取概率最高的元素的索引
41 |     if p == t[i]:
42 |         accuracy_cnt += 1
43 | 
44 | print("Accuracy:" + str(float(accuracy_cnt) / len(x)))


--------------------------------------------------------------------------------
/ch03/neuralnet_mnist_batch.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys, os
 3 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
 4 | import numpy as np
 5 | import pickle
 6 | from dataset.mnist import load_mnist
 7 | from common.functions import sigmoid, softmax
 8 | 
 9 | 
10 | def get_data():
11 |     (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, flatten=True, one_hot_label=False)
12 |     return x_test, t_test
13 | 
14 | 
15 | def init_network():
16 |     with open("sample_weight.pkl", 'rb') as f:
17 |         network = pickle.load(f)
18 |     return network
19 | 
20 | 
21 | def predict(network, x):
22 |     w1, w2, w3 = network['W1'], network['W2'], network['W3']
23 |     b1, b2, b3 = network['b1'], network['b2'], network['b3']
24 | 
25 |     a1 = np.dot(x, w1) + b1
26 |     z1 = sigmoid(a1)
27 |     a2 = np.dot(z1, w2) + b2
28 |     z2 = sigmoid(a2)
29 |     a3 = np.dot(z2, w3) + b3
30 |     y = softmax(a3)
31 | 
32 |     return y
33 | 
34 | 
35 | x, t = get_data()
36 | network = init_network()
37 | 
38 | batch_size = 100 # 批数量
39 | accuracy_cnt = 0
40 | 
41 | for i in range(0, len(x), batch_size):
42 |     x_batch = x[i:i+batch_size]
43 |     y_batch = predict(network, x_batch)
44 |     p = np.argmax(y_batch, axis=1)
45 |     accuracy_cnt += np.sum(p == t[i:i+batch_size])
46 | 
47 | print("Accuracy:" + str(float(accuracy_cnt) / len(x)))
48 | 


--------------------------------------------------------------------------------
/common/functions.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | 
 4 | 
 5 | def identity_function(x):
 6 |     return x
 7 | 
 8 | 
 9 | def step_function(x):
10 |     return np.array(x > 0, dtype=np.int)
11 | 
12 | 
13 | def sigmoid(x):
14 |     return 1 / (1 + np.exp(-x))    
15 | 
16 | 
17 | def sigmoid_grad(x):
18 |     return (1.0 - sigmoid(x)) * sigmoid(x)
19 |     
20 | 
21 | def relu(x):
22 |     return np.maximum(0, x)
23 | 
24 | 
25 | def relu_grad(x):
26 |     grad = np.zeros(x)
27 |     grad[x>=0] = 1
28 |     return grad
29 |     
30 | 
31 | def softmax(x):
32 |     if x.ndim == 2:
33 |         x = x.T
34 |         x = x - np.max(x, axis=0)
35 |         y = np.exp(x) / np.sum(np.exp(x), axis=0)
36 |         return y.T 
37 | 
38 |     x = x - np.max(x) # 溢出对策
39 |     return np.exp(x) / np.sum(np.exp(x))
40 | 
41 | 
42 | def mean_squared_error(y, t):
43 |     return 0.5 * np.sum((y-t)**2)
44 | 
45 | 
46 | def cross_entropy_error(y, t):
47 |     if y.ndim == 1: # 这里没有变化
48 |         t = t.reshape(1, t.size)
49 |         y = y.reshape(1, y.size)
50 |         
51 |     # 监督数据是one-hot-vector的情况下，转换为正确解标签的索引
52 |     if t.size == y.size:
53 |         t = t.argmax(axis=1)
54 |              
55 |     batch_size = y.shape[0]
56 |     return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
57 | 
58 | 
59 | def softmax_loss(X, t):
60 |     y = softmax(X)
61 |     return cross_entropy_error(y, t)
62 | 


--------------------------------------------------------------------------------
/common/gradient.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | 
 4 | def _numerical_gradient_1d(f, x):
 5 |     h = 1e-4 # 0.0001
 6 |     grad = np.zeros_like(x)
 7 |     
 8 |     for idx in range(x.size):
 9 |         tmp_val = x[idx]
10 |         x[idx] = float(tmp_val) + h
11 |         fxh1 = f(x) # f(x+h)
12 |         
13 |         x[idx] = tmp_val - h 
14 |         fxh2 = f(x) # f(x-h)
15 |         grad[idx] = (fxh1 - fxh2) / (2*h)
16 |         
17 |         x[idx] = tmp_val # 还原值
18 |         
19 |     return grad
20 | 
21 | 
22 | def numerical_gradient_2d(f, X):
23 |     if X.ndim == 1:
24 |         return _numerical_gradient_1d(f, X)
25 |     else:
26 |         grad = np.zeros_like(X)
27 |         
28 |         for idx, x in enumerate(X):
29 |             grad[idx] = _numerical_gradient_1d(f, x)
30 |         
31 |         return grad
32 | 
33 | 
34 | def numerical_gradient(f, x):
35 |     h = 1e-4 # 0.0001
36 |     grad = np.zeros_like(x)
37 |     
38 |     it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
39 |     while not it.finished:
40 |         idx = it.multi_index
41 |         tmp_val = x[idx]
42 |         x[idx] = float(tmp_val) + h
43 |         fxh1 = f(x) # f(x+h)
44 |         
45 |         x[idx] = tmp_val - h 
46 |         fxh2 = f(x) # f(x-h)
47 |         grad[idx] = (fxh1 - fxh2) / (2*h)
48 |         
49 |         x[idx] = tmp_val # 还原值
50 |         it.iternext()   
51 |         
52 |     return grad


--------------------------------------------------------------------------------
/ch05/train_neuralnet.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys, os
 3 | sys.path.append(os.pardir)
 4 | 
 5 | import numpy as np
 6 | from dataset.mnist import load_mnist
 7 | from two_layer_net import TwoLayerNet
 8 | 
 9 | # 读入数据
10 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
11 | 
12 | network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
13 | 
14 | iters_num = 10000
15 | train_size = x_train.shape[0]
16 | batch_size = 100
17 | learning_rate = 0.1
18 | 
19 | train_loss_list = []
20 | train_acc_list = []
21 | test_acc_list = []
22 | 
23 | iter_per_epoch = max(train_size / batch_size, 1)
24 | 
25 | for i in range(iters_num):
26 |     batch_mask = np.random.choice(train_size, batch_size)
27 |     x_batch = x_train[batch_mask]
28 |     t_batch = t_train[batch_mask]
29 |     
30 |     # 梯度
31 |     #grad = network.numerical_gradient(x_batch, t_batch)
32 |     grad = network.gradient(x_batch, t_batch)
33 |     
34 |     # 更新
35 |     for key in ('W1', 'b1', 'W2', 'b2'):
36 |         network.params[key] -= learning_rate * grad[key]
37 |     
38 |     loss = network.loss(x_batch, t_batch)
39 |     train_loss_list.append(loss)
40 |     
41 |     if i % iter_per_epoch == 0:
42 |         train_acc = network.accuracy(x_train, t_train)
43 |         test_acc = network.accuracy(x_test, t_test)
44 |         train_acc_list.append(train_acc)
45 |         test_acc_list.append(test_acc)
46 |         print(train_acc, test_acc)
47 | 


--------------------------------------------------------------------------------
/ch06/weight_init_activation_histogram.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | 
 6 | def sigmoid(x):
 7 |     return 1 / (1 + np.exp(-x))
 8 | 
 9 | 
10 | def ReLU(x):
11 |     return np.maximum(0, x)
12 | 
13 | 
14 | def tanh(x):
15 |     return np.tanh(x)
16 |     
17 | input_data = np.random.randn(1000, 100)  # 1000个数据
18 | node_num = 100  # 各隐藏层的节点（神经元）数
19 | hidden_layer_size = 5  # 隐藏层有5层
20 | activations = {}  # 激活值的结果保存在这里
21 | 
22 | x = input_data
23 | 
24 | for i in range(hidden_layer_size):
25 |     if i != 0:
26 |         x = activations[i-1]
27 | 
28 |     # 改变初始值进行实验！
29 |     # w = np.random.randn(node_num, node_num) * 1
30 |     # w = np.random.randn(node_num, node_num) * 0.01
31 |     w = np.random.randn(node_num, node_num) * np.sqrt(1.0 / node_num) # Xavier
32 |     # w = np.random.randn(node_num, node_num) * np.sqrt(2.0 / node_num) # He
33 |     # w = np.random.randn(node_num, node_num) * 0.0001
34 |     # w = np.random.randn(node_num, node_num) * 10
35 | 
36 | 
37 |     a = np.dot(x, w) # 
38 | 
39 | 
40 |     # 将激活函数的种类也改变，来进行实验！
41 |     # z = sigmoid(a)
42 |     z = ReLU(a)
43 |     # z = tanh(a)
44 | 
45 |     activations[i] = z
46 | 
47 | # 绘制直方图
48 | for i, a in activations.items():
49 |     plt.subplot(1, len(activations), i+1)
50 |     plt.title(str(i+1) + "-layer")
51 |     if i != 0: plt.yticks([], [])
52 |     # plt.xlim(0.1, 1)
53 |     # plt.ylim(0, 7000)
54 |     plt.hist(a.flatten(), 30, range=(0,1))
55 | plt.show()
56 | 


--------------------------------------------------------------------------------
/ch07/train_convnet.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys, os
 3 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | from dataset.mnist import load_mnist
 7 | from simple_convnet import SimpleConvNet
 8 | from common.trainer import Trainer
 9 | 
10 | # 读入数据
11 | (x_train, t_train), (x_test, t_test) = load_mnist(flatten=False)
12 | 
13 | # 处理花费时间较长的情况下减少数据 
14 | #x_train, t_train = x_train[:5000], t_train[:5000]
15 | #x_test, t_test = x_test[:1000], t_test[:1000]
16 | 
17 | max_epochs = 20
18 | 
19 | network = SimpleConvNet(input_dim=(1,28,28), 
20 |                         conv_param = {'filter_num': 30, 'filter_size': 5, 'pad': 0, 'stride': 1},
21 |                         hidden_size=100, output_size=10, weight_init_std=0.01)
22 |                         
23 | trainer = Trainer(network, x_train, t_train, x_test, t_test,
24 |                   epochs=max_epochs, mini_batch_size=100,
25 |                   optimizer='Adam', optimizer_param={'lr': 0.001},
26 |                   evaluate_sample_num_per_epoch=1000)
27 | trainer.train()
28 | 
29 | # 保存参数
30 | network.save_params("params.pkl")
31 | print("Saved Network Parameters!")
32 | 
33 | # 绘制图形
34 | markers = {'train': 'o', 'test': 's'}
35 | x = np.arange(max_epochs)
36 | plt.plot(x, trainer.train_acc_list, marker='o', label='train', markevery=2)
37 | plt.plot(x, trainer.test_acc_list, marker='s', label='test', markevery=2)
38 | plt.xlabel("epochs")
39 | plt.ylabel("accuracy")
40 | plt.ylim(0, 1.0)
41 | plt.legend(loc='lower right')
42 | plt.show()
43 | 


--------------------------------------------------------------------------------
/ch06/overfit_dropout.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import os
 3 | import sys
 4 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
 5 | import numpy as np
 6 | import matplotlib.pyplot as plt
 7 | from dataset.mnist import load_mnist
 8 | from common.multi_layer_net_extend import MultiLayerNetExtend
 9 | from common.trainer import Trainer
10 | 
11 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True)
12 | 
13 | # 为了再现过拟合，减少学习数据
14 | x_train = x_train[:2000]
15 | t_train = t_train[:2000]
16 | 
17 | # 设定是否使用Dropuout，以及比例 ========================
18 | use_dropout = True  # 不使用Dropout的情况下为False
19 | dropout_ratio = 0.2
20 | # ====================================================
21 | 
22 | network = MultiLayerNetExtend(input_size=784, hidden_size_list=[100, 100, 100, 100, 100, 100],
23 |                               output_size=10, use_dropout=use_dropout, dropout_ration=dropout_ratio)
24 | trainer = Trainer(network, x_train, t_train, x_test, t_test,
25 |                   epochs=301, mini_batch_size=100,
26 |                   optimizer='sgd', optimizer_param={'lr': 0.01}, verbose=True)
27 | trainer.train()
28 | 
29 | train_acc_list, test_acc_list = trainer.train_acc_list, trainer.test_acc_list
30 | 
31 | # 绘制图形==========
32 | markers = {'train': 'o', 'test': 's'}
33 | x = np.arange(len(train_acc_list))
34 | plt.plot(x, train_acc_list, marker='o', label='train', markevery=10)
35 | plt.plot(x, test_acc_list, marker='s', label='test', markevery=10)
36 | plt.xlabel("epochs")
37 | plt.ylabel("accuracy")
38 | plt.ylim(0, 1.0)
39 | plt.legend(loc='lower right')
40 | plt.show()


--------------------------------------------------------------------------------
/ch06/optimizer_compare_naive.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys, os
 3 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | from collections import OrderedDict
 7 | from common.optimizer import *
 8 | 
 9 | 
10 | def f(x, y):
11 |     return x**2 / 20.0 + y**2
12 | 
13 | 
14 | def df(x, y):
15 |     return x / 10.0, 2.0*y
16 | 
17 | init_pos = (-7.0, 2.0)
18 | params = {}
19 | params['x'], params['y'] = init_pos[0], init_pos[1]
20 | grads = {}
21 | grads['x'], grads['y'] = 0, 0
22 | 
23 | 
24 | optimizers = OrderedDict() 
25 | optimizers["SGD"] = SGD(lr=0.95)
26 | optimizers["Momentum"] = Momentum(lr=0.1)
27 | optimizers["AdaGrad"] = AdaGrad(lr=1.5)
28 | optimizers["Adam"] = Adam(lr=0.3)
29 | 
30 | idx = 1
31 | 
32 | for key in optimizers:
33 |     optimizer = optimizers[key]
34 |     x_history = []
35 |     y_history = []
36 |     params['x'], params['y'] = init_pos[0], init_pos[1]
37 |     
38 |     for i in range(15):
39 |         x_history.append(params['x'])
40 |         y_history.append(params['y'])
41 |         
42 |         grads['x'], grads['y'] = df(params['x'], params['y'])
43 |         optimizer.update(params, grads)
44 |     
45 | 
46 |     x = np.arange(-10, 10, 0.01)
47 |     y = np.arange(-5, 5, 0.01)
48 |     
49 |     X, Y = np.meshgrid(x, y) 
50 |     Z = f(X, Y)
51 |     
52 |     # for simple contour line  
53 |     mask = Z > 7
54 |     Z[mask] = 0
55 |     
56 |     # plot 
57 |     plt.subplot(2, 2, idx)
58 |     idx += 1
59 |     plt.plot(x_history, y_history, 'o-', color="red")
60 |     plt.contour(X, Y, Z)
61 |     plt.ylim(-10, 10)
62 |     plt.xlim(-10, 10)
63 |     plt.plot(0, 0, '+')
64 |     #colorbar()
65 |     #spring()
66 |     plt.title(key)
67 |     plt.xlabel("x")
68 |     plt.ylabel("y")
69 |     
70 | plt.show()
71 | 


--------------------------------------------------------------------------------
/ch07/apply_filter.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys, os
 3 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | from simple_convnet import SimpleConvNet
 7 | from matplotlib.image import imread
 8 | from common.layers import Convolution
 9 | 
10 | def filter_show(filters, nx=4, show_num=16):
11 |     """
12 |     c.f. https://gist.github.com/aidiary/07d530d5e08011832b12#file-draw_weight-py
13 |     """
14 |     FN, C, FH, FW = filters.shape
15 |     ny = int(np.ceil(show_num / nx))
16 | 
17 |     fig = plt.figure()
18 |     fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
19 | 
20 |     for i in range(show_num):
21 |         ax = fig.add_subplot(4, 4, i+1, xticks=[], yticks=[])
22 |         ax.imshow(filters[i, 0], cmap=plt.cm.gray_r, interpolation='nearest')
23 | 
24 | 
25 | network = SimpleConvNet(input_dim=(1,28,28), 
26 |                         conv_param = {'filter_num':30, 'filter_size':5, 'pad':0, 'stride':1},
27 |                         hidden_size=100, output_size=10, weight_init_std=0.01)
28 | 
29 | # 学习后的权重
30 | network.load_params("params.pkl")
31 | 
32 | filter_show(network.params['W1'], 16)
33 | 
34 | img = imread('../dataset/lena_gray.png')
35 | img = img.reshape(1, 1, *img.shape)
36 | 
37 | fig = plt.figure()
38 | 
39 | w_idx = 1
40 | 
41 | for i in range(16):
42 |     w = network.params['W1'][i]
43 |     b = 0  # network.params['b1'][i]
44 | 
45 |     w = w.reshape(1, *w.shape)
46 |     #b = b.reshape(1, *b.shape)
47 |     conv_layer = Convolution(w, b) 
48 |     out = conv_layer.forward(img)
49 |     out = out.reshape(out.shape[2], out.shape[3])
50 |     
51 |     ax = fig.add_subplot(4, 4, i+1, xticks=[], yticks=[])
52 |     ax.imshow(out, cmap=plt.cm.gray_r, interpolation='nearest')
53 | 
54 | plt.show()


--------------------------------------------------------------------------------
/ch08/misclassified_mnist.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys, os
 3 | sys.path.append(os.pardir)  # 为了导入父目录而进行的设定
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | from deep_convnet import DeepConvNet
 7 | from dataset.mnist import load_mnist
 8 | 
 9 | 
10 | (x_train, t_train), (x_test, t_test) = load_mnist(flatten=False)
11 | 
12 | network = DeepConvNet()
13 | network.load_params("deep_convnet_params.pkl")
14 | 
15 | print("calculating test accuracy ... ")
16 | #sampled = 1000
17 | #x_test = x_test[:sampled]
18 | #t_test = t_test[:sampled]
19 | 
20 | classified_ids = []
21 | 
22 | acc = 0.0
23 | batch_size = 100
24 | 
25 | for i in range(int(x_test.shape[0] / batch_size)):
26 |     tx = x_test[i*batch_size:(i+1)*batch_size]
27 |     tt = t_test[i*batch_size:(i+1)*batch_size]
28 |     y = network.predict(tx, train_flg=False)
29 |     y = np.argmax(y, axis=1)
30 |     classified_ids.append(y)
31 |     acc += np.sum(y == tt)
32 |     
33 | acc = acc / x_test.shape[0]
34 | print("test accuracy:" + str(acc))
35 | 
36 | classified_ids = np.array(classified_ids)
37 | classified_ids = classified_ids.flatten()
38 |  
39 | max_view = 20
40 | current_view = 1
41 | 
42 | fig = plt.figure()
43 | fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.2, wspace=0.2)
44 | 
45 | mis_pairs = {}
46 | for i, val in enumerate(classified_ids == t_test):
47 |     if not val:
48 |         ax = fig.add_subplot(4, 5, current_view, xticks=[], yticks=[])
49 |         ax.imshow(x_test[i].reshape(28, 28), cmap=plt.cm.gray_r, interpolation='nearest')
50 |         mis_pairs[current_view] = (t_test[i], classified_ids[i])
51 |             
52 |         current_view += 1
53 |         if current_view > max_view:
54 |             break
55 | 
56 | print("======= misclassified result =======")
57 | print("{view index: (label, inference), ...}")
58 | print(mis_pairs)
59 | 
60 | plt.show()
61 | 


--------------------------------------------------------------------------------
/ch04/gradient_2d.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | # cf.http://d.hatena.ne.jp/white_wheels/20100327/p3
 3 | import numpy as np
 4 | import matplotlib.pylab as plt
 5 | from mpl_toolkits.mplot3d import Axes3D
 6 | 
 7 | 
 8 | def _numerical_gradient_no_batch(f, x):
 9 |     h = 1e-4 # 0.0001
10 |     grad = np.zeros_like(x)
11 |     
12 |     for idx in range(x.size):
13 |         tmp_val = x[idx]
14 |         x[idx] = float(tmp_val) + h
15 |         fxh1 = f(x) # f(x+h)
16 |         
17 |         x[idx] = tmp_val - h 
18 |         fxh2 = f(x) # f(x-h)
19 |         grad[idx] = (fxh1 - fxh2) / (2*h)
20 |         
21 |         x[idx] = tmp_val # 还原值
22 |         
23 |     return grad
24 | 
25 | 
26 | def numerical_gradient(f, X):
27 |     if X.ndim == 1:
28 |         return _numerical_gradient_no_batch(f, X)
29 |     else:
30 |         grad = np.zeros_like(X)
31 |         
32 |         for idx, x in enumerate(X):
33 |             grad[idx] = _numerical_gradient_no_batch(f, x)
34 |         
35 |         return grad
36 | 
37 | 
38 | def function_2(x):
39 |     if x.ndim == 1:
40 |         return np.sum(x**2)
41 |     else:
42 |         return np.sum(x**2, axis=1)
43 | 
44 | 
45 | def tangent_line(f, x):
46 |     d = numerical_gradient(f, x)
47 |     print(d)
48 |     y = f(x) - d*x
49 |     return lambda t: d*t + y
50 |      
51 | if __name__ == '__main__':
52 |     x0 = np.arange(-2, 2.5, 0.25)
53 |     x1 = np.arange(-2, 2.5, 0.25)
54 |     X, Y = np.meshgrid(x0, x1)
55 |     
56 |     X = X.flatten()
57 |     Y = Y.flatten()
58 |     
59 |     grad = numerical_gradient(function_2, np.array([X, Y]) )
60 |     
61 |     plt.figure()
62 |     plt.quiver(X, Y, -grad[0], -grad[1],  angles="xy",color="#666666")#,headwidth=10,scale=40,color="#444444")
63 |     plt.xlim([-2, 2])
64 |     plt.ylim([-2, 2])
65 |     plt.xlabel('x0')
66 |     plt.ylabel('x1')
67 |     plt.grid()
68 |     plt.legend()
69 |     plt.draw()
70 |     plt.show()


--------------------------------------------------------------------------------
/ch04/train_neuralnet.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys, os
 3 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | from dataset.mnist import load_mnist
 7 | from two_layer_net import TwoLayerNet
 8 | 
 9 | # 读入数据
10 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
11 | 
12 | network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
13 | 
14 | iters_num = 10000  # 适当设定循环的次数
15 | train_size = x_train.shape[0]
16 | batch_size = 100
17 | learning_rate = 0.1
18 | 
19 | train_loss_list = []
20 | train_acc_list = []
21 | test_acc_list = []
22 | 
23 | iter_per_epoch = max(train_size / batch_size, 1)
24 | 
25 | for i in range(iters_num):
26 |     batch_mask = np.random.choice(train_size, batch_size)
27 |     x_batch = x_train[batch_mask]
28 |     t_batch = t_train[batch_mask]
29 |     
30 |     # 计算梯度
31 |     #grad = network.numerical_gradient(x_batch, t_batch)
32 |     grad = network.gradient(x_batch, t_batch)
33 |     
34 |     # 更新参数
35 |     for key in ('W1', 'b1', 'W2', 'b2'):
36 |         network.params[key] -= learning_rate * grad[key]
37 |     
38 |     loss = network.loss(x_batch, t_batch)
39 |     train_loss_list.append(loss)
40 |     
41 |     if i % iter_per_epoch == 0:
42 |         train_acc = network.accuracy(x_train, t_train)
43 |         test_acc = network.accuracy(x_test, t_test)
44 |         train_acc_list.append(train_acc)
45 |         test_acc_list.append(test_acc)
46 |         print("train acc, test acc | " + str(train_acc) + ", " + str(test_acc))
47 | 
48 | # 绘制图形
49 | markers = {'train': 'o', 'test': 's'}
50 | x = np.arange(len(train_acc_list))
51 | plt.plot(x, train_acc_list, label='train acc')
52 | plt.plot(x, test_acc_list, label='test acc', linestyle='--')
53 | plt.xlabel("epochs")
54 | plt.ylabel("accuracy")
55 | plt.ylim(0, 1.0)
56 | plt.legend(loc='lower right')
57 | plt.show()
58 | 


--------------------------------------------------------------------------------
/ch06/weight_init_compare.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | from dataset.mnist import load_mnist
 9 | from common.util import smooth_curve
10 | from common.multi_layer_net import MultiLayerNet
11 | from common.optimizer import SGD
12 | 
13 | 
14 | # 0:读入MNIST数据==========
15 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True)
16 | 
17 | train_size = x_train.shape[0]
18 | batch_size = 128
19 | max_iterations = 2000
20 | 
21 | 
22 | # 1:进行实验的设置==========
23 | weight_init_types = {'std=0.01': 0.01, 'Xavier': 'sigmoid', 'He': 'relu'}
24 | optimizer = SGD(lr=0.01)
25 | 
26 | networks = {}
27 | train_loss = {}
28 | for key, weight_type in weight_init_types.items():
29 |     networks[key] = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100],
30 |                                   output_size=10, weight_init_std=weight_type)
31 |     train_loss[key] = []
32 | 
33 | 
34 | # 2:开始训练==========
35 | for i in range(max_iterations):
36 |     batch_mask = np.random.choice(train_size, batch_size)
37 |     x_batch = x_train[batch_mask]
38 |     t_batch = t_train[batch_mask]
39 |     
40 |     for key in weight_init_types.keys():
41 |         grads = networks[key].gradient(x_batch, t_batch)
42 |         optimizer.update(networks[key].params, grads)
43 |     
44 |         loss = networks[key].loss(x_batch, t_batch)
45 |         train_loss[key].append(loss)
46 |     
47 |     if i % 100 == 0:
48 |         print("===========" + "iteration:" + str(i) + "===========")
49 |         for key in weight_init_types.keys():
50 |             loss = networks[key].loss(x_batch, t_batch)
51 |             print(key + ":" + str(loss))
52 | 
53 | 
54 | # 3.绘制图形==========
55 | markers = {'std=0.01': 'o', 'Xavier': 's', 'He': 'D'}
56 | x = np.arange(max_iterations)
57 | for key in weight_init_types.keys():
58 |     plt.plot(x, smooth_curve(train_loss[key]), marker=markers[key], markevery=100, label=key)
59 | plt.xlabel("iterations")
60 | plt.ylabel("loss")
61 | plt.ylim(0, 2.5)
62 | plt.legend()
63 | plt.show()


--------------------------------------------------------------------------------
/ch06/optimizer_compare_mnist.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import os
 3 | import sys
 4 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
 5 | import matplotlib.pyplot as plt
 6 | from dataset.mnist import load_mnist
 7 | from common.util import smooth_curve
 8 | from common.multi_layer_net import MultiLayerNet
 9 | from common.optimizer import *
10 | 
11 | 
12 | # 0:读入MNIST数据==========
13 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True)
14 | 
15 | train_size = x_train.shape[0]
16 | batch_size = 128
17 | max_iterations = 2000
18 | 
19 | 
20 | # 1:进行实验的设置==========
21 | optimizers = {}
22 | optimizers['SGD'] = SGD()
23 | optimizers['Momentum'] = Momentum()
24 | optimizers['AdaGrad'] = AdaGrad()
25 | optimizers['Adam'] = Adam()
26 | #optimizers['RMSprop'] = RMSprop()
27 | 
28 | networks = {}
29 | train_loss = {}
30 | for key in optimizers.keys():
31 |     networks[key] = MultiLayerNet(
32 |         input_size=784, hidden_size_list=[100, 100, 100, 100],
33 |         output_size=10)
34 |     train_loss[key] = []    
35 | 
36 | 
37 | # 2:开始训练==========
38 | for i in range(max_iterations):
39 |     batch_mask = np.random.choice(train_size, batch_size)
40 |     x_batch = x_train[batch_mask]
41 |     t_batch = t_train[batch_mask]
42 |     
43 |     for key in optimizers.keys():
44 |         grads = networks[key].gradient(x_batch, t_batch)
45 |         optimizers[key].update(networks[key].params, grads)
46 |     
47 |         loss = networks[key].loss(x_batch, t_batch)
48 |         train_loss[key].append(loss)
49 |     
50 |     if i % 100 == 0:
51 |         print( "===========" + "iteration:" + str(i) + "===========")
52 |         for key in optimizers.keys():
53 |             loss = networks[key].loss(x_batch, t_batch)
54 |             print(key + ":" + str(loss))
55 | 
56 | 
57 | # 3.绘制图形==========
58 | markers = {"SGD": "o", "Momentum": "x", "AdaGrad": "s", "Adam": "D"}
59 | x = np.arange(max_iterations)
60 | for key in optimizers.keys():
61 |     plt.plot(x, smooth_curve(train_loss[key]), marker=markers[key], markevery=100, label=key)
62 | plt.xlabel("iterations")
63 | plt.ylabel("loss")
64 | plt.ylim(0, 1)
65 | plt.legend()
66 | plt.show()
67 | 


--------------------------------------------------------------------------------
/ch06/overfit_weight_decay.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import os
 3 | import sys
 4 | 
 5 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | from dataset.mnist import load_mnist
 9 | from common.multi_layer_net import MultiLayerNet
10 | from common.optimizer import SGD
11 | 
12 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True)
13 | 
14 | # 为了再现过拟合，减少学习数据
15 | x_train = x_train[:300]
16 | t_train = t_train[:300]
17 | 
18 | # weight decay（权值衰减）的设定 =======================
19 | # weight_decay_lambda = 0 # 不使用权值衰减的情况
20 | weight_decay_lambda = 0.1
21 | # ====================================================
22 | 
23 | network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100, 100, 100], output_size=10,
24 |                         weight_decay_lambda=weight_decay_lambda)
25 | optimizer = SGD(lr=0.01)
26 | 
27 | max_epochs = 201
28 | train_size = x_train.shape[0]
29 | batch_size = 100
30 | 
31 | train_loss_list = []
32 | train_acc_list = []
33 | test_acc_list = []
34 | 
35 | iter_per_epoch = max(train_size / batch_size, 1)
36 | epoch_cnt = 0
37 | 
38 | for i in range(1000000000):
39 |     batch_mask = np.random.choice(train_size, batch_size)
40 |     x_batch = x_train[batch_mask]
41 |     t_batch = t_train[batch_mask]
42 | 
43 |     grads = network.gradient(x_batch, t_batch)
44 |     optimizer.update(network.params, grads)
45 | 
46 |     if i % iter_per_epoch == 0:
47 |         train_acc = network.accuracy(x_train, t_train)
48 |         test_acc = network.accuracy(x_test, t_test)
49 |         train_acc_list.append(train_acc)
50 |         test_acc_list.append(test_acc)
51 | 
52 |         print("epoch:" + str(epoch_cnt) + ", train acc:" + str(train_acc) + ", test acc:" + str(test_acc))
53 | 
54 |         epoch_cnt += 1
55 |         if epoch_cnt >= max_epochs:
56 |             break
57 | 
58 | 
59 | # 3.绘制图形==========
60 | markers = {'train': 'o', 'test': 's'}
61 | x = np.arange(max_epochs)
62 | plt.plot(x, train_acc_list, marker='o', label='train', markevery=15)
63 | plt.plot(x, test_acc_list, marker='s', label='test', markevery=15)
64 | plt.xlabel("epochs")
65 | plt.ylabel("accuracy")
66 | plt.ylim(0, 1.0)
67 | plt.legend(loc='lower right')
68 | plt.show()


--------------------------------------------------------------------------------
/ch04/two_layer_net.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys, os
 3 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
 4 | from common.functions import *
 5 | from common.gradient import numerical_gradient
 6 | 
 7 | 
 8 | class TwoLayerNet:
 9 | 
10 |     def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
11 |         # 初始化权重
12 |         self.params = {}
13 |         self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
14 |         self.params['b1'] = np.zeros(hidden_size)
15 |         self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
16 |         self.params['b2'] = np.zeros(output_size)
17 | 
18 |     def predict(self, x):
19 |         W1, W2 = self.params['W1'], self.params['W2']
20 |         b1, b2 = self.params['b1'], self.params['b2']
21 |     
22 |         a1 = np.dot(x, W1) + b1
23 |         z1 = sigmoid(a1)
24 |         a2 = np.dot(z1, W2) + b2
25 |         y = softmax(a2)
26 |         
27 |         return y
28 |         
29 |     # x:输入数据, t:监督数据
30 |     def loss(self, x, t):
31 |         y = self.predict(x)
32 |         
33 |         return cross_entropy_error(y, t)
34 |     
35 |     def accuracy(self, x, t):
36 |         y = self.predict(x)
37 |         y = np.argmax(y, axis=1)
38 |         t = np.argmax(t, axis=1)
39 |         
40 |         accuracy = np.sum(y == t) / float(x.shape[0])
41 |         return accuracy
42 |         
43 |     # x:输入数据, t:监督数据
44 |     def numerical_gradient(self, x, t):
45 |         loss_W = lambda W: self.loss(x, t)
46 |         
47 |         grads = {}
48 |         grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
49 |         grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
50 |         grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
51 |         grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
52 |         
53 |         return grads
54 |         
55 |     def gradient(self, x, t):
56 |         W1, W2 = self.params['W1'], self.params['W2']
57 |         b1, b2 = self.params['b1'], self.params['b2']
58 |         grads = {}
59 |         
60 |         batch_num = x.shape[0]
61 |         
62 |         # forward
63 |         a1 = np.dot(x, W1) + b1
64 |         z1 = sigmoid(a1)
65 |         a2 = np.dot(z1, W2) + b2
66 |         y = softmax(a2)
67 |         
68 |         # backward
69 |         dy = (y - t) / batch_num
70 |         grads['W2'] = np.dot(z1.T, dy)
71 |         grads['b2'] = np.sum(dy, axis=0)
72 |         
73 |         da1 = np.dot(dy, W2.T)
74 |         dz1 = sigmoid_grad(a1) * da1
75 |         grads['W1'] = np.dot(x.T, dz1)
76 |         grads['b1'] = np.sum(dz1, axis=0)
77 | 
78 |         return grads


--------------------------------------------------------------------------------
/ch05/two_layer_net.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys, os
 3 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
 4 | import numpy as np
 5 | from common.layers import *
 6 | from common.gradient import numerical_gradient
 7 | from collections import OrderedDict
 8 | 
 9 | 
10 | class TwoLayerNet:
11 | 
12 |     def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
13 |         # 初始化权重
14 |         self.params = {}
15 |         self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
16 |         self.params['b1'] = np.zeros(hidden_size)
17 |         self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size) 
18 |         self.params['b2'] = np.zeros(output_size)
19 | 
20 |         # 生成层
21 |         self.layers = OrderedDict()
22 |         self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
23 |         self.layers['Relu1'] = Relu()
24 |         self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
25 | 
26 |         self.lastLayer = SoftmaxWithLoss()
27 |         
28 |     def predict(self, x):
29 |         for layer in self.layers.values():
30 |             x = layer.forward(x)
31 |         
32 |         return x
33 |         
34 |     # x:输入数据, t:监督数据
35 |     def loss(self, x, t):
36 |         y = self.predict(x)
37 |         return self.lastLayer.forward(y, t)
38 |     
39 |     def accuracy(self, x, t):
40 |         y = self.predict(x)
41 |         y = np.argmax(y, axis=1)
42 |         if t.ndim != 1 : t = np.argmax(t, axis=1)
43 |         
44 |         accuracy = np.sum(y == t) / float(x.shape[0])
45 |         return accuracy
46 |         
47 |     # x:输入数据, t:监督数据
48 |     def numerical_gradient(self, x, t):
49 |         loss_W = lambda W: self.loss(x, t)
50 |         
51 |         grads = {}
52 |         grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
53 |         grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
54 |         grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
55 |         grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
56 |         
57 |         return grads
58 |         
59 |     def gradient(self, x, t):
60 |         # forward
61 |         self.loss(x, t)
62 | 
63 |         # backward
64 |         dout = 1
65 |         dout = self.lastLayer.backward(dout)
66 |         
67 |         layers = list(self.layers.values())
68 |         layers.reverse()
69 |         for layer in layers:
70 |             dout = layer.backward(dout)
71 | 
72 |         # 设定
73 |         grads = {}
74 |         grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
75 |         grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db
76 | 
77 |         return grads
78 | 


--------------------------------------------------------------------------------
/common/util.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | 
 4 | 
 5 | def smooth_curve(x):
 6 |     """用于使损失函数的图形变圆滑
 7 | 
 8 |     参考：http://glowingpython.blogspot.jp/2012/02/convolution-with-numpy.html
 9 |     """
10 |     window_len = 11
11 |     s = np.r_[x[window_len-1:0:-1], x, x[-1:-window_len:-1]]
12 |     w = np.kaiser(window_len, 2)
13 |     y = np.convolve(w/w.sum(), s, mode='valid')
14 |     return y[5:len(y)-5]
15 | 
16 | 
17 | def shuffle_dataset(x, t):
18 |     """打乱数据集
19 | 
20 |     Parameters
21 |     ----------
22 |     x : 训练数据
23 |     t : 监督数据
24 | 
25 |     Returns
26 |     -------
27 |     x, t : 打乱的训练数据和监督数据
28 |     """
29 |     permutation = np.random.permutation(x.shape[0])
30 |     x = x[permutation,:] if x.ndim == 2 else x[permutation,:,:,:]
31 |     t = t[permutation]
32 | 
33 |     return x, t
34 | 
35 | def conv_output_size(input_size, filter_size, stride=1, pad=0):
36 |     return (input_size + 2*pad - filter_size) / stride + 1
37 | 
38 | 
39 | def im2col(input_data, filter_h, filter_w, stride=1, pad=0):
40 |     """
41 | 
42 |     Parameters
43 |     ----------
44 |     input_data : 由(数据量, 通道, 高, 长)的4维数组构成的输入数据
45 |     filter_h : 滤波器的高
46 |     filter_w : 滤波器的长
47 |     stride : 步幅
48 |     pad : 填充
49 | 
50 |     Returns
51 |     -------
52 |     col : 2维数组
53 |     """
54 |     N, C, H, W = input_data.shape
55 |     out_h = (H + 2*pad - filter_h)//stride + 1
56 |     out_w = (W + 2*pad - filter_w)//stride + 1
57 | 
58 |     img = np.pad(input_data, [(0,0), (0,0), (pad, pad), (pad, pad)], 'constant')
59 |     col = np.zeros((N, C, filter_h, filter_w, out_h, out_w))
60 | 
61 |     for y in range(filter_h):
62 |         y_max = y + stride*out_h
63 |         for x in range(filter_w):
64 |             x_max = x + stride*out_w
65 |             col[:, :, y, x, :, :] = img[:, :, y:y_max:stride, x:x_max:stride]
66 | 
67 |     col = col.transpose(0, 4, 5, 1, 2, 3).reshape(N*out_h*out_w, -1)
68 |     return col
69 | 
70 | 
71 | def col2im(col, input_shape, filter_h, filter_w, stride=1, pad=0):
72 |     """
73 | 
74 |     Parameters
75 |     ----------
76 |     col :
77 |     input_shape : 输入数据的形状（例：(10, 1, 28, 28)）
78 |     filter_h :
79 |     filter_w
80 |     stride
81 |     pad
82 | 
83 |     Returns
84 |     -------
85 | 
86 |     """
87 |     N, C, H, W = input_shape
88 |     out_h = (H + 2*pad - filter_h)//stride + 1
89 |     out_w = (W + 2*pad - filter_w)//stride + 1
90 |     col = col.reshape(N, out_h, out_w, C, filter_h, filter_w).transpose(0, 3, 4, 5, 1, 2)
91 | 
92 |     img = np.zeros((N, C, H + 2*pad + stride - 1, W + 2*pad + stride - 1))
93 |     for y in range(filter_h):
94 |         y_max = y + stride*out_h
95 |         for x in range(filter_w):
96 |             x_max = x + stride*out_w
97 |             img[:, :, y:y_max:stride, x:x_max:stride] += col[:, :, y, x, :, :]
98 | 
99 |     return img[:, :, pad:H + pad, pad:W + pad]


--------------------------------------------------------------------------------
/ch06/hyperparameter_optimization.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys, os
 3 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | from dataset.mnist import load_mnist
 7 | from common.multi_layer_net import MultiLayerNet
 8 | from common.util import shuffle_dataset
 9 | from common.trainer import Trainer
10 | 
11 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True) # 这里的x_test是没有用到
12 | 
13 | # 为了实现高速化，减少训练数据
14 | x_train = x_train[:500]
15 | t_train = t_train[:500]
16 | 
17 | # 分割验证数据
18 | validation_rate = 0.20 # 验证数据集和测试数据集不一样
19 | validation_num = int(x_train.shape[0] * validation_rate)
20 | x_train, t_train = shuffle_dataset(x_train, t_train)
21 | x_val = x_train[:validation_num]
22 | t_val = t_train[:validation_num]
23 | x_train = x_train[validation_num:]
24 | t_train = t_train[validation_num:]
25 | 
26 | 
27 | def __train(lr, weight_decay, epocs=50):
28 |     network = MultiLayerNet(input_size=784, hidden_size_list=[100, 100, 100, 100, 100, 100],
29 |                             output_size=10, weight_decay_lambda=weight_decay)
30 |     trainer = Trainer(network, x_train, t_train, x_val, t_val,
31 |                       epochs=epocs, mini_batch_size=100,
32 |                       optimizer='sgd', optimizer_param={'lr': lr}, verbose=False)
33 |     trainer.train()
34 | 
35 |     return trainer.test_acc_list, trainer.train_acc_list
36 | 
37 | 
38 | # 超参数的随机搜索======================================
39 | optimization_trial = 100 # 参数100次
40 | results_val = {}
41 | results_train = {}
42 | for _ in range(optimization_trial):
43 |     # 指定搜索的超参数的范围===============
44 |     weight_decay = 10 ** np.random.uniform(-8, -4)
45 |     lr = 10 ** np.random.uniform(-6, -2)
46 |     # ================================================
47 | 
48 |     val_acc_list, train_acc_list = __train(lr, weight_decay) # 这里每次的训练和测试数据是固定的，也就是变量是lr，和权重衰减。
49 |     print("val acc:" + str(val_acc_list[-1]) + " | lr:" + str(lr) + ", weight decay:" + str(weight_decay))
50 |     key = "lr:" + str(lr) + ", weight decay:" + str(weight_decay)
51 |     # print('val acc 的内容',val_acc_list)
52 |     results_val[key] = val_acc_list
53 |     results_train[key] = train_acc_list
54 | 
55 | # 绘制图形========================================================
56 | print("=========== Hyper-Parameter Optimization Result ===========")
57 | graph_draw_num = 20
58 | col_num = 5
59 | row_num = int(np.ceil(graph_draw_num / col_num))
60 | i = 0
61 | 
62 | # acc list里面最后一个值是最后运行 一个loop的acc值，并不一定是最大的， 只是参数执行到最后的值
63 | for key, val_acc_list in sorted(results_val.items(), key=lambda x:x[1][-1], reverse=True):
64 |     print("Best-" + str(i+1) + "(val acc:" + str(val_acc_list[-1]) + ") | " + key)
65 | 
66 |     plt.subplot(row_num, col_num, i+1)
67 |     plt.title("Best-" + str(i+1))
68 |     plt.ylim(0.0, 1.0)
69 |     if i % 5: plt.yticks([])
70 |     plt.xticks([])
71 |     x = np.arange(len(val_acc_list))
72 |     plt.plot(x, val_acc_list)
73 |     plt.plot(x, results_train[key], "--")
74 |     i += 1
75 | 
76 |     if i >= graph_draw_num:
77 |         break
78 | 
79 | plt.show()
80 | 


--------------------------------------------------------------------------------
/ch06/batch_norm_test.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys, os
 3 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | from dataset.mnist import load_mnist
 7 | from common.multi_layer_net_extend import MultiLayerNetExtend
 8 | from common.optimizer import SGD, Adam
 9 | 
10 | (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True)
11 | 
12 | # 减少学习数据
13 | x_train = x_train[:1000]
14 | t_train = t_train[:1000]
15 | 
16 | max_epochs = 20
17 | train_size = x_train.shape[0]
18 | batch_size = 100
19 | learning_rate = 0.01
20 | 
21 | 
22 | def __train(weight_init_std):
23 |     bn_network = MultiLayerNetExtend(input_size=784, hidden_size_list=[100, 100, 100, 100, 100], output_size=10, 
24 |                                     weight_init_std=weight_init_std, use_batchnorm=True)
25 |     network = MultiLayerNetExtend(input_size=784, hidden_size_list=[100, 100, 100, 100, 100], output_size=10,
26 |                                 weight_init_std=weight_init_std)
27 |     optimizer = SGD(lr=learning_rate)
28 |     
29 |     train_acc_list = []
30 |     bn_train_acc_list = []
31 |     
32 |     iter_per_epoch = max(train_size / batch_size, 1)
33 |     epoch_cnt = 0
34 |     
35 |     for i in range(1000000000):
36 |         batch_mask = np.random.choice(train_size, batch_size)
37 |         x_batch = x_train[batch_mask]
38 |         t_batch = t_train[batch_mask]
39 |     
40 |         for _network in (bn_network, network):
41 |             grads = _network.gradient(x_batch, t_batch)
42 |             optimizer.update(_network.params, grads)
43 |     
44 |         # 只是为了显示作用，实际可以不用？
45 |         if i % iter_per_epoch == 0: 
46 |             train_acc = network.accuracy(x_train, t_train)
47 |             bn_train_acc = bn_network.accuracy(x_train, t_train)
48 |             train_acc_list.append(train_acc)
49 |             bn_train_acc_list.append(bn_train_acc)
50 |     
51 |             print("epoch:" + str(epoch_cnt) + " | " + str(train_acc) + " - " + str(bn_train_acc))
52 |     
53 |             epoch_cnt += 1
54 |             if epoch_cnt >= max_epochs:
55 |                 break
56 |                 
57 |     return train_acc_list, bn_train_acc_list
58 | 
59 | 
60 | # 3.绘制图形==========
61 | weight_scale_list = np.logspace(0, -4, num=16)
62 | x = np.arange(max_epochs)
63 | 
64 | for i, w in enumerate(weight_scale_list):
65 |     print( "============== " + str(i+1) + "/16" + " ==============")
66 |     train_acc_list, bn_train_acc_list = __train(w)
67 |     
68 |     plt.subplot(4,4,i+1)
69 |     plt.title("W:" + str(w))
70 |     if i == 15:
71 |         plt.plot(x, bn_train_acc_list, label='Batch Normalization', markevery=2)
72 |         plt.plot(x, train_acc_list, linestyle = "--", label='Normal(without BatchNorm)', markevery=2)
73 |     else:
74 |         plt.plot(x, bn_train_acc_list, markevery=2)
75 |         plt.plot(x, train_acc_list, linestyle="--", markevery=2)
76 | 
77 |     plt.ylim(0, 1.0)
78 |     if i % 4:
79 |         plt.yticks([])
80 |     else:
81 |         plt.ylabel("accuracy")
82 |     if i < 12:
83 |         plt.xticks([])
84 |     else:
85 |         plt.xlabel("epochs")
86 |     plt.legend(loc='lower right')
87 |     
88 | plt.show()


--------------------------------------------------------------------------------
/common/trainer.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys, os
 3 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
 4 | import numpy as np
 5 | from common.optimizer import *
 6 | 
 7 | class Trainer:
 8 |     """进行神经网络的训练的类
 9 |     """
10 |     def __init__(self, network, x_train, t_train, x_test, t_test,
11 |                  epochs=20, mini_batch_size=100,
12 |                  optimizer='SGD', optimizer_param={'lr':0.01}, 
13 |                  evaluate_sample_num_per_epoch=None, verbose=True):
14 |         self.network = network
15 |         self.verbose = verbose
16 |         self.x_train = x_train
17 |         self.t_train = t_train
18 |         self.x_test = x_test
19 |         self.t_test = t_test
20 |         self.epochs = epochs
21 |         self.batch_size = mini_batch_size
22 |         self.evaluate_sample_num_per_epoch = evaluate_sample_num_per_epoch
23 | 
24 |         # optimzer
25 |         optimizer_class_dict = {'sgd':SGD, 'momentum':Momentum, 'nesterov':Nesterov,
26 |                                 'adagrad':AdaGrad, 'rmsprpo':RMSprop, 'adam':Adam}
27 |         self.optimizer = optimizer_class_dict[optimizer.lower()](**optimizer_param)
28 |         
29 |         self.train_size = x_train.shape[0]
30 |         self.iter_per_epoch = max(self.train_size / mini_batch_size, 1)
31 |         self.max_iter = int(epochs * self.iter_per_epoch)
32 |         self.current_iter = 0
33 |         self.current_epoch = 0
34 |         
35 |         self.train_loss_list = []
36 |         self.train_acc_list = []
37 |         self.test_acc_list = []
38 | 
39 |     def train_step(self):
40 |         batch_mask = np.random.choice(self.train_size, self.batch_size)
41 |         x_batch = self.x_train[batch_mask]
42 |         t_batch = self.t_train[batch_mask]
43 |         
44 |         grads = self.network.gradient(x_batch, t_batch)
45 |         self.optimizer.update(self.network.params, grads)
46 |         
47 |         loss = self.network.loss(x_batch, t_batch)
48 |         self.train_loss_list.append(loss)
49 |         if self.verbose: print("train loss:" + str(loss))
50 |         
51 |         if self.current_iter % self.iter_per_epoch == 0:
52 |             self.current_epoch += 1
53 |             
54 |             x_train_sample, t_train_sample = self.x_train, self.t_train
55 |             x_test_sample, t_test_sample = self.x_test, self.t_test
56 |             if not self.evaluate_sample_num_per_epoch is None:
57 |                 t = self.evaluate_sample_num_per_epoch
58 |                 x_train_sample, t_train_sample = self.x_train[:t], self.t_train[:t]
59 |                 x_test_sample, t_test_sample = self.x_test[:t], self.t_test[:t]
60 |                 
61 |             train_acc = self.network.accuracy(x_train_sample, t_train_sample)
62 |             test_acc = self.network.accuracy(x_test_sample, t_test_sample)
63 |             self.train_acc_list.append(train_acc)
64 |             self.test_acc_list.append(test_acc)
65 | 
66 |             if self.verbose: print("=== epoch:" + str(self.current_epoch) + ", train acc:" + str(train_acc) + ", test acc:" + str(test_acc) + " ===")
67 |         self.current_iter += 1
68 | 
69 |     def train(self):
70 |         for i in range(self.max_iter):
71 |             self.train_step() # 不断更新权重值W
72 | 
73 |         test_acc = self.network.accuracy(self.x_test, self.t_test)
74 | 
75 |         if self.verbose:
76 |             print("=============== Final Test Accuracy ===============")
77 |             print("test acc:" + str(test_acc))
78 | 
79 | 


--------------------------------------------------------------------------------
/dataset/mnist.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | try:
  3 |     import urllib.request
  4 | except ImportError:
  5 |     raise ImportError('You should use Python 3.x')
  6 | import os.path
  7 | import gzip
  8 | import pickle
  9 | import os
 10 | import numpy as np
 11 | 
 12 | 
 13 | url_base = 'http://yann.lecun.com/exdb/mnist/'
 14 | key_file = {
 15 |     'train_img':'train-images-idx3-ubyte.gz',
 16 |     'train_label':'train-labels-idx1-ubyte.gz',
 17 |     'test_img':'t10k-images-idx3-ubyte.gz',
 18 |     'test_label':'t10k-labels-idx1-ubyte.gz'
 19 | }
 20 | 
 21 | dataset_dir = os.path.dirname(os.path.abspath(__file__))
 22 | save_file = dataset_dir + "/mnist.pkl"
 23 | 
 24 | train_num = 60000
 25 | test_num = 10000
 26 | img_dim = (1, 28, 28)
 27 | img_size = 784
 28 | 
 29 | 
 30 | def _download(file_name):
 31 |     file_path = dataset_dir + "/" + file_name
 32 |     
 33 |     if os.path.exists(file_path):
 34 |         return
 35 | 
 36 |     print("Downloading " + file_name + " ... ")
 37 |     urllib.request.urlretrieve(url_base + file_name, file_path)
 38 |     print("Done")
 39 |     
 40 | def download_mnist():
 41 |     for v in key_file.values():
 42 |        _download(v)
 43 |         
 44 | def _load_label(file_name):
 45 |     file_path = dataset_dir + "/" + file_name
 46 |     
 47 |     print("Converting " + file_name + " to NumPy Array ...")
 48 |     with gzip.open(file_path, 'rb') as f:
 49 |             labels = np.frombuffer(f.read(), np.uint8, offset=8)
 50 |     print("Done")
 51 |     
 52 |     return labels
 53 | 
 54 | def _load_img(file_name):
 55 |     file_path = dataset_dir + "/" + file_name
 56 |     
 57 |     print("Converting " + file_name + " to NumPy Array ...")    
 58 |     with gzip.open(file_path, 'rb') as f:
 59 |             data = np.frombuffer(f.read(), np.uint8, offset=16)
 60 |     data = data.reshape(-1, img_size)
 61 |     print("Done")
 62 |     
 63 |     return data
 64 |     
 65 | def _convert_numpy():
 66 |     dataset = {}
 67 |     dataset['train_img'] =  _load_img(key_file['train_img'])
 68 |     dataset['train_label'] = _load_label(key_file['train_label'])    
 69 |     dataset['test_img'] = _load_img(key_file['test_img'])
 70 |     dataset['test_label'] = _load_label(key_file['test_label'])
 71 |     
 72 |     return dataset
 73 | 
 74 | def init_mnist():
 75 |     download_mnist()
 76 |     dataset = _convert_numpy()
 77 |     print("Creating pickle file ...")
 78 |     with open(save_file, 'wb') as f:
 79 |         pickle.dump(dataset, f, -1)
 80 |     print("Done!")
 81 | 
 82 | def _change_one_hot_label(X):
 83 |     T = np.zeros((X.size, 10))
 84 |     for idx, row in enumerate(T):
 85 |         row[X[idx]] = 1
 86 |         
 87 |     return T
 88 |     
 89 | 
 90 | def load_mnist(normalize=True, flatten=True, one_hot_label=False):
 91 |     """读入MNIST数据集
 92 |     
 93 |     Parameters
 94 |     ----------
 95 |     normalize : 将图像的像素值正规化为0.0~1.0
 96 |     one_hot_label : 
 97 |         one_hot_label为True的情况下，标签作为one-hot数组返回
 98 |         one-hot数组是指[0,0,1,0,0,0,0,0,0,0]这样的数组
 99 |     flatten : 是否将图像展开为一维数组
100 |     
101 |     Returns
102 |     -------
103 |     (训练图像, 训练标签), (测试图像, 测试标签)
104 |     """
105 |     if not os.path.exists(save_file):
106 |         init_mnist()
107 |         
108 |     with open(save_file, 'rb') as f:
109 |         dataset = pickle.load(f)
110 |     
111 |     if normalize:
112 |         for key in ('train_img', 'test_img'):
113 |             dataset[key] = dataset[key].astype(np.float32)
114 |             dataset[key] /= 255.0
115 |             
116 |     if one_hot_label:
117 |         dataset['train_label'] = _change_one_hot_label(dataset['train_label'])
118 |         dataset['test_label'] = _change_one_hot_label(dataset['test_label'])
119 |     
120 |     if not flatten:
121 |          for key in ('train_img', 'test_img'):
122 |             dataset[key] = dataset[key].reshape(-1, 1, 28, 28)
123 | 
124 |     return (dataset['train_img'], dataset['train_label']), (dataset['test_img'], dataset['test_label']) 
125 | 
126 | 
127 | if __name__ == '__main__':
128 |     init_mnist()
129 | 


--------------------------------------------------------------------------------
/common/optimizer.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import numpy as np
  3 | 
  4 | class SGD:
  5 | 
  6 |     """随机梯度下降法（Stochastic Gradient Descent）"""
  7 | 
  8 |     def __init__(self, lr=0.01):
  9 |         self.lr = lr
 10 |         
 11 |     def update(self, params, grads):
 12 |         for key in params.keys():
 13 |             params[key] -= self.lr * grads[key] 
 14 | 
 15 | 
 16 | class Momentum:
 17 | 
 18 |     """Momentum SGD"""
 19 | 
 20 |     def __init__(self, lr=0.01, momentum=0.9):
 21 |         self.lr = lr
 22 |         self.momentum = momentum
 23 |         self.v = None
 24 |         
 25 |     def update(self, params, grads):
 26 |         if self.v is None:
 27 |             self.v = {}
 28 |             for key, val in params.items():                                
 29 |                 self.v[key] = np.zeros_like(val)
 30 |                 
 31 |         for key in params.keys():
 32 |             self.v[key] = self.momentum*self.v[key] - self.lr*grads[key] 
 33 |             params[key] += self.v[key]
 34 | 
 35 | 
 36 | class Nesterov:
 37 | 
 38 |     """Nesterov's Accelerated Gradient (http://arxiv.org/abs/1212.0901)"""
 39 | 
 40 |     def __init__(self, lr=0.01, momentum=0.9):
 41 |         self.lr = lr
 42 |         self.momentum = momentum
 43 |         self.v = None
 44 |         
 45 |     def update(self, params, grads):
 46 |         if self.v is None:
 47 |             self.v = {}
 48 |             for key, val in params.items():
 49 |                 self.v[key] = np.zeros_like(val)
 50 |             
 51 |         for key in params.keys():
 52 |             self.v[key] *= self.momentum
 53 |             self.v[key] -= self.lr * grads[key]
 54 |             params[key] += self.momentum * self.momentum * self.v[key]
 55 |             params[key] -= (1 + self.momentum) * self.lr * grads[key]
 56 | 
 57 | 
 58 | class AdaGrad:
 59 | 
 60 |     """AdaGrad"""
 61 | 
 62 |     def __init__(self, lr=0.01):
 63 |         self.lr = lr
 64 |         self.h = None
 65 |         
 66 |     def update(self, params, grads):
 67 |         if self.h is None:
 68 |             self.h = {}
 69 |             for key, val in params.items():
 70 |                 self.h[key] = np.zeros_like(val)
 71 |             
 72 |         for key in params.keys():
 73 |             self.h[key] += grads[key] * grads[key]
 74 |             params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)
 75 | 
 76 | 
 77 | class RMSprop:
 78 | 
 79 |     """RMSprop"""
 80 | 
 81 |     def __init__(self, lr=0.01, decay_rate = 0.99):
 82 |         self.lr = lr
 83 |         self.decay_rate = decay_rate
 84 |         self.h = None
 85 |         
 86 |     def update(self, params, grads):
 87 |         if self.h is None:
 88 |             self.h = {}
 89 |             for key, val in params.items():
 90 |                 self.h[key] = np.zeros_like(val)
 91 |             
 92 |         for key in params.keys():
 93 |             self.h[key] *= self.decay_rate
 94 |             self.h[key] += (1 - self.decay_rate) * grads[key] * grads[key]
 95 |             params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)
 96 | 
 97 | 
 98 | class Adam:
 99 | 
100 |     """Adam (http://arxiv.org/abs/1412.6980v8)"""
101 | 
102 |     def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
103 |         self.lr = lr
104 |         self.beta1 = beta1
105 |         self.beta2 = beta2
106 |         self.iter = 0
107 |         self.m = None
108 |         self.v = None
109 |         
110 |     def update(self, params, grads):
111 |         if self.m is None:
112 |             self.m, self.v = {}, {}
113 |             for key, val in params.items():
114 |                 self.m[key] = np.zeros_like(val)
115 |                 self.v[key] = np.zeros_like(val)
116 |         
117 |         self.iter += 1
118 |         lr_t  = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter)         
119 |         
120 |         for key in params.keys():
121 |             #self.m[key] = self.beta1*self.m[key] + (1-self.beta1)*grads[key]
122 |             #self.v[key] = self.beta2*self.v[key] + (1-self.beta2)*(grads[key]**2)
123 |             self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
124 |             self.v[key] += (1 - self.beta2) * (grads[key]**2 - self.v[key])
125 |             
126 |             params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)
127 |             
128 |             #unbias_m += (1 - self.beta1) * (grads[key] - self.m[key]) # correct bias
129 |             #unbisa_b += (1 - self.beta2) * (grads[key]*grads[key] - self.v[key]) # correct bias
130 |             #params[key] += self.lr * unbias_m / (np.sqrt(unbisa_b) + 1e-7)
131 | 


--------------------------------------------------------------------------------
/common/multi_layer_net.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import sys, os
  3 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
  4 | import numpy as np
  5 | from collections import OrderedDict
  6 | from common.layers import *
  7 | from common.gradient import numerical_gradient
  8 | 
  9 | 
 10 | class MultiLayerNet:
 11 |     """全连接的多层神经网络
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     input_size : 输入大小（MNIST的情况下为784）
 16 |     hidden_size_list : 隐藏层的神经元数量的列表（e.g. [100, 100, 100]）
 17 |     output_size : 输出大小（MNIST的情况下为10）
 18 |     activation : 'relu' or 'sigmoid'
 19 |     weight_init_std : 指定权重的标准差（e.g. 0.01）
 20 |         指定'relu'或'he'的情况下设定“He的初始值”
 21 |         指定'sigmoid'或'xavier'的情况下设定“Xavier的初始值”
 22 |     weight_decay_lambda : Weight Decay（L2范数）的强度
 23 |     """
 24 |     def __init__(self, input_size, hidden_size_list, output_size,
 25 |                  activation='relu', weight_init_std='relu', weight_decay_lambda=0):
 26 |         self.input_size = input_size
 27 |         self.output_size = output_size
 28 |         self.hidden_size_list = hidden_size_list
 29 |         self.hidden_layer_num = len(hidden_size_list)
 30 |         self.weight_decay_lambda = weight_decay_lambda
 31 |         self.params = {}
 32 | 
 33 |         # 初始化权重
 34 |         self.__init_weight(weight_init_std)
 35 | 
 36 |         # 生成层
 37 |         activation_layer = {'sigmoid': Sigmoid, 'relu': Relu}
 38 |         self.layers = OrderedDict()
 39 |         for idx in range(1, self.hidden_layer_num+1): # 每一层都是差不多的，
 40 |             self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)],
 41 |                                                       self.params['b' + str(idx)])
 42 |             self.layers['Activation_function' + str(idx)] = activation_layer[activation]()
 43 | 
 44 |         idx = self.hidden_layer_num + 1 # 输出层
 45 |         self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)],
 46 |             self.params['b' + str(idx)])
 47 | 
 48 |         self.last_layer = SoftmaxWithLoss()
 49 | 
 50 |     def __init_weight(self, weight_init_std):
 51 |         """设定权重的初始值
 52 | 
 53 |         Parameters
 54 |         ----------
 55 |         weight_init_std : 指定权重的标准差（e.g. 0.01）
 56 |             指定'relu'或'he'的情况下设定“He的初始值”
 57 |             指定'sigmoid'或'xavier'的情况下设定“Xavier的初始值”
 58 |         """
 59 |         all_size_list = [self.input_size] + self.hidden_size_list + [self.output_size]
 60 |         for idx in range(1, len(all_size_list)):
 61 |             scale = weight_init_std
 62 |             if str(weight_init_std).lower() in ('relu', 'he'):
 63 |                 scale = np.sqrt(2.0 / all_size_list[idx - 1])  # 使用ReLU的情况下推荐的初始值
 64 |             elif str(weight_init_std).lower() in ('sigmoid', 'xavier'):
 65 |                 scale = np.sqrt(1.0 / all_size_list[idx - 1])  # 使用sigmoid的情况下推荐的初始值
 66 | 
 67 |             self.params['W' + str(idx)] = scale * np.random.randn(all_size_list[idx-1], all_size_list[idx])
 68 |             self.params['b' + str(idx)] = np.zeros(all_size_list[idx])
 69 | 
 70 |     def predict(self, x):
 71 |         for layer in self.layers.values():
 72 |             x = layer.forward(x)
 73 | 
 74 |         return x
 75 | 
 76 |     def loss(self, x, t):
 77 |         """求损失函数
 78 | 
 79 |         Parameters
 80 |         ----------
 81 |         x : 输入数据
 82 |         t : 教师标签
 83 | 
 84 |         Returns
 85 |         -------
 86 |         损失函数的值
 87 |         """
 88 |         y = self.predict(x)
 89 | 
 90 |         weight_decay = 0
 91 |         for idx in range(1, self.hidden_layer_num + 2):
 92 |             W = self.params['W' + str(idx)]
 93 |             weight_decay += 0.5 * self.weight_decay_lambda * np.sum(W ** 2)
 94 | 
 95 |         return self.last_layer.forward(y, t) + weight_decay
 96 | 
 97 |     def accuracy(self, x, t):
 98 |         y = self.predict(x)
 99 |         y = np.argmax(y, axis=1)
100 |         if t.ndim != 1 : t = np.argmax(t, axis=1)
101 | 
102 |         accuracy = np.sum(y == t) / float(x.shape[0])
103 |         return accuracy
104 | 
105 |     def numerical_gradient(self, x, t):
106 |         """求梯度（数值微分）
107 | 
108 |         Parameters
109 |         ----------
110 |         x : 输入数据
111 |         t : 教师标签
112 | 
113 |         Returns
114 |         -------
115 |         具有各层的梯度的字典变量
116 |             grads['W1']、grads['W2']、...是各层的权重
117 |             grads['b1']、grads['b2']、...是各层的偏置
118 |         """
119 |         loss_W = lambda W: self.loss(x, t)
120 | 
121 |         grads = {}
122 |         for idx in range(1, self.hidden_layer_num+2):
123 |             grads['W' + str(idx)] = numerical_gradient(loss_W, self.params['W' + str(idx)])
124 |             grads['b' + str(idx)] = numerical_gradient(loss_W, self.params['b' + str(idx)])
125 | 
126 |         return grads
127 | 
128 |     def gradient(self, x, t):
129 |         """求梯度（误差反向传播法）
130 | 
131 |         Parameters
132 |         ----------
133 |         x : 输入数据
134 |         t : 教师标签
135 | 
136 |         Returns
137 |         -------
138 |         具有各层的梯度的字典变量
139 |             grads['W1']、grads['W2']、...是各层的权重
140 |             grads['b1']、grads['b2']、...是各层的偏置
141 |         """
142 |         # forward
143 |         self.loss(x, t)
144 | 
145 |         # backward
146 |         dout = 1
147 |         dout = self.last_layer.backward(dout)
148 | 
149 |         layers = list(self.layers.values())
150 |         layers.reverse()
151 |         for layer in layers:
152 |             dout = layer.backward(dout)
153 | 
154 |         # 设定
155 |         grads = {}
156 |         for idx in range(1, self.hidden_layer_num+2):
157 |             grads['W' + str(idx)] = self.layers['Affine' + str(idx)].dW + self.weight_decay_lambda * self.layers['Affine' + str(idx)].W
158 |             grads['b' + str(idx)] = self.layers['Affine' + str(idx)].db
159 | 
160 |         return grads
161 | 


--------------------------------------------------------------------------------
/ch07/simple_convnet.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import sys, os
  3 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
  4 | import pickle
  5 | import numpy as np
  6 | from collections import OrderedDict
  7 | from common.layers import *
  8 | from common.gradient import numerical_gradient
  9 | 
 10 | 
 11 | class SimpleConvNet:
 12 |     """简单的ConvNet
 13 | 
 14 |     conv - relu - pool - affine - relu - affine - softmax
 15 |     
 16 |     Parameters
 17 |     ----------
 18 |     input_size : 输入大小（MNIST的情况下为784）
 19 |     hidden_size_list : 隐藏层的神经元数量的列表（e.g. [100, 100, 100]）
 20 |     output_size : 输出大小（MNIST的情况下为10）
 21 |     activation : 'relu' or 'sigmoid'
 22 |     weight_init_std : 指定权重的标准差（e.g. 0.01）
 23 |         指定'relu'或'he'的情况下设定“He的初始值”
 24 |         指定'sigmoid'或'xavier'的情况下设定“Xavier的初始值”
 25 |     """
 26 |     def __init__(self, input_dim=(1, 28, 28), 
 27 |                  conv_param={'filter_num':30, 'filter_size':5, 'pad':0, 'stride':1},
 28 |                  hidden_size=100, output_size=10, weight_init_std=0.01):
 29 |         filter_num = conv_param['filter_num']
 30 |         filter_size = conv_param['filter_size']
 31 |         filter_pad = conv_param['pad']
 32 |         filter_stride = conv_param['stride']
 33 |         input_size = input_dim[1]
 34 |         conv_output_size = (input_size - filter_size + 2*filter_pad) / filter_stride + 1
 35 |         pool_output_size = int(filter_num * (conv_output_size/2) * (conv_output_size/2))
 36 | 
 37 |         # 初始化权重
 38 |         self.params = {}
 39 |         self.params['W1'] = weight_init_std * \
 40 |                             np.random.randn(filter_num, input_dim[0], filter_size, filter_size)
 41 |         self.params['b1'] = np.zeros(filter_num)
 42 |         self.params['W2'] = weight_init_std * \
 43 |                             np.random.randn(pool_output_size, hidden_size)
 44 |         self.params['b2'] = np.zeros(hidden_size)
 45 |         self.params['W3'] = weight_init_std * \
 46 |                             np.random.randn(hidden_size, output_size)
 47 |         self.params['b3'] = np.zeros(output_size)
 48 | 
 49 |         # 生成层
 50 |         self.layers = OrderedDict()
 51 |         self.layers['Conv1'] = Convolution(self.params['W1'], self.params['b1'],
 52 |                                            conv_param['stride'], conv_param['pad'])
 53 |         self.layers['Relu1'] = Relu()
 54 |         self.layers['Pool1'] = Pooling(pool_h=2, pool_w=2, stride=2)
 55 |         self.layers['Affine1'] = Affine(self.params['W2'], self.params['b2'])
 56 |         self.layers['Relu2'] = Relu()
 57 |         self.layers['Affine2'] = Affine(self.params['W3'], self.params['b3'])
 58 | 
 59 |         self.last_layer = SoftmaxWithLoss()
 60 | 
 61 |     def predict(self, x):
 62 |         for layer in self.layers.values():
 63 |             x = layer.forward(x)
 64 | 
 65 |         return x
 66 | 
 67 |     def loss(self, x, t):
 68 |         """求损失函数
 69 |         参数x是输入数据、t是教师标签
 70 |         """
 71 |         y = self.predict(x)
 72 |         return self.last_layer.forward(y, t)
 73 | 
 74 |     def accuracy(self, x, t, batch_size=100):
 75 |         if t.ndim != 1 : t = np.argmax(t, axis=1)
 76 |         
 77 |         acc = 0.0
 78 |         
 79 |         for i in range(int(x.shape[0] / batch_size)):
 80 |             tx = x[i*batch_size:(i+1)*batch_size]
 81 |             tt = t[i*batch_size:(i+1)*batch_size]
 82 |             y = self.predict(tx)
 83 |             y = np.argmax(y, axis=1)
 84 |             acc += np.sum(y == tt) 
 85 |         
 86 |         return acc / x.shape[0]
 87 | 
 88 |     def numerical_gradient(self, x, t):
 89 |         """求梯度（数值微分）
 90 | 
 91 |         Parameters
 92 |         ----------
 93 |         x : 输入数据
 94 |         t : 教师标签
 95 | 
 96 |         Returns
 97 |         -------
 98 |         具有各层的梯度的字典变量
 99 |             grads['W1']、grads['W2']、...是各层的权重
100 |             grads['b1']、grads['b2']、...是各层的偏置
101 |         """
102 |         loss_w = lambda w: self.loss(x, t)
103 | 
104 |         grads = {}
105 |         for idx in (1, 2, 3):
106 |             grads['W' + str(idx)] = numerical_gradient(loss_w, self.params['W' + str(idx)])
107 |             grads['b' + str(idx)] = numerical_gradient(loss_w, self.params['b' + str(idx)])
108 | 
109 |         return grads
110 | 
111 |     def gradient(self, x, t):
112 |         """求梯度（误差反向传播法）
113 | 
114 |         Parameters
115 |         ----------
116 |         x : 输入数据
117 |         t : 教师标签
118 | 
119 |         Returns
120 |         -------
121 |         具有各层的梯度的字典变量
122 |             grads['W1']、grads['W2']、...是各层的权重
123 |             grads['b1']、grads['b2']、...是各层的偏置
124 |         """
125 |         # forward
126 |         self.loss(x, t)
127 | 
128 |         # backward
129 |         dout = 1
130 |         dout = self.last_layer.backward(dout)
131 | 
132 |         layers = list(self.layers.values())
133 |         layers.reverse()
134 |         for layer in layers:
135 |             dout = layer.backward(dout)
136 | 
137 |         # 设定
138 |         grads = {}
139 |         grads['W1'], grads['b1'] = self.layers['Conv1'].dW, self.layers['Conv1'].db
140 |         grads['W2'], grads['b2'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
141 |         grads['W3'], grads['b3'] = self.layers['Affine2'].dW, self.layers['Affine2'].db
142 | 
143 |         return grads
144 |         
145 |     def save_params(self, file_name="params.pkl"):
146 |         params = {}
147 |         for key, val in self.params.items():
148 |             params[key] = val
149 |         with open(file_name, 'wb') as f:
150 |             pickle.dump(params, f)
151 | 
152 |     def load_params(self, file_name="params.pkl"):
153 |         with open(file_name, 'rb') as f:
154 |             params = pickle.load(f)
155 |         for key, val in params.items():
156 |             self.params[key] = val
157 | 
158 |         for i, key in enumerate(['Conv1', 'Affine1', 'Affine2']):
159 |             self.layers[key].W = self.params['W' + str(i+1)]
160 |             self.layers[key].b = self.params['b' + str(i+1)]


--------------------------------------------------------------------------------
/ch08/deep_convnet.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import sys, os
  3 | sys.path.append(os.pardir)  # 为了导入父目录的文件而进行的设定
  4 | import pickle
  5 | import numpy as np
  6 | from collections import OrderedDict
  7 | from common.layers import *
  8 | 
  9 | 
 10 | class DeepConvNet:
 11 |     """识别率为99%以上的高精度的ConvNet
 12 | 
 13 |     网络结构如下所示
 14 |         conv - relu - conv- relu - pool -
 15 |         conv - relu - conv- relu - pool -
 16 |         conv - relu - conv- relu - pool -
 17 |         affine - relu - dropout - affine - dropout - softmax
 18 |     """
 19 |     def __init__(self, input_dim=(1, 28, 28),
 20 |                  conv_param_1 = {'filter_num':16, 'filter_size':3, 'pad':1, 'stride':1},
 21 |                  conv_param_2 = {'filter_num':16, 'filter_size':3, 'pad':1, 'stride':1},
 22 |                  conv_param_3 = {'filter_num':32, 'filter_size':3, 'pad':1, 'stride':1},
 23 |                  conv_param_4 = {'filter_num':32, 'filter_size':3, 'pad':2, 'stride':1},
 24 |                  conv_param_5 = {'filter_num':64, 'filter_size':3, 'pad':1, 'stride':1},
 25 |                  conv_param_6 = {'filter_num':64, 'filter_size':3, 'pad':1, 'stride':1},
 26 |                  hidden_size=50, output_size=10):
 27 |         # 初始化权重===========
 28 |         # 各层的神经元平均与前一层的几个神经元有连接（TODO:自动计算）
 29 |         pre_node_nums = np.array([1*3*3, 16*3*3, 16*3*3, 32*3*3, 32*3*3, 64*3*3, 64*4*4, hidden_size])
 30 |         wight_init_scales = np.sqrt(2.0 / pre_node_nums)  # 使用ReLU的情况下推荐的初始值
 31 |         
 32 |         self.params = {}
 33 |         pre_channel_num = input_dim[0]
 34 |         for idx, conv_param in enumerate([conv_param_1, conv_param_2, conv_param_3, conv_param_4, conv_param_5, conv_param_6]):
 35 |             self.params['W' + str(idx+1)] = wight_init_scales[idx] * np.random.randn(conv_param['filter_num'], pre_channel_num, conv_param['filter_size'], conv_param['filter_size'])
 36 |             self.params['b' + str(idx+1)] = np.zeros(conv_param['filter_num'])
 37 |             pre_channel_num = conv_param['filter_num']
 38 |         self.params['W7'] = wight_init_scales[6] * np.random.randn(64*4*4, hidden_size)
 39 |         self.params['b7'] = np.zeros(hidden_size)
 40 |         self.params['W8'] = wight_init_scales[7] * np.random.randn(hidden_size, output_size)
 41 |         self.params['b8'] = np.zeros(output_size)
 42 | 
 43 |         # 生成层===========
 44 |         self.layers = []
 45 |         self.layers.append(Convolution(self.params['W1'], self.params['b1'], 
 46 |                            conv_param_1['stride'], conv_param_1['pad']))
 47 |         self.layers.append(Relu())
 48 |         self.layers.append(Convolution(self.params['W2'], self.params['b2'], 
 49 |                            conv_param_2['stride'], conv_param_2['pad']))
 50 |         self.layers.append(Relu())
 51 |         self.layers.append(Pooling(pool_h=2, pool_w=2, stride=2))
 52 |         self.layers.append(Convolution(self.params['W3'], self.params['b3'], 
 53 |                            conv_param_3['stride'], conv_param_3['pad']))
 54 |         self.layers.append(Relu())
 55 |         self.layers.append(Convolution(self.params['W4'], self.params['b4'],
 56 |                            conv_param_4['stride'], conv_param_4['pad']))
 57 |         self.layers.append(Relu())
 58 |         self.layers.append(Pooling(pool_h=2, pool_w=2, stride=2))
 59 |         self.layers.append(Convolution(self.params['W5'], self.params['b5'],
 60 |                            conv_param_5['stride'], conv_param_5['pad']))
 61 |         self.layers.append(Relu())
 62 |         self.layers.append(Convolution(self.params['W6'], self.params['b6'],
 63 |                            conv_param_6['stride'], conv_param_6['pad']))
 64 |         self.layers.append(Relu())
 65 |         self.layers.append(Pooling(pool_h=2, pool_w=2, stride=2))
 66 |         self.layers.append(Affine(self.params['W7'], self.params['b7']))
 67 |         self.layers.append(Relu())
 68 |         self.layers.append(Dropout(0.5))
 69 |         self.layers.append(Affine(self.params['W8'], self.params['b8']))
 70 |         self.layers.append(Dropout(0.5))
 71 |         
 72 |         self.last_layer = SoftmaxWithLoss()
 73 | 
 74 |     def predict(self, x, train_flg=False):
 75 |         for layer in self.layers:
 76 |             if isinstance(layer, Dropout):
 77 |                 x = layer.forward(x, train_flg)
 78 |             else:
 79 |                 x = layer.forward(x)
 80 |         return x
 81 | 
 82 |     def loss(self, x, t):
 83 |         y = self.predict(x, train_flg=True)
 84 |         return self.last_layer.forward(y, t)
 85 | 
 86 |     def accuracy(self, x, t, batch_size=100):
 87 |         if t.ndim != 1 : t = np.argmax(t, axis=1)
 88 | 
 89 |         acc = 0.0
 90 | 
 91 |         for i in range(int(x.shape[0] / batch_size)):
 92 |             tx = x[i*batch_size:(i+1)*batch_size]
 93 |             tt = t[i*batch_size:(i+1)*batch_size]
 94 |             y = self.predict(tx, train_flg=False)
 95 |             y = np.argmax(y, axis=1)
 96 |             acc += np.sum(y == tt)
 97 | 
 98 |         return acc / x.shape[0]
 99 | 
100 |     def gradient(self, x, t):
101 |         # forward
102 |         self.loss(x, t)
103 | 
104 |         # backward
105 |         dout = 1
106 |         dout = self.last_layer.backward(dout)
107 | 
108 |         tmp_layers = self.layers.copy()
109 |         tmp_layers.reverse()
110 |         for layer in tmp_layers:
111 |             dout = layer.backward(dout)
112 | 
113 |         # 设定
114 |         grads = {}
115 |         for i, layer_idx in enumerate((0, 2, 5, 7, 10, 12, 15, 18)):
116 |             grads['W' + str(i+1)] = self.layers[layer_idx].dW
117 |             grads['b' + str(i+1)] = self.layers[layer_idx].db
118 | 
119 |         return grads
120 | 
121 |     def save_params(self, file_name="params.pkl"):
122 |         params = {}
123 |         for key, val in self.params.items():
124 |             params[key] = val
125 |         with open(file_name, 'wb') as f:
126 |             pickle.dump(params, f)
127 | 
128 |     def load_params(self, file_name="params.pkl"):
129 |         with open(file_name, 'rb') as f:
130 |             params = pickle.load(f)
131 |         for key, val in params.items():
132 |             self.params[key] = val
133 | 
134 |         for i, layer_idx in enumerate((0, 2, 5, 7, 10, 12, 15, 18)):
135 |             self.layers[layer_idx].W = self.params['W' + str(i+1)]
136 |             self.layers[layer_idx].b = self.params['b' + str(i+1)]
137 | 


--------------------------------------------------------------------------------
/common/multi_layer_net_extend.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import sys, os
  3 | sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定
  4 | import numpy as np
  5 | from collections import OrderedDict
  6 | from common.layers import *
  7 | from common.gradient import numerical_gradient
  8 | 
  9 | class MultiLayerNetExtend:
 10 |     """扩展版的全连接的多层神经网络
 11 |     
 12 |     具有Weiht Decay、Dropout、Batch Normalization的功能
 13 | 
 14 |     Parameters
 15 |     ----------
 16 |     input_size : 输入大小（MNIST的情况下为784）
 17 |     hidden_size_list : 隐藏层的神经元数量的列表（e.g. [100, 100, 100]）
 18 |     output_size : 输出大小（MNIST的情况下为10）
 19 |     activation : 'relu' or 'sigmoid'
 20 |     weight_init_std : 指定权重的标准差（e.g. 0.01）
 21 |         指定'relu'或'he'的情况下设定“He的初始值”
 22 |         指定'sigmoid'或'xavier'的情况下设定“Xavier的初始值”
 23 |     weight_decay_lambda : Weight Decay（L2范数）的强度
 24 |     use_dropout: 是否使用Dropout
 25 |     dropout_ration : Dropout的比例
 26 |     use_batchNorm: 是否使用Batch Normalization
 27 |     """
 28 |     def __init__(self, input_size, hidden_size_list, output_size,
 29 |                  activation='relu', weight_init_std='relu', weight_decay_lambda=0, 
 30 |                  use_dropout = False, dropout_ration = 0.5, use_batchnorm=False):
 31 |         self.input_size = input_size
 32 |         self.output_size = output_size
 33 |         self.hidden_size_list = hidden_size_list
 34 |         self.hidden_layer_num = len(hidden_size_list)
 35 |         self.use_dropout = use_dropout
 36 |         self.weight_decay_lambda = weight_decay_lambda
 37 |         self.use_batchnorm = use_batchnorm
 38 |         self.params = {}
 39 | 
 40 |         # 初始化权重
 41 |         self.__init_weight(weight_init_std)
 42 | 
 43 |         # 生成层
 44 |         activation_layer = {'sigmoid': Sigmoid, 'relu': Relu}
 45 |         self.layers = OrderedDict()
 46 |         for idx in range(1, self.hidden_layer_num+1):
 47 |             self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)],
 48 |                                                       self.params['b' + str(idx)])
 49 |             if self.use_batchnorm:
 50 |                 self.params['gamma' + str(idx)] = np.ones(hidden_size_list[idx-1])
 51 |                 self.params['beta' + str(idx)] = np.zeros(hidden_size_list[idx-1])
 52 |                 self.layers['BatchNorm' + str(idx)] = BatchNormalization(self.params['gamma' + str(idx)], self.params['beta' + str(idx)])
 53 |                 
 54 |             self.layers['Activation_function' + str(idx)] = activation_layer[activation]()
 55 |             
 56 |             if self.use_dropout:
 57 |                 self.layers['Dropout' + str(idx)] = Dropout(dropout_ration)
 58 | 
 59 |         idx = self.hidden_layer_num + 1
 60 |         self.layers['Affine' + str(idx)] = Affine(self.params['W' + str(idx)], self.params['b' + str(idx)])
 61 | 
 62 |         self.last_layer = SoftmaxWithLoss()
 63 | 
 64 |     def __init_weight(self, weight_init_std):
 65 |         """设定权重的初始值
 66 | 
 67 |         Parameters
 68 |         ----------
 69 |         weight_init_std : 指定权重的标准差（e.g. 0.01）
 70 |             指定'relu'或'he'的情况下设定“He的初始值”
 71 |             指定'sigmoid'或'xavier'的情况下设定“Xavier的初始值”
 72 |         """
 73 |         all_size_list = [self.input_size] + self.hidden_size_list + [self.output_size]
 74 |         for idx in range(1, len(all_size_list)):
 75 |             scale = weight_init_std
 76 |             if str(weight_init_std).lower() in ('relu', 'he'):
 77 |                 scale = np.sqrt(2.0 / all_size_list[idx - 1])  # 使用ReLU的情况下推荐的初始值
 78 |             elif str(weight_init_std).lower() in ('sigmoid', 'xavier'):
 79 |                 scale = np.sqrt(1.0 / all_size_list[idx - 1])  # 使用sigmoid的情况下推荐的初始值
 80 |             self.params['W' + str(idx)] = scale * np.random.randn(all_size_list[idx-1], all_size_list[idx])
 81 |             self.params['b' + str(idx)] = np.zeros(all_size_list[idx])
 82 | 
 83 |     def predict(self, x, train_flg=False):
 84 |         for key, layer in self.layers.items():
 85 |             if "Dropout" in key or "BatchNorm" in key:
 86 |                 x = layer.forward(x, train_flg)
 87 |             else:
 88 |                 x = layer.forward(x)
 89 | 
 90 |         return x
 91 | 
 92 |     def loss(self, x, t, train_flg=False):
 93 |         """求损失函数
 94 |         参数x是输入数据，t是教师标签
 95 |         """
 96 |         y = self.predict(x, train_flg)
 97 | 
 98 |         weight_decay = 0
 99 |         for idx in range(1, self.hidden_layer_num + 2):
100 |             W = self.params['W' + str(idx)]
101 |             weight_decay += 0.5 * self.weight_decay_lambda * np.sum(W**2)
102 | 
103 |         return self.last_layer.forward(y, t) + weight_decay
104 | 
105 |     def accuracy(self, X, T):
106 |         Y = self.predict(X, train_flg=False)
107 |         Y = np.argmax(Y, axis=1)
108 |         if T.ndim != 1 : T = np.argmax(T, axis=1)
109 | 
110 |         accuracy = np.sum(Y == T) / float(X.shape[0])
111 |         return accuracy
112 | 
113 |     def numerical_gradient(self, X, T):
114 |         """求梯度（数值微分）
115 | 
116 |         Parameters
117 |         ----------
118 |         X : 输入数据
119 |         T : 教师标签
120 | 
121 |         Returns
122 |         -------
123 |         具有各层的梯度的字典变量
124 |             grads['W1']、grads['W2']、...是各层的权重
125 |             grads['b1']、grads['b2']、...是各层的偏置
126 |         """
127 |         loss_W = lambda W: self.loss(X, T, train_flg=True)
128 | 
129 |         grads = {}
130 |         for idx in range(1, self.hidden_layer_num+2):
131 |             grads['W' + str(idx)] = numerical_gradient(loss_W, self.params['W' + str(idx)])
132 |             grads['b' + str(idx)] = numerical_gradient(loss_W, self.params['b' + str(idx)])
133 |             
134 |             if self.use_batchnorm and idx != self.hidden_layer_num+1:
135 |                 grads['gamma' + str(idx)] = numerical_gradient(loss_W, self.params['gamma' + str(idx)])
136 |                 grads['beta' + str(idx)] = numerical_gradient(loss_W, self.params['beta' + str(idx)])
137 | 
138 |         return grads
139 |         
140 |     def gradient(self, x, t):
141 |         # forward
142 |         self.loss(x, t, train_flg=True)
143 | 
144 |         # backward
145 |         dout = 1
146 |         dout = self.last_layer.backward(dout)
147 | 
148 |         layers = list(self.layers.values())
149 |         layers.reverse()
150 |         for layer in layers:
151 |             dout = layer.backward(dout)
152 | 
153 |         # 设定
154 |         grads = {}
155 |         for idx in range(1, self.hidden_layer_num+2):
156 |             grads['W' + str(idx)] = self.layers['Affine' + str(idx)].dW + self.weight_decay_lambda * self.params['W' + str(idx)]
157 |             grads['b' + str(idx)] = self.layers['Affine' + str(idx)].db
158 | 
159 |             if self.use_batchnorm and idx != self.hidden_layer_num+1:
160 |                 grads['gamma' + str(idx)] = self.layers['BatchNorm' + str(idx)].dgamma
161 |                 grads['beta' + str(idx)] = self.layers['BatchNorm' + str(idx)].dbeta
162 | 
163 |         return grads


--------------------------------------------------------------------------------
/深度学习入门笔记.md:
--------------------------------------------------------------------------------
  1 | - 我们看着真值
  2 | 
  3 | 表这种“训练数据”，人工考虑（想到）了参数的值。而机器学习的课
  4 | 
  5 | 题就是将这个决定参数值的工作交由计算机自动进行。 学习是确定
  6 | 
  7 | 合适的参数的过程，而人要做的是思考感知机的构造（模型），并把
  8 | 
  9 | 训练数据交给计算机
 10 | 
 11 | 
 12 | 
 13 | - 偏置和权重w1、 w2的作用是不
 14 | 
 15 | 一样的。具体地说， w1和w2是控制输入信号的重要性的参数，而偏置是调
 16 | 
 17 | 整神经元被激活的容易程度（输出信号为1的程度）的参数
 18 | 
 19 | 
 20 | 
 21 | - 感知机的局限性就在于它只能表示由一条直线分割的空间。图2-8这样弯
 22 | 
 23 | 曲的曲线无法用感知机表示
 24 | 
 25 | 
 26 | 
 27 | - 严格地讲，应该是“单层感知机无法
 28 | 
 29 | 表示异或门”或者“单层感知机无法分离非线性空间”。接下来，我
 30 | 
 31 | 们将看到通过组合感知机（叠加层）就可以实现异或门
 32 | 
 33 | 
 34 | 
 35 | - 像这样，在异或门的感知机中，工人之间不断进行零件的传送。通过这
 36 | 
 37 | 样的结构（2层结构），感知机得以实现异或门。这可以解释为“单层感知机
 38 | 
 39 | 无法表示的东西，通过增加一层就可以解决”。也就是说，通过叠加层（加深
 40 | 
 41 | 层），感知机能进行更加灵活的表示
 42 | 
 43 | 
 44 | 
 45 | - 人们一般会认为计算机内部进行的处理非常复杂，而令人惊讶的是，实
 46 | 
 47 | 际上只需要通过与非门的组合，就能再现计算机进行的处理
 48 | 
 49 | 
 50 | 
 51 | - sigmoid函数的平滑性对神经网络的学习具有重要意义。
 52 | 
 53 | - 实际上，上一章介绍的感知机和接下来要介绍
 54 | 
 55 | 的神经网络的主要区别就在于这个激活函数。
 56 | 
 57 | 
 58 | 
 59 | - 使用线性函数时，无法发挥多层网络带来的优势。因此，为了发挥叠加层所
 60 | 
 61 | 带来的优势，激活函数必须使用非线性函数
 62 | 
 63 | 
 64 | 
 65 | - ReLU函数在输入大于0时，直接输出该值；在输入小于等于0时，输
 66 | 
 67 | 出0
 68 | 
 69 | 
 70 | 
 71 | - 在
 72 | 
 73 | 神经网络发展的历史上， sigmoid函数很早就开始被使用了，而最近则主要
 74 | 
 75 | 使用ReLU（Rectifed Linear Unit）函数
 76 | 
 77 | 
 78 | 
 79 | - 输出层所用的激活函数，要根据求解问题的性质决定。一般地，回
 80 | 
 81 | 归问题可以使用恒等函数，二元分类问题可以使用 sigmoid函数，
 82 | 
 83 | 多元分类问题可以使用 softmax函数。关于输出层的激活函数，我
 84 | 
 85 | 们将在下一节详细介绍
 86 | 
 87 | 
 88 | 
 89 | - 经网络可以用在分类问题和回归问题上，不过需要根据情况改变输出
 90 | 
 91 | 层的激活函数。一般而言，回归问题用恒等函数，分类问题用softmax函数。
 92 | 
 93 | 
 94 | 
 95 | - 分类问题中使用的softmax函数
 96 | 
 97 | exp(x)是表示ex的指数函数（e是纳皮尔常数2.7182 . . .）。式（3.10）表示
 98 | 
 99 | 假设输出层共有n个神经元，计算第k个神经元的输出yk。如式（3.10）所示，
100 | 
101 | softmax函数的分子是输入信号ak的指数函数，分母是所有输入信号的指数
102 | 
103 | 函数的和。
104 | 
105 | 
106 | 
107 | - 在进行 softmax 的指数函数的运算时,加上(或者减去)
108 | 
109 | 某个常数并不会改变运算的结果。这里的 C  可以使用任何值,但是为了防
110 | 
111 | 止溢出,一般会使用输入信号中的最大值
112 | 
113 | 
114 | 
115 | 
116 | 
117 | - softmax 函数的输出是 0.0 到 1.0 之间的实数。并且,softmax
118 | 
119 | 函数的输出值的总和是 1。输出总和为 1 是 softmax 函数的一个重要性质
120 | 
121 | 
122 | 
123 | - 一般而言,神经网络只把输出值最大的神经元所对应的类别作为识别结果。
124 | 
125 | 并且,即便使用 softmax 函数,输出值最大的神经元的位置也不会变。因此,
126 | 
127 | 神经网络在进行分类时,输出层的 softmax 函数可以省略
128 | 
129 | 
130 | 
131 | - 阶跃函数就像“竹筒敲石”一样,只在某个瞬间产生变化。而 sigmoid 函数,
132 | 
133 | 如图 4-4 所示,不仅函数的输出(竖轴的值)是连续变化的,曲线的斜率(导数)
134 | 
135 | 也是连续变化的。也就是说,sigmoid 函数的导数在任何地方都不为 0。这对
136 | 
137 | 神经网络的学习非常重要。得益于这个斜率不会为 0 的性质,神经网络的学
138 | 
139 | 习得以正确进行
140 | 
141 | 
142 | 
143 | - 我们把这里讨论的有多个变量的函数的导数称为偏导数  
144 | 
145 | - **梯度指示的方向
146 | 
147 | 是各点处的函数值减小最多的方向 A 。这是一个非常重要的性质,请一定
148 | 
149 | 牢记!** 
150 | 
151 | 
152 | 
153 | - 实验结果表明,学习率过大的话,会发散成一个很大的值;反过来,学
154 | 
155 | 习率过小的话,基本上没怎么更新就结束了。也就是说,设定合适的学习率
156 | 
157 | 是一个很重要的问题。
158 | 
159 | 
160 | 
161 | - 像学习率这样的参数称为超参数。这是一种和神经网络的参数(权重
162 | 
163 | 和偏置)性质不同的参数。相对于神经网络的权重参数是通过训练
164 | 
165 | 数据和学习算法自动获得的,学习率这样的超参数则是人工设定的。
166 | 
167 | 一般来说,超参数需要尝试多个值,以便找到一种可以使学习顺利
168 | 
169 | 进行的设定。
170 | 
171 | 
172 | 
173 | - 后面我们会详细讨论权重参数的初始化,这里只需要知道,权重使用符合高斯
174 | 
175 | 分 布 的 随 机 数 进 行 初 始 化,偏 置 使 用 0 进 行 初 始 化
176 | 
177 | 
178 | 
179 | - epoch 是一个单位。一个 epoch 表示学习中所有训练数据均被使用过
180 | 
181 | 一次时的更新次数
182 | 
183 | 
184 | 
185 | - 实线表示训练数据的识别精度,虚线表示测试数据的识别精
186 | 
187 | 度。如图所示,随着 epoch 的前进(学习的进行),我们发现使用训练数据和
188 | 
189 | 测试数据评价的识别精度都提高了,并且,这两个识别精度基本上没有差异(两
190 | 
191 | 条线基本重叠在一起)。因此,可以说这次的学习中没有发生过拟合的现象。
192 | 
193 | 
194 | 
195 | - 数值微分虽然费时间,但是实现起来很简单。下一章中要实现的稍
196 | 
197 | 微复杂一些的误差反向传播法可以高速地计算梯度
198 | 
199 | 
200 | 
201 | - 这里的第 2 歩“从左向右进行计算”是一种正方向上的传播,简称为正
202 | 
203 | 向传播 (forward propagation)。正向传播是从计算图出发点到结束点的传播。
204 | 
205 | 既然有正向传播这个名称,当然也可以考虑反向(从图上看的话,就是从右向左)
206 | 
207 | 的传播。实际上,这种传播称为反向传播 (backward propagation)。反向传
208 | 
209 | 播将在接下来的导数计算中发挥重要作用
210 | 
211 | 
212 | 
213 | - 几何中,仿射变换包括一次线性变换和一次平移,分别对应神经网络的加权和运算与加偏置运算
214 | 
215 | 
216 | 
217 | - 输入数据为张量(四维数据)的情况
218 | 
219 | 
220 | 
221 | - 数值微分的优点是实现简单,因此,一般情况下不太容易出错。而误差
222 | 
223 | 反向传播法的实现很复杂,容易出错。所以,经常会比较数值微分的结果和
224 | 
225 | 误差反向传播法的结果,以确认误差反向传播法的实现是否正确。确认数值
226 | 
227 | 微分求出的梯度结果和误差反向传播法求出的结果是否一致(严格地讲,是
228 | 
229 | 非常相近)的操作称为梯度确认 (gradient check)
230 | 
231 | 
232 | 
233 | - 如果我们把权重初始值全部设为 0 以减小权重的值,会怎么样呢?从结
234 | 
235 | 论来说,将权重初始值设为 0 不是一个好主意。事实上,将权重初始值设为
236 | 
237 | 0 的话,将无法正确进行学习。
238 | 
239 | 
240 | 
241 | - 这里使用的 sigmoid
242 | 
243 | 函数是 S 型函数,随着输出不断地靠近 0 (或者靠近 1),它的导数的值逐渐接
244 | 
245 | 近 0。因此,偏向 0 和 1 的数据分布会造成反向传播中梯度的值不断变小,最
246 | 
247 | 后消失。这个问题称为梯度消失 (gradient vanishing)。层次加深的深度学习
248 | 
249 | 中,梯度消失的问题可能会更加严重
250 | 
251 | 
252 | 
253 | - 各层的激活值的分布都要求有适当的广度。为什么呢?因为通过
254 | 
255 | 在各层间传递多样性的数据,神经网络可以进行高效的学习。反
256 | 
257 | 过来,如果传递的是有所偏向的数据,就会出现梯度消失或者“表
258 | 
259 | 现力受限”的问题,导致学习可能无法顺利进行。
260 | 
261 | 
262 | 
263 | - 机器学习中经常使用集成学习。所谓集成学习,就是让多个模型单
264 | 
265 | 独进行学习,推理时再取多个模型的输出的平均值。
266 | 
267 | 实验告诉我们,通过进行集成学习,神经网络的识别精度可以提高好几个百分点
268 | 
269 | 
270 | 
271 | - 除了权重和偏置等参数,超参数 (hyper-parameter)也经
272 | 
273 | 常出现。这里所说的超参数是指,比如各层的神经元数量、batch 大小、参
274 | 
275 | 数更新时的学习率或权值衰减等。如果这些超参数没有设置合适的值,模型
276 | 
277 | 的性能就会很差。
278 | 
279 | 
280 | 
281 | - 不能使用测试数据评估超参数的性能。
282 | 
283 | 为什么不能用测试数据评估超参数的性能呢?这是因为如果使用测试数
284 | 
285 | 据调整超参数,超参数的值会对测试数据发生过拟合。换句话说,用测试数
286 | 
287 | 据确认超参数的值的“好坏”,就会导致超参数的值被调整为只拟合测试数据。
288 | 
289 | 这样的话,可能就会得到不能拟合其他数据、泛化能力低的模型。
290 | 
291 | 
292 | 
293 | - 调整超参数时,必须使用超参数专用的确认数据。用于调整超参
294 | 
295 | 数的数据,一般称为验证数据 (validation data)。我们使用这个验证数据来
296 | 
297 | 评估超参数的好坏
298 | 
299 | 
300 | 
301 | - 分割训练数据前,先打乱了输入数据和教师标签。这是因为数据
302 | 
303 | 集的数据可能存在偏向(比如,数据从“0”到“10”按顺序排列等)。
304 | 
305 | np.random.shuffle(x) 这个函数会改变x的值，重新赋值。
306 | 
307 | 
308 | 
309 | -     permutation = np.random.permutation(x.shape[0]) 返回0到x.shape[0] 的排列
310 | 
311 | - 有报告 [15] 显示,在进行神经网络的超参数的最优化时,与网格搜索
312 | 
313 | 等有规律的搜索相比,随机采样的搜索方式效果更好。这是因为在
314 | 
315 | 多个超参数中,各个超参数对最终的识别精度的影响程度不同。
316 | 
317 | 
318 | 
319 | - 以上就是超参数的最优化的内容,简单归纳一下,如下所示。
320 | 
321 | * 步骤 0
322 | 
323 | 设定超参数的范围。
324 | 
325 | * 步骤 1
326 | 
327 | 从设定的超参数范围中随机采样。
328 | 
329 | * 步骤 2
330 | 
331 | 使用步骤 1 中采样到的超参数的值进行学习,通过验证数据评估识别精
332 | 
333 | 度(但是要将 epoch 设置得很小) 。
334 | 
335 | * 步骤 3
336 | 
337 | 重复步骤 1 和步骤 2 (100 次等),根据它们的识别精度的结果,缩小超参
338 | 
339 | 数的范围。
340 | 
341 | 
342 | 
343 | 反复进行上述操作,不断缩小超参数的范围,在缩小到一定程度时,从
344 | 
345 | 该范围中选出一个超参数的值。这就是进行超参数的最优化的一种方法。
346 | 
347 | 
348 | 
349 | *  参 数 的 更 新 方 法,除 了 SGD 之 外,还 有 Momentum、AdaGrad、
350 | 
351 | Adam 等方法。
352 | 
353 | * 权重初始值的赋值方法对进行正确的学习非常重要。
354 | 
355 | *  作为权重初始值,Xavier 初始值、He 初始值等比较有效。
356 | 
357 | *  通过使用 Batch Normalization,可以加速学习,并且对初始值变得
358 | 
359 | 健壮。
360 | 
361 | * 抑制过拟合的正则化技术有权值衰减、Dropout 等。
362 | 
363 | * 逐渐缩小“好值”存在的范围是搜索超参数的一个有效方法
364 | 
365 | 
366 | 
367 | - 全连接层存在什么问题呢?那就是数据的形状被“忽视”了。比如,输
368 | 
369 | 入数据是图像时,图像通常是高、长、通道方向上的 3 维形状。但是,向全
370 | 
371 | 连接层输入时,需要将 3 维数据拉平为 1 维数据。
372 | 
373 | 
374 | 
375 | 图像是 3 维形状,这个形状中应该含有重要的空间信息。比如,空间上
376 | 
377 | 邻近的像素为相似的值、RBG 的各个通道之间分别有密切的关联性、相距
378 | 
379 | 较远的像素之间没有什么关联等,3 维形状中可能隐藏有值得提取的本质模
380 | 
381 | 式。但是,因为全连接层会忽视形状,将全部的输入数据作为相同的神经元
382 | 
383 | (同一维度的神经元)处理,所以无法利用与形状相关的信息。
384 | 
385 | 
386 | 
387 | - 通过填充,大小为 (4, 4) 的输入数据变成了 (6, 6) 的形状。
388 | 
389 | 然后,应用大小为 (3, 3) 的滤波器,生成了大小为 (4, 4) 的输出数据。这个例
390 | 
391 | 子中将填充设成了 1,不过填充的值也可以设置成 2、 3 等任意的整数。在图 7-5
392 | 
393 | 的例子中,如果将填充设为 2,则输入数据的大小变为 (8, 8);如果将填充设
394 | 
395 | 为 3,则大小变为 (10, 10)。
396 | 
397 | 这个填充是指 填充的个数。 填充的内容为0
398 | 
399 | 
400 | 
401 | - 使用填充主要是为了调整输出的大小。比如,对大小为 (4, 4) 的输入
402 | 
403 | 数据应用 (3, 3) 的滤波器时,输出大小变为 (2, 2),相当于输出大小
404 | 
405 | 比输入大小缩小了 2 个元素。这在反复进行多次卷积运算的深度网
406 | 
407 | 络中会成为问题。为什么呢?因为如果每次进行卷积运算都会缩小
408 | 
409 | 空间,那么在某个时刻输出大小就有可能变为 1,导致无法再应用
410 | 
411 | 卷积运算。为了避免出现这样的情况,就要使用填充。在刚才的例
412 | 
413 | 子中,将填充的幅度设为 1,那么相对于输入大小 (4, 4),输出大小
414 | 
415 | 也保持为原来的 (4, 4)。因此,卷积运算就可以在保持空间大小不变
416 | 
417 | 的情况下将数据传给下一层。
418 | 
419 | 
420 | 
421 | 
422 | 
423 | - 在 3 维数据的卷积运算中,输入数据和滤波器的通道数
424 | 
425 | 要设为相同的值。在这个例子中,输入数据和滤波器的通道数一致,均为 3。
426 | 
427 | 滤波器大小可以设定为任意值(不过,每个通道的滤波器大小要全部相同)。
428 | 
429 | 这个例子中滤波器大小为 (3, 3),但也可以设定为 (2, 2)、(1, 1)、(5, 5) 等任
430 | 
431 | 意值。再强调一下,通道数只能设定为和输入数据的通道数相同的值(本例
432 | 
433 | 中为 3)。
434 | 
435 | 
436 | 


--------------------------------------------------------------------------------
/common/layers.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import numpy as np
  3 | from common.functions import *
  4 | from common.util import im2col, col2im
  5 | import time
  6 | 
  7 | class Relu:
  8 |     def __init__(self):
  9 |         self.mask = None
 10 | 
 11 |     def forward(self, x):
 12 |         self.mask = (x <= 0)
 13 |         out = x.copy()
 14 |         out[self.mask] = 0
 15 | 
 16 |         return out
 17 | 
 18 |     def backward(self, dout):
 19 |         dout[self.mask] = 0
 20 |         dx = dout
 21 | 
 22 |         return dx
 23 | 
 24 | 
 25 | class Sigmoid:
 26 |     def __init__(self):
 27 |         self.out = None
 28 | 
 29 |     def forward(self, x):
 30 |         out = sigmoid(x)
 31 |         self.out = out
 32 |         return out
 33 | 
 34 |     def backward(self, dout):
 35 |         dx = dout * (1.0 - self.out) * self.out
 36 | 
 37 |         return dx
 38 | 
 39 | 
 40 | class Affine:
 41 |     def __init__(self, W, b):
 42 |         self.W =W
 43 |         self.b = b
 44 |         
 45 |         self.x = None
 46 |         self.original_x_shape = None
 47 |         # 权重和偏置参数的导数
 48 |         self.dW = None
 49 |         self.db = None
 50 | 
 51 |     def forward(self, x):
 52 |         # 对应张量
 53 |         self.original_x_shape = x.shape
 54 |         #print('before ---->',x.shape)
 55 |         x = x.reshape(x.shape[0], -1)
 56 |         #print('after ---->',x.shape)
 57 |         self.x = x
 58 | 
 59 |         out = np.dot(self.x, self.W) + self.b
 60 | 
 61 |         return out
 62 | 
 63 |     def backward(self, dout):
 64 |         dx = np.dot(dout, self.W.T)
 65 |         self.dW = np.dot(self.x.T, dout)
 66 |         self.db = np.sum(dout, axis=0)
 67 |         
 68 |         dx = dx.reshape(*self.original_x_shape)  # 还原输入数据的形状（对应张量）
 69 |         return dx
 70 | 
 71 | 
 72 | class SoftmaxWithLoss:
 73 |     def __init__(self):
 74 |         self.loss = None
 75 |         self.y = None # softmax的输出
 76 |         self.t = None # 监督数据
 77 | 
 78 |     def forward(self, x, t):
 79 |         self.t = t
 80 |         self.y = softmax(x)
 81 |         self.loss = cross_entropy_error(self.y, self.t)
 82 |         
 83 |         return self.loss
 84 | 
 85 |     def backward(self, dout=1):
 86 |         batch_size = self.t.shape[0]
 87 |         if self.t.size == self.y.size: # 监督数据是one-hot-vector的情况
 88 |             dx = (self.y - self.t) / batch_size
 89 |         else:
 90 |             dx = self.y.copy()
 91 |             dx[np.arange(batch_size), self.t] -= 1
 92 |             dx = dx / batch_size
 93 |         
 94 |         return dx
 95 | 
 96 | 
 97 | class Dropout:
 98 |     """
 99 |     http://arxiv.org/abs/1207.0580
100 |     """
101 |     def __init__(self, dropout_ratio=0.5):
102 |         self.dropout_ratio = dropout_ratio
103 |         self.mask = None
104 | 
105 |     def forward(self, x, train_flg=True):
106 |         if train_flg:
107 |             self.mask = np.random.rand(*x.shape) > self.dropout_ratio
108 |             return x * self.mask
109 |         else:
110 |             return x * (1.0 - self.dropout_ratio)
111 | 
112 |     def backward(self, dout):
113 |         return dout * self.mask
114 | 
115 | # 归一化层 
116 | class BatchNormalization:
117 |     """
118 |     http://arxiv.org/abs/1502.03167
119 |     """
120 |     def __init__(self, gamma, beta, momentum=0.9, running_mean=None, running_var=None):
121 |         self.gamma = gamma
122 |         self.beta = beta
123 |         self.momentum = momentum
124 |         self.input_shape = None # Conv层的情况下为4维，全连接层的情况下为2维  
125 | 
126 |         # 测试时使用的平均值和方差
127 |         self.running_mean = running_mean
128 |         self.running_var = running_var  
129 |         
130 |         # backward时使用的中间数据
131 |         self.batch_size = None
132 |         self.xc = None
133 |         self.std = None
134 |         self.dgamma = None
135 |         self.dbeta = None
136 | 
137 |     def forward(self, x, train_flg=True):
138 |         self.input_shape = x.shape
139 |         if x.ndim != 2:
140 |             N, C, H, W = x.shape
141 |             x = x.reshape(N, -1)
142 | 
143 |         out = self.__forward(x, train_flg)
144 |         
145 |         return out.reshape(*self.input_shape)
146 |             
147 |     def __forward(self, x, train_flg):
148 |         if self.running_mean is None:
149 |             N, D = x.shape
150 |             self.running_mean = np.zeros(D)
151 |             self.running_var = np.zeros(D)
152 |                         
153 |         if train_flg:
154 |             mu = x.mean(axis=0)
155 |             xc = x - mu
156 |             var = np.mean(xc**2, axis=0)
157 |             std = np.sqrt(var + 10e-7)
158 |             xn = xc / std
159 |             
160 |             self.batch_size = x.shape[0]
161 |             self.xc = xc
162 |             self.xn = xn
163 |             self.std = std
164 |             self.running_mean = self.momentum * self.running_mean + (1-self.momentum) * mu
165 |             self.running_var = self.momentum * self.running_var + (1-self.momentum) * var            
166 |         else:
167 |             xc = x - self.running_mean
168 |             xn = xc / ((np.sqrt(self.running_var + 10e-7)))
169 |             
170 |         out = self.gamma * xn + self.beta 
171 |         return out
172 | 
173 |     def backward(self, dout):
174 |         if dout.ndim != 2:
175 |             N, C, H, W = dout.shape
176 |             dout = dout.reshape(N, -1)
177 | 
178 |         dx = self.__backward(dout)
179 | 
180 |         dx = dx.reshape(*self.input_shape)
181 |         return dx
182 | 
183 |     def __backward(self, dout):
184 |         dbeta = dout.sum(axis=0)
185 |         dgamma = np.sum(self.xn * dout, axis=0)
186 |         dxn = self.gamma * dout
187 |         dxc = dxn / self.std
188 |         dstd = -np.sum((dxn * self.xc) / (self.std * self.std), axis=0)
189 |         dvar = 0.5 * dstd / self.std
190 |         dxc += (2.0 / self.batch_size) * self.xc * dvar
191 |         dmu = np.sum(dxc, axis=0)
192 |         dx = dxc - dmu / self.batch_size
193 |         
194 |         self.dgamma = dgamma
195 |         self.dbeta = dbeta
196 |         
197 |         return dx
198 | 
199 | 
200 | class Convolution:
201 |     def __init__(self, W, b, stride=1, pad=0):
202 |         self.W = W
203 |         self.b = b
204 |         self.stride = stride
205 |         self.pad = pad
206 |         
207 |         # 中间数据（backward时使用）
208 |         self.x = None   
209 |         self.col = None
210 |         self.col_W = None
211 |         
212 |         # 权重和偏置参数的梯度
213 |         self.dW = None
214 |         self.db = None
215 | 
216 |     def forward(self, x):
217 |         FN, C, FH, FW = self.W.shape
218 |         N, C, H, W = x.shape
219 |         out_h = 1 + int((H + 2*self.pad - FH) / self.stride)
220 |         out_w = 1 + int((W + 2*self.pad - FW) / self.stride)
221 | 
222 |         col = im2col(x, FH, FW, self.stride, self.pad)
223 |         col_W = self.W.reshape(FN, -1).T
224 | 
225 |         out = np.dot(col, col_W) + self.b
226 |         out = out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2)
227 | 
228 |         self.x = x
229 |         self.col = col
230 |         self.col_W = col_W
231 | 
232 |         return out
233 | 
234 |     def backward(self, dout):
235 |         FN, C, FH, FW = self.W.shape
236 |         dout = dout.transpose(0,2,3,1).reshape(-1, FN)
237 | 
238 |         self.db = np.sum(dout, axis=0)
239 |         self.dW = np.dot(self.col.T, dout)
240 |         self.dW = self.dW.transpose(1, 0).reshape(FN, C, FH, FW)
241 | 
242 |         dcol = np.dot(dout, self.col_W.T)
243 |         dx = col2im(dcol, self.x.shape, FH, FW, self.stride, self.pad)
244 | 
245 |         return dx
246 | 
247 | 
248 | class Pooling:
249 |     def __init__(self, pool_h, pool_w, stride=1, pad=0):
250 |         self.pool_h = pool_h
251 |         self.pool_w = pool_w
252 |         self.stride = stride
253 |         self.pad = pad
254 |         
255 |         self.x = None
256 |         self.arg_max = None
257 | 
258 |     def forward(self, x):
259 |         N, C, H, W = x.shape
260 |         out_h = int(1 + (H - self.pool_h) / self.stride)
261 |         out_w = int(1 + (W - self.pool_w) / self.stride)
262 | 
263 |         col = im2col(x, self.pool_h, self.pool_w, self.stride, self.pad)
264 |         col = col.reshape(-1, self.pool_h*self.pool_w)
265 | 
266 |         arg_max = np.argmax(col, axis=1)
267 |         out = np.max(col, axis=1)
268 |         out = out.reshape(N, out_h, out_w, C).transpose(0, 3, 1, 2)
269 | 
270 |         self.x = x
271 |         self.arg_max = arg_max
272 | 
273 |         return out
274 | 
275 |     def backward(self, dout):
276 |         dout = dout.transpose(0, 2, 3, 1)
277 |         
278 |         pool_size = self.pool_h * self.pool_w
279 |         dmax = np.zeros((dout.size, pool_size))
280 |         dmax[np.arange(self.arg_max.size), self.arg_max.flatten()] = dout.flatten()
281 |         dmax = dmax.reshape(dout.shape + (pool_size,)) 
282 |         
283 |         dcol = dmax.reshape(dmax.shape[0] * dmax.shape[1] * dmax.shape[2], -1)
284 |         dx = col2im(dcol, self.x.shape, self.pool_h, self.pool_w, self.stride, self.pad)
285 |         
286 |         return dx
287 | 


--------------------------------------------------------------------------------