├── src ├── __init__.py ├── convnet │ ├── __init__.py │ ├── hyper_conv_mnist.py │ └── conv_mnist.py ├── neural │ ├── __init__.py │ ├── digit_nn.py │ ├── nn_overfit.py │ └── full_connect.py ├── rnn │ ├── __init__.py │ ├── word2vec.py │ ├── cbow.py │ ├── seq2seq.py │ ├── bigram_lstm.py │ ├── singlew_lstm.py │ ├── lstm.py │ └── embed_bigram_lstm.py ├── not_mnist │ ├── __init__.py │ ├── schedule.py │ ├── extract.py │ ├── logistic_train.py │ ├── load_data.py │ ├── img_pickle.py │ ├── merge_prune.py │ └── clean_overlap.py ├── num_stable.py └── soft_max.py ├── res ├── rnn.png ├── sgd.png ├── ipynb.png ├── relu.png ├── cbow_res.png ├── cmpcos.png ├── cnn_rnn.png ├── dropout.png ├── logistic.png ├── mem_cell.png ├── min_num.png ├── moment2.jpg ├── softmax.png ├── stride.png ├── word2vec.png ├── wxmoney.jpg ├── LSTM3-gate.png ├── RNN-rolled.png ├── SDG_param.png ├── add_layer.png ├── analogies.png ├── chain_rule.png ├── conv_lingo.png ├── gauss_init.png ├── inception.png ├── logistic2.png ├── logistic3.png ├── lstm_cell.png ├── lstm_gate.png ├── momentum1.jpg ├── num_stable.png ├── rnn_model.png ├── train_loss.png ├── vecanalogy.png ├── LSTM3-C-line.png ├── LSTM3-chain.png ├── LSTM3-focus-C.png ├── LSTM3-focus-f.png ├── LSTM3-focus-i.png ├── LSTM3-focus-o.png ├── LSTM3-var-GRU.png ├── RELU2Neural.png ├── RNN-unrolled.png ├── beam_search.png ├── conv_concept.png ├── conv_output.png ├── cross-entropy.png ├── gradient_clip.png ├── init_for_sdg.png ├── ipython_start.png ├── math_reason.png ├── normal_target.png ├── predictword.png ├── rnn_gradient.png ├── stable_linear.png ├── weight_loss.png ├── word2vec_res.png ├── 2_layer_neural.png ├── LSTM2-notation.png ├── LSTM3-SimpleRNN.png ├── LSTM3-focus-o-1.png ├── LSTM3-var-tied.png ├── avg_train_loss.png ├── deep_neural_abs.png ├── normal_optimize.png ├── train_loss_init.png ├── back_propagation.png ├── constant_derivate.png ├── early_termination.png ├── l2_regularization.png ├── linear_are_linear.png ├── linear_complexity.png ├── one_hot_encoding.png ├── LSTM3-var-peepholes.png ├── hard_scale_gradient.png ├── load_notminist_shot.png ├── RNN-shorttermdepdencies.png └── RNN-longtermdependencies.png ├── .gitignore ├── note ├── sklearn │ └── README.md ├── matplotlib │ └── README.md ├── lesson-2 │ ├── README.md │ ├── limit_linear.md │ ├── neural_network.md │ ├── dig_classifier.md │ ├── deep_network.md │ ├── neural_practical.md │ └── deep_network_practice.md ├── other.md ├── numpy │ └── README.md ├── lesson-1 │ ├── README.md │ ├── Stochastic_Optimization.md │ ├── logistic_classify.md │ └── practical.md ├── tensorflow │ └── README.md ├── hw │ └── gpu.md ├── lesson-3 │ ├── README.md │ └── practice.md └── lesson-4 │ ├── README.md │ ├── unstand_lstm.md │ └── rnn_practice.md └── README.md /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/convnet/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/neural/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/rnn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/not_mnist/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /res/rnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/rnn.png -------------------------------------------------------------------------------- /res/sgd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/sgd.png -------------------------------------------------------------------------------- /res/ipynb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/ipynb.png -------------------------------------------------------------------------------- /res/relu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/relu.png -------------------------------------------------------------------------------- /res/cbow_res.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/cbow_res.png -------------------------------------------------------------------------------- /res/cmpcos.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/cmpcos.png -------------------------------------------------------------------------------- /res/cnn_rnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/cnn_rnn.png -------------------------------------------------------------------------------- /res/dropout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/dropout.png -------------------------------------------------------------------------------- /res/logistic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/logistic.png -------------------------------------------------------------------------------- /res/mem_cell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/mem_cell.png -------------------------------------------------------------------------------- /res/min_num.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/min_num.png -------------------------------------------------------------------------------- /res/moment2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/moment2.jpg -------------------------------------------------------------------------------- /res/softmax.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/softmax.png -------------------------------------------------------------------------------- /res/stride.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/stride.png -------------------------------------------------------------------------------- /res/word2vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/word2vec.png -------------------------------------------------------------------------------- /res/wxmoney.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/wxmoney.jpg -------------------------------------------------------------------------------- /res/LSTM3-gate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/LSTM3-gate.png -------------------------------------------------------------------------------- /res/RNN-rolled.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/RNN-rolled.png -------------------------------------------------------------------------------- /res/SDG_param.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/SDG_param.png -------------------------------------------------------------------------------- /res/add_layer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/add_layer.png -------------------------------------------------------------------------------- /res/analogies.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/analogies.png -------------------------------------------------------------------------------- /res/chain_rule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/chain_rule.png -------------------------------------------------------------------------------- /res/conv_lingo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/conv_lingo.png -------------------------------------------------------------------------------- /res/gauss_init.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/gauss_init.png -------------------------------------------------------------------------------- /res/inception.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/inception.png -------------------------------------------------------------------------------- /res/logistic2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/logistic2.png -------------------------------------------------------------------------------- /res/logistic3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/logistic3.png -------------------------------------------------------------------------------- /res/lstm_cell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/lstm_cell.png -------------------------------------------------------------------------------- /res/lstm_gate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/lstm_gate.png -------------------------------------------------------------------------------- /res/momentum1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/momentum1.jpg -------------------------------------------------------------------------------- /res/num_stable.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/num_stable.png -------------------------------------------------------------------------------- /res/rnn_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/rnn_model.png -------------------------------------------------------------------------------- /res/train_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/train_loss.png -------------------------------------------------------------------------------- /res/vecanalogy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/vecanalogy.png -------------------------------------------------------------------------------- /res/LSTM3-C-line.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/LSTM3-C-line.png -------------------------------------------------------------------------------- /res/LSTM3-chain.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/LSTM3-chain.png -------------------------------------------------------------------------------- /res/LSTM3-focus-C.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/LSTM3-focus-C.png -------------------------------------------------------------------------------- /res/LSTM3-focus-f.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/LSTM3-focus-f.png -------------------------------------------------------------------------------- /res/LSTM3-focus-i.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/LSTM3-focus-i.png -------------------------------------------------------------------------------- /res/LSTM3-focus-o.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/LSTM3-focus-o.png -------------------------------------------------------------------------------- /res/LSTM3-var-GRU.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/LSTM3-var-GRU.png -------------------------------------------------------------------------------- /res/RELU2Neural.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/RELU2Neural.png -------------------------------------------------------------------------------- /res/RNN-unrolled.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/RNN-unrolled.png -------------------------------------------------------------------------------- /res/beam_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/beam_search.png -------------------------------------------------------------------------------- /res/conv_concept.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/conv_concept.png -------------------------------------------------------------------------------- /res/conv_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/conv_output.png -------------------------------------------------------------------------------- /res/cross-entropy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/cross-entropy.png -------------------------------------------------------------------------------- /res/gradient_clip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/gradient_clip.png -------------------------------------------------------------------------------- /res/init_for_sdg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/init_for_sdg.png -------------------------------------------------------------------------------- /res/ipython_start.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/ipython_start.png -------------------------------------------------------------------------------- /res/math_reason.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/math_reason.png -------------------------------------------------------------------------------- /res/normal_target.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/normal_target.png -------------------------------------------------------------------------------- /res/predictword.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/predictword.png -------------------------------------------------------------------------------- /res/rnn_gradient.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/rnn_gradient.png -------------------------------------------------------------------------------- /res/stable_linear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/stable_linear.png -------------------------------------------------------------------------------- /res/weight_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/weight_loss.png -------------------------------------------------------------------------------- /res/word2vec_res.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/word2vec_res.png -------------------------------------------------------------------------------- /res/2_layer_neural.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/2_layer_neural.png -------------------------------------------------------------------------------- /res/LSTM2-notation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/LSTM2-notation.png -------------------------------------------------------------------------------- /res/LSTM3-SimpleRNN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/LSTM3-SimpleRNN.png -------------------------------------------------------------------------------- /res/LSTM3-focus-o-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/LSTM3-focus-o-1.png -------------------------------------------------------------------------------- /res/LSTM3-var-tied.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/LSTM3-var-tied.png -------------------------------------------------------------------------------- /res/avg_train_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/avg_train_loss.png -------------------------------------------------------------------------------- /res/deep_neural_abs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/deep_neural_abs.png -------------------------------------------------------------------------------- /res/normal_optimize.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/normal_optimize.png -------------------------------------------------------------------------------- /res/train_loss_init.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/train_loss_init.png -------------------------------------------------------------------------------- /res/back_propagation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/back_propagation.png -------------------------------------------------------------------------------- /res/constant_derivate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/constant_derivate.png -------------------------------------------------------------------------------- /res/early_termination.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/early_termination.png -------------------------------------------------------------------------------- /res/l2_regularization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/l2_regularization.png -------------------------------------------------------------------------------- /res/linear_are_linear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/linear_are_linear.png -------------------------------------------------------------------------------- /res/linear_complexity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/linear_complexity.png -------------------------------------------------------------------------------- /res/one_hot_encoding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/one_hot_encoding.png -------------------------------------------------------------------------------- /res/LSTM3-var-peepholes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/LSTM3-var-peepholes.png -------------------------------------------------------------------------------- /res/hard_scale_gradient.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/hard_scale_gradient.png -------------------------------------------------------------------------------- /res/load_notminist_shot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/load_notminist_shot.png -------------------------------------------------------------------------------- /res/RNN-shorttermdepdencies.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/RNN-shorttermdepdencies.png -------------------------------------------------------------------------------- /res/RNN-longtermdependencies.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lite/GDLnotes/master/res/RNN-longtermdependencies.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | src/not_mnist/notMNIST_large/ 2 | src/not_mnist/notMNIST_small/ 3 | src/not_mnist/notMNIST_large.tar.gz 4 | src/not_mnist/notMNIST_small.tar.gz 5 | .idea/ 6 | *.pickle 7 | *.zip -------------------------------------------------------------------------------- /note/sklearn/README.md: -------------------------------------------------------------------------------- 1 | # sklearn笔记 2 | 3 | > 高效方便的机器学习库 4 | 5 | - 官方文档:http://scikit-learn.org/stable/documentation.html 6 | - Example:http://scikit-learn.org/stable/auto_examples/ 7 | 8 | 简述sklearn里我用过的一些类和函数 9 | -------------------------------------------------------------------------------- /src/num_stable.py: -------------------------------------------------------------------------------- 1 | # Numerical stability 2 | 3 | a = 1000000000 4 | for i in xrange(1000000): 5 | a += 1e-6 6 | print(a - 1000000000) 7 | 8 | a = 1 9 | for i in xrange(1000000): 10 | a += 1e-6 11 | print a - 1 -------------------------------------------------------------------------------- /note/matplotlib/README.md: -------------------------------------------------------------------------------- 1 | # matplotlib笔记 2 | > 机器学习常常需要进行数据可视化,matplotlib是python可视化最著名的库。 3 | 4 | - matplotlib [API文档](http://matplotlib.org/api/index.html) 5 | 6 | 7 | 常用方法: 8 | 9 | > pylot模块 10 | - hist 11 | - plot 12 | - show 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /note/lesson-2/README.md: -------------------------------------------------------------------------------- 1 | # Deep Neural Network 2 | 3 | - [Limit of Linear Model](limit_linear.md) 4 | - [Neural network](neural_network.md) 5 | - [神经网络实践](neural_practical.md) 6 | - 优化神经网络:[Deep Network](deep_network.md) 7 | - 防止深度神经网络过拟合 8 | - Regularization 9 | - Dropout 10 | - [深度神经网络实践](deep_network_practice.md) -------------------------------------------------------------------------------- /note/other.md: -------------------------------------------------------------------------------- 1 | # More about TensorFlow 2 | ## skflow 3 | 用tensorflow来处理训练sklearn的数据集 4 | 5 | ```python 6 | import skflow 7 | from sklearn import datasets, metrics 8 | 9 | iris = datasets.load_iris() 10 | classifier = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=3) 11 | classifier.fit(iris.data, iris.target) 12 | score = metrics.accuracy_score(iris.target, classifier.predict(iris.data)) 13 | print("Accuracy: %f" % score) 14 | ``` 15 | -------------------------------------------------------------------------------- /src/soft_max.py: -------------------------------------------------------------------------------- 1 | """Softmax.""" 2 | 3 | scores = [3.0, 1.0, 0.2] 4 | 5 | import numpy as np 6 | 7 | 8 | def softmax(x): 9 | return np.exp(x) / np.sum(np.exp(x), axis=0) 10 | 11 | 12 | print(softmax(scores)) 13 | 14 | # Plot softmax curves 15 | import matplotlib.pyplot as plt 16 | 17 | x = np.arange(-2.0, 6.0, 0.1) 18 | scores = np.vstack([x, np.ones_like(x), 0.2 * np.ones_like(x)]) 19 | 20 | plt.plot(x, softmax(scores).T, linewidth=2) 21 | plt.show() 22 | -------------------------------------------------------------------------------- /note/lesson-2/limit_linear.md: -------------------------------------------------------------------------------- 1 | # Limit of Linear Model 2 | - 实际要调整的参数很多 3 | ![](../../res/linear_complexity.png) 4 | 5 | > 如果有N个Class,K个Label,需要调整的参数就有(N+1)K个 6 | 7 | - Linear Model不能应对非线性的问题 8 | ![](../../res/linear_are_linear.png) 9 | - Linear Model的好处 10 | - GPU就是设计用于大矩阵相乘的,因此它们用来计算Linear Model非常高效 11 | - Stable:input的微小改变不会很大地影响output 12 | ![](../../res/stable_linear.png) 13 | - 求导方便:线性求导是常数 14 | ![](../../res/constant_derivate.png) 15 | - 我们想要参数函数是线性的,但整个model是非线性的 16 | - 所以需要对各个线性模型做非线性组合 17 | - 最简单的非线性组合:分段线性函数(RELU) 18 | ![](../../res/relu.png) -------------------------------------------------------------------------------- /note/lesson-2/neural_network.md: -------------------------------------------------------------------------------- 1 | # Neural network 2 | - 用一个RELU作为中介,一个Linear Model的输出作为其输入,其输出作为另一个Linear Model的输入,使其能够解决非线性问题 3 | 4 | ![](../../res/RELU2Neural.png) 5 | 6 | - 神经网络并不一定要完全像神经元那样工作 7 | - Chain Rule:复合函数求导规律 8 | 9 | ![](../../res/chain_rule.png) 10 | 11 | - Lots of data reuse and easy to implement(a simple data pipeline) 12 | - Back propagation 13 | 14 | ![](../../res/back_propagation.png) 15 | 16 | - 计算train_loss时,数据正向流入,计算梯度时,逆向计算 17 | - 计算梯度需要的内存和计算时间是计算train_loss的两倍 18 | 19 | - 利用上面的知识,结合lesson1中的SGD,训练一个全连接神经网络:[神经网络实践](neural_practical.md) -------------------------------------------------------------------------------- /note/numpy/README.md: -------------------------------------------------------------------------------- 1 | # numpy笔记 2 | > 机器学习常常需要fake数据,或者进行数据预处理,numpy是python科学计算的一把利器。 3 | 4 | - numpy [官方手册](http://docs.scipy.org/doc/numpy-1.10.1/genindex.html),支持字母检索 5 | 6 | 常用方法: 7 | 8 | - 生成数据: 9 | - arange: 生成一定范围内的数据 10 | - ones_like:生成与参数维度相同的数据 11 | - random模块:随机相关 12 | - np.random.shuffle:给一个ndarray做洗牌 13 | 14 | - 数学计算: 15 | - exp:自然指数 16 | - sum:求和 17 | - [numpy.linalg.norm](http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.norm.html):求模 18 | 19 | - 数据修改: 20 | - delete:从一个列表中删除 21 | - 数据格式化: 22 | - vstack:转为纵向向量 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /note/lesson-2/dig_classifier.md: -------------------------------------------------------------------------------- 1 | # 神经网络做数据分类 2 | ## 问题描述 3 | 给定两个范围在[-1, 1]之间的数字[x1, x2],求落在[-0.5, 0.5]之间的数字个数 4 | 5 | ## 思路 6 | - 构建一个神经网络,包含若干层,将一系列训练数据输入训练参数进行预测 7 | - 神经网络的每层用一个Relu(Wx+b)实现 8 | 9 | ## 问题分解 10 | - 构建数据集 11 | - 实际上,我们需要先判断x是否落在目标区间,是,则为1,不是则为0,将结果相加 12 | - 代码见train_data函数 13 | - 神经网络搭建 14 | - 思路同之前的[深度神经网络训练手写文字识别](deep_network_pratice.md) 15 | - 暂时不做优化 16 | - 仅搭建两层神经网络 17 | - 暂时不做validate,因为数据充分,每次训练都是新数据,新数据都相当于validate 18 | 19 | > 代码:[digit_nn.py](../../src/neural/digit_nn.py) 20 | 21 | ## 训练结果 22 | 仅截取最后一次结果为例 23 | ``` 24 | current first data [0.206416, 0.101028] 25 | current first predict: [0.000000, 0.000177, 0.999823] 26 | Minibatch loss at step 9980: 0.036539 27 | Minibatch accuracy: 100.0% 28 | ``` 29 | 30 | -------------------------------------------------------------------------------- /note/lesson-1/README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning to Deep Learning 2 | 3 | 深度学习 4 | - 我们可以在Personal Computer上完成庞大的任务 5 | - 深度学习是一种适应于各类问题的万能药 6 | 7 | 神经网络 8 | - 神经网络出现于80年代,但当时计算机运行慢,数据集很小,神经网络不适用 9 | - 现在神经网络回来了,因为能够进行GPU计算,可用使用的数据集也变大 10 | 11 | 分类 12 | 13 | > 分类的一些讨论可以在[这个项目](https://github.com/ahangchen/GoogleML/blob/master/note/lesson-2-viz/README.md)里看到 14 | 15 | - Machine Learning不仅是Classification!但分类是机器学习的核心。 16 | - 学会分类也就学会了Detect和Rank 17 | - Detect:从复杂场景中识别某类物品 18 | - Rank:从各种链接中找到与某个关键词相关的一类链接 19 | 20 | 21 | - [Logistic Classification](logistic_classify.md) 22 | - [Logistic Classification实践](practical.md) 23 | - [Stochastic Optimization](Stochastic_Optimization.md) 24 | 25 | 26 | > general data practices to train models 27 | 28 | > 觉得得我的文章对您有帮助的话,就给个[star](https://github.com/ahangchen/GDLnotes)吧~ -------------------------------------------------------------------------------- /src/not_mnist/schedule.py: -------------------------------------------------------------------------------- 1 | from not_mnist.clean_overlap import clean 2 | from not_mnist.extract import maybe_extract 3 | from not_mnist.img_pickle import maybe_pickle, save_obj 4 | from not_mnist.load_data import maybe_download 5 | from not_mnist.logistic_train import load_train 6 | from not_mnist.merge_prune import merge_datasets, randomize, merge_prune 7 | 8 | train_filename = maybe_download('notMNIST_large.tar.gz', 247336696) 9 | test_filename = maybe_download('notMNIST_small.tar.gz', 8458043) 10 | 11 | train_folders = maybe_extract(train_filename) 12 | test_folders = maybe_extract(test_filename) 13 | 14 | train_datasets = maybe_pickle(train_folders, 45000) 15 | test_datasets = maybe_pickle(test_folders, 1800) 16 | 17 | train_size = 200000 18 | valid_size = 10000 19 | test_size = 10000 20 | 21 | valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets( 22 | train_datasets, train_size, valid_size) 23 | _, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size) 24 | 25 | merge_prune(train_folders, test_folders) 26 | 27 | print('Training:', train_dataset.shape, train_labels.shape) 28 | print('Validation:', valid_dataset.shape, valid_labels.shape) 29 | print('Testing:', test_dataset.shape, test_labels.shape) 30 | clean() 31 | load_train() 32 | -------------------------------------------------------------------------------- /note/lesson-2/deep_network.md: -------------------------------------------------------------------------------- 1 | # Deep Neural Network 2 | ## Current two layer neural network: 3 | 4 | ![](../../res/2_layer_neural.png) 5 | 6 | ## 优化: 7 | - 优化RELU(隐藏层), wider 8 | - 增加linear层,layer deeper 9 | ![](../../res/add_layer.png) 10 | - Performance: few parameters by deeper 11 | - 随层级变高,获得的信息越综合,越符合目标 12 | ![](../../res/deep_neural_abs.png) 13 | ## About t-model 14 | - t-model只有在有大量数据时有效 15 | - 今天我们才有高效的大数据训练方法:Better Regularization 16 | - 难以决定适应问题的神经网络的规模,因此通常选择更大的规模,并防止过拟合 17 | 18 | ## Avoid Overfit 19 | ### Early Termination 20 | - 当训练结果与验证集符合度下降时,就停止训练 21 | ![](../../res/early_termination.png) 22 | 23 | ### Regularization 24 | - 给神经网络里加一些常量,做一些限制,减少自由的参数 25 | - L2 regularization 26 | 27 | ![](../../res/l2_regularization.png) 28 | 29 | 在计算train loss时,增加一个l2 norm作为新的损失,这里需要乘一个β(Hyper parameter),调整这个新的项的值 30 | 31 | > Hyper parameter:拍脑袋参数→_→ 32 | 33 | - l2模的导数容易计算,即W本身 34 | 35 | ### DropOut 36 | 最近才出现,效果极其好 37 | - 从一个layer到另一个layer的value被称为activation 38 | - 将一个layer到另一个layer的value的中,随机地取一半的数据变为0,这其实是将一半的数据直接丢掉 39 | - 由于数据缺失,所以就强迫了神经网络学习redundant的知识,以作为损失部分的补充 40 | - 由于神经网络中总有其他部分作为损失部分的补充,所以最后的结果还是OK的 41 | - More robust and prevent overfit 42 | - 如果这种方法不能生效,那可能就要使用更大的神经网络了 43 | 44 | - 评估神经网络时,就不需要DropOut,因为需要确切的结果 45 | - 可以将所有Activation做平均,作为评估的依据 46 | - 因为我们在训练时去掉了一半的随机数据,如果要让得到Activation正确量级的平均值,就需要将没去掉的数据翻倍 47 | ![](../../res/dropout.png) -------------------------------------------------------------------------------- /src/not_mnist/extract.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import sys 5 | import tarfile 6 | 7 | import numpy as np 8 | 9 | # from six.moves.urllib.request import urlretrieve 10 | # from six.moves import cPickle as pickle 11 | 12 | num_classes = 10 13 | np.random.seed(133) 14 | 15 | 16 | def maybe_extract(filename, force=False): 17 | root = os.path.splitext(os.path.splitext(filename)[0])[0] # remove .tar.gz 18 | if os.path.isdir(root) and not force: 19 | # You may override by setting force=True. 20 | print('%s already present - Skipping extraction of %s.' % (root, filename)) 21 | else: 22 | print('Extracting data for %s. This may take a while. Please wait.' % root) 23 | tar = tarfile.open(filename) 24 | sys.stdout.flush() 25 | tar.extractall() 26 | tar.close() 27 | data_folders = [ 28 | os.path.join(root, d) for d in sorted(os.listdir(root)) 29 | if os.path.isdir(os.path.join(root, d))] 30 | if len(data_folders) != num_classes: 31 | raise Exception( 32 | 'Expected %d folders, one per class. Found %d instead.' % ( 33 | num_classes, len(data_folders))) 34 | print(data_folders) 35 | return data_folders 36 | 37 | if __name__ == '__main__': 38 | train_folders = maybe_extract('notMNIST_large.tar.gz') 39 | test_folders = maybe_extract('notMNIST_small.tar.gz') 40 | -------------------------------------------------------------------------------- /src/not_mnist/logistic_train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | 5 | from sklearn.linear_model import LogisticRegression 6 | 7 | from not_mnist.img_pickle import load_pickle, save_obj 8 | 9 | 10 | def load_train(): 11 | datasets = load_pickle('notMNIST_clean.pickle') 12 | train_dataset = datasets['train_dataset'] 13 | train_labels = datasets['train_labels'] 14 | valid_dataset = datasets['valid_dataset'] 15 | valid_labels = datasets['valid_labels'] 16 | 17 | classifier_name = 'classifier.pickle' 18 | 19 | if os.path.exists(classifier_name): 20 | classifier = load_pickle(classifier_name) 21 | else: 22 | classifier = LogisticRegression() 23 | classifier.fit(train_dataset.reshape(train_dataset.shape[0], -1), train_labels) 24 | save_obj(classifier_name, classifier) 25 | 26 | # simple valid 27 | valid_idx_s = 3000 28 | valid_idx_e = 3014 29 | x = classifier.predict(valid_dataset.reshape(valid_dataset.shape[0], -1)[valid_idx_s: valid_idx_e]) 30 | print(x) 31 | print(valid_labels[valid_idx_s:valid_idx_e]) 32 | 33 | # whole valid 34 | x = classifier.predict(valid_dataset.reshape(valid_dataset.shape[0], -1)) 35 | fail_cnt = 0 36 | for i, pred in enumerate(x): 37 | if pred != valid_labels[i]: 38 | fail_cnt += 1 39 | print("success rate:" + str((1 - float(fail_cnt) / len(x)) * 100) + "%") 40 | 41 | if __name__ == '__main__': 42 | load_train() 43 | -------------------------------------------------------------------------------- /note/tensorflow/README.md: -------------------------------------------------------------------------------- 1 | # TensorFlow 安装教程 2 | ## Install TensorFlow 3 | 4 | 安装教程就在TensorFlow的github页上>>>[点击查看](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/get_started/os_setup.md) 5 | 6 | 按照官方的流程装就好了,这里讲一下几种方式的特点: 7 | 8 | 1. pip: 安装在全局的python解释器中,简单 9 | 10 | 2. Third party: Virtualenv, Anaconda and Docker:都能创建tensorflow独立的编译环境,但就是多了一份包 11 | 12 | 3. Source: 能够适应不同的python版本(比如编译一个3.5版的),但源码编译可能有许多坑 13 | 14 | - ubuntu安装时,需要注意自己的python - pip - tensorflow版本是否对应(比如是否都是2.7), 15 | - 使用sudo命令时,注意自己的环境变量是否变化(会导致pip或python命令对应的版本变化) 16 | - 具体讲一下ubuntu安装tensorflow流程: 17 | - 安装anaconda2 18 | - 确定自己终端的pip和python版本: 19 | ``` 20 | $ pip -V && python -V 21 | ``` 22 | 确认使用的是否都来自anaconda,如果不是,则应该使用类似这样的命令运行对应的pip: 23 | ``` 24 | $ /home/cwh/.conda/envs/tensorflow/bin/pip -V 25 | ``` 26 | 27 | 即最好安装到tensorflow自己的python环境里,不跟anaconda原来的环境混淆 28 | 29 | 使用sudo命令时最好也看一下版本 30 | 31 | - 使用anaconda创建一个tensorflow虚拟环境: 32 | ``` 33 | $ conda create -n tensorflow python=2.7 34 | ``` 35 | - 切换到tensorflow环境下(实际上是更换了环境变量里的pip和python),下载安装tensorflow,需要sudo权限 36 | ``` 37 | $ source activate tensorflow 38 | (tensorflow)$ sudo pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.8.0rc0-cp27-none-linux_x86_64.whl 39 | $ source deactivate 40 | ``` 41 | 注意如果安装的是gpu版本,还需要按照官网说明安装cuda和cudaCNN,具体教程看这个[视频](https://www.youtube.com/watch?v=cVWVRA8XXxs),不能科学上网的访问这个[地址](http://www.tudou.com/programs/view/MEnGrbSTui8/?bid=03&pid=02&resourceId=391713117_03_0_02) 42 | 43 | - 安装成功后就可以在tensorflow的python环境下,执行import tensorflow看看了。 44 | -------------------------------------------------------------------------------- /note/lesson-1/Stochastic_Optimization.md: -------------------------------------------------------------------------------- 1 | # Stochastic Optimization 2 | 3 | > Github工程地址:https://github.com/ahangchen/GDLnotes 4 | 5 | > 欢迎star,有问题可以到[Issue区](https://github.com/ahangchen/GDLnotes/issues)讨论 6 | 7 | > 官方教程[地址](https://classroom.udacity.com/courses/ud730/lessons/6370362152/concepts/63798118170923) 8 | 9 | > [视频](http://d2uz2655q5g6b2.cloudfront.net/6370362152/L1%20Machine%20Learning%20to%20Deep%20Learning%20Videos.zip)/[字幕](http://d2uz2655q5g6b2.cloudfront.net/6370362152/L1%20Machine%20Learning%20to%20Deep%20Learning%20Subtitles.zip)下载 10 | 11 | 12 | - 实践中大量机器学习都是通过梯度算子来求优化的 13 | - 但有一些问题,最大的问题就是,梯度很难计算 14 | - 我们要计算train loss,这需要基于整个数据集的数据做一个计算 15 | - 而计算使 train loss 下降最快的调整方向需要的时间是计算train loss本身的三倍 16 | 17 | ![](../../res/hard_scale_gradient.png) 18 | 19 | - 因此有了SGD:Stochastic Gradient Descent 20 | - 计算train loss时,只随机取一小部分数据集做为输入 21 | - 调整W和b时,调整的大小step需要比较小,因为数据集小,我们找到的不一定是对的方向 22 | - 这样也就增加了调整的次数 23 | - 但可观地减小了计算量 24 | 25 | ![](../../res/sgd.png) 26 | 27 | ## SGD的优化 28 | 29 | > 实际上SGD会使得每次寻找的方向都不是很准,因此有了这些优化 30 | 31 | - 随机的初始值 32 | 33 | ![](../../res/init_for_sdg.png) 34 | - Momentum 35 | 36 | > 考虑以前的平均调整方向来决定每一步的调整方向 37 | 38 | ![](../../res/momentum1.jpg) 39 | ![](../../res/momentum2.jpg) 40 | 41 | - Learning Rate Decay 42 | - 训练越靠近目标,步长应该越小 43 | 44 | - Parameter Hyperspace 45 | - Learning Rate(即调整的step)不是越大越好,可能有瓶颈 46 | - SGD有许多参数可以调整,所以被称为黑魔法 47 | 48 | ![](../../res/SDG_param.png) 49 | - AdaGurad 50 | - 自动执行momentum和learning rate decay 51 | - 使得SGD对参数不像原来那样敏感 52 | - 自动调整效果不如原来的好,但仍然是一个option 53 | 54 | > 觉得得我的文章对您有帮助的话,就给个[star](https://github.com/ahangchen/GDLnotes)吧~ -------------------------------------------------------------------------------- /note/hw/gpu.md: -------------------------------------------------------------------------------- 1 | # 浪潮 NF5280M3 GPU选型 2 | 3 | ## 必备条件 4 | - 支持Tensorflow(即算力超过3的NVIDIA显卡) 5 | 6 | > NVIDIA显卡算力:[link](https://developer.nvidia.com/cuda-gpus) 7 | 8 | - 可虚拟化:因为我们需要在浪潮服务器上虚拟化多台ubuntu做分布式运算,所以需要虚拟化 9 | 10 | NVIDIA Grid系列,支持GPU虚拟化的[成果](http://www.nvidia.cn/object/grid-boards-cn.html) 11 | 12 | > 兼容 VMware vSphere Hypervisor,甚至有专门的[驱动](http://www.nvidia.com/object/vmware-trygrid.html)和[教程](https://blogs.vmware.com/euc/2015/12/horizon-6-view-esri-arcgis-nvidia-dell-desktop-virtualization-appliance.html) 13 | 14 | - **初步锁定GRID K1或GRID K2** 15 | 16 | ## NF5280M3是否支持NVIDIA显卡? 17 | - 支持:[13年一次比赛](http://scc.ustc.edu.cn/yjdt/201305/t20130506_150923.html)就是用的这两家的产品 18 | - 使用的是NVIDIA的Tesla K20GPU加速卡 19 | - Tesla系列显卡算力可以在上面的显卡算力中查到, 20 | - NF5280M3的GPU能力: 21 | 22 | > http://www.jxkenuo.com/Info/View.Asp?id=309 23 | 24 | > https://citrixready.citrix.com/inspur-cn/nf5280m3-cn.html 25 | 26 | - NF5280M3标配参数[详情](http://www.inspur.com/lcjtww/443012/444624/447247/450192/450233/458384/index.html) 27 | 28 | ## Question 29 | - Grid显卡是否支持tensorflow/cuda? 30 | 31 | > 支持:[CUDA FAQ](https://developer.nvidia.com/cuda-faq)提及:CUDA is a standard feature in all NVIDIA GeForce, Quadro, and Tesla GPUs as well as NVIDIA GRID solutions. 32 | 33 | - Grid显卡和Tesla显卡是不同系列的吧,那么Grid显卡是否能够安装到NF5280M3上? 34 | 35 | - [Grid显卡介绍附图](http://www.nvidia.cn/object/grid-boards-cn.html) 36 | - [Tesla显卡介绍附图](http://www.nvidia.com/object/tesla-servers.html) 37 | - 观察图可以看到引脚是一致的 38 | - 可以咨询浪潮方面厂商 39 | 40 | - Grid显卡在算力表中没有,是否达到算力3?分布式虚拟化之后是否达到算力3? 41 | 42 | > 这篇[文章](http://www.brianmadden.com/opinion/Clearing-up-the-confusion-around-VMware-Nvidias-vGPU-vDGA-DaaS-announcement)做出了一些解释,用vGPU的方式可以实现高算力和多机共享GPU的效果 43 | -------------------------------------------------------------------------------- /note/lesson-3/README.md: -------------------------------------------------------------------------------- 1 | # Convolutional Networks 2 | 3 | > deep dive into images and convolutional models 4 | 5 | ## Convnet 6 | 7 | ### BackGround 8 | - 人眼在识别图像时,往往从局部到全局 9 | - 局部与局部之间联系往往不太紧密 10 | - 我们不需要神经网络中的每个结点都掌握全局的知识,因此可以从这里减少需要学习的参数数量 11 | 12 | ### Weight share 13 | - 但这样参数其实还是挺多的,所以有了另一种方法:权值共享 14 | 15 | > Share Parameters across space 16 | 17 | - 取图片的一小块,在上面做神经网络分析,会得到一些预测 18 | - 将切片做好的神经网络作用于图片的每个区域,得到一系列输出 19 | 20 | - 可以增加切片个数提取更多特征 21 | - 在这个过程中,梯度的计算跟之前是一样的 22 | 23 | ### Concept 24 | ![](../../res/conv_concept.png) 25 | - Patch/Kernel:一个局部切片 26 | - Depth: 数据的深度,图像数据是三维的,长宽和RGB,神经网络的预测输出也属于一维 27 | - Feature Map:每层Conv网络,因为它们将前一层的feature映射到后一层(Output map) 28 | ![](../../res/conv_lingo.png) 29 | - Stride: 移动切片的步长,影响取样的数量 30 | - 在边缘上的取样影响Conv层的面积,由于移动步长不一定能整除整张图的像素宽度,不越过边缘取样会得到Valid Padding, 越过边缘取样会得到Same Padding 31 | - Example 32 | ![](../../res/stride.png) 33 | 34 | - 用一个3x3的网格在一个28x28的图像上做切片并移动 35 | - 移动到边缘上的时候,如果不超出边缘,3x3的中心就到不了边界 36 | - 因此得到的内容就会缺乏边界的一圈像素点,只能得到26x26的结果 37 | - 而可以越过边界的情况下,就可以让3x3的中心到达边界的像素点 38 | - 超出部分的矩阵补零就行 39 | 40 | ## Deep Convnet 41 | 在Convnet上套Convnet,就可以一层一层综合局部得到的信息 42 | 43 | ## OutPut 44 | 将一个deep and narrow的feature层作为输入,传给一个Regular神经网络 45 | ![](../../res/conv_output.png) 46 | 47 | ## Optimization 48 | ### Pooling 49 | 将不同Stride的卷积用某种方式合并起来,节省卷积层的空间复杂度。 50 | 51 | - Max Pooling 52 | 在一个卷积层的输出层上取一个切片,取其中最大值代表这个切片 53 | - 优点 54 | - 不增加需要调整的参数 55 | - 通常比其他方法准确 56 | - 缺点:更多Hyper Parameter,包括要取最值的切片大小,以及去切片的步长 57 | 58 | > LENET-5, ALEXNET 59 | 60 | - Average Pooling 61 | 在卷积层输出中,取切片,取平均值代表这个切片 62 | 63 | ### 1x1 Convolutions 64 | 在一个卷积层的输出层上,加一个1x1的卷积层,这样就形成了一个小型的神经网络。 65 | - cheap for deeper model 66 | - 结合Average Pooling食用效果更加 67 | ### Inception 68 | 对同一个卷积层输出,执行各种二次计算,将各种结果堆叠到新输出的depth方向上 69 | ![](../../res/inception.png) 70 | 71 | ## [卷积神经网络实践](practice.md) 72 | 73 | ## 参考链接 74 | - 张雨石 [Conv神经网络](http://blog.csdn.net/stdcoutzyx/article/details/41596663) 75 | - Bill Xia [卷积神经网络(CNN)](http://ibillxia.github.io/blog/2013/04/06/Convolutional-Neural-Networks/) -------------------------------------------------------------------------------- /src/not_mnist/load_data.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | import sys 5 | 6 | # from six.moves.urllib.request import urlretrieve 7 | # from six.moves import cPickle as pickle 8 | from urllib import urlretrieve 9 | 10 | # %matplotlib inline 11 | 12 | # url = 'http://commondatastorage.googleapis.com/books1000/' 13 | # if the url above can't work, use this: 14 | 15 | last_percent_reported = None 16 | 17 | # First, we'll download the dataset to our local machine. 18 | # The data consists of characters rendered in a variety of fonts on a 28x28 image. 19 | # The labels are limited to 'A' through 'J' (10 classes). 20 | # The training set has about 500k and the testset 19000 labelled examples. 21 | # Given these sizes, it should be possible to train models quickly on any machine. 22 | 23 | 24 | def download_progress_hook(count, blockSize, totalSize): 25 | """A hook to report the progress of a download. This is mostly intended for users with 26 | slow internet connections. Reports every 1% change in download progress. 27 | """ 28 | global last_percent_reported 29 | percent = int(count * blockSize * 100 / totalSize) 30 | 31 | if last_percent_reported != percent: 32 | if percent % 5 == 0: 33 | sys.stdout.write("%s%%" % percent) 34 | sys.stdout.flush() 35 | else: 36 | sys.stdout.write(".") 37 | sys.stdout.flush() 38 | 39 | last_percent_reported = percent 40 | 41 | 42 | def maybe_download(filename, expected_bytes, url='http://yaroslavvb.com/upload/notMNIST/', force=False, ): 43 | """Download a file if not present, and make sure it's the right size.""" 44 | if force or not os.path.exists(filename): 45 | print('Attempting to download:', filename) 46 | filename, _ = urlretrieve(url + filename, filename, reporthook=download_progress_hook) 47 | print('\nDownload Complete!') 48 | statinfo = os.stat(filename) 49 | if statinfo.st_size == expected_bytes: 50 | print('Found and verified', filename) 51 | else: 52 | raise Exception( 53 | 'Failed to verify ' + filename + '. Can you get to it with a browser?') 54 | return filename 55 | 56 | 57 | if __name__ == '__main__': 58 | train_filename = maybe_download('notMNIST_large.tar.gz', 247336696) 59 | test_filename = maybe_download('notMNIST_small.tar.gz', 8458043) 60 | 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Google Deep Learning Notes 2 | 3 | > Google 深度学习笔记 4 | 5 | > Github工程地址:https://github.com/ahangchen/GDLnotes 6 | 7 | > 欢迎star,有问题可以到[Issue区](https://github.com/ahangchen/GDLnotes/issues)讨论 8 | 9 | > 官方教程[地址](https://classroom.udacity.com/courses/ud730/lessons/6370362152/concepts/63798118170923) 10 | 11 | > [视频](http://d2uz2655q5g6b2.cloudfront.net/6370362152/L1%20Machine%20Learning%20to%20Deep%20Learning%20Videos.zip)/[字幕](http://d2uz2655q5g6b2.cloudfront.net/6370362152/L1%20Machine%20Learning%20to%20Deep%20Learning%20Subtitles.zip)下载 12 | 13 | > 最近tensorflow团队出了一个[model项目](https://github.com/tensorflow/models),和这个课程无关,但是可以参考 14 | 15 | 16 | 框架: TensorFlow ([安装教程](https://github.com/ahangchen/GDLnotes/tree/master/note/tensorflow)) 17 | > 谷歌出品的基于Python的深度学习工具集 18 | 19 | 工具:Ipython, Pycharm 20 | 21 | 笔记列表 22 | - Lesson 1 [Machine Learning to Deep Learning](note/lesson-1/README.md) 23 | 24 | > train your first simple model entirely end to end 25 | 26 | - [Logistic Classification](note/lesson-1/logistic_classify.md) 27 | - [Logistic Classification实践](note/lesson-1/practical.md) 28 | - [Stochastic Optimization](note/lesson-1/Stochastic_Optimization.md) 29 | 30 | > general data practices to train models 31 | 32 | - Lesson 2 [Deep Neural Network](note/lesson-2/README.md) 33 | - [Limit of Linear Model](note/lesson-2/limit_linear.md) 34 | - [Neural network](note/lesson-2/neural_network.md) 35 | - [神经网络实践](note/lesson-2/neural_practical.md) 36 | - 优化神经网络:[Deep Network](note/lesson-2/deep_network.md) 37 | - 防止深度神经网络过拟合 38 | - Regularization 39 | - Dropout 40 | - [深度神经网络实践](note/lesson-2/deep_network_practice.md) 41 | 42 | > train your first deep network; Train even bigger models; 43 | 44 | - Lesson 3 [Convolutional Networks](note/lesson-3/README.md) 45 | 46 | > deep dive into images and convolutional models 47 | 48 | - [卷积神经网络实践](note/lesson-3/practice.md) 49 | 50 | - Lessson 4 [Deep Models for Text and Sequence](note/lesson-4/README.md) 51 | 52 | > Deep Models for Text and Sequence 53 | 54 | - Challenge 55 | - Model 56 | - Sequence 57 | - [循环神经网络实践](note/lesson-4/rnn_practice.md) 58 | 59 | 60 | - [More about TensorFlow](note/other.md) 61 | 62 | 附录: 63 | - [NumPy笔记](note/numpy/README.md)(待完善) 64 | - [matplotlib笔记](note/matplotlib/README.md)(待完善) 65 | - [sklearn笔记](note/sklearn/README.md)(待完善) 66 | 67 | > 觉得我的文章对您有帮助的话,就给个[star](https://github.com/ahangchen/GDLnotes)吧~ 68 | 69 | -------------------------------------------------------------------------------- /src/neural/digit_nn.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | 6 | def div(xt): 7 | label1 = int(abs(xt[0]) < 0.5) 8 | label2 = int(abs(xt[1]) < 0.5) 9 | return label1 + label2 10 | 11 | 12 | def train_data(): 13 | inputs = [[random.uniform(-1, 1), random.uniform(-1, 1)] for i in range(100000)] 14 | labels = np.asarray([div(x_t) for x_t in inputs]) 15 | labels = (np.arange(3) == labels[:, None]).astype(np.float32) 16 | 17 | print(inputs[0]) 18 | print(div(inputs[0])) 19 | print(labels[0]) 20 | return inputs, labels 21 | 22 | 23 | def accuracy(predictions, train_labels): 24 | return 100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(train_labels, 1)) / predictions.shape[0] 25 | 26 | 27 | def dig_nn(dataset, train_labels, batch_size, data_count, label_count): 28 | graph = tf.Graph() 29 | with graph.as_default(): 30 | tf_train_dataset = tf.placeholder(tf.float32, 31 | shape=(batch_size, data_count)) 32 | tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, label_count)) 33 | hidden_node_count = [10, 10] 34 | wi = tf.Variable(tf.truncated_normal([data_count, hidden_node_count[0]])) 35 | bi = tf.Variable(tf.zeros([hidden_node_count[0]])) 36 | 37 | y1 = tf.matmul(tf_train_dataset, wi) + bi 38 | h1 = tf.nn.relu(y1) 39 | 40 | w0 = tf.Variable(tf.truncated_normal([hidden_node_count[0], hidden_node_count[1]])) 41 | b0 = tf.Variable(tf.zeros([hidden_node_count[1]])) 42 | 43 | y2 = tf.matmul(h1, w0) + b0 44 | h2 = tf.nn.relu(y2) 45 | 46 | wo = tf.Variable(tf.truncated_normal([hidden_node_count[1], label_count])) 47 | bo = tf.Variable(tf.zeros([label_count])) 48 | 49 | logits = tf.matmul(h2, wo) + bo 50 | train_prediction = tf.nn.softmax(logits) 51 | loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) 52 | optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss) 53 | 54 | num_steps = 1000 55 | 56 | with tf.Session(graph=graph) as session: 57 | tf.initialize_all_variables().run() 58 | print("Initialized") 59 | for step in range(num_steps): 60 | batch_data = dataset[step * batch_size: (step + 1) * batch_size] 61 | batch_labels = train_labels[step * batch_size: (step + 1) * batch_size] 62 | 63 | feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels} 64 | _, l, predictions = session.run( 65 | [optimizer, loss, train_prediction], feed_dict=feed_dict) 66 | if step % 10 == 0: 67 | print('=' * 80) 68 | cur_first_data = dataset[step * batch_size: (step + 1) * batch_size][0] 69 | print('current first data [%f, %f]' % (cur_first_data[0], cur_first_data[1])) 70 | print('current first predict: [%f, %f, %f]' % (predictions[0][0], predictions[0][1], predictions[0][2])) 71 | print("Minibatch loss at step %d: %f" % (step, l)) 72 | print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels)) 73 | 74 | if __name__ == '__main__': 75 | inputs, labels = train_data() 76 | dig_nn(inputs, labels, 100, 2, 3) 77 | -------------------------------------------------------------------------------- /note/lesson-3/practice.md: -------------------------------------------------------------------------------- 1 | # 卷积神经网络实践 2 | ## 数据处理 3 | - dataset处理成四维的,label仍然作为one-hot encoding 4 | ```python 5 | def reformat(dataset, labels, image_size, num_labels, num_channels): 6 | dataset = dataset.reshape( 7 | (-1, image_size, image_size, num_channels)).astype(np.float32) 8 | labels = (np.arange(num_labels) == labels[:, None]).astype(np.float32) 9 | return dataset, labels 10 | ``` 11 | - 将lesson2的dnn转为cnn很简单,只要把WX+b改为conv2d(X)+b即可 12 | - 关键在于conv2d 13 | - - - 14 | 15 | ### `tf.nn.conv2d(input, filter, strides, padding, use_cudnn_on_gpu=None, data_format=None, name=None)` {#conv2d} 16 | 17 | 给定四维的`input`和`filter` tensor,计算一个二维卷积 18 | 19 | ##### Args: 20 | 21 | 22 | * `input`: A `Tensor`. type必须是以下几种类型之一: `half`, `float32`, `float64`. 23 | * `filter`: A `Tensor`. type和`input`必须相同 24 | * `strides`: A list of `ints`.一维,长度4, 在`input`上切片采样时,每个方向上的滑窗步长,必须和format指定的维度同阶 25 | * `padding`: A `string` from: `"SAME", "VALID"`. padding 算法的类型 26 | * `use_cudnn_on_gpu`: An optional `bool`. Defaults to `True`. 27 | * `data_format`: An optional `string` from: `"NHWC", "NCHW"`, 默认为`"NHWC"`。 28 | 指定输入输出数据格式,默认格式为"NHWC", 数据按这样的顺序存储: 29 | `[batch, in_height, in_width, in_channels]` 30 | 也可以用这种方式:"NCHW", 数据按这样的顺序存储: 31 | `[batch, in_channels, in_height, in_width]` 32 | * `name`: 操作名,可选. 33 | 34 | ##### Returns: 35 | 36 | A `Tensor`. type与`input`相同 37 | 38 | Given an input tensor of shape `[batch, in_height, in_width, in_channels]` 39 | and a filter / kernel tensor of shape 40 | `[filter_height, filter_width, in_channels, out_channels]` 41 | 42 | conv2d实际上执行了以下操作: 43 | 44 | 1. 将filter转为二维矩阵,shape为 45 | `[filter_height * filter_width * in_channels, output_channels]`. 46 | 2. 从input tensor中提取image patches,每个patch是一个*virtual* tensor,shape`[batch, out_height, out_width, 47 | filter_height * filter_width * in_channels]`. 48 | 3. 将每个filter矩阵和image patch向量相乘 49 | 50 | 具体来讲,当data_format为NHWC时: 51 | 52 | output[b, i, j, k] = 53 | sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] * 54 | filter[di, dj, q, k] 55 | 56 | input 中的每个patch都作用于filter,每个patch都能获得其他patch对filter的训练 57 | 需要满足`strides[0] = strides[3] = 1`. 大多数水平步长和垂直步长相同的情况下:`strides = [1, stride, stride, 1]`. 58 | - - - 59 | 60 | - 然后再接一个WX+b连Relu连WX+b的全连接神经网络即可 61 | 62 | ## Max Pooling 63 | 在tf.nn.conv2d后面接tf.nn.max_pool,将卷积层输出减小,从而减少要调整的参数 64 | 65 | ### `tf.nn.max_pool(value, ksize, strides, padding, data_format='NHWC', name=None)` {#max_pool} 66 | 67 | Performs the max pooling on the input. 68 | 69 | ##### Args: 70 | 71 | 72 | * `value`: A 4-D `Tensor` with shape `[batch, height, width, channels]` and 73 | type `tf.float32`. 74 | * `ksize`: A list of ints that has length >= 4. 要执行取最值的切片在各个维度上的尺寸 75 | * `strides`: A list of ints that has length >= 4. 取切片的步长 76 | * `padding`: A string, either `'VALID'` or `'SAME'`. padding算法 77 | * `data_format`: A string. 'NHWC' and 'NCHW' are supported. 78 | * `name`: 操作名,可选 79 | 80 | ##### Returns: 81 | 82 | A `Tensor` with type `tf.float32`. The max pooled output tensor. 83 | 84 | - - - 85 | 86 | ## 优化 87 | 仿照lesson2,添加learning rate decay 和 drop out,可以将准确率提高到90.6% 88 | 89 | ## 补充 90 | - 最近在用GPU版本的TensorFlow,发现,如果import tensorflow放在代码第一行,运行会报段错误(pycharm debug模式下不会),因此最好在import tensorflow前import numpy或者其他的module 91 | 92 | ## 参考链接 93 | - [Tensorflow 中 conv2d 都干了啥](http://stackoverflow.com/questions/34619177/what-does-tf-nn-conv2d-do-in-tensorflow) 94 | - [TensorFlow Example](https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/convolutional_network.py) -------------------------------------------------------------------------------- /note/lesson-4/README.md: -------------------------------------------------------------------------------- 1 | # Deep Models for Text and Sequence 2 | 3 | ## Rare Event 4 | 与其他机器学习不同,在文本分析里,陌生的东西(rare event)往往是最重要的,而最常见的东西往往是最不重要的。 5 | 6 | ## 语法多义性 7 | - 一个东西可能有多个名字,对这种related文本能够做参数共享是最好的 8 | - 需要识别单词,还要识别其关系,就需要过量label数据 9 | 10 | ## 无监督学习 11 | - 不用label进行训练,训练文本是非常多的,关键是要找到训练的内容 12 | - 遵循这样一个思想:相似的词汇出现在相似的场景中 13 | - 不需要知道一个词真实的含义,词的含义由它所处的历史环境决定 14 | 15 | ## Embeddings 16 | - 将单词映射到一个向量(Word2Vec),越相似的单词的向量会越接近 17 | - 新的词可以由语境得到共享参数 18 | 19 | ## Word2Vec 20 | ![](../../res/word2vec.png) 21 | 22 | - 将每个词映射到一个Vector列表(就是一个Embeddings)里,一开始随机,用这个Embedding进行预测 23 | - Context即Vector列表里的邻居 24 | - 目标是让Window里相近的词放在相邻的位置,即预测一个词的邻居 25 | - 用来预测这些相邻位置单词的模型只是一个Logistics Regression, just a simple Linear model 26 | ### Comparing embeddings 27 | - 比较两个vector之间的夹角大小来判断接近程度,用cos值而非L2计算,因为vector的长度和分类是不相关的: 28 | 29 | ![](../../res/cmpcos.png) 30 | 31 | - 最好将要计算的vector都归一化 32 | 33 | ### Predict Words 34 | 35 | ![](../../res/predictword.png) 36 | 37 | - 单词经过embedding变成一个vector 38 | - 然后输入一个WX+b,做一个线性模型 39 | - 输出的label概率为输入文本中的词汇 40 | - 问题在于WX+b输出时,label太多了,计算这种softmax很低效 41 | - 解决方法是,筛掉不可能是目标的label,只计算某个label在某个局部的概率,sample softmax 42 | 43 | ## t-SNE 44 | - 查看某个词在embedding里的最近邻居可以看到单词间的语义接近关系 45 | - 将vector构成的空间降维,可以更高效地查找最近单词,但降维过程中要保持邻居关系(原来接近的降维后还要接近) 46 | - t-SNE就是这样一种有效的方法 47 | 48 | ## 类比 49 | - 实际上我们能得到的不仅是单词的邻接关系,由于将单词向量化,可以对单词进行计算 50 | - 可以通过计算进行语义加减,语法加减 51 | 52 | ![](../../res/analogies.png) 53 | 54 | ![](../../res/vecanalogy.png) 55 | 56 | ## Sequence 57 | 文本(Text)是单词(word)的序列,一个关键特点是长度可变,就不能直接变为vector 58 | 59 | ### CNN and RNN 60 | CNN 在空间上共享参数,RNN在时间上(顺序上)共享参数 61 | ![](../../res/cnn_rnn.png) 62 | 63 | - 在每轮训练中,需要判断至今为之发生了什么,过去输入的所有数据都对当下的分类造成影响 64 | - 一种思路是记忆之前的分类器的状态,在这个基础上训练新的分类器,从而结合历史影响 65 | - 这样需要大量历史分类器 66 | - 重用分类器,只用一个分类器总结状态,其他分类器接受对应时间的训练,然后传递状态 67 | 68 | ![](../../res/rnn.png) 69 | 70 | ### RNN Derivatives 71 | - BackPropagation Through time 72 | - 对同一个weight参数,会有许多求导操作同时更新之 73 | - 对SGD不友好,因为SGD是用许多不相关的求导更新参数,以保证训练的稳定性 74 | - 由于梯度之间的相关性,导致梯度爆炸或者梯度消失 75 | 76 | ![](../../res/rnn_gradient.png) 77 | 78 | - 使得训练时找不到优化方向,训练失败 79 | 80 | #### Clip Gradient 81 | 计算到梯度爆炸的时候,使用一个比值来代替△W(梯度是回流计算的,横坐标从右往左看) 82 | 83 | ![](../../res/gradient_clip.png) 84 | 85 | - Hack but cheap and effective 86 | 87 | #### LSTM(Long Short-Term Memory) 88 | 梯度消失会导致分类器只对最近的消息的变化有反应,淡化以前训练的参数,也不能用比值的方法来解决 89 | - 一个RNN的model包含两个输入,一个是过去状态,一个是新的数据,两个输出,一个是预测,一个是将来状态 90 | 91 | ![](../../res/rnn_model.png) 92 | 93 | - 中间是一个简单的神经网络 94 | - 将中间的部分换成LSTM-cell就能解决梯度消失问题 95 | - 我们的目的是提高RNN的记忆能力 96 | - Memory Cell 97 | 98 | ![](../../res/mem_cell.png) 99 | 100 | 三个门,决定是否写/读/遗忘/写回 101 | 102 | - 在每个门上,不单纯做yes/no的判断,而是使用一个权重,决定对输入的接收程度 103 | - 这个权重是一个连续的函数,可以求导,也就可以进行训练,这是LSTM的核心 104 | 105 | ![](../../res/lstm_gate.png) 106 | - 用一个逻辑回归训练这些门,在输出进行归一化 107 | 108 | ![](../../res/lstm_cell.png) 109 | 110 | - 这样的模型能让整个cell更好地记忆与遗忘 111 | - 由于整个模型都是线性的,所以可以方便地求导和训练 112 | 113 | - 关于lstm有这样一篇博客讲的很好:[地址](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) 114 | - 稍微翻了一个[中文版](unstand_lstm.md) 115 | 116 | #### LSTM Regularization 117 | - L2, works 118 | - Dropout on the input or output of data, works 119 | 120 | ### Beam Search 121 | 有了上面的模型之后,我们可以根据上文来推测下文,甚至创造下文,预测,筛选最大概率的词,喂回,继续预测…… 122 | 123 | ![](../../res/beam_search.png) 124 | 125 | - 我们可以每次只预测一个字母,but this is greedy,每次都挑最好的那个 126 | - 也可以每次多预测几步,然后挑整体概率较高的那个,以减少偶然因素的影响 127 | - 但这样需要生成的sequence会指数增长 128 | - 因此我们在多预测几步的时候,只为概率比较高的几个候选项做预测,that's beam search. 129 | 130 | ## 翻译与识图 131 | - RNN将variable length sequence问题变成了fixed length vector问题,同时因为实际上我们能利用vector进行预测,我们也可以将vector变成sequence 132 | 133 | - 我们可以利用这一点,输入一个序列,到一个RNN里,将输出输入到另一个逆RNN序列,形成另一种序列,比如,语言翻译 134 | - 如果我们将CNN的输出接到一个RNN,就可以做一种识图系统 135 | 136 | ## [循环神经网络实践](rnn_practice.md) -------------------------------------------------------------------------------- /note/lesson-2/neural_practical.md: -------------------------------------------------------------------------------- 1 | # 全连接神经网络 2 | 辅助阅读:[TensorFlow中文社区教程](http://www.tensorfly.cn/tfdoc/tutorials/mnist_tf.html) - [英文官方教程](https://www.tensorflow.org/versions/r0.8/tutorials/mnist/pros/index.html#train-the-model) 3 | 4 | > 代码见:[full_connect.py](../../src/sgd/full_connect.py) 5 | 6 | ## Linear Model 7 | - 加载lesson 1中的数据集 8 | - 将Data降维成一维,将label映射为one-hot encoding 9 | ```python 10 | def reformat(dataset, labels): 11 | dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32) 12 | # Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...] 13 | labels = (np.arange(num_labels) == labels[:, None]).astype(np.float32) 14 | return dataset, labels 15 | ``` 16 | ### TensorFlow Graph 17 | - 使用梯度计算train_loss,用tf.Graph()创建一个计算单元 18 | - 用tf.constant将dataset和label转为tensorflow可用的训练格式(训练中不可修改) 19 | - 用tf.truncated_normal生成正太分布的数据,作为W的初始值,初始化b为可变的0矩阵 20 | - 用tf.variable将上面的矩阵转为tensorflow可用的训练格式(训练中可以修改) 21 | - 用tf.matmul实现矩阵相乘,计算WX+b,这里实际上logit只是一个变量,而非结果 22 | - 用tf.nn.softmax_cross_entropy_with_logits计算WX+b的结果相较于原来的label的train_loss,并求均值 23 | - 使用梯度找到最小train_loss 24 | ```python 25 | optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss) 26 | ``` 27 | - 计算相对valid_dataset和test_dataset对应的label的train_loss 28 | 29 | > 上面这些变量都是一种Tensor的概念,它们是一个个的计算单元,我们在Graph中设置了这些计算单元,规定了它们的组合方式,就好像把一个个门电路串起来那样 30 | 31 | ### TensorFLow Session 32 | Session用来执行Graph里规定的计算,就好像给一个个门电路通上电,我们在Session里,给计算单元冲上数据,That’s Flow. 33 | - 重复计算单元反复训练800次,提高其准确度 34 | - 为了快速查看训练效果,每轮训练只给10000个训练数据(subset),恩,每次都是相同的训练数据 35 | - 将计算单元graph传给session 36 | - 初始化参数 37 | - 传给session优化器 - train_loss的梯度optimizer,训练损失 - train_loss,每次的预测结果,循环执行训练 38 | ```python 39 | with tf.Session(graph=graph) as session: 40 | tf.initialize_all_variables().run() 41 | for step in range(num_steps): 42 | _, l, predictions = session.run([optimizer, loss, train_prediction]) 43 | ``` 44 | - 在循环过程中,W和b会保留,并不断得到修正 45 | - 在每100次循环后,会用验证集进行验证一次,验证也同时修正了一部分参数 46 | ```python 47 | valid_prediction.eval() 48 | ``` 49 | - 最后用测试集进行测试 50 | - 注意如果lesson 1中没有对数据进行乱序化,可能训练集预测准确度很高,验证集和测试集准确度会很低 51 | 52 | > 这样训练的准确度为83.2% 53 | 54 | ## SGD 55 | - 每次只取一小部分数据做训练,计算loss时,也只取一小部分数据计算loss 56 | - 对应到程序中,即修改计算单元中的训练数据, 57 | - 每次输入的训练数据只有128个,随机取起点,取连续128个数据: 58 | ```python 59 | offset = (step * batch_size) % (train_labels.shape[0] - batch_size) 60 | batch_data = train_dataset[offset:(offset + batch_size), :] 61 | batch_labels = train_labels[offset:(offset + batch_size), :] 62 | ``` 63 | - 由于这里的数据是会变化的,因此用tf.placeholder来存放这块空间 64 | ```python 65 | tf_train_dataset = tf.placeholder(tf.float32, 66 | shape=(batch_size, image_size * image_size)) 67 | tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels)) 68 | ``` 69 | - 计算3000次,训练总数据量为384000,比之前8000000少 70 | 71 | > 准确率提高到86.5%,而且准确率随训练次数增加而提高的速度变快了 72 | 73 | ## 神经网络 74 | - 上面SGD的模型只有一层WX+b,现在使用一个RELU作为中间的隐藏层,连接两个WX+b 75 | - 仍然只需要修改Graph计算单元为 76 | ```python 77 | Y = W2 * RELU(W1*X + b1) + b2 78 | ``` 79 | - 为了在数学上满足矩阵运算,我们需要这样的矩阵运算: 80 | ``` 81 | [n * 10] = RELU([n * 784] · [784 * N] + [n * N]) · [N * 10] + [n * 10] 82 | ``` 83 | - 这里N取1024,即1024个隐藏结点 84 | - 于是四个参数被修改 85 | ```python 86 | weights1 = tf.Variable( 87 | tf.truncated_normal([image_size * image_size, hidden_node_count])) 88 | biases1 = tf.Variable(tf.zeros([hidden_node_count])) 89 | weights2 = tf.Variable( 90 | tf.truncated_normal([hidden_node_count, num_labels])) 91 | biases2 = tf.Variable(tf.zeros([num_labels])) 92 | ``` 93 | - 预测值计算方法改为 94 | ```python 95 | ys = tf.matmul(tf_train_dataset, weights1) + biases1 96 | hidden = tf.nn.relu(ys) 97 | logits = tf.matmul(hidden, weights2) + biases2 98 | ``` 99 | - 计算3000次,可以发现准确率一开始提高得很快,后面提高速度变缓,最终测试准确率提高到88.8% -------------------------------------------------------------------------------- /note/lesson-1/logistic_classify.md: -------------------------------------------------------------------------------- 1 | # Logistic Classification 2 | 3 | > Github工程地址:https://github.com/ahangchen/GDLnotes 4 | 5 | > 欢迎star,有问题可以到[Issue区](https://github.com/ahangchen/GDLnotes/issues)讨论 6 | 7 | > 官方教程[地址](https://classroom.udacity.com/courses/ud730/lessons/6370362152/concepts/63798118170923) 8 | 9 | > [视频](http://d2uz2655q5g6b2.cloudfront.net/6370362152/L1%20Machine%20Learning%20to%20Deep%20Learning%20Videos.zip)/[字幕](http://d2uz2655q5g6b2.cloudfront.net/6370362152/L1%20Machine%20Learning%20to%20Deep%20Learning%20Subtitles.zip)下载 10 | 11 | ## About 12 | 13 | > simple but important classifier 14 | 15 | - Train your first simple model entirely end to end 16 | - 下载、预处理一些图片以分类 17 | - Run an actual logistic classifier on images data 18 | - Connect bit of math and code 19 | 20 | ## Detail 21 | ### Linear Classifier 22 | 23 | ![](../../res/logistic.png) 24 | 25 | > 之所以这样建模,是因为线性公式是最简单的数学模型,仅此而已。 26 | 27 | - Input: X (e.g. the pixels in an image) 28 | - Apply a linear function to X 29 | - Giant matrix multiply 30 | - Take inputs as a big vector 31 | - Multiply input vector with a matrix, W means weights 32 | - b means biased term 33 | - Machine learning adjust weights and bias for the best prediction 34 | - Output: Y, predictions for per output class 35 | - Y is a vector, represents the probability of each label 36 | - 好的预测中,正确的label的概率应当更接近1 37 | - 往往得到的Y一开始不是概率,而是一些具体值(scores/logits),所以需要转换,by: 38 | 39 | > Softmax回归模型:[Wikipedia](http://ufldl.stanford.edu/wiki/index.php/Softmax%E5%9B%9E%E5%BD%92) 40 | 41 | ![](../../res/softmax.png) 42 | ### Softmax 43 | - 代码 [soft_max.py](../../src/soft_max.py):Softmax实现与应用 44 | - input的score差异越大(可以全部乘10试试),则输出的各项label概率差异越大,反之差异越小 45 | - Softmax只关心几个label之间的概率,不关心具体值 46 | - 机器学习是一个让预测成功率升高的事情,因此是一个让score之间差异增大的过程 47 | 48 | ### One hot encoding 49 | ![](../../res/one_hot_encoding.png) 50 | 51 | > 正确预测结果应当是只有一个label成立,其他label不成立。这种情况下,预测概率最大的则是最可能的结果。 52 | 53 | > Example: take this [test](https://classroom.udacity.com/courses/ud730/lessons/6370362152/concepts/63713510510923) 54 | 55 | - one hot encoding在label很多的情况下not work well,因为output vector到处都是0,很稀疏,因此效率低 56 | - solved by [embeddings](../lesson-4/README.md) 57 | - 好处:可以measure我们与理想情况之间的距离(compare two vectors) 58 | 59 | > 分类器输出:[0.7 0.2 0.1] \<=\> 与label对应的真实情况:[1 0 0] 60 | 61 | - Compare two vectors: cross-entropy 62 | ![](../../res/cross-entropy.png) 63 | 64 | - D(S, L) != D(L, S) 65 | 66 | > Remember: Label don't log, for label zero 67 | 68 | ### 小结 69 | ![](../../res/logistic2.png) 70 | 71 | ![](../../res/logistic3.png) 72 | 73 | 找到合适的W和b,使得S和L的距离D的平均值,在整个数据集n中最小。 74 | 75 | ### 最小化cross-entropy 76 | 77 | ![](../../res/avg_train_loss.png) 78 | 79 | D的平均值即是Training loss,求和和矩阵相乘是个大数据的活。 80 | 81 | ![](../../res/weight_loss.png) 82 | 83 | 两个参数的误差导致一个呈圆形的loss,所以我们要做的就是找到尽量靠近圆心的weight 84 | > 机器学习问题变成了一个数值优化 85 | - 解决方法之一:Gradient descent,求导 86 | 87 | ![](../../res/min_num.png) 88 | 89 | > 修改参数,检查误差是否变大,往变小的方向修改,直到抵达bottom。 90 | 91 | > 图中weight是二维的,但事实上可能有极多的weight 92 | 93 | ### Numerical Stability 94 | 95 | > 量级相差太多的数运算会导致许多错误 96 | 97 | Example:[num_stable.py](../../src/num_stable.py) 98 | 99 | - 你可能以为输出是1, 但结果是一个接近0.95的数。 100 | - 但将1billion换成1,结果就很接近1。 101 | - 因此需要让前面提到的Train loss函数中的数据不要too big or too small 102 | 103 | ### Normalized Inputs and Initial Wights 104 | > 归一化输入和初始参数 105 | 106 | - 理想目标 107 | - 均值为0 108 | - 方差处处相等 109 | ![](../../res/normal_target.png) 110 | - Math Reason 111 | 112 | > Easier for the optimizer to find a good solution 113 | ![](../../res/math_reason.png) 114 | 115 | - Example: Images Normalization 116 | ```python 117 | R = (R - 128) / 128 118 | G = (G - 128) / 128 119 | B = (B - 128) / 128 120 | ``` 121 | - Weight Initialization 122 | 找到好的weight和bias for the gradient descent to proceed 123 | 124 | > A simple, general method 125 | 126 | ![](../../res/gauss_init.png) 127 | - 用均值为0,标准偏差的高斯分布产生随机的数据填充W矩阵 128 | ![](../../res/train_loss_init.png) 129 | - 高斯分布模型也决定了初始输出(softmax输出)的概率分布 130 | - 高斯分布的sigma越小,说明预测越不确定,sigma的取值很主观 131 | - 我们的工作即是,选一个较小的sigma,让sigma变小到合适的值,使得预测更确定。 132 | 133 | - 优化 134 | ![](../../res/normal_optimize.png) 135 | 调整W和b,使得Train loss最小 136 | 137 | [下一节](practical.md)实践 138 | 139 | > 觉得得我的文章对您有帮助的话,就给个[star](https://github.com/ahangchen/GDLnotes)吧~ -------------------------------------------------------------------------------- /src/not_mnist/img_pickle.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | from scipy import ndimage 8 | # from six.moves.urllib.request import urlretrieve 9 | # from six.moves import cPickle as pickle 10 | import cPickle as pickle 11 | 12 | image_size = 28 # Pixel width and height. 13 | pixel_depth = 255.0 # Number of levels per pixel. 14 | 15 | 16 | def load_letter(folder, min_num_images): 17 | """Load the data for a single letter label.""" 18 | image_files = os.listdir(folder) 19 | dataset = np.ndarray(shape=(len(image_files), image_size, image_size), 20 | dtype=np.float32) 21 | print(folder) 22 | for image_index, image in enumerate(image_files): 23 | image_file = os.path.join(folder, image) 24 | try: 25 | image_data = (ndimage.imread(image_file).astype(float) - 26 | pixel_depth / 2) / pixel_depth 27 | if image_data.shape != (image_size, image_size): 28 | raise Exception('Unexpected image shape: %s' % str(image_data.shape)) 29 | dataset[image_index, :, :] = image_data 30 | except IOError as e: 31 | print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.') 32 | 33 | num_images = image_index + 1 34 | dataset = dataset[0:num_images, :, :] 35 | if num_images < min_num_images: 36 | raise Exception('Many fewer images than expected: %d < %d' % 37 | (num_images, min_num_images)) 38 | 39 | print('Full dataset tensor:', dataset.shape) 40 | print('Mean:', np.mean(dataset)) 41 | print('Standard deviation:', np.std(dataset)) 42 | return dataset 43 | 44 | 45 | def maybe_pickle(data_folders, min_num_images_per_class, force=False): 46 | dataset_names = [] 47 | for folder in data_folders: 48 | set_filename = folder + '.pickle' 49 | dataset_names.append(set_filename) 50 | if os.path.exists(set_filename) and not force: 51 | # You may override by setting force=True. 52 | print('%s already present - Skipping pickling.' % set_filename) 53 | else: 54 | print('Pickling %s.' % set_filename) 55 | dataset = load_letter(folder, min_num_images_per_class) 56 | try: 57 | with open(set_filename, 'wb') as f: 58 | pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL) 59 | 60 | except Exception as e: 61 | print('Unable to save data to', set_filename, ':', e) 62 | 63 | return dataset_names 64 | 65 | 66 | def show_imgs(imgs, show_max=-1): 67 | show_cnt = show_max 68 | if show_max == -1: 69 | show_cnt = len(imgs) 70 | 71 | for image_index in xrange(show_cnt): 72 | # they are binary images, if RGBs, don't add cmap="Graeys" 73 | plt.imshow(imgs[image_index], cmap="Greys") 74 | plt.show() 75 | 76 | 77 | def load_pickle(pickle_name): 78 | # load a pickle file to memory 79 | if os.path.exists(pickle_name): 80 | return pickle.load(open(pickle_name, "r")) 81 | return None 82 | 83 | 84 | def save_obj(pickle_file, obj): 85 | try: 86 | f = open(pickle_file, 'wb') 87 | pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) 88 | f.close() 89 | except Exception as e: 90 | print('Unable to save data to', pickle_file, ':', e) 91 | raise 92 | statinfo = os.stat(pickle_file) 93 | print('Compressed pickle size:', statinfo.st_size) 94 | 95 | if __name__ == '__main__': 96 | train_folders = ['notMNIST_large/A', 'notMNIST_large/B', 'notMNIST_large/C', 'notMNIST_large/D', 'notMNIST_large/E', 97 | 'notMNIST_large/F', 'notMNIST_large/G', 'notMNIST_large/H', 'notMNIST_large/I', 'notMNIST_large/J'] 98 | test_folders = ['notMNIST_small/A', 'notMNIST_small/B', 'notMNIST_small/C', 'notMNIST_small/D', 'notMNIST_small/E', 99 | 'notMNIST_small/F', 'notMNIST_small/G', 'notMNIST_small/H', 'notMNIST_small/I', 'notMNIST_small/J'] 100 | train_datasets = maybe_pickle(train_folders, 45000) 101 | test_datasets = maybe_pickle(test_folders, 1800) 102 | 103 | for i in range(1): # only load a.pickle 104 | imgs = load_pickle(train_datasets[i]) 105 | show_imgs(imgs, 3) 106 | 107 | -------------------------------------------------------------------------------- /note/lesson-2/deep_network_practice.md: -------------------------------------------------------------------------------- 1 | # 深度神经网络实践 2 | 代码见[nn_overfit.py](../../src/neural/nn_overfit.py) 3 | ## 优化 4 | ### Regularization 5 | 在前面实现的[RELU连接的两层神经网络](../../src/neural/full_connect.py)中,加Regularization进行约束,采用加l2 norm的方法,进行负反馈: 6 | 7 | ![](../../res/l2_regularization.png) 8 | 9 | 代码实现上,只需要对tf_sgd_relu_nn中train_loss做修改即可: 10 | - 可以用tf.nn.l2_loss(t)对一个Tensor对象求l2 norm 11 | - 需要对我们使用的各个W都做这样的计算(参考tensorflow官方[example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/models/image/mnist/convolutional.py)) 12 | ```python 13 | l2_loss = tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2) 14 | ``` 15 | - 添加到train_loss上 16 | - 这里还有一个重要的点,Hyper Parameter: β 17 | - 我觉得这是一个拍脑袋参数,取什么值都行,但效果会不同,我这里解释一下我取β=0.001的理由 18 | - 如果直接将l2_loss加到train_loss上,每次的train_loss都特别大,几乎只取决于l2_loss 19 | - 为了让原本的train_loss与l2_loss都能较好地对参数调整方向起作用,它们应当至少在同一个量级 20 | - 观察不加l2_loss,step 0 时,train_loss在300左右 21 | - 加l2_loss后, step 0 时,train_loss在300000左右 22 | - 因此给l2_loss乘0.0001使之降到同一个量级 23 | ```python 24 | loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + 0.001 * l2_loss 25 | ``` 26 | - 所有其他参数不变,训练3000次,准确率提高到92.7% 27 | - 黑魔法之所以为黑魔法就在于,这个参数可以很容易地影响准确率,如果β = 0.002,准确率提高到93.5% 28 | 29 | ### OverFit问题 30 | 在训练数据很少的时候,会出现训练结果准确率高,但测试结果准确率低的情况 31 | - 缩小训练数据范围:将把batch数据的起点offset的可选范围变小(只能选择0-1128之间的数据): 32 | ```python 33 | offset_range = 1000 34 | offset = (step * batch_size) % offset_range 35 | ``` 36 | - 可以看到,在step500后,训练集就一直是100%,验证集一直是77.6%,准确度无法随训练次数上升,最后的测试准确度是85.4% 37 | 38 | ### DropOut 39 | 采取Dropout方式强迫神经网络学习更多知识 40 | 41 | > 参考[aymericdamien/TensorFlow-Examples](https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3%20-%20Neural%20Networks/alexnet.py)中dropout的使用 42 | 43 | - 我们需要丢掉RELU出来的部分结果 44 | - 调用tf.nn.dropout达到我们的目的: 45 | ```python 46 | keep_prob = tf.placeholder(tf.float32) 47 | if drop_out: 48 | hidden_drop = tf.nn.dropout(hidden, keep_prob) 49 | h_fc = hidden_drop 50 | ``` 51 | - 这里的keep_prob是保留概率,即我们要保留的RELU的结果所占比例,tensorflow建议的[语法](https://www.tensorflow.org/versions/r0.8/tutorials/mnist/pros/index.html)是,让它作为一个placeholder,在run时传入 52 | - 当然我们也可以不用placeholder,直接传一个0.5: 53 | ```python 54 | if drop_out: 55 | hidden_drop = tf.nn.dropout(hidden, 0.5) 56 | h_fc = hidden_drop 57 | ``` 58 | - 这种训练的结果就是,虽然在step 500对训练集预测没能达到100%(起步慢),但训练集预测率达到100%后,验证集的预测正确率仍然在上升 59 | - 这就是Dropout的好处,每次丢掉随机的数据,让神经网络每次都学习到更多,但也需要知道,这种方式只在我们有的训练数据比较少时很有效 60 | - 最后预测准确率为88.0% 61 | 62 | ### Learning Rate Decay 63 | 随着训练次数增加,自动调整步长 64 | - 在之前单纯两层神经网络基础上,添加Learning Rate Decay算法 65 | - 使用tf.train.exponential_decay方法,指数下降调整步长,具体使用方法[官方文档](https://www.tensorflow.org/versions/r0.8/api_docs/python/train.html#exponential_decay)说的特别清楚 66 | - 注意这里面的cur_step传给优化器,优化器在训练中对其做自增计数 67 | - 与之前单纯两层神经网络对比,准确率直接提高到90.6% 68 | 69 | ## Deep Network 70 | 增加神经网络层数,增加训练次数到20000 71 | - 为了避免修改网络层数需要重写代码,用循环实现中间层 72 | ```python 73 | # middle layer 74 | for i in range(layer_cnt - 2): 75 | y1 = tf.matmul(hidden_drop, weights[i]) + biases[i] 76 | hidden_drop = tf.nn.relu(y1) 77 | if drop_out: 78 | keep_prob += 0.5 * i / (layer_cnt + 1) 79 | hidden_drop = tf.nn.dropout(hidden_drop, keep_prob) 80 | ``` 81 | - 初始化weight在迭代中使用 82 | ```python 83 | for i in range(layer_cnt - 2): 84 | if hidden_cur_cnt > 2: 85 | hidden_next_cnt = int(hidden_cur_cnt / 2) 86 | else: 87 | hidden_next_cnt = 2 88 | hidden_stddev = np.sqrt(2.0 / hidden_cur_cnt) 89 | weights.append(tf.Variable(tf.truncated_normal([hidden_cur_cnt, hidden_next_cnt], stddev=hidden_stddev))) 90 | biases.append(tf.Variable(tf.zeros([hidden_next_cnt]))) 91 | hidden_cur_cnt = hidden_next_cnt 92 | ``` 93 | - 第一次测试时,用正太分布设置所有W的数值,将标准差设置为1,由于网络增加了一层,寻找step调整方向时具有更大的不确定性,很容易导致loss变得很大 94 | - 因此需要用stddev调整其标准差到一个较小的范围(怎么调整有许多研究,这里直接找了一个来用) 95 | 96 | ```python 97 | stddev = np.sqrt(2.0 / n) 98 | ``` 99 | 100 | - 启用regular时,也要适当调一下β,不要让它对原本的loss造成过大的影响 101 | - DropOut时,因为后面的layer得到的信息越重要,需要动态调整丢弃的比例,到后面的layer,丢弃的比例要减小 102 | ```python 103 | keep_prob += 0.5 * i / (layer_cnt + 1) 104 | ``` 105 | - 训练时,调节参数,你可能遇到[消失的梯度问题](http://wiki.jikexueyuan.com/project/neural-networks-and-deep-learning-zh-cn/chapter5.html), 106 | 对于一个幅度为1的信号,在BP反向传播梯度时,每隔一层下降0.25,指数下降使得后面的层级根本接收不到有效的训练信号 107 | - 官方教程表示最好的训练结果是,准确率97.5%, 108 | - 我的[nn_overfit.py](../../src/neural/nn_overfit.py)开启六层神经网络, 109 | 启用Regularization、DropOut、Learning Rate Decay, 110 | 训练次数20000(应该还有再训练的希望,在这里虽然loss下降很慢了,但仍然在下降),训练结果是,准确率95.2% 111 | 112 | -------------------------------------------------------------------------------- /src/not_mnist/merge_prune.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import os 4 | 5 | import numpy as np 6 | # from six.moves.urllib.request import urlretrieve 7 | # from six.moves import cPickle as pickle 8 | import cPickle as pickle 9 | 10 | from not_mnist.img_pickle import maybe_pickle, save_obj 11 | 12 | image_size = 28 # Pixel width and height. 13 | 14 | 15 | def make_arrays(nb_rows, img_size): 16 | if nb_rows: 17 | dataset = np.ndarray((nb_rows, img_size, img_size), dtype=np.float32) 18 | labels = np.ndarray(nb_rows, dtype=np.int32) 19 | else: 20 | dataset, labels = None, None 21 | return dataset, labels 22 | 23 | 24 | def merge_datasets(pickle_files, train_size, valid_size=0): 25 | num_classes = len(pickle_files) 26 | valid_dataset, valid_labels = make_arrays(valid_size, image_size) 27 | train_dataset, train_labels = make_arrays(train_size, image_size) 28 | vsize_per_class = valid_size // num_classes 29 | tsize_per_class = train_size // num_classes 30 | 31 | start_v, start_t = 0, 0 32 | end_v, end_t = vsize_per_class, tsize_per_class 33 | end_l = vsize_per_class + tsize_per_class 34 | for label, pickle_file in enumerate(pickle_files): 35 | try: 36 | with open(pickle_file, 'rb') as f: 37 | letter_set = pickle.load(f) 38 | # let's shuffle the letters to have random validation and training set 39 | np.random.shuffle(letter_set) 40 | if valid_dataset is not None: # None for test dataSet 41 | valid_letter = letter_set[:vsize_per_class, :, :] 42 | valid_dataset[start_v:end_v, :, :] = valid_letter 43 | valid_labels[start_v:end_v] = label 44 | start_v += vsize_per_class 45 | end_v += vsize_per_class 46 | 47 | train_letter = letter_set[vsize_per_class:end_l, :, :] 48 | train_dataset[start_t:end_t, :, :] = train_letter 49 | train_labels[start_t:end_t] = label 50 | start_t += tsize_per_class 51 | end_t += tsize_per_class 52 | except Exception as e: 53 | print('Unable to process data from', pickle_file, ':', e) 54 | raise 55 | 56 | return valid_dataset, valid_labels, train_dataset, train_labels 57 | 58 | 59 | def randomize(dataset, labels): 60 | permutation = np.random.permutation(labels.shape[0]) 61 | shuffled_dataset = dataset[permutation, :, :] 62 | shuffled_labels = labels[permutation] 63 | return shuffled_dataset, shuffled_labels 64 | 65 | 66 | def merge_prune(train_floders, test_folders): 67 | train_datasets = maybe_pickle(train_folders, 45000) 68 | test_datasets = maybe_pickle(test_folders, 1800) 69 | 70 | train_size = 200000 71 | valid_size = 10000 72 | test_size = 10000 73 | 74 | valid_dataset, valid_labels, train_dataset, train_labels = merge_datasets( 75 | train_datasets, train_size, valid_size) 76 | _, _, test_dataset, test_labels = merge_datasets(test_datasets, test_size) 77 | 78 | print('Training:', train_dataset.shape, train_labels.shape) 79 | print('Validation:', valid_dataset.shape, valid_labels.shape) 80 | print('Testing:', test_dataset.shape, test_labels.shape) 81 | 82 | train_dataset, train_labels = randomize(train_dataset, train_labels) 83 | test_dataset, test_labels = randomize(test_dataset, test_labels) 84 | valid_dataset, valid_labels = randomize(valid_dataset, valid_labels) 85 | 86 | pickle_file = 'notMNIST.pickle' 87 | save = { 88 | 'train_dataset': train_dataset, 89 | 'train_labels': train_labels, 90 | 'valid_dataset': valid_dataset, 91 | 'valid_labels': valid_labels, 92 | 'test_dataset': test_dataset, 93 | 'test_labels': test_labels, 94 | } 95 | save_obj(pickle_file, save) 96 | 97 | 98 | if __name__ == "__main__": 99 | train_folders = ['notMNIST_large/A', 'notMNIST_large/B', 'notMNIST_large/C', 'notMNIST_large/D', 'notMNIST_large/E', 100 | 'notMNIST_large/F', 'notMNIST_large/G', 'notMNIST_large/H', 'notMNIST_large/I', 'notMNIST_large/J'] 101 | test_folders = ['notMNIST_small/A', 'notMNIST_small/B', 'notMNIST_small/C', 'notMNIST_small/D', 'notMNIST_small/E', 102 | 'notMNIST_small/F', 'notMNIST_small/G', 'notMNIST_small/H', 'notMNIST_small/I', 'notMNIST_small/J'] 103 | merge_prune(train_folders, test_folders) 104 | -------------------------------------------------------------------------------- /src/not_mnist/clean_overlap.py: -------------------------------------------------------------------------------- 1 | import cPickle as pickle 2 | import os 3 | import numpy as np 4 | 5 | from not_mnist.img_pickle import load_pickle, save_obj 6 | 7 | image_size = 28 # Pixel width and height. 8 | 9 | 10 | def img_diff(pix_s1, pix_s2): # by pixels 11 | dif_cnt = 0 12 | height = image_size 13 | width = image_size 14 | total = width * height 15 | for x in range(height): 16 | for y in range(width): 17 | if pix_s1[x][y] != pix_s2[x][y]: 18 | dif_cnt += 1 19 | return float(dif_cnt) / float(total) 20 | 21 | 22 | def test_img_diff(): 23 | img1 = [[x for x in range(20)] for y in range(28)] 24 | img2 = [[x for x in range(20)] for y in range(28)] 25 | print(img_diff(img1, img2)) 26 | 27 | 28 | def img_in(img, imgs): 29 | for i, img2 in enumerate(imgs): 30 | if img_diff(img, img2) < 0.1: 31 | return True 32 | return False 33 | 34 | 35 | def BKDRHash(string): 36 | seed = 131 37 | hash = 0 38 | for ch in string: 39 | hash = hash * seed + ord(ch) 40 | return hash & 0x7FFFFFFF 41 | 42 | 43 | def img_hash(pix_s): 44 | seed = 131 45 | v_hash = 0 46 | for row in pix_s: 47 | for p in row: 48 | v_hash = v_hash * seed + int(p * 255) 49 | return v_hash & 0x7FFFFFFF 50 | 51 | 52 | def imgs_except(left, right): 53 | return filter(lambda img: not img_in(img, right), left) 54 | 55 | 56 | def test_imgs_diff(): 57 | img1 = [[x for x in range(20)] for y in range(28)] 58 | img2 = [[x for x in range(20)] for y in range(28)] 59 | img3 = [[x for x in range(20)] for y in range(28)] 60 | 61 | print(len(imgs_except([img2, img3], [img1]))) 62 | 63 | 64 | def imgs_idx_except(left, right): 65 | except_idxs = [] 66 | imgs = [] 67 | for i in range(len(left)): 68 | print('compare left[%d] to right' % i) 69 | # about 2-3 seconds for one compare between left[i] and all right 70 | if img_in(left[i], right): 71 | except_idxs.append(i) 72 | imgs.append(left[i]) 73 | return except_idxs, imgs 74 | 75 | 76 | def imgs_idx_hash_except(left, right): 77 | except_idxs = [] 78 | right_hashes = [img_hash(img) for img in right] 79 | print len(right_hashes) 80 | for i in range(len(left)): 81 | if img_hash(left[i]) in right_hashes: 82 | print('compare left[%d] to right found the same' % i) 83 | except_idxs.append(i) 84 | res = np.delete(left, except_idxs, axis=0) 85 | return except_idxs, res 86 | 87 | 88 | def list_except(objs, idxs): 89 | new_objs = [] 90 | for i in range(len(objs)): 91 | if i not in idxs: 92 | new_objs.append(objs[i]) 93 | return new_objs 94 | 95 | 96 | def clean(): 97 | datasets = load_pickle('notMNIST.pickle') 98 | test_dataset = datasets['test_dataset'] 99 | test_labels = datasets['test_labels'] 100 | print('test_dataset:%d' % len(test_dataset)) 101 | print('test_labels:%d' % len(test_labels)) 102 | 103 | except_valid_idx, valid_dataset = imgs_idx_hash_except(datasets['valid_dataset'], test_dataset) 104 | valid_labels = np.delete(datasets['valid_labels'], except_valid_idx) 105 | print('valid_dataset:%d' % len(valid_dataset)) 106 | print('valid_labels:%d' % len(valid_labels)) 107 | 108 | # except with valid_dataset 109 | except_train_idx, train_dataset = imgs_idx_hash_except(datasets['train_dataset'], valid_dataset) 110 | train_labels = np.delete(datasets['train_labels'], except_train_idx) 111 | # except with test_dataset 112 | except_train_idx, train_dataset = imgs_idx_hash_except(train_dataset, test_dataset) 113 | train_labels = np.delete(train_labels, except_train_idx) 114 | 115 | print('train_dataset:%d' % len(train_dataset)) 116 | print('train_labels:%d' % len(train_labels)) 117 | print('valid_dataset:%d' % len(valid_dataset)) 118 | print('valid_labels:%d' % len(valid_labels)) 119 | print('test_dataset:%d' % len(test_dataset)) 120 | print('test_labels:%d' % len(test_labels)) 121 | 122 | pickle_file = 'notMNIST_clean.pickle' 123 | save = { 124 | 'train_dataset': train_dataset, 125 | 'train_labels': train_labels, 126 | 'valid_dataset': valid_dataset, 127 | 'valid_labels': valid_labels, 128 | 'test_dataset': test_dataset, 129 | 'test_labels': test_labels, 130 | } 131 | save_obj(pickle_file, save) 132 | 133 | 134 | if __name__ == '__main__': 135 | clean() 136 | -------------------------------------------------------------------------------- /note/lesson-1/practical.md: -------------------------------------------------------------------------------- 1 | # Practical Aspects of Learning 2 | 3 | > Github工程地址:https://github.com/ahangchen/GDLnotes 4 | 5 | > 欢迎star,有问题可以到[Issue区](https://github.com/ahangchen/GDLnotes/issues)讨论 6 | 7 | > 官方教程[地址](https://classroom.udacity.com/courses/ud730/lessons/6370362152/concepts/63798118170923) 8 | 9 | > [视频](http://d2uz2655q5g6b2.cloudfront.net/6370362152/L1%20Machine%20Learning%20to%20Deep%20Learning%20Videos.zip)/[字幕](http://d2uz2655q5g6b2.cloudfront.net/6370362152/L1%20Machine%20Learning%20to%20Deep%20Learning%20Subtitles.zip)下载 10 | 11 | 12 | > 课程目标:学习简单的数据展示,熟悉以后要使用的数据 13 | 14 | ## Install Ipython NoteBook 15 | 16 | 可以参考这个[教程](http://opentechschool.github.io/python-data-intro/core/notebook.html) 17 | 18 | - 可以直接安装[anaconda](https://www.continuum.io/downloads),里面包含了各种库,也包含了ipython; 19 | - 推荐使用python2的版本,因为很多lib只支持python2,而且python3在升级中,支持3.4还是3.5是个很纠结的问题。 20 | - 安装anaconda后直接在终端输入 ipython notebook,则会运行一个ipython的server端,同时在你的浏览器中打开基于你终端目录的一个页面: 21 | ![](../../res/ipython_start.png) 22 | - 点开ipynb文件即可进入文件编辑页面 23 | ![](../../res/ipynb.png) 24 | 25 | 上图即为practical部分的教程,可以在github[下载](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/udacity) 26 | 27 | 官方推荐使用docker来进行这部分教程,但简单起见我们先用ipython notebook 28 | 29 | ## [安装tensorflow](https://github.com/ahangchen/GDLnotes/tree/master/note/tensorflow) 30 | 31 | ## notMNIST 32 | 33 | 修改的[MNIST](http://yann.lecun.com/exdb/mnist/),不够干净,更接近真实数据,比MNIST任务更困难。 34 | 35 | ## Todo 36 | 我将官方教程的一个文件拆成了多个(以文件持久化为边界),然后在[schedule.py](../../src/assign_1/schedule.py)里统一调用,在各个文件里可以执行各个部分的功能测试。 37 | 38 | - 下载 39 | - 使用urlretrieve来获取数据集notMNIST_large.tar.gz和notMNIST_small.tar.gz 40 | 41 | > 代码示例:[load_data.py](../../src/assign_1/load_data.py) 42 | 43 | - 解压 44 | - 使用tarfile模块来解压刚刚下载的压缩包 45 | 46 | > 代码示例:[extract.py](../../src/assign_1/extract.py) 47 | 48 | - 读图 - 展示 - 序列化 49 | - 用ndimage读取一部分图片,用pickle将读取到的对象(ndarray对象的list)序列化存储到磁盘 50 | - 用matplotlib.plot.imshow实现图片显示,可以展示任意的numpy.ndarray,详见show_imgs(dataset) 51 | - 这里展示的是二值化图片,可以设置显示为灰度图 52 | - 将每个class对应的图像数据集序列化到磁盘 53 | 54 | > 代码示例:[img_pickle.py](../../src/assign_1/img_pickle.py) 55 | 56 | - 整理数据集 57 | - 用pickle读取pickle文件, 58 | - 从train_folder中为10个class分别获取10000个valid_dataset和20000个train_dataset, 59 | - 其中对每个class读取到的数据,用random.shuffle将数据乱序化 60 | - 将各个class及其对应的label序列化到磁盘,分别为训练器和校验集 61 | - 从test_folder中为10个class分别获取10000个test_dataset, 62 | - 其中对每个class读取到的数据,用random.shuffle将数据乱序化 63 | - 将各个class及其对应的label序列化到磁盘,作为测试集 64 | 65 | > 代码示例[merge_prune.py](../../src/assign_1/merge_prune.py) 66 | 67 | - 去除重复数据 68 | - load_pickle,加载dataset 69 | - 先将valid_dataset中与test_dataset重复部分剔除,再将train_dataset中与valid_dataset重复部分剔除 70 | - 每个dataset都是一个二维浮点数组的list,也可以理解为三维浮点数组, 71 | - 比较list中的每个图,也就是将list1中每个二维浮点数组与list2中每个二维浮点数组比较 72 | - 示例代码即为[clean_overlap.py](../../src/assign_1/clean_overlap.py)中的imgs_idx_except 73 | 74 | - 我们在拿list1中的一个元素跟list2中的一个元素比较时,总共需要比较len(list1) * len(list2) * image_size * image_size次,速度极慢 75 | - 实际上这是有重复的计算的,就在于,list2中的每个元素,都被遍历了len(list1)次 76 | - 因此有这样的一个优化,我们遍历每个图,用图中的灰度值,仿照BKDRHash,得到每个图都不同的hash值,比较hash值来比较图像 77 | - 示例代码即为[clean_overlap.py](../../src/assign_1/clean_overlap.py)中的imgs_idx_hash_except 78 | 79 | - 这样每个图都只需要访问一次,计算hash的时间变为(len(list1) + len(list2)) * image_size * image_size 80 | - 比较的次数是len(list1) * len(list2) 81 | - 由于我们的数据中,list1和list2的长度是大数,所以节省的时间是相当可观的 82 | - 在我的机器上,比较完valid_dataset和test_dataset需要的时间分别是25000秒(10000次比较,每次2-3秒)和60秒 83 | 84 | - 然后再将清理后的数据序列化到磁盘即可 85 | 86 | > 代码示例: [clean_overlap.py](../../src/assign_1/clean_overlap.py) 87 | 88 | - 训练一个logistics 模型 89 | - 将train_dataset作为输入,用valid_dataset进行验证(预测成功率81.9%) 90 | - 为了重复利用训练后的分类器,将其序列化到磁盘 91 | 92 | > 代码示例: [logistic_train.py](../../src/assign_1/logistic_train.py) 93 | 94 | - Measure Performance 95 | - 分类器会尝试去记住训练集 96 | - 遇到训练集中没有的数据时,分类器可能就没辙了 97 | - 所以我们应该measure的是,分类器如何产生新数据(生成能力(推导能力)越大,说明它应对新数据能力越强) 98 | - 仅measure分类器记忆数据集的能力并不能应对新数据(没有学到规律),所以不应该拿旧数据去measure 99 | - 因此measure的方式应该是拿新数据去看分类器的预测准确度(never see, can't memorize) 100 | 101 | - 但是在measure的过程中,我们会根据测试数据去重新调整分类器,使其对所有测试数据都生效 102 | - 也就是说测试数据变成了训练集的一部分,因此这部分数据我们只能作为valid_dataset,而不能用于衡量最后的performance 103 | 104 | - 解决方法之一即,最终进行performance measure的数据集,必须是调整分类器的过程中没有使用过的 105 | 106 | - 即坚持一个原则,测试数据不用于训练 107 | 108 | > 在机器学习比赛Kaggle中,有public data,validate data,并有用于测试(选手未知)的private data,只有在训练时自己的分类器时,预先取一部分数据作为test data, 109 | 才能不会在train和valid的过程中被已有数据所蒙蔽 110 | 111 | - Validation dataset 112 | - 验证集越大,验证的可信度越大 113 | - 统计学上,调整分类器后,当30个以上预测结果的正确性发生变化的话,这种变化是可信的,值得注意的,小于30是噪音 114 | - 因此Validation dataset通常数据要大于30000个,在准确率变化高于0.1%时,认为分类器的performance变化 115 | - 但这样需要的数据往往偏多,所以可以尝试交叉验证(cross validation),交叉验证有个缺点是速度慢 116 | - 验证时,使用tensor.eval(input),相当于tf.get_default_session().run(tensor) 117 | 118 | > 觉得得我的文章对您有帮助的话,就给个[star](https://github.com/ahangchen/GDLnotes)吧~ 119 | -------------------------------------------------------------------------------- /src/convnet/hyper_conv_mnist.py: -------------------------------------------------------------------------------- 1 | from convnet.conv_mnist import maxpool2d, load_reformat_not_mnist 2 | from neural.full_connect import accuracy 3 | 4 | import tensorflow as tf 5 | 6 | 7 | def up_div(y, x): 8 | if y % x > 0: 9 | return y / x + 1 10 | else: 11 | return y / x 12 | 13 | 14 | def size_by_conv(stride_ps, data_size, total_layer_cnt): 15 | param1 = data_size[1] 16 | param2 = data_size[2] 17 | for i in range(total_layer_cnt): 18 | param1 = up_div(param1, stride_ps[i][1]) 19 | param1 = up_div(param1, 2) 20 | param2 = up_div(param2, stride_ps[i][2]) 21 | param2 = up_div(param2, 2) 22 | return param1 * param2 * data_size[0] 23 | 24 | 25 | def conv_train(basic_hps, stride_ps, layer_cnt=3, drop=False, lrd=False): 26 | batch_size = basic_hps['batch_size'] 27 | patch_size = basic_hps['patch_size'] 28 | depth = basic_hps['depth'] 29 | num_hidden = basic_hps['num_hidden'] 30 | num_channels = basic_hps['num_channels'] 31 | 32 | graph = tf.Graph() 33 | with graph.as_default(): 34 | # Input data. 35 | tf_train_dataset = tf.placeholder( 36 | tf.float32, shape=(batch_size, image_size, image_size, num_channels)) 37 | tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels)) 38 | tf_valid_dataset = tf.constant(valid_dataset) 39 | tf_test_dataset = tf.constant(test_dataset) 40 | 41 | # Variables. 42 | input_weights = tf.Variable(tf.truncated_normal( 43 | [patch_size, patch_size, num_channels, depth], stddev=0.1)) 44 | input_biases = tf.Variable(tf.zeros([depth])) 45 | 46 | mid_layer_cnt = layer_cnt - 1 47 | layer_weights = [tf.Variable(tf.truncated_normal( 48 | [patch_size, patch_size, depth, depth], stddev=0.1)) for _ in range(mid_layer_cnt)] 49 | layer_biases = [tf.Variable(tf.constant(1.0, shape=[depth])) for _ in range(mid_layer_cnt)] 50 | 51 | output_size = size_by_conv(stride_ps, [batch_size, image_size, image_size, num_channels], layer_cnt) 52 | output_weights = tf.Variable(tf.truncated_normal([output_size, num_hidden], stddev=0.1)) 53 | output_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden])) 54 | final_weights = tf.Variable(tf.truncated_normal( 55 | [num_hidden, num_labels], stddev=0.1)) 56 | final_biases = tf.Variable(tf.constant(1.0, shape=[num_labels])) 57 | 58 | # Model. 59 | def model(data): 60 | conv = tf.nn.conv2d(data, input_weights, stride_ps[0], use_cudnn_on_gpu=True, padding='SAME') 61 | conv = maxpool2d(conv) 62 | hidden = tf.nn.relu(conv + input_biases) 63 | if drop: 64 | hidden = tf.nn.dropout(hidden, 0.5) 65 | for i in range(mid_layer_cnt): 66 | print i 67 | conv = tf.nn.conv2d(hidden, layer_weights[i], stride_ps[i + 1], use_cudnn_on_gpu=True, padding='SAME') 68 | conv = maxpool2d(conv) 69 | hidden = tf.nn.relu(conv + layer_biases[i]) 70 | if drop: 71 | hidden = tf.nn.dropout(hidden, 0.7) 72 | 73 | shape = hidden.get_shape().as_list() 74 | reshape = tf.reshape(hidden, [shape[0], output_size]) 75 | 76 | hidden = tf.nn.relu(tf.matmul(reshape, output_weights) + output_biases) 77 | if drop: 78 | hidden = tf.nn.dropout(hidden, 0.8) 79 | return tf.matmul(hidden, final_weights) + final_biases 80 | 81 | # Training computation. 82 | logits = model(tf_train_dataset) 83 | loss = tf.reduce_mean( 84 | tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) 85 | # Optimizer. 86 | if lrd: 87 | cur_step = tf.Variable(0) # count the number of steps taken. 88 | starter_learning_rate = 0.1 89 | learning_rate = tf.train.exponential_decay(starter_learning_rate, cur_step, 10000, 0.96, staircase=True) 90 | optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=cur_step) 91 | else: 92 | optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss) 93 | 94 | # Predictions for the training, validation, and test data. 95 | train_prediction = tf.nn.softmax(logits) 96 | valid_prediction = tf.nn.softmax(model(tf_valid_dataset)) 97 | test_prediction = tf.nn.softmax(model(tf_test_dataset)) 98 | num_steps = 5001 99 | 100 | with tf.Session(graph=graph) as session: 101 | tf.initialize_all_variables().run() 102 | print('Initialized') 103 | for step in range(num_steps): 104 | offset = (step * batch_size) % (train_labels.shape[0] - batch_size) 105 | batch_data = train_dataset[offset:(offset + batch_size), :, :, :] 106 | batch_labels = train_labels[offset:(offset + batch_size), :] 107 | feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels} 108 | _, l, predictions = session.run( 109 | [optimizer, loss, train_prediction], feed_dict=feed_dict) 110 | if step % 50 == 0: 111 | print('Minibatch loss at step %d: %f' % (step, l)) 112 | print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels)) 113 | print('Validation accuracy: %.1f%%' % accuracy( 114 | valid_prediction.eval(), valid_labels)) 115 | print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels)) 116 | 117 | 118 | if __name__ == '__main__': 119 | image_size = 28 120 | num_labels = 10 121 | train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels = \ 122 | load_reformat_not_mnist(image_size, num_labels, 1) 123 | pick_size = 2048 124 | # piece of valid dataset to avoid OOM 125 | valid_dataset = valid_dataset[0: pick_size, :, :, :] 126 | valid_labels = valid_labels[0: pick_size, :] 127 | # piece of test dataset to avoid OOM 128 | test_dataset = test_dataset[0: pick_size, :, :, :] 129 | test_labels = test_labels[0: pick_size, :] 130 | # conv_max_pool_train() 131 | # conv_train() 132 | basic_hypers = { 133 | 'batch_size': 16, 134 | 'patch_size': 5, 135 | 'depth': 16, 136 | 'num_hidden': 64, 137 | 'num_channels': 1, 138 | } 139 | layer_sum = 3 140 | stride_params = [[1, 1, 1, 1] for _ in range(layer_sum - 1)] 141 | stride_params.append([1, 2, 2, 1]) 142 | conv_train(basic_hypers, stride_params, layer_cnt=layer_sum, lrd=True) 143 | -------------------------------------------------------------------------------- /note/lesson-4/unstand_lstm.md: -------------------------------------------------------------------------------- 1 | # 理解LSTM 网络 2 | - Posted on August 27, 2015 3 | 4 | ## 循环神经网络 5 | 6 | 人不会每时每刻都从抓取信息这一步开始思考。你在读这篇文章的时候,你对每个次的理解是基于你对以前的词汇的理解的。你不会把所有东西都释放出来然后再从抓取信息开始重新思考,你的思维是有持续性的。 7 | 8 | 传统的神经网络不能做到这一点, 而且好像这是传统神经网络的一个主要缺点。例如,想象你想要区分一个电影里的每个时刻正在发生的事情。一个传统的神经网络将会如何利用它对过去电影中事件的推理,来预测后续的事件,这个过程是不清晰的。 9 | 10 | 循环神经网络解决了这个问题。在循环神经网络里,有循环,允许信息持续产生作用。 11 | 12 | 图片名称 13 | 14 | 15 | 在上面的图中,一大块神经网络,A,观察一些输入xt,输出一个值ht。循环允许信息从网络的一步传到下一步。 16 | 17 | 这些循环使得循环神经网络似乎有点神秘。然而,如果你想多一点,其实它们跟一个正常的神经网络没有神秘区别。一个循环神经网络可以被认为是同一个网络的多重副本,每个部分会向继任者传递一个信息。想一想,如果我们展开了循环会发生什么: 18 | 19 | ![](../../res/RNN-unrolled.png) 20 | 21 | 22 | 这个链式本质揭示了,循环神经网络跟序列和列表是紧密相关的。它们是神经网络为这类数据而生的自然架构。 23 | 24 | 并且它们真的起了作用!在过去的几年里,应用RNN到许多问题中都取得了难以置信的成功:语音识别,语言建模,翻译,图像截取,等等。我会留一个话题,讨论学习Andrej Karpathy的博客能够取得多么令人惊艳的成绩: 25 | 26 | [The Unreasonable Effectiveness of Recurrent Neural Networks](http://karpathy.github.io/2015/05/21/rnn-effectiveness/)。但它们真的相当惊艳。 27 | 28 | 与这些成功紧密相关的是对LSTM的使用,一个非常特殊的循环神经网络的类型。它在许多任务上都能比标准的RNN工作的好得多。几乎所有基于RNN的神经网络取得的激动人心的成果都由LSTM获得。这篇文章将要探索的就是这些LSTM。 29 | 30 | ## Long-Term依赖问题 31 | 32 | RNN吸引人的一个地方是它们能够链接先前的信息与当前的任务,比如使用先前的视频帧可能预测对于当前帧的理解。如果RNN能够做到这种事情,它们会变得极度有用。但真的可以吗?不好说。 33 | 34 | 有时候,我们只需要查看最近的信息来执行现在的任务,例如,考虑一个语言模型试图基于先前的词预测下一个词。如果我们要预测“the clouds are in the sky”,我们不需要其他更遥远的上下文 —— 非常明显,下一个词就应该是sky。在这样的例子中,相关信息和目的地之间的距离是很小的。RNN可以学着区使用过去的信息。 35 | 36 | ![](../../res/RNN-shorttermdepdencies.png) 37 | 38 | 但也有一些情况是我们需要更多上下文的。考虑预测这个句子中最后一个词:“I grew up in France... I speak fluent French.” 最近的信息表明下一个词可能是一种语言的名字,但如果我们想要找出是哪种语言,我们需要从更久远的地方获取France的上下文。相关信息和目标之间的距离完全可能是非常巨大的。 39 | 40 | 不幸的是,随着距离的增大,RNN变得不能够连接信息。 41 | 42 | ![](../../res/RNN-longtermdependencies.png) 43 | 44 | 长期依赖导致的神经网络困境 45 | 46 | 理论上,RNN是绝对能够处理这样的“长期依赖的”。人类可以仔细地从这些词中找到参数然后解决这种形式的一些雏形问题。然而,实践中,RNN似乎不能够学习到这些。 Hochreiter (1991) [German] 和 Bengio, et al. 1994年曾探索过这个问题,他们发现了一些非常根本的导致RNN难以生效的原因。 47 | 48 | 万幸的是,LSTM没有这个问题! 49 | 50 | 51 | ## LSTM 网络 52 | 53 | 长短期记忆网络 - 通常简称为“LSTMs”,是一种特殊的RNN,适用于学习长期依赖。 54 | 他们由[Hochreiter 和 Schmidhuber(1997)](http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf)介绍引入, 55 | 由许多其他的人们在后续的工作中重新定义和丰富。 56 | 他们在各种各样的问题中都工作的特别好,并且现在已经被广泛使用。 57 | 58 | LSTMs 是为了避免长期依赖问题而特殊设计的。为长期时间记忆信息实际上是他们默认的行为, 59 | 而非他们需要学习的东西! 60 | 61 | 所有RNN都有重复神经网络模型的链式形式。在标准的RNN中,这种重复模型会有一种非常简单的结构,比如简单的tanh层。 62 | 63 | ![](../../res/LSTM3-SimpleRNN.png) 64 | 65 | The repeating module in a standard RNN contains a single layer. 66 | 67 | LSTM也有这种链式结构,但重复单元有着一种不同的结构。里面不再是只有单一的神经网络层,里面有四个,以非常简单的方式起作用。 68 | 69 | ![](../../res/LSTM3-chain.png) 70 | 71 | The repeating module in an LSTM contains four interacting layers. 72 | 73 | 不要担心内部的细节。我们稍后会一步一步遍历LSTM图。现在,我们要熟悉我们将要使用的定义: 74 | 75 | ![](../../res/LSTM2-notation.png) 76 | 77 | 在上面的图中,每行都有一个箭头,从一个结点的输出到另外的结点的输入。粉色的圆代表结点操作,比如向量相加,而黄色的长方形是学习的神经网络层。 78 | 线的合并代表denote的链接,而箭头的分叉代表内容复制后流向不同的位置。 79 | 80 | ## LSTM背后的核心思想 81 | 82 | LSTM的关键在于cell的状态,也就是图中贯穿顶部的那条水平线。 83 | 84 | cell的状态像是一条传送带,它贯穿整条链,其中只发生一些小的线性作用。信息流过这条线而不改变是非常容易的。 85 | 86 | ![](../../res/LSTM3-C-line.png) 87 | 88 | 89 | LSTM确实有能力移除或增加信息到cell状态中,由被称为门的结构精细控制。 90 | 91 | 门是一种让信息可选地通过的方法。它们由一个sigmoid神经网络层和一个点乘操作组成。 92 | 93 | 图片名称 94 | 95 | sigmod层输出[0, 1]区间内的数,描述了每个部分中应该通过的比例。输出0意味着“什么都不能通过”,而输出1意味着“让所有东西通过!”。 96 | 97 | 一个LSTM有四个这样的门,以保护和控制cell的状态。 98 | 99 | ## 深入浅出LSTM 100 | 101 | 我们的LSTM的第一步是决定我们需要从cell状态中扔掉什么样的信息。这个决策由一个称为“遗忘门”的sigmoid层做出。它观察ht-1和xt,位cell状态Ct-1中每个number输出一个0和1之间的数。1代表“完全保留这个值”,而0代表“完全扔掉这个值”。 102 | 103 | 让我们回到我们那个基于上文预测最后一个词的语言模型。在这样一个问题中,cell的状态可能包含当前主题的种类,这样才能使用正确的名词。当我们看到一个新的主题的时候,我们会想要遗忘旧的主题的种类。 104 | 105 | ![](../../res/LSTM3-focus-f.png) 106 | 107 | 下一步是决定我们需要在cell state里存储什么样的信息。这个问题有两个部分。第一,一个sigmoid层调用“输入门”以决定哪些数据是需要更新的。然后,一个tanh层为新的候选值创建一个向量C~t,这些值能够加入state中。下一步,我们要将这两个部分合并以创建对state的更新。 108 | 109 | 在我们的语言模型的例子中,我们想要把主题的种类加入到cell state中,以替代我们要遗忘的旧的种类。 110 | 111 | ![](../../res/LSTM3-focus-i.png) 112 | 113 | 现在是时候更新旧的cell stateCt-1到新的cell stateCt。前一步已经决定了我们需要做的事情,我们只需要实现它。 114 | 115 | 我们把旧的state与ft相乘,遗忘我们先前决定遗忘的东西,然后我们加上it \* C~t。这是新的候选值,受我们对每个状态值的更新度约束而缩放。 116 | 117 | 在语言模型的例子中,这就是我们真正扔掉旧主题种类,并增加新的信息的地方,正如我们之前所决定的。 118 | 119 | ![](../../res/LSTM3-focus-C.png) 120 | 121 | 最后,我们需要决定要输出的东西。这个输出基于我们的cell state,但会是一个过滤版本。首先,我们运行一个sigmoid层,以决定cell state中的那个部分是我们将要输出的。然后我们把cell state放进tanh(将数值压到-1和1之间),最后将它与sigmoid门的输出相乘,这样我们就只输出了我们想要的部分了。 122 | 123 | ![](../../res/LSTM3-focus-o.png) 124 | 125 | 语言模型的例子中,由于它仅关注一个主题,它可能会输出与一个动词相关的信息,以防后面还有其他的词。比如,它可能输出这个主题是单数还是复数,让我们知道如果后面还有东西,动词才会对应出现。 126 | 127 | ![](../../res/LSTM3-focus-o-1.png) 128 | 129 | 130 | ## LSTM变种 131 | 132 | 到目前为止我所描述的是一种非常普通的LSTM,但不是所有的LSTM都和上面描述的这种一样。事实上,几乎所有涉及LSTM的文章用的版本都稍有不同,差别微小,但值得一谈。 133 | 134 | 一种由[Gers & Schmidhuber (2000)](ftp://ftp.idsia.ch/pub/juergen/TimeCount-IJCNN2000.pdf)介绍的广受欢迎的LSTM变种,添加了“门镜连接”。这意味着我们可以让门观察cell状态。 135 | 136 | ![](../../res/LSTM3-var-peepholes.png) 137 | 138 | 上面的图为每个门都添加了门镜,但许多文章只会给一部分门镜。 139 | 140 | 另一种变种是使用多个遗忘门和输入门。我们不再分别判断该遗忘和添加的东西,我们同时做出决策。我们只在填充某个位置的时候遗忘原来的东西,我们值在遗忘某些东西的时候输入新的数据。 141 | 142 | ![](../../res/LSTM3-var-tied.png) 143 | 144 | 一个稍微更奇特的变种是循环门单元(Gated Recurrent Unit,GRU),由 [Cho, et al. (2014)](http://arxiv.org/pdf/1406.1078v3.pdf)提出。它组合了遗忘门和输入门到一个单独的“更新门”中。它也合并了cell state和hidden state,并且做了一些其他的改变。结果模型比标准LSTM模型更简单,并且正越来越受欢迎。 145 | 146 | ![](../../res/LSTM3-var-GRU.png) 147 | 148 | A gated recurrent unit neural network. 149 | 150 | 这些只是一些最值得一提的LSTM变种。还有许多其他种类,像[Yao, et al. (2015)](http://arxiv.org/pdf/1508.03790v2.pdf)提出的Depth Gate RNN。也有许多复杂的不同方法来处理长期依赖,像 [Koutnik, et al. (2014)](http://arxiv.org/pdf/1402.3511v1.pdf)提出的Clockwork RNN。 151 | 152 | 哪种变种是最好的?这些区别重要吗? [Greff, et al. (2015)](http://arxiv.org/pdf/1503.04069.pdf)对流行的变种做了一个很好的比较,发现它们都是一样的。[Jozefowicz, et al. (2015)](http://jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)测试了超过一万中RNN结构,发现某些任务情形下,有些比LSTM工作得更好。 153 | 154 | ## 结论 155 | 156 | 首先,我讲述了人们用RNN获得的巨大成果。而这些成果都用到了LSTM,它们在大多数任务中都工作得好得多! 157 | 158 | 列出方程的话,LSTM看起来很吓人。幸好,在这篇文章里一步步看下来让它们变得相对可以接受了些。 159 | 160 | LSTM是我们在RNN上取得的一大步。我们自然会想:还有另一个突破口吗?研究人员中的一个通常的观点是“有!是注意力!”思路是让一个RNN收集信息的每一步都关注更大的一个信息。例如,如果你用一个RNN抽取图片信息来描述它,RNN可能可以为每个输出的词都从图片拿一部分进行分析。事实上,[Xu, et al. (2015)](http://arxiv.org/pdf/1502.03044v2.pdf)就是这样做的 - 这可能是一个有趣的出发点,如果你想要探索注意力这个话题的话。已经有许多令人惊艳的成果了,并且似乎还有更多不为人知的研究。 161 | 162 | 注意力不是RNN研究中唯一刺激的线。例如,网格LSTM([Kalchbrenner, et al. (2015)](http://arxiv.org/pdf/1507.01526v1.pdf)),生产模型中使用RNN( [Gregor, et al. (2015)](http://arxiv.org/pdf/1502.04623.pdf), [Chung, et al. (2015)](http://arxiv.org/pdf/1506.02216v3.pdf), or [Bayer & Osendorfer (2015)](http://arxiv.org/pdf/1411.7610v3.pdf)),都很有趣。最近几年是RNN的黄金时代,下一年更是如此。 163 | 164 | ## 致谢 165 | 略 -------------------------------------------------------------------------------- /src/rnn/word2vec.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | import tensorflow as tf 3 | import numpy as np 4 | import random 5 | import math 6 | import collections 7 | 8 | from matplotlib import pylab 9 | from sklearn.manifold import TSNE 10 | 11 | from not_mnist.img_pickle import save_obj 12 | from not_mnist.load_data import maybe_download 13 | 14 | 15 | def read_data(filename): 16 | """Extract the first file enclosed in a zip file as a list of words""" 17 | with zipfile.ZipFile(filename) as f: 18 | data = tf.compat.as_str(f.read(f.namelist()[0])).split() 19 | return data 20 | 21 | 22 | def build_dataset(words): 23 | count = [['UNK', -1]] 24 | count.extend(collections.Counter(words).most_common(vocabulary_size - 1)) 25 | dictionary = dict() 26 | for word, _ in count: 27 | dictionary[word] = len(dictionary) 28 | data = list() 29 | unk_count = 0 30 | for word in words: 31 | if word in dictionary: 32 | index = dictionary[word] 33 | else: 34 | index = 0 # dictionary['UNK'] 35 | unk_count = unk_count + 1 36 | data.append(index) 37 | count[0][1] = unk_count 38 | reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 39 | return data, count, dictionary, reverse_dictionary 40 | 41 | 42 | def generate_batch(batch_size, num_skips, skip_window): 43 | global data_index 44 | assert batch_size % num_skips == 0 45 | assert num_skips <= 2 * skip_window 46 | batch = np.ndarray(shape=(batch_size), dtype=np.int32) 47 | labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) 48 | span = 2 * skip_window + 1 # [ skip_window target skip_window ] 49 | buffer = collections.deque(maxlen=span) 50 | for _ in range(span): 51 | buffer.append(data[data_index]) 52 | data_index = (data_index + 1) % len(data) 53 | for i in range(batch_size // num_skips): 54 | target = skip_window # target label at the center of the buffer 55 | targets_to_avoid = [skip_window] 56 | for j in range(num_skips): 57 | while target in targets_to_avoid: 58 | target = random.randint(0, span - 1) 59 | targets_to_avoid.append(target) 60 | batch[i * num_skips + j] = buffer[skip_window] 61 | labels[i * num_skips + j, 0] = buffer[target] 62 | buffer.append(data[data_index]) 63 | data_index = (data_index + 1) % len(data) 64 | return batch, labels 65 | 66 | 67 | # load data 68 | url = 'http://mattmahoney.net/dc/' 69 | filename = maybe_download('text8.zip', 31344016, url=url) 70 | 71 | # read data 72 | words = read_data(filename) 73 | print('Data size %d' % len(words)) 74 | 75 | vocabulary_size = 50000 76 | context_size = 1 77 | 78 | data, count, dictionary, reverse_dictionary = build_dataset(words) 79 | print('Most common words (+UNK)', count[:5]) 80 | print('Sample data', data[:10]) 81 | del words # Hint to reduce memory. 82 | 83 | # split data 84 | data_index = 0 85 | 86 | print('data:', [reverse_dictionary[di] for di in data[:8]]) 87 | 88 | for num_skips, skip_window in [(2, 1), (4, 2)]: 89 | batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window) 90 | print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window)) 91 | print(' batch:', [reverse_dictionary[bi] for bi in batch]) 92 | print(' labels:', [reverse_dictionary[li] for li in labels.reshape(8)]) 93 | 94 | batch_size = 128 95 | embedding_size = 128 # Dimension of the embedding vector. 96 | skip_window = 1 # How many words to consider left and right. 97 | num_skips = 2 # How many times to reuse an input to generate a label. 98 | # We pick a random validation set to sample nearest neighbors. here we limit the 99 | # validation samples to the words that have a low numeric ID, which by 100 | # construction are also the most frequent. 101 | valid_size = 16 # Random set of words to evaluate similarity on. 102 | valid_window = 100 # Only pick dev samples in the head of the distribution. 103 | valid_examples = np.array(random.sample(range(valid_window), valid_size)) 104 | num_sampled = 64 # Number of negative examples to sample. 105 | 106 | # tensor: Train a skip-gram model, word2vec 107 | graph = tf.Graph() 108 | 109 | with graph.as_default(), tf.device('/cpu:0'): 110 | # Input data. 111 | train_dataset = tf.placeholder(tf.int32, shape=[batch_size]) 112 | train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) 113 | valid_dataset = tf.constant(valid_examples, dtype=tf.int32) 114 | 115 | # Variables. 116 | embeddings = tf.Variable( 117 | tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) 118 | softmax_weights = tf.Variable( 119 | tf.truncated_normal([vocabulary_size, embedding_size], 120 | stddev=1.0 / math.sqrt(embedding_size))) 121 | softmax_biases = tf.Variable(tf.zeros([vocabulary_size])) 122 | 123 | # Model. 124 | # Look up embeddings for inputs. 125 | embed = tf.nn.embedding_lookup(embeddings, train_dataset) 126 | # Compute the softmax loss, using a sample of the negative labels each time. 127 | loss = tf.reduce_mean( 128 | tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed, 129 | train_labels, num_sampled, vocabulary_size)) 130 | 131 | # Optimizer. 132 | optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss) 133 | 134 | # Compute the similarity between minibatch examples and all embeddings. 135 | # We use the cosine distance: 136 | norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) 137 | normalized_embeddings = embeddings / norm 138 | valid_embeddings = tf.nn.embedding_lookup( 139 | normalized_embeddings, valid_dataset) 140 | similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings)) 141 | 142 | # flow 143 | num_steps = 100001 144 | 145 | with tf.Session(graph=graph) as session: 146 | tf.initialize_all_variables().run() 147 | print('Initialized') 148 | average_loss = 0 149 | for step in range(num_steps): 150 | batch_data, batch_labels = generate_batch( 151 | batch_size, num_skips, skip_window) 152 | feed_dict = {train_dataset: batch_data, train_labels: batch_labels} 153 | _, l = session.run([optimizer, loss], feed_dict=feed_dict) 154 | average_loss += l 155 | if step % 2000 == 0: 156 | if step > 0: 157 | average_loss /= 2000 158 | # The average loss is an estimate of the loss over the last 2000 batches. 159 | print('Average loss at step %d: %f' % (step, average_loss)) 160 | average_loss = 0 161 | # note that this is expensive (~20% slowdown if computed every 500 steps) 162 | if step % 10000 == 0: 163 | sim = similarity.eval() 164 | for i in range(valid_size): 165 | valid_word = reverse_dictionary[valid_examples[i]] 166 | top_k = 8 # number of nearest neighbors 167 | nearest = (-sim[i, :]).argsort()[1:top_k + 1] 168 | log = 'Nearest to %s:' % valid_word 169 | for k in range(top_k): 170 | close_word = reverse_dictionary[nearest[k]] 171 | log = '%s %s,' % (log, close_word) 172 | print(log) 173 | final_embeddings = normalized_embeddings.eval() 174 | save_obj('text8_embed.pickle', final_embeddings) 175 | 176 | num_points = 400 177 | 178 | tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) 179 | two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points + 1, :]) 180 | 181 | 182 | def plot(embeddings, labels): 183 | assert embeddings.shape[0] >= len(labels), 'More labels than embeddings' 184 | pylab.figure(figsize=(15, 15)) # in inches 185 | for i, label in enumerate(labels): 186 | x, y = embeddings[i, :] 187 | pylab.scatter(x, y) 188 | pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', 189 | ha='right', va='bottom') 190 | pylab.show() 191 | 192 | 193 | words = [reverse_dictionary[i] for i in range(1, num_points + 1)] 194 | plot(two_d_embeddings, words) 195 | -------------------------------------------------------------------------------- /note/lesson-4/rnn_practice.md: -------------------------------------------------------------------------------- 1 | # 循环神经网络实践 2 | ## 加载数据 3 | - 使用[text8](http://mattmahoney.net/dc/textdata)作为训练的文本数据集 4 | 5 | text8中只包含27种字符:小写的从a到z,以及空格符。如果把它打出来,读起来就像是去掉了所有标点的wikipedia。 6 | 7 | - 直接调用lesson1中maybe_download下载text8.zip 8 | - 用zipfile读取zip内容为字符串,并拆分成单词list 9 | - 用connections模块统计单词数量并找出最常见的单词 10 | 11 | 12 | 达成随机取数据的目标 13 | 14 | ## 构造计算单元 15 | 16 | ```python 17 | embeddings = tf.Variable( 18 | tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) 19 | ``` 20 | 21 | - 构造一个vocabulary_size x embedding_size的矩阵,作为embeddings容器, 22 | - 有vocabulary_size个容量为embedding_size的向量,每个向量代表一个vocabulary, 23 | - 每个向量的中的分量的值都在-1到1之间随机分布 24 | 25 | ```python 26 | embed = tf.nn.embedding_lookup(embeddings, train_dataset) 27 | ``` 28 | 29 | - 调用tf.nn.embedding_lookup,索引与train_dataset对应的向量,相当于用train_dataset作为一个id,去检索矩阵中与这个id对应的embedding 30 | 31 | ```python 32 | loss = tf.reduce_mean( 33 | tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, embed, 34 | train_labels, num_sampled, vocabulary_size)) 35 | ``` 36 | 37 | - 采样计算训练损失 38 | 39 | ```python 40 | optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss) 41 | ``` 42 | 43 | - 自适应梯度调节器,调节embedding列表的数据,使得偏差最小 44 | 45 | - 预测,并用cos值计算预测向量与实际数据的夹角作为预测准确度(相似度)指标 46 | 47 | ## 传入数据进行训练 48 | - 切割数据用于训练,其中: 49 | 50 | ```python 51 | data_index = (data_index + 1) % len(data) 52 | ``` 53 | 54 | - 依旧是每次取一部分随机数据传入 55 | - 等距离截取一小段文本 56 | - 构造训练集:每个截取窗口的中间位置作为一个train_data 57 | - 构造标签:每个截取窗口中,除了train_data之外的部分,随机取几个成为一个list,作为label(这里只随机取了一个) 58 | - 这样就形成了根据目标词汇预测上下文的机制,即Skip-gram 59 | - 训练100001次,每2000次输出这两千次的平均损失 60 | - 每10000次计算相似度,并输出与验证集中的词最接近的词汇列表 61 | - 用tSNE降维呈现词汇接近程度 62 | - 用matplotlib绘制结果 63 | 64 | ![](../../res/word2vec_res.png) 65 | 66 | 代码见:[word2vec.py](../../src/rnn/word2vec.py) 67 | 68 | ## CBOW 69 | 上面训练的是Skip-gram模型,是根据目标词汇预测上下文,而word2vec还有一种方式,CBOW,根据上下文预测目标词汇。 70 | 71 | 实际上就是将Skip-gram中的输入输出反过来。 72 | 73 | - 修改截取数据的方式 74 | - 构造标签:每个截取窗口的中间位置作为一个train_label 75 | - 构造训练集:每个截取窗口中,除了train_label之外的部分,作为train_data(这里只随机取了一个) 76 | - 这样就形成了根据上下文预测目标词汇的机制,即CBOW 77 | 78 | - 分别从embeding里找到train_data里每个word对应的vector,用tf.reduce_sum将其相加,将相加结果与train_label比较 79 | 80 | ```python 81 | # Look up embeddings for inputs. 82 | embed = tf.nn.embedding_lookup(embeddings, train_dataset) 83 | # sum up vectors on first dimensions, as context vectors 84 | embed_sum = tf.reduce_sum(embed, 0) 85 | ``` 86 | 87 | - 训练中依旧是调节embeding的参数来优化loss 88 | - 训练结果如下图,可以看到不同单词的接近程度 89 | 90 | ![](../../res/cbow_res.png) 91 | 92 | 代码见:[cbow.py](../../src/rnn/cbow.py) 93 | 94 | ## RNN 造句 95 | 整体思路是,以一个文本中的一个词作为train data,后续的所有词作为train label,从而能够根据一个给定词,预测后续的片段。 96 | 97 | ### 训练数据 98 | - BatchGenerator 99 | - text: 全部的文本数据 100 | - text_size:全部文本的字符串长度 101 | - batch_size:每段训练数据的大小 102 | - num_unrollings:要生成的训练数据段的数目 103 | - segment:整个训练数据集可以分成几个训练数据片段 104 | - cursor:重要, 105 | - 一开始记录每个训练数据片段的起始位置坐标,即这个片段位于text的哪个index 106 | - 执行next_batch生成一个训练数据的时候,游标会从初始位置自增,直到取够batch_size个数据 107 | - last_batch:上一个训练数据片段 108 | - 每调用一次next,生成一个num_unrollings长的array,以last_batch开头,跟着num_unrollings个batch 109 | - 每个batch的作为train_input,每个batch后面的一个batch作为train_label,每个step训练num_unrolling个batch 110 | 111 | 112 | ### lstm-cell 113 | - 为了解决消失的梯度问题,引入lstm-cell,增强model的记忆能力 114 | - 根据这篇论文设计lstm-cell: http://arxiv.org/pdf/1402.1128v1.pdf 115 | - 分别有三个门:输入门,遗忘门,输出门,构成一个cell 116 | - 输入数据是num_nodes个词,可能有vocabulary_size种词 117 | - 输入门: 118 | 119 | ```python 120 | input_gate = sigmoid(i * ix + o * im + ib) 121 | ``` 122 | 123 | - 给输入乘一个vocabulary_size * num_nodes大小的矩阵,给输出乘一个num_nodes * num_nodes大小的矩阵; 124 | - 用这两个矩阵调节对输入数据的取舍程度 125 | - 用sigmoid这个非线性函数进行激活 126 | 127 | - 遗忘门: 128 | 129 | ```python 130 | forget_gate = sigmoid(i * fx + o * fm + fb) 131 | ``` 132 | 133 | 思路同输入门,用以对历史数据做取舍 134 | 135 | - 输出门: 136 | 137 | ```python 138 | output_gate = sigmoid(i * ox + o * om + ob) 139 | ``` 140 | 141 | 思路同输入门,用以对输出状态做取舍 142 | 143 | - 组合: 144 | 145 | ```python 146 | update = i * cx + o * cm + cb 147 | state = forget_gate * state + input_gate * tanh(update) 148 | lstm_cell = output_gate * tanh(state) 149 | ``` 150 | 151 | - 用同样的方式构造新状态update 152 | - 用遗忘门处理历史状态state 153 | - 用tanh激活新状态update 154 | - 用输入门处理新状态update 155 | - 整合新旧状态,再用tanh激活状态state 156 | - 用输出门处理state 157 | 158 | ### lstm优化 159 | 上面的cell中,update,output_gate,forget_gate,input_gate计算方法都是一样的, 160 | 可以把四组参数分别合并,一次计算,再分别取出: 161 | 162 | ```python 163 | values = tf.split(1, gate_count, tf.matmul(i, input_weights) + tf.matmul(o, output_weights) + bias) 164 | input_gate = tf.sigmoid(values[0]) 165 | forget_gate = tf.sigmoid(values[1]) 166 | update = values[2] 167 | ``` 168 | 169 | 再将lstm-cell的输出扔到一个WX+b中调整作为输出 170 | 171 | 代码见:[singlew_lstm.py](../../src/rnn/singlew_lstm.py) 172 | 173 | ### Optimizer 174 | - 采用one-hot encoding作为label预测 175 | - 采用交叉熵计算损失 176 | - 引入learning rate decay 177 | 178 | ### Flow 179 | - 填入训练数据到placeholder中 180 | - 验证集的准确性用logprob来计算,即对可能性取对数 181 | - 每10次训练随机挑取5个字母作为起始词,进行造句测试 182 | - 你可能注意到输出的sentence是由sample得到的词组成的,而非选择概率最高的词,这是因为,如果一直取概率最高的词,最后会一直重复这个概率最高的词 183 | 184 | ## Beam Search 185 | 上面的流程里,每次都是以一个字符作为单位,可以使用多一点的字符做预测,取最高概率的那个,防止特殊情况导致的误判 186 | 187 | 在这里我们增加字符为2个,形成bigram,代码见:[bigram_lstm.py](../../src/rnn/bigram_lstm.py) 188 | 189 | 主要通过BigramBatchGenerator类实现 190 | 191 | ## Embedding look up 192 | 193 | 由于bigram情况下,vocabulary_size变为 27\*27个,使用one-hot encoding 做predict的话会产生非常稀疏的矩阵,浪费算力,计算速度慢 194 | 195 | 因此引入embedding_lookup,代码见[embed_bigram_lstm.py](../../src/rnn/embed_bigram_lstm.py) 196 | 197 | - 数据输入:BatchGenerator不再生成one-hot-encoding的向量作为输入,而是直接生成bigram对应的index列表 198 | - embedding look up调整embedding,使bigram与vector对应起来 199 | - 将embedding look up的结果喂给lstm cell即可 200 | - 输出时,需要将label和output都转为One-hot-encoding,才能用交叉熵和softmax计算损失 201 | - 在tensor里做data到one-hot-encoding转换时,主要依赖tf.gather函数 202 | - 在对valid数据做转换时,主要依赖one_hot_voc函数 203 | 204 | ## Drop out 205 | - 在lstm cell中对input和output做drop out 206 | - Refer to this [article](http://arxiv.org/abs/1409.2329) 207 | 208 | ## Seq2Seq 209 | - 最后一个问题是,将一个句子中每个词转为它的逆序字符串,也就是一个seq到seq的转换 210 | - 正经的实现思路是,word 2 vector 2 lstm 2 vector 2 word 211 | - 不过tensorflow已经有了这样一个模型来做这件事情:Seq2SeqModel,关于这个模型可以看[这个分析](http://www.cnblogs.com/edwardbi/p/5559338.html) 212 | 以及tensorflow的[example](https://github.com/tensorflow/tensorflow/blob/63409bd23facad471973b110df998782c0e19c06/tensorflow/models/rnn/translate/translate.py#L132) 213 | - 只需要从batch中,根据字符串逆序的规律生成target sequence,放到seq2seqmodel里即可,主要依赖rev_id函数 214 | - 实现见seq2seq.py 215 | - 注意,用Seq2SeqModel的时候,size和num_layer会在学习到正确的规律前就收敛,我把它调大了一点 216 | 217 | ```python 218 | def create_model(sess, forward_only): 219 | model = seq2seq_model.Seq2SeqModel(source_vocab_size=vocabulary_size, 220 | target_vocab_size=vocabulary_size, 221 | buckets=[(20, 21)], 222 | size=256, 223 | num_layers=4, 224 | max_gradient_norm=5.0, 225 | batch_size=batch_size, 226 | learning_rate=1.0, 227 | learning_rate_decay_factor=0.9, 228 | use_lstm=True, 229 | forward_only=forward_only) 230 | return model 231 | ``` 232 | - 参数含义 233 | - source_vocab_size: size of the source vocabulary. 234 | - target_vocab_size: size of the target vocabulary. 235 | - buckets: a list of pairs (I, O), where I specifies maximum input length 236 | that will be processed in that bucket, and O specifies maximum output 237 | length. Training instances that have inputs longer than I or outputs 238 | longer than O will be pushed to the next bucket and padded accordingly. 239 | We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. 240 | - size: number of units in each layer of the model. 241 | - num_layers: number of layers in the model. 242 | - max_gradient_norm: gradients will be clipped to maximally this norm. 243 | - batch_size: the size of the batches used during training; 244 | the model construction is independent of batch_size, so it can be 245 | changed after initialization if this is convenient, e.g., for decoding. 246 | - learning_rate: learning rate to start with. 247 | - learning_rate_decay_factor: decay learning rate by this much when needed. 248 | - use_lstm: if true, we use LSTM cells instead of GRU cells. 249 | - num_samples: number of samples for sampled softmax. 250 | - forward_only: if set, we do not construct the backward pass in the model. 251 | 252 | ## 参考链接 253 | - [林洲汉-知乎](https://www.zhihu.com/question/28473843/answer/68797210) 254 | - [词向量](http://www.jeyzhang.com/tensorflow-learning-notes-3.html) 255 | - [rudolfix - udacity_deeplearn](https://github.com/rudolfix/udacity_deeplearn/) 256 | - [Edwardbi - 解析Tensorflow官方English-Franch翻译器demo](http://www.cnblogs.com/edwardbi/p/5559338.html) 257 | 258 | -------------------------------------------------------------------------------- /src/rnn/cbow.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | import tensorflow as tf 3 | import numpy as np 4 | import random 5 | import math 6 | import collections 7 | 8 | from matplotlib import pylab 9 | from sklearn.manifold import TSNE 10 | 11 | from not_mnist.img_pickle import save_obj, load_pickle 12 | from not_mnist.load_data import maybe_download 13 | 14 | 15 | def read_data(filename): 16 | """Extract the first file enclosed in a zip file as a list of words""" 17 | with zipfile.ZipFile(filename) as f: 18 | data = tf.compat.as_str(f.read(f.namelist()[0])).split() 19 | return data 20 | 21 | 22 | def build_dataset(words): 23 | count = [['UNK', -1]] 24 | count.extend(collections.Counter(words).most_common(vocabulary_size - 1)) 25 | dictionary = dict() 26 | for word, _ in count: 27 | dictionary[word] = len(dictionary) 28 | data = list() 29 | unk_count = 0 30 | for word in words: 31 | if word in dictionary: 32 | index = dictionary[word] 33 | else: 34 | index = 0 # dictionary['UNK'] 35 | unk_count = unk_count + 1 36 | data.append(index) 37 | count[0][1] = unk_count 38 | reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 39 | return data, count, dictionary, reverse_dictionary 40 | 41 | 42 | def generate_batch(batch_size, num_skips, skip_window): 43 | global data_index 44 | assert batch_size % num_skips == 0 45 | assert num_skips <= 2 * skip_window 46 | context_size = 2 * skip_window 47 | labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) 48 | batchs = np.ndarray(shape=(context_size, batch_size), dtype=np.int32) 49 | span = 2 * skip_window + 1 # [ skip_window target skip_window ] 50 | buffer = collections.deque(maxlen=span) 51 | for _ in range(span): 52 | buffer.append(data[data_index]) 53 | data_index = (data_index + 1) % len(data) 54 | 55 | # use data of batch_size to create train_data-label set of batch_size // num_skips * num_skips 56 | for i in range(batch_size // num_skips): 57 | target = skip_window # target label at the center of the buffer 58 | for j in range(num_skips): 59 | labels[i * num_skips + j, 0] = buffer[target] 60 | met_target = False 61 | for bj in range(context_size): 62 | if bj == target: 63 | met_target = True 64 | if met_target: 65 | batchs[bj, i * num_skips + j] = buffer[bj + 1] 66 | else: 67 | batchs[bj, i * num_skips + j] = buffer[bj] 68 | 69 | buffer.append(data[data_index]) 70 | data_index = (data_index + 1) % len(data) 71 | # print('generate batch') 72 | # print(batchs) 73 | return batchs, labels 74 | 75 | 76 | data_set = load_pickle('text8_data.pickle') 77 | if data_set is None: 78 | # load data 79 | url = 'http://mattmahoney.net/dc/' 80 | filename = maybe_download('text8.zip', 31344016, url=url) 81 | 82 | # read data 83 | words = read_data(filename) 84 | print('Data size %d' % len(words)) 85 | data, count, dictionary, reverse_dictionary = build_dataset(words) 86 | print('Most common words (+UNK)', count[:5]) 87 | print('Sample data', data[:10]) 88 | del words # Hint to reduce memory. 89 | data_set = { 90 | 'data': data, 'count': count, 'dictionary': dictionary, 'reverse_dictionary': reverse_dictionary, 91 | } 92 | save_obj('text8_data.pickle', data_set) 93 | else: 94 | data = data_set['data'] 95 | count = data_set['count'] 96 | dictionary = data_set['dictionary'] 97 | reverse_dictionary = data_set['reverse_dictionary'] 98 | 99 | vocabulary_size = 50000 100 | # split data 101 | data_index = 0 102 | 103 | print('data:', [reverse_dictionary[di] for di in data[:8]]) 104 | 105 | for num_skips, skip_window in [(2, 1), (4, 2)]: 106 | test_size = 8 107 | batch, labels = generate_batch(batch_size=test_size, num_skips=num_skips, skip_window=skip_window) 108 | print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window)) 109 | print(' batch:', [reverse_dictionary[bi] for bi in batch.reshape(-1)]) 110 | print(' labels:', [reverse_dictionary[li] for li in labels.reshape(-1)]) 111 | 112 | batch_size = 128 113 | embedding_size = 128 # Dimension of the embedding vector. 114 | skip_window = 1 # How many words to consider left and right. 115 | num_skips = 2 # How many times to reuse an input to generate a label. 116 | # We pick a random validation set to sample nearest neighbors. here we limit the 117 | # validation samples to the words that have a low numeric ID, which by 118 | # construction are also the most frequent. 119 | valid_size = 16 # Random set of words to evaluate similarity on. 120 | valid_window = 100 # Only pick dev samples in the head of the distribution. 121 | valid_examples = np.array(random.sample(range(valid_window), valid_size)) 122 | num_sampled = 64 # Number of negative examples to sample. 123 | 124 | # tensor: Train a skip-gram model, word2vec 125 | graph = tf.Graph() 126 | 127 | with graph.as_default(), tf.device('/cpu:0'): 128 | # Input data. 129 | train_dataset = tf.placeholder(tf.int32, shape=[2 * skip_window, batch_size]) 130 | train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) 131 | valid_dataset = tf.constant(valid_examples, shape=[2 * skip_window, batch_size], dtype=tf.int32) 132 | 133 | # Variables. 134 | embeddings = tf.Variable( 135 | tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) 136 | softmax_weights = tf.Variable( 137 | tf.truncated_normal([vocabulary_size, embedding_size], 138 | stddev=1.0 / math.sqrt(embedding_size))) 139 | softmax_biases = tf.Variable(tf.zeros([vocabulary_size])) 140 | 141 | # Model. 142 | # Look up embeddings for inputs. 143 | embed = tf.nn.embedding_lookup(embeddings, train_dataset) 144 | # sum up vectors on first dimensions, as context vectors 145 | embed_sum = tf.reduce_sum(embed, 0) 146 | 147 | # Compute the softmax loss, using a sample of the negative labels each time. 148 | loss = tf.reduce_mean( 149 | tf.nn.sampled_softmax_loss(softmax_weights, softmax_biases, 150 | embed_sum, 151 | train_labels, num_sampled, vocabulary_size)) 152 | 153 | # Optimizer. 154 | optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss) 155 | 156 | # Compute the similarity between minibatch examples and all embeddings. 157 | # We use the cosine distance: 158 | norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True)) 159 | normalized_embeddings = embeddings / norm 160 | valid_embeddings = tf.nn.embedding_lookup( 161 | normalized_embeddings, valid_dataset) 162 | # sum up vectors 163 | valid_embeddings_sum = tf.reduce_sum(valid_embeddings, 0) 164 | similarity = tf.matmul(valid_embeddings_sum, tf.transpose(normalized_embeddings)) 165 | 166 | # flow 167 | num_steps = 100001 168 | 169 | with tf.Session(graph=graph) as session: 170 | tf.initialize_all_variables().run() 171 | print('Initialized') 172 | average_loss = 0 173 | for step in range(num_steps): 174 | batch_data, batch_labels = generate_batch( 175 | batch_size, num_skips, skip_window) 176 | # print(batch_data.shape) 177 | # print(batch_labels.shape) 178 | feed_dict = {train_dataset: batch_data, train_labels: batch_labels} 179 | _, l = session.run([optimizer, loss], feed_dict=feed_dict) 180 | average_loss += l 181 | if step % 2000 == 0: 182 | if step > 0: 183 | average_loss /= 2000 184 | # The average loss is an estimate of the loss over the last 2000 batches. 185 | print('Average loss at step %d: %f' % (step, average_loss)) 186 | average_loss = 0 187 | # note that this is expensive (~20% slowdown if computed every 500 steps) 188 | if step % 10000 == 0: 189 | sim = similarity.eval() 190 | for i in range(valid_size): 191 | valid_word = reverse_dictionary[valid_examples[i]] 192 | top_k = 8 # number of nearest neighbors 193 | nearest = (-sim[i, :]).argsort()[1:top_k + 1] 194 | log = 'Nearest to %s:' % valid_word 195 | for k in range(top_k): 196 | close_word = reverse_dictionary[nearest[k]] 197 | log = '%s %s,' % (log, close_word) 198 | print(log) 199 | final_embeddings = normalized_embeddings.eval() 200 | save_obj('text8_embed.pickle', final_embeddings) 201 | 202 | num_points = 400 203 | 204 | tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) 205 | two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points + 1, :]) 206 | 207 | 208 | def plot(embeddings, labels): 209 | assert embeddings.shape[0] >= len(labels), 'More labels than embeddings' 210 | pylab.figure(figsize=(15, 15)) # in inches 211 | for i, label in enumerate(labels): 212 | x, y = embeddings[i, :] 213 | pylab.scatter(x, y) 214 | pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', 215 | ha='right', va='bottom') 216 | pylab.show() 217 | 218 | 219 | words = [reverse_dictionary[i] for i in range(1, num_points + 1)] 220 | plot(two_d_embeddings, words) 221 | -------------------------------------------------------------------------------- /src/rnn/seq2seq.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import math 3 | import string 4 | import zipfile 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | from tensorflow.models.rnn.translate import seq2seq_model 9 | 10 | from not_mnist.img_pickle import save_obj, load_pickle 11 | from not_mnist.load_data import maybe_download 12 | 13 | 14 | def read_data(filename): 15 | f = zipfile.ZipFile(filename) 16 | for name in f.namelist(): 17 | return tf.compat.as_str(f.read(name)) 18 | f.close() 19 | 20 | 21 | data_set = load_pickle('text8_text.pickle') 22 | if data_set is None: 23 | # load data 24 | url = 'http://mattmahoney.net/dc/' 25 | filename = maybe_download('text8.zip', 31344016, url=url) 26 | 27 | # read data 28 | text = read_data(filename) 29 | print('Data size %d' % len(text)) 30 | save_obj('text8_text.pickle', text) 31 | else: 32 | text = data_set 33 | 34 | # Create a small validation set. 35 | valid_size = 100 36 | valid_text = text[:valid_size] 37 | train_text = text[valid_size:] 38 | train_size = len(train_text) 39 | print(train_size, train_text[:64]) 40 | print(valid_size, valid_text[:64]) 41 | 42 | vocabulary_size = 35 # len(string.ascii_lowercase) + 2 # [a-z] + ' ' 43 | first_letter = ord(string.ascii_lowercase[0]) 44 | 45 | 46 | def char2id(char): 47 | if char in string.ascii_lowercase: 48 | return ord(char) - first_letter + 5 49 | elif char == ' ': 50 | return 4 51 | elif char == '!': 52 | return 31 53 | else: 54 | print('Unexpected character: %s' % char) 55 | return 0 56 | 57 | 58 | def id2char(dictid): 59 | if dictid == 31: 60 | return '!' 61 | elif dictid > 4: 62 | return chr(dictid + first_letter - 5) 63 | elif dictid == 4: 64 | return ' ' 65 | else: 66 | return '@' 67 | 68 | 69 | print(char2id('a'), char2id('z'), char2id(' '), char2id('!')) 70 | print(id2char(5), id2char(30), id2char(4), id2char(31)) 71 | batch_size = 64 72 | num_unrollings = 19 73 | 74 | 75 | class BatchGenerator(object): 76 | def __init__(self, text, batch_size, num_unrollings): 77 | self._text = text 78 | self._text_size = len(text) 79 | self._batch_size = batch_size 80 | self._num_unrollings = num_unrollings 81 | segment = self._text_size // num_unrollings 82 | self._cursor = [offset * segment for offset in range(batch_size)] 83 | self._last_batch = self._next_batch(0) 84 | 85 | def _next_batch(self, step): 86 | """Generate a single batch from the current cursor position in the data.""" 87 | batch = '' 88 | # print('text size', self._text_size) 89 | for b in range(self._num_unrollings): 90 | # print(self._cursor[step]) 91 | self._cursor[step] %= self._text_size 92 | batch += self._text[self._cursor[step]] 93 | self._cursor[step] += 1 94 | return batch 95 | 96 | def next(self): 97 | """Generate the next array of batches from the data. The array consists of 98 | the last batch of the previous array, followed by num_unrollings new ones. 99 | """ 100 | batches = [self._last_batch] 101 | for step in range(self._batch_size): 102 | batches.append(self._next_batch(step)) 103 | self._last_batch = batches[-1] 104 | return batches 105 | 106 | 107 | def characters(probabilities): 108 | """Turn a 1-hot encoding or a probability distribution over the possible 109 | characters back into its (most likely) character representation.""" 110 | return [id2char(c) for c in np.argmax(probabilities, 1)] 111 | 112 | 113 | def ids(probabilities): 114 | """Turn a 1-hot encoding or a probability distribution over the possible 115 | characters back into its (most likely) character representation.""" 116 | return [str(c) for c in np.argmax(probabilities, 1)] 117 | 118 | 119 | def batches2id(batches): 120 | """Convert a sequence of batches back into their (most likely) string 121 | representation.""" 122 | s = [''] * batches[0].shape[0] 123 | for b in batches: 124 | s = [''.join(x) for x in zip(s, ids(b))] 125 | return s 126 | 127 | train_batches = BatchGenerator(train_text, batch_size, num_unrollings) 128 | valid_batches = BatchGenerator(valid_text, 1, num_unrollings) 129 | 130 | 131 | def rev_id(forward): 132 | temp = forward.split(' ') 133 | backward = [] 134 | for i in range(len(temp)): 135 | backward += temp[i][::-1] + ' ' 136 | return map(lambda x: char2id(x), backward[:-1] + ['!']) 137 | 138 | 139 | def create_model(sess, forward_only): 140 | model = seq2seq_model.Seq2SeqModel(source_vocab_size=vocabulary_size, 141 | target_vocab_size=vocabulary_size, 142 | buckets=[(20, 21)], 143 | size=256, 144 | num_layers=4, 145 | max_gradient_norm=5.0, 146 | batch_size=batch_size, 147 | learning_rate=1.0, 148 | learning_rate_decay_factor=0.9, 149 | use_lstm=True, 150 | forward_only=forward_only) 151 | return model 152 | 153 | 154 | with tf.Session() as sess: 155 | model = create_model(sess, False) 156 | sess.run(tf.initialize_all_variables()) 157 | num_steps = 30001 158 | 159 | # This is the training loop. 160 | step_time, loss = 0.0, 0.0 161 | current_step = 0 162 | previous_losses = [] 163 | step_ckpt = 100 164 | valid_ckpt = 500 165 | 166 | for step in range(1, num_steps): 167 | model.batch_size = batch_size 168 | train_batches_next = train_batches.next() 169 | batches = train_batches_next 170 | train_sets = [] 171 | batch_encs = map(lambda x: map(lambda y: char2id(y), list(x)), batches) 172 | batch_decs = map(lambda x: rev_id(x), batches) 173 | for i in range(len(batch_encs)): 174 | train_sets.append((batch_encs[i], batch_decs[i])) 175 | 176 | # Get a batch and make a step. 177 | encoder_inputs, decoder_inputs, target_weights = model.get_batch([train_sets], 0) 178 | _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, 0, False) 179 | 180 | loss += step_loss / step_ckpt 181 | 182 | # Once in a while, we save checkpoint, print statistics, and run evals. 183 | if step % step_ckpt == 0: 184 | # Print statistics for the previous epoch. 185 | perplexity = math.exp(loss) if loss < 300 else float('inf') 186 | print ("global step %d learning rate %.4f perplexity " 187 | "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), perplexity)) 188 | # Decrease learning rate if no improvement was seen over last 3 times. 189 | if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): 190 | sess.run(model.learning_rate_decay_op) 191 | previous_losses.append(loss) 192 | 193 | loss = 0.0 194 | 195 | if step % valid_ckpt == 0: 196 | v_loss = 0.0 197 | 198 | model.batch_size = 1 199 | batches = ['the quick brown fox'] 200 | test_sets = [] 201 | batch_encs = map(lambda x: map(lambda y: char2id(y), list(x)), batches) 202 | # batch_decs = map(lambda x: rev_id(x), batches) 203 | test_sets.append((batch_encs[0], [])) 204 | # Get a 1-element batch to feed the sentence to the model. 205 | encoder_inputs, decoder_inputs, target_weights = model.get_batch([test_sets], 0) 206 | # Get output logits for the sentence. 207 | _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, 0, True) 208 | 209 | # This is a greedy decoder - outputs are just argmaxes of output_logits. 210 | outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] 211 | # If there is an EOS symbol in outputs, cut them at that point. 212 | 213 | if char2id('!') in outputs: 214 | outputs = outputs[:outputs.index(char2id('!'))] 215 | 216 | print('>>>>>>>>> ', batches[0], ' -> ', ''.join(map(lambda x: id2char(x), outputs))) 217 | 218 | for _ in range(valid_size): 219 | model.batch_size = 1 220 | v_batches = valid_batches.next() 221 | valid_sets = [] 222 | v_batch_encs = map(lambda x: map(lambda y: char2id(y), list(x)), v_batches) 223 | v_batch_decs = map(lambda x: rev_id(x), v_batches) 224 | for i in range(len(v_batch_encs)): 225 | valid_sets.append((v_batch_encs[i], v_batch_decs[i])) 226 | encoder_inputs, decoder_inputs, target_weights = model.get_batch([valid_sets], 0) 227 | _, eval_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, 0, True) 228 | v_loss += eval_loss / valid_size 229 | 230 | eval_ppx = math.exp(v_loss) if v_loss < 300 else float('inf') 231 | print(" valid eval: perplexity %.2f" % (eval_ppx)) 232 | 233 | # reuse variable -> subdivide into two boxes 234 | model.batch_size = 1 # We decode one sentence at a time. 235 | batches = ['the quick brown fox'] 236 | test_sets = [] 237 | batch_encs = map(lambda x: map(lambda y: char2id(y), list(x)), batches) 238 | # batch_decs = map(lambda x: rev_id(x), batches) 239 | test_sets.append((batch_encs[0], [])) 240 | # Get a 1-element batch to feed the sentence to the model. 241 | encoder_inputs, decoder_inputs, target_weights = model.get_batch([test_sets], 0) 242 | # Get output logits for the sentence. 243 | _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, 0, True) 244 | # This is a greedy decoder - outputs are just argmaxes of output_logits. 245 | outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] 246 | print ('## : ', outputs) 247 | # If there is an EOS symbol in outputs, cut them at that point. 248 | if char2id('!') in outputs: 249 | outputs = outputs[:outputs.index(char2id('!'))] 250 | 251 | print(batches[0], ' -> ', ''.join(map(lambda x: id2char(x), outputs))) 252 | -------------------------------------------------------------------------------- /src/neural/nn_overfit.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | from neural.full_connect import load_reformat_not_mnist, accuracy 7 | 8 | 9 | def tf_better_nn(offset_range=-1, regular=False, drop_out=False, lrd=False): 10 | batch_size = 128 11 | 12 | graph = tf.Graph() 13 | with graph.as_default(): 14 | # Input data. For the training data, we use a placeholder that will be fed 15 | # at run time with a training minibatch. 16 | tf_train_dataset = tf.placeholder(tf.float32, 17 | shape=(batch_size, image_size * image_size)) 18 | tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels)) 19 | tf_valid_dataset = tf.constant(valid_dataset) 20 | tf_test_dataset = tf.constant(test_dataset) 21 | 22 | hidden_node_count = 1024 23 | # Variables. 24 | weights1 = tf.Variable( 25 | tf.truncated_normal([image_size * image_size, hidden_node_count])) 26 | biases1 = tf.Variable(tf.zeros([hidden_node_count])) 27 | 28 | weights2 = tf.Variable( 29 | tf.truncated_normal([hidden_node_count, num_labels])) 30 | biases2 = tf.Variable(tf.zeros([num_labels])) 31 | 32 | # Training computation. right most 33 | ys = tf.matmul(tf_train_dataset, weights1) + biases1 34 | hidden = tf.nn.relu(ys) 35 | h_fc = hidden 36 | 37 | valid_y0 = tf.matmul(tf_valid_dataset, weights1) + biases1 38 | valid_hidden1 = tf.nn.relu(valid_y0) 39 | 40 | test_y0 = tf.matmul(tf_test_dataset, weights1) + biases1 41 | test_hidden1 = tf.nn.relu(test_y0) 42 | 43 | # enable DropOut 44 | keep_prob = tf.placeholder(tf.float32) 45 | if drop_out: 46 | hidden_drop = tf.nn.dropout(hidden, keep_prob) 47 | h_fc = hidden_drop 48 | 49 | # left most 50 | logits = tf.matmul(h_fc, weights2) + biases2 51 | # only drop out when train 52 | logits_predict = tf.matmul(hidden, weights2) + biases2 53 | valid_predict = tf.matmul(valid_hidden1, weights2) + biases2 54 | test_predict = tf.matmul(test_hidden1, weights2) + biases2 55 | # loss 56 | l2_loss = tf.nn.l2_loss(weights1) + tf.nn.l2_loss(biases1) + tf.nn.l2_loss(weights2) + tf.nn.l2_loss(biases2) 57 | # enable regularization 58 | if not regular: 59 | l2_loss = 0 60 | beta = 0.002 61 | loss = tf.reduce_mean( 62 | tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + beta * l2_loss 63 | 64 | # Optimizer. 65 | optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss) 66 | if lrd: 67 | cur_step = tf.Variable(0) # count the number of steps taken. 68 | starter_learning_rate = 0.1 69 | learning_rate = tf.train.exponential_decay(starter_learning_rate, cur_step, 10000, 0.96, staircase=True) 70 | optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=cur_step) 71 | 72 | # Predictions for the training, validation, and test data. 73 | train_prediction = tf.nn.softmax(logits_predict) 74 | valid_prediction = tf.nn.softmax(valid_predict) 75 | test_prediction = tf.nn.softmax(test_predict) 76 | 77 | num_steps = 30001 78 | 79 | with tf.Session(graph=graph) as session: 80 | tf.initialize_all_variables().run() 81 | print("Initialized") 82 | for step in range(num_steps): 83 | # Pick an offset within the training data, which has been randomized. 84 | # Note: we could use better randomization across epochs. 85 | if offset_range == -1: 86 | offset_range = train_labels.shape[0] - batch_size 87 | 88 | offset = (step * batch_size) % offset_range 89 | # Generate a minibatch. 90 | batch_data = train_dataset[offset:(offset + batch_size), :] 91 | batch_labels = train_labels[offset:(offset + batch_size), :] 92 | # Prepare a dictionary telling the session where to feed the minibatch. 93 | # The key of the dictionary is the placeholder node of the graph to be fed, 94 | # and the value is the numpy array to feed to it. 95 | feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels, keep_prob: 0.5} 96 | _, l, predictions = session.run( 97 | [optimizer, loss, train_prediction], feed_dict=feed_dict) 98 | if step % 500 == 0: 99 | print("Minibatch loss at step %d: %f" % (step, l)) 100 | print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels)) 101 | print("Validation accuracy: %.1f%%" % accuracy( 102 | valid_prediction.eval(), valid_labels)) 103 | print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels)) 104 | 105 | 106 | def tf_deep_nn(regular=False, drop_out=False, lrd=False, layer_cnt=2): 107 | batch_size = 128 108 | 109 | graph = tf.Graph() 110 | with graph.as_default(): 111 | tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size)) 112 | tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels)) 113 | tf_valid_dataset = tf.constant(valid_dataset) 114 | tf_test_dataset = tf.constant(test_dataset) 115 | 116 | hidden_node_count = 1024 117 | # start weight 118 | hidden_stddev = np.sqrt(2.0 / 784) 119 | weights1 = tf.Variable(tf.truncated_normal([image_size * image_size, hidden_node_count], stddev=hidden_stddev)) 120 | biases1 = tf.Variable(tf.zeros([hidden_node_count])) 121 | # middle weight 122 | weights = [] 123 | biases = [] 124 | hidden_cur_cnt = hidden_node_count 125 | for i in range(layer_cnt - 2): 126 | if hidden_cur_cnt > 2: 127 | hidden_next_cnt = int(hidden_cur_cnt / 2) 128 | else: 129 | hidden_next_cnt = 2 130 | hidden_stddev = np.sqrt(2.0 / hidden_cur_cnt) 131 | weights.append(tf.Variable(tf.truncated_normal([hidden_cur_cnt, hidden_next_cnt], stddev=hidden_stddev))) 132 | biases.append(tf.Variable(tf.zeros([hidden_next_cnt]))) 133 | hidden_cur_cnt = hidden_next_cnt 134 | # first wx + b 135 | y0 = tf.matmul(tf_train_dataset, weights1) + biases1 136 | # first relu 137 | hidden = tf.nn.relu(y0) 138 | hidden_drop = hidden 139 | # first DropOut 140 | keep_prob = 0.5 141 | if drop_out: 142 | hidden_drop = tf.nn.dropout(hidden, keep_prob) 143 | # first wx+b for valid 144 | valid_y0 = tf.matmul(tf_valid_dataset, weights1) + biases1 145 | valid_hidden = tf.nn.relu(valid_y0) 146 | # first wx+b for test 147 | test_y0 = tf.matmul(tf_test_dataset, weights1) + biases1 148 | test_hidden = tf.nn.relu(test_y0) 149 | 150 | # middle layer 151 | for i in range(layer_cnt - 2): 152 | y1 = tf.matmul(hidden_drop, weights[i]) + biases[i] 153 | hidden_drop = tf.nn.relu(y1) 154 | if drop_out: 155 | keep_prob += 0.5 * i / (layer_cnt + 1) 156 | hidden_drop = tf.nn.dropout(hidden_drop, keep_prob) 157 | 158 | y0 = tf.matmul(hidden, weights[i]) + biases[i] 159 | hidden = tf.nn.relu(y0) 160 | 161 | valid_y0 = tf.matmul(valid_hidden, weights[i]) + biases[i] 162 | valid_hidden = tf.nn.relu(valid_y0) 163 | 164 | test_y0 = tf.matmul(test_hidden, weights[i]) + biases[i] 165 | test_hidden = tf.nn.relu(test_y0) 166 | 167 | # last weight 168 | weights2 = tf.Variable(tf.truncated_normal([hidden_cur_cnt, num_labels], stddev=hidden_stddev / 2)) 169 | biases2 = tf.Variable(tf.zeros([num_labels])) 170 | # last wx + b 171 | logits = tf.matmul(hidden_drop, weights2) + biases2 172 | 173 | # predicts 174 | logits_predict = tf.matmul(hidden, weights2) + biases2 175 | valid_predict = tf.matmul(valid_hidden, weights2) + biases2 176 | test_predict = tf.matmul(test_hidden, weights2) + biases2 177 | 178 | l2_loss = 0 179 | # enable regularization 180 | if regular: 181 | l2_loss = tf.nn.l2_loss(weights1) + tf.nn.l2_loss(weights2) 182 | for i in range(len(weights)): 183 | l2_loss += tf.nn.l2_loss(weights[i]) 184 | # l2_loss += tf.nn.l2_loss(biases[i]) 185 | beta = 0.25 / batch_size 186 | beta = 1e-5 187 | l2_loss *= beta 188 | loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) + l2_loss 189 | 190 | # Optimizer. 191 | if lrd: 192 | cur_step = tf.Variable(0, trainable=False) # count the number of steps taken. 193 | starter_learning_rate = 0.4 194 | learning_rate = tf.train.exponential_decay(starter_learning_rate, cur_step, 100000, 0.96, staircase=True) 195 | optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=cur_step) 196 | else: 197 | optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss) 198 | 199 | # Predictions for the training, validation, and test data. 200 | train_prediction = tf.nn.softmax(logits_predict) 201 | valid_prediction = tf.nn.softmax(valid_predict) 202 | test_prediction = tf.nn.softmax(test_predict) 203 | 204 | num_steps = 20001 205 | 206 | with tf.Session(graph=graph) as session: 207 | tf.initialize_all_variables().run() 208 | print("Initialized") 209 | for step in range(num_steps): 210 | offset_range = train_labels.shape[0] - batch_size 211 | offset = (step * batch_size) % offset_range 212 | batch_data = train_dataset[offset:(offset + batch_size), :] 213 | batch_labels = train_labels[offset:(offset + batch_size), :] 214 | feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels} 215 | _, l, predictions = session.run( 216 | [optimizer, loss, train_prediction], feed_dict=feed_dict) 217 | if step % 500 == 0: 218 | print("Minibatch loss at step %d: %f" % (step, l)) 219 | print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels)) 220 | print("Validation accuracy: %.1f%%" % accuracy( 221 | valid_prediction.eval(), valid_labels)) 222 | print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels)) 223 | 224 | if __name__ == '__main__': 225 | image_size = 28 226 | num_labels = 10 227 | train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels = \ 228 | load_reformat_not_mnist(image_size, num_labels) 229 | # tf_better_nn(regular=True) 230 | # tf_better_nn(offset_range=1000) 231 | # tf_better_nn(offset_range=1000, drop_out=True) 232 | # tf_better_nn(lrd=True) 233 | tf_deep_nn(layer_cnt=6, lrd=True, drop_out=True, regular=True) 234 | -------------------------------------------------------------------------------- /src/rnn/bigram_lstm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import random 3 | import string 4 | import zipfile 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | from not_mnist.img_pickle import save_obj, load_pickle 10 | from not_mnist.load_data import maybe_download 11 | 12 | 13 | def read_data(filename): 14 | f = zipfile.ZipFile(filename) 15 | for name in f.namelist(): 16 | return tf.compat.as_str(f.read(name)) 17 | f.close() 18 | 19 | 20 | data_set = load_pickle('text8_text.pickle') 21 | if data_set is None: 22 | # load data 23 | url = 'http://mattmahoney.net/dc/' 24 | filename = maybe_download('text8.zip', 31344016, url=url) 25 | 26 | # read data 27 | text = read_data(filename) 28 | print('Data size %d' % len(text)) 29 | save_obj('text8_text.pickle', text) 30 | else: 31 | text = data_set 32 | 33 | # Create a small validation set. 34 | valid_size = 1000 35 | valid_text = text[:valid_size] 36 | train_text = text[valid_size:] 37 | train_size = len(train_text) 38 | print(train_size, train_text[:64]) 39 | print(valid_size, valid_text[:64]) 40 | 41 | vocabulary_size = (len(string.ascii_lowercase) + 1) * (len(string.ascii_lowercase) + 1) # [a-z] + ' ' 42 | 43 | idx2bi = {} 44 | bi2idx = {} 45 | idx = 0 46 | for i in ' ' + string.ascii_lowercase: 47 | for j in ' ' + string.ascii_lowercase: 48 | idx2bi[idx] = i + j 49 | bi2idx[i + j] = idx 50 | idx += 1 51 | 52 | 53 | def bi2id(char): 54 | if char in bi2idx.keys(): 55 | return bi2idx[char] 56 | else: 57 | print('Unexpected character: %s' % char) 58 | return 0 59 | 60 | 61 | def id2bi(dictid): 62 | if 0 <= dictid < len(idx2bi): 63 | return idx2bi[dictid] 64 | else: 65 | return ' ' 66 | 67 | 68 | print(bi2id('ad'), bi2id('zf'), bi2id(' '), bi2id('r '), bi2id('ï')) 69 | print(id2bi(31), id2bi(708), id2bi(0), id2bi(486)) 70 | 71 | batch_size = 64 72 | num_unrollings = 10 73 | 74 | 75 | class BigramBatchGenerator(object): 76 | def __init__(self, text, batch_size, num_unrollings): 77 | self._text = text 78 | self._text_size = len(text) 79 | self._batch_size = batch_size 80 | self._num_unrollings = num_unrollings 81 | segment = self._text_size // batch_size 82 | # print 'self._text_size, batch_size, segment', self._text_size, batch_size, segment 83 | self._cursor = [offset * segment for offset in range(batch_size)] 84 | # print self._cursor 85 | self._last_batch = self._next_batch() 86 | 87 | def _next_batch(self): 88 | """Generate a single batch from the current cursor position in the data.""" 89 | batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float) 90 | for b in range(self._batch_size): 91 | batch[b, bi2id(self._text[self._cursor[b]:self._cursor[b] + 2])] = 1.0 92 | self._cursor[b] = (self._cursor[b] + 2) % self._text_size 93 | return batch 94 | 95 | def next(self): 96 | """Generate the next array of batches from the data. The array consists of 97 | the last batch of the previous array, followed by num_unrollings new ones. 98 | """ 99 | batches = [self._last_batch] 100 | for step in range(self._num_unrollings): 101 | batches.append(self._next_batch()) 102 | self._last_batch = batches[-1] 103 | return batches 104 | 105 | 106 | def characters(probabilities): 107 | """Turn a 1-hot encoding or a probability distribution over the possible 108 | characters back into its (mostl likely) character representation.""" 109 | return [id2bi(c) for c in np.argmax(probabilities, 1)] 110 | 111 | 112 | def batches2string(batches): 113 | """Convert a sequence of batches back into their (most likely) string 114 | representation.""" 115 | s = [''] * batches[0].shape[0] 116 | for b in batches: 117 | s = [''.join(x) for x in zip(s, characters(b))] 118 | return s 119 | 120 | 121 | train_batches = BigramBatchGenerator(train_text, batch_size, num_unrollings) 122 | valid_batches = BigramBatchGenerator(valid_text, 1, 1) 123 | 124 | print(batches2string(train_batches.next())) 125 | print(batches2string(train_batches.next())) 126 | print(batches2string(valid_batches.next())) 127 | print(batches2string(valid_batches.next())) 128 | 129 | 130 | def logprob(predictions, labels): 131 | # prevent negative probability 132 | """Log-probability of the true labels in a predicted batch.""" 133 | predictions[predictions < 1e-10] = 1e-10 134 | return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0] 135 | 136 | 137 | def sample_distribution(distribution): 138 | """Sample one element from a distribution assumed to be an array of normalized 139 | probabilities. 140 | """ 141 | # 取一部分数据用于评估,所取数据比例随机 142 | r = random.uniform(0, 1) 143 | s = 0 144 | for i in range(len(distribution)): 145 | s += distribution[i] 146 | if s >= r: 147 | return i 148 | return len(distribution) - 1 149 | 150 | 151 | def sample(prediction): 152 | """Turn a (column) prediction into 1-hot encoded samples.""" 153 | p = np.zeros(shape=[1, vocabulary_size], dtype=np.float) 154 | p[0, sample_distribution(prediction[0])] = 1.0 155 | return p 156 | 157 | 158 | def random_distribution(): 159 | """Generate a random column of probabilities.""" 160 | b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size]) 161 | return b / np.sum(b, 1)[:, None] 162 | 163 | 164 | num_nodes = 64 165 | 166 | graph = tf.Graph() 167 | with graph.as_default(): 168 | # Parameters: 169 | # Input, Forget, Memory, Output gate: input, previous output, and bias. 170 | ifcox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes * 4], -0.1, 0.1)) 171 | ifcom = tf.Variable(tf.truncated_normal([num_nodes, num_nodes * 4], -0.1, 0.1)) 172 | ifcob = tf.Variable(tf.zeros([1, num_nodes * 4])) 173 | 174 | # Variables saving state across unrollings. 175 | saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False) 176 | saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False) 177 | # Classifier weights and biases. 178 | w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1)) 179 | b = tf.Variable(tf.zeros([vocabulary_size])) 180 | 181 | 182 | def _slice(_x, n, dim): 183 | return _x[:, n * dim:(n + 1) * dim] 184 | 185 | 186 | # Definition of the cell computation. 187 | def lstm_cell(i, o, state): 188 | 189 | ifco_gates = tf.matmul(i, ifcox) + tf.matmul(o, ifcom) + ifcob 190 | 191 | input_gate = tf.sigmoid(_slice(ifco_gates, 0, num_nodes)) 192 | forget_gate = tf.sigmoid(_slice(ifco_gates, 1, num_nodes)) 193 | update = _slice(ifco_gates, 2, num_nodes) 194 | state = forget_gate * state + input_gate * tf.tanh(update) 195 | output_gate = tf.sigmoid(_slice(ifco_gates, 3, num_nodes)) 196 | return output_gate * tf.tanh(state), state 197 | 198 | 199 | # Input data. 200 | train_data = list() 201 | for _ in range(num_unrollings + 1): 202 | train_data.append( 203 | tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size])) 204 | 205 | train_inputs = train_data[:num_unrollings] 206 | train_labels = train_data[1:] # labels are inputs shifted by one time step. 207 | # print('#######', train_inputs) 208 | # print('#######', train_labels) 209 | 210 | # Unrolled LSTM loop. 211 | outputs = list() 212 | output = saved_output 213 | state = saved_state 214 | for i in train_inputs: 215 | output, state = lstm_cell(i, output, state) 216 | outputs.append(output) 217 | 218 | # State saving across unrollings. 219 | with tf.control_dependencies([saved_output.assign(output), 220 | saved_state.assign(state)]): 221 | # Classifier. 222 | logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b) 223 | loss = tf.reduce_mean( 224 | tf.nn.softmax_cross_entropy_with_logits( 225 | logits, tf.concat(0, train_labels))) 226 | 227 | # Optimizer. 228 | global_step = tf.Variable(0) 229 | learning_rate = tf.train.exponential_decay( 230 | 10.0, global_step, 5000, 0.1, staircase=True) 231 | optimizer = tf.train.GradientDescentOptimizer(learning_rate) 232 | gradients, v = zip(*optimizer.compute_gradients(loss)) 233 | gradients, _ = tf.clip_by_global_norm(gradients, 1.25) 234 | optimizer = optimizer.apply_gradients( 235 | zip(gradients, v), global_step=global_step) 236 | 237 | # Predictions. 238 | train_prediction = tf.nn.softmax(logits) 239 | 240 | # Sampling and validation eval: batch 1, no unrolling. 241 | sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size]) 242 | saved_sample_output = tf.Variable(tf.zeros([1, num_nodes])) 243 | saved_sample_state = tf.Variable(tf.zeros([1, num_nodes])) 244 | reset_sample_state = tf.group( 245 | saved_sample_output.assign(tf.zeros([1, num_nodes])), 246 | saved_sample_state.assign(tf.zeros([1, num_nodes]))) 247 | sample_output, sample_state = lstm_cell( 248 | sample_input, saved_sample_output, saved_sample_state) 249 | with tf.control_dependencies([saved_sample_output.assign(sample_output), 250 | saved_sample_state.assign(sample_state)]): 251 | sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b)) 252 | 253 | num_steps = 7001 254 | summary_frequency = 100 255 | 256 | with tf.Session(graph=graph) as session: 257 | tf.initialize_all_variables().run() 258 | print('Initialized') 259 | mean_loss = 0 260 | for step in range(num_steps): 261 | batches = train_batches.next() 262 | feed_dict = dict() 263 | for i in range(num_unrollings + 1): 264 | feed_dict[train_data[i]] = batches[i] 265 | _, l, predictions, lr = session.run( 266 | [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict) 267 | mean_loss += l 268 | if step % summary_frequency == 0: 269 | if step > 0: 270 | mean_loss /= summary_frequency 271 | # The mean loss is an estimate of the loss over the last few batches. 272 | print( 273 | 'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr)) 274 | mean_loss = 0 275 | labels = np.concatenate(list(batches)[1:]) 276 | print('Minibatch perplexity: %.2f' % float( 277 | np.exp(logprob(predictions, labels)))) 278 | if step % (summary_frequency * 10) == 0: 279 | # Generate some samples. 280 | print('=' * 80) 281 | for _ in range(5): 282 | feed = sample(random_distribution()) 283 | sentence = characters(feed)[0] 284 | reset_sample_state.run() 285 | for _ in range(79): 286 | prediction = sample_prediction.eval({sample_input: feed}) 287 | feed = sample(prediction) 288 | sentence += characters(feed)[0] 289 | print(sentence) 290 | print('=' * 80) 291 | # Measure validation set perplexity. 292 | reset_sample_state.run() 293 | valid_logprob = 0 294 | for _ in range(valid_size): 295 | b = valid_batches.next() 296 | predictions = sample_prediction.eval({sample_input: b[0]}) 297 | valid_logprob = valid_logprob + logprob(predictions, b[1]) 298 | print('Validation set perplexity: %.2f' % float(np.exp( 299 | valid_logprob / valid_size))) 300 | -------------------------------------------------------------------------------- /src/rnn/singlew_lstm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import random 3 | import string 4 | import zipfile 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | from not_mnist.img_pickle import save_obj, load_pickle 10 | from not_mnist.load_data import maybe_download 11 | 12 | 13 | def read_data(filename): 14 | f = zipfile.ZipFile(filename) 15 | for name in f.namelist(): 16 | return tf.compat.as_str(f.read(name)) 17 | f.close() 18 | 19 | 20 | data_set = load_pickle('text8_text.pickle') 21 | if data_set is None: 22 | # load data 23 | url = 'http://mattmahoney.net/dc/' 24 | filename = maybe_download('text8.zip', 31344016, url=url) 25 | 26 | # read data 27 | text = read_data(filename) 28 | print('Data size %d' % len(text)) 29 | save_obj('text8_text.pickle', text) 30 | else: 31 | text = data_set 32 | 33 | # Create a small validation set. 34 | valid_size = 1000 35 | valid_text = text[:valid_size] 36 | train_text = text[valid_size:] 37 | train_size = len(train_text) 38 | print(train_size, train_text[:64]) 39 | print(valid_size, valid_text[:64]) 40 | 41 | # Utility functions to map characters to vocabulary IDs and back. 42 | vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' ' 43 | # ascii code for character 44 | first_letter = ord(string.ascii_lowercase[0]) 45 | 46 | 47 | def char2id(char): 48 | if char in string.ascii_lowercase: 49 | return ord(char) - first_letter + 1 50 | elif char == ' ': 51 | return 0 52 | else: 53 | print('Unexpected character: %s' % char) 54 | return 0 55 | 56 | 57 | def id2char(dictid): 58 | if dictid > 0: 59 | return chr(dictid + first_letter - 1) 60 | else: 61 | return ' ' 62 | 63 | 64 | print(char2id('a'), char2id('z'), char2id(' '), char2id('ï')) 65 | print(id2char(1), id2char(26), id2char(0)) 66 | 67 | # Function to generate a training batch for the LSTM model. 68 | batch_size = 64 69 | num_unrollings = 10 70 | 71 | 72 | class BatchGenerator(object): 73 | def __init__(self, text, batch_size, num_unrollings): 74 | self._text = text 75 | self._text_size = len(text) 76 | self._batch_size = batch_size 77 | self._num_unrollings = num_unrollings 78 | segment = self._text_size // batch_size 79 | self._cursor = [offset * segment for offset in range(batch_size)] 80 | self._last_batch = self._next_batch() 81 | 82 | def _next_batch(self): 83 | """Generate a single batch from the current cursor position in the data.""" 84 | # take character from text on cursor[b] 85 | # set to 1 for the taken character 86 | # so we have a matrix of 1/0 as input, an one hot encoding 87 | batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float) 88 | for b in range(self._batch_size): 89 | # same id, same index of second dimension 90 | batch[b, char2id(self._text[self._cursor[b]])] = 1.0 91 | self._cursor[b] = (self._cursor[b] + 1) % self._text_size 92 | return batch 93 | 94 | def next(self): 95 | """Generate the next array of batches from the data. The array consists of 96 | the last batch of the previous array, followed by num_unrollings new ones. 97 | """ 98 | batches = [self._last_batch] 99 | for step in range(self._num_unrollings): 100 | batches.append(self._next_batch()) 101 | self._last_batch = batches[-1] 102 | return batches 103 | 104 | 105 | def characters(probabilities): 106 | """Turn a 1-hot encoding or a probability distribution over the possible 107 | characters back into its (most likely) character representation.""" 108 | # argmax for the most likely character 109 | return [id2char(c) for c in np.argmax(probabilities, 1)] 110 | 111 | 112 | def batches2string(batches): 113 | """Convert a sequence of batches back into their (most likely) string 114 | representation.""" 115 | s = [''] * batches[0].shape[0] 116 | for b in batches: 117 | s = [''.join(x) for x in zip(s, characters(b))] 118 | return s 119 | 120 | 121 | train_batches = BatchGenerator(train_text, batch_size, num_unrollings) 122 | valid_batches = BatchGenerator(valid_text, 1, 1) 123 | 124 | print(batches2string(train_batches.next())) 125 | print(batches2string(train_batches.next())) 126 | print(batches2string(valid_batches.next())) 127 | print(batches2string(valid_batches.next())) 128 | 129 | 130 | def logprob(predictions, labels): 131 | # prevent negative probability 132 | """Log-probability of the true labels in a predicted batch.""" 133 | predictions[predictions < 1e-10] = 1e-10 134 | return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0] 135 | 136 | 137 | def sample_distribution(distribution): 138 | """Sample one element from a distribution assumed to be an array of normalized 139 | probabilities. 140 | """ 141 | # 取一部分数据用于评估,所取数据比例随机 142 | r = random.uniform(0, 1) 143 | s = 0 144 | for i in range(len(distribution)): 145 | s += distribution[i] 146 | if s >= r: 147 | return i 148 | return len(distribution) - 1 149 | 150 | 151 | def sample(prediction): 152 | """Turn a (column) prediction into 1-hot encoded samples.""" 153 | p = np.zeros(shape=[1, vocabulary_size], dtype=np.float) 154 | p[0, sample_distribution(prediction[0])] = 1.0 155 | return p 156 | 157 | 158 | def random_distribution(): 159 | """Generate a random column of probabilities.""" 160 | b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size]) 161 | return b / np.sum(b, 1)[:, None] 162 | 163 | 164 | # Simple LSTM Model. 165 | num_nodes = 64 166 | 167 | graph = tf.Graph() 168 | with graph.as_default(): 169 | gate_count = 4 170 | # Parameters: 171 | # Gates: input, previous output, and bias. 172 | input_weights = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes * gate_count], -0.1, 0.1)) 173 | output_weights = tf.Variable(tf.truncated_normal([num_nodes, num_nodes * gate_count], -0.1, 0.1)) 174 | bias = tf.Variable(tf.zeros([1, num_nodes * gate_count])) 175 | # Variables saving state across unrollings. 176 | saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False) 177 | saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False) 178 | # Classifier weights and biases. 179 | w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1)) 180 | b = tf.Variable(tf.zeros([vocabulary_size])) 181 | 182 | # Definition of the cell computation. 183 | def lstm_cell(i, o, state): 184 | """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf 185 | Note that in this formulation, we omit the various connections between the 186 | previous state and the gates.""" 187 | # large weight, 1/4 parameters for each gate, matrix multiply once, take 1/4 output results as a gate 188 | values = tf.split(1, gate_count, tf.matmul(i, input_weights) + tf.matmul(o, output_weights) + bias) 189 | input_gate = tf.sigmoid(values[0]) 190 | forget_gate = tf.sigmoid(values[1]) 191 | update = values[2] 192 | state = forget_gate * state + input_gate * tf.tanh(update) 193 | output_gate = tf.sigmoid(values[3]) 194 | return output_gate * tf.tanh(state), state 195 | 196 | # Input data. 197 | train_data = list() 198 | for _ in range(num_unrollings + 1): 199 | train_data.append( 200 | tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size])) 201 | train_inputs = train_data[:num_unrollings] 202 | train_labels = train_data[1:] # labels are inputs shifted by one time step. 203 | 204 | # Unrolled LSTM loop. 205 | outputs = list() 206 | output = saved_output 207 | state = saved_state 208 | for i in train_inputs: 209 | output, state = lstm_cell(i, output, state) 210 | outputs.append(output) 211 | 212 | # State saving across unrollings. 213 | with tf.control_dependencies([saved_output.assign(output), 214 | saved_state.assign(state)]): 215 | # Classifier. 216 | logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b) 217 | loss = tf.reduce_mean( 218 | tf.nn.softmax_cross_entropy_with_logits( 219 | logits, tf.concat(0, train_labels))) 220 | 221 | # Optimizer. 222 | global_step = tf.Variable(0) 223 | learning_rate = tf.train.exponential_decay( 224 | 10.0, global_step, 5000, 0.1, staircase=True) 225 | optimizer = tf.train.GradientDescentOptimizer(learning_rate) 226 | gradients, v = zip(*optimizer.compute_gradients(loss)) 227 | gradients, _ = tf.clip_by_global_norm(gradients, 1.25) 228 | optimizer = optimizer.apply_gradients( 229 | zip(gradients, v), global_step=global_step) 230 | 231 | # Predictions. 232 | train_prediction = tf.nn.softmax(logits) 233 | 234 | # Sampling and validation eval: batch 1, no unrolling. 235 | sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size]) 236 | saved_sample_output = tf.Variable(tf.zeros([1, num_nodes])) 237 | saved_sample_state = tf.Variable(tf.zeros([1, num_nodes])) 238 | reset_sample_state = tf.group( 239 | saved_sample_output.assign(tf.zeros([1, num_nodes])), 240 | saved_sample_state.assign(tf.zeros([1, num_nodes]))) 241 | sample_output, sample_state = lstm_cell( 242 | sample_input, saved_sample_output, saved_sample_state) 243 | with tf.control_dependencies([saved_sample_output.assign(sample_output), 244 | saved_sample_state.assign(sample_state)]): 245 | sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b)) 246 | 247 | num_steps = 7001 248 | summary_frequency = 100 249 | 250 | with tf.Session(graph=graph) as session: 251 | tf.initialize_all_variables().run() 252 | print('Initialized') 253 | mean_loss = 0 254 | for step in range(num_steps): 255 | batches = train_batches.next() 256 | feed_dict = dict() 257 | for i in range(num_unrollings + 1): 258 | feed_dict[train_data[i]] = batches[i] 259 | _, l, predictions, lr = session.run( 260 | [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict) 261 | mean_loss += l 262 | if step % summary_frequency == 0: 263 | if step > 0: 264 | mean_loss /= summary_frequency 265 | # The mean loss is an estimate of the loss over the last few batches. 266 | print( 267 | 'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr)) 268 | mean_loss = 0 269 | labels = np.concatenate(list(batches)[1:]) 270 | print('Minibatch perplexity: %.2f' % float( 271 | np.exp(logprob(predictions, labels)))) 272 | if step % (summary_frequency * 10) == 0: 273 | # Generate some samples. 274 | print('=' * 80) 275 | for _ in range(5): 276 | feed = sample(random_distribution()) 277 | sentence = characters(feed)[0] 278 | reset_sample_state.run() 279 | for _ in range(79): 280 | prediction = sample_prediction.eval({sample_input: feed}) 281 | feed = sample(prediction) 282 | sentence += characters(feed)[0] 283 | print(sentence) 284 | print('=' * 80) 285 | # Measure validation set perplexity. 286 | reset_sample_state.run() 287 | valid_logprob = 0 288 | for _ in range(valid_size): 289 | b = valid_batches.next() 290 | predictions = sample_prediction.eval({sample_input: b[0]}) 291 | valid_logprob = valid_logprob + logprob(predictions, b[1]) 292 | print('Validation set perplexity: %.2f' % float(np.exp( 293 | valid_logprob / valid_size))) -------------------------------------------------------------------------------- /src/neural/full_connect.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | from not_mnist.img_pickle import load_pickle 7 | 8 | 9 | def reformat(dataset, labels, image_size, num_labels): 10 | dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32) 11 | # Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...] 12 | labels = (np.arange(num_labels) == labels[:, None]).astype(np.float32) 13 | return dataset, labels 14 | 15 | 16 | def accuracy(predictions, labels): 17 | return 100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1)) / predictions.shape[0] 18 | 19 | 20 | def tf_logist(): 21 | # With gradient descent training, even this much data is prohibitive. 22 | # Subset the training data for faster turnaround. 23 | train_subset = 10000 24 | 25 | graph = tf.Graph() 26 | with graph.as_default(): 27 | # Input data. 28 | # Load the training, validation and test data into constants that are 29 | # attached to the graph. 30 | tf_train_dataset = tf.constant(train_dataset[:train_subset, :]) 31 | tf_train_labels = tf.constant(train_labels[:train_subset]) 32 | tf_valid_dataset = tf.constant(valid_dataset) 33 | tf_test_dataset = tf.constant(test_dataset) 34 | 35 | # Variables. 36 | # These are the parameters that we are going to be training. The weight 37 | # matrix will be initialized using random valued following a (truncated) 38 | # normal distribution. The biases get initialized to zero. 39 | weights = tf.Variable( 40 | tf.truncated_normal([image_size * image_size, num_labels])) 41 | biases = tf.Variable(tf.zeros([num_labels])) 42 | 43 | # Training computation. 44 | # We multiply the inputs with the weight matrix, and add biases. We compute 45 | # the softmax and cross-entropy (it's one operation in TensorFlow, because 46 | # it's very common, and it can be optimized). We take the average of this 47 | # cross-entropy across all training examples: that's our loss. 48 | logits = tf.matmul(tf_train_dataset, weights) + biases 49 | loss = tf.reduce_mean( 50 | tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) 51 | 52 | # Optimizer. 53 | # We are going to find the minimum of this loss using gradient descent. 54 | optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss) 55 | 56 | # Predictions for the training, validation, and test data. 57 | # These are not part of training, but merely here so that we can report 58 | # accuracy figures as we train. 59 | train_prediction = tf.nn.softmax(logits) 60 | valid_prediction = tf.nn.softmax( 61 | tf.matmul(tf_valid_dataset, weights) + biases) 62 | test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases) 63 | 64 | num_steps = 801 65 | 66 | with tf.Session(graph=graph) as session: 67 | # This is a one-time operation which ensures the parameters get initialized as 68 | # we described in the graph: random weights for the matrix, zeros for the 69 | # biases. 70 | tf.initialize_all_variables().run() 71 | print('Initialized') 72 | for step in range(num_steps): 73 | # Run the computations. We tell .run() that we want to run the optimizer, 74 | # and get the loss value and the training predictions returned as numpy 75 | # arrays. 76 | _, l, predictions = session.run([optimizer, loss, train_prediction]) 77 | if step % 100 == 0: 78 | print('Loss at step %d: %f' % (step, l)) 79 | print('Training accuracy: %.1f%%' % accuracy( 80 | predictions, train_labels[:train_subset, :])) 81 | # Calling .eval() on valid_prediction is basically like calling run(), but 82 | # just to get that one numpy array. Note that it recomputes all its graph 83 | # dependencies. 84 | print('Validation accuracy: %.1f%%' % accuracy( 85 | valid_prediction.eval(), valid_labels)) 86 | print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels)) 87 | 88 | 89 | def tf_sgd(): 90 | batch_size = 128 91 | 92 | graph = tf.Graph() 93 | with graph.as_default(): 94 | # Input data. For the training data, we use a placeholder that will be fed 95 | # at run time with a training minibatch. 96 | tf_train_dataset = tf.placeholder(tf.float32, 97 | shape=(batch_size, image_size * image_size)) 98 | tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels)) 99 | tf_valid_dataset = tf.constant(valid_dataset) 100 | tf_test_dataset = tf.constant(test_dataset) 101 | 102 | # Variables. 103 | weights = tf.Variable( 104 | tf.truncated_normal([image_size * image_size, num_labels])) 105 | biases = tf.Variable(tf.zeros([num_labels])) 106 | 107 | # Training computation. 108 | logits = tf.matmul(tf_train_dataset, weights) + biases 109 | loss = tf.reduce_mean( 110 | tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) 111 | 112 | # Optimizer. 113 | optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss) 114 | 115 | # Predictions for the training, validation, and test data. 116 | train_prediction = tf.nn.softmax(logits) 117 | valid_prediction = tf.nn.softmax( 118 | tf.matmul(tf_valid_dataset, weights) + biases) 119 | test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases) 120 | 121 | num_steps = 3001 122 | 123 | with tf.Session(graph=graph) as session: 124 | tf.initialize_all_variables().run() 125 | print("Initialized") 126 | for step in range(num_steps): 127 | # Pick an offset within the training data, which has been randomized. 128 | # Note: we could use better randomization across epochs. 129 | offset = (step * batch_size) % (train_labels.shape[0] - batch_size) 130 | # Generate a minibatch. 131 | batch_data = train_dataset[offset:(offset + batch_size), :] 132 | batch_labels = train_labels[offset:(offset + batch_size), :] 133 | # Prepare a dictionary telling the session where to feed the minibatch. 134 | # The key of the dictionary is the placeholder node of the graph to be fed, 135 | # and the value is the numpy array to feed to it. 136 | feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels} 137 | _, l, predictions = session.run( 138 | [optimizer, loss, train_prediction], feed_dict=feed_dict) 139 | if step % 500 == 0: 140 | print("Minibatch loss at step %d: %f" % (step, l)) 141 | print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels)) 142 | print("Validation accuracy: %.1f%%" % accuracy( 143 | valid_prediction.eval(), valid_labels)) 144 | print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels)) 145 | 146 | 147 | def tf_sgd_relu_nn(): 148 | batch_size = 128 149 | 150 | graph = tf.Graph() 151 | with graph.as_default(): 152 | # Input data. For the training data, we use a placeholder that will be fed 153 | # at run time with a training minibatch. 154 | tf_train_dataset = tf.placeholder(tf.float32, 155 | shape=(batch_size, image_size * image_size)) 156 | tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels)) 157 | tf_valid_dataset = tf.constant(valid_dataset) 158 | tf_test_dataset = tf.constant(test_dataset) 159 | 160 | hidden_node_count = 1024 161 | # Variables. 162 | weights1 = tf.Variable( 163 | tf.truncated_normal([image_size * image_size, hidden_node_count])) 164 | biases1 = tf.Variable(tf.zeros([hidden_node_count])) 165 | 166 | weights2 = tf.Variable( 167 | tf.truncated_normal([hidden_node_count, num_labels])) 168 | biases2 = tf.Variable(tf.zeros([num_labels])) 169 | 170 | # Training computation. 171 | ys = tf.matmul(tf_train_dataset, weights1) + biases1 172 | hidden = tf.nn.relu(ys) 173 | logits = tf.matmul(hidden, weights2) + biases2 174 | loss = tf.reduce_mean( 175 | tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) 176 | 177 | # Optimizer. 178 | optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss) 179 | 180 | # Predictions for the training, validation, and test data. 181 | train_prediction = tf.nn.softmax(logits) 182 | valid_prediction = tf.nn.softmax( 183 | tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1), weights2) + biases2) 184 | test_prediction = tf.nn.softmax( 185 | tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1), weights2) + biases2) 186 | 187 | num_steps = 3001 188 | 189 | with tf.Session(graph=graph) as session: 190 | tf.initialize_all_variables().run() 191 | print("Initialized") 192 | for step in range(num_steps): 193 | # Pick an offset within the training data, which has been randomized. 194 | # Note: we could use better randomization across epochs. 195 | offset = (step * batch_size) % (train_labels.shape[0] - batch_size) 196 | # Generate a minibatch. 197 | batch_data = train_dataset[offset:(offset + batch_size), :] 198 | batch_labels = train_labels[offset:(offset + batch_size), :] 199 | # Prepare a dictionary telling the session where to feed the minibatch. 200 | # The key of the dictionary is the placeholder node of the graph to be fed, 201 | # and the value is the numpy array to feed to it. 202 | feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels} 203 | _, l, predictions = session.run( 204 | [optimizer, loss, train_prediction], feed_dict=feed_dict) 205 | if step % 500 == 0: 206 | print("Minibatch loss at step %d: %f" % (step, l)) 207 | print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels)) 208 | print("Validation accuracy: %.1f%%" % accuracy( 209 | valid_prediction.eval(), valid_labels)) 210 | print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels)) 211 | 212 | 213 | def load_reformat_not_mnist(image_size, num_labels): 214 | pickle_file = '../not_mnist/notMNIST_clean.pickle' 215 | save = load_pickle(pickle_file) 216 | train_dataset = save['train_dataset'] 217 | train_labels = save['train_labels'] 218 | valid_dataset = save['valid_dataset'] 219 | valid_labels = save['valid_labels'] 220 | test_dataset = save['test_dataset'] 221 | test_labels = save['test_labels'] 222 | del save # hint to help gc free up memory 223 | print('Training set', train_dataset.shape, train_labels.shape) 224 | print('Validation set', valid_dataset.shape, valid_labels.shape) 225 | print('Test set', test_dataset.shape, test_labels.shape) 226 | train_dataset, train_labels = reformat(train_dataset, train_labels, image_size, num_labels) 227 | valid_dataset, valid_labels = reformat(valid_dataset, valid_labels, image_size, num_labels) 228 | test_dataset, test_labels = reformat(test_dataset, test_labels, image_size, num_labels) 229 | print('Training set', train_dataset.shape, train_labels.shape) 230 | print('Validation set', valid_dataset.shape, valid_labels.shape) 231 | print('Test set', test_dataset.shape, test_labels.shape) 232 | return train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels 233 | 234 | if __name__ == '__main__': 235 | # First reload the data we generated in 1_notmnist.ipynb. 236 | image_size = 28 237 | num_labels = 10 238 | train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels = \ 239 | load_reformat_not_mnist(image_size, num_labels) 240 | 241 | # tf_logist() 242 | # tf_sgd() 243 | tf_sgd_relu_nn() 244 | -------------------------------------------------------------------------------- /src/rnn/lstm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import random 3 | import string 4 | import zipfile 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | from not_mnist.img_pickle import save_obj, load_pickle 10 | from not_mnist.load_data import maybe_download 11 | 12 | 13 | def read_data(filename): 14 | f = zipfile.ZipFile(filename) 15 | for name in f.namelist(): 16 | return tf.compat.as_str(f.read(name)) 17 | f.close() 18 | 19 | 20 | data_set = load_pickle('text8_text.pickle') 21 | if data_set is None: 22 | # load data 23 | url = 'http://mattmahoney.net/dc/' 24 | filename = maybe_download('text8.zip', 31344016, url=url) 25 | 26 | # read data 27 | text = read_data(filename) 28 | print('Data size %d' % len(text)) 29 | save_obj('text8_text.pickle', text) 30 | else: 31 | text = data_set 32 | 33 | # Create a small validation set. 34 | valid_size = 1000 35 | valid_text = text[:valid_size] 36 | train_text = text[valid_size:] 37 | train_size = len(train_text) 38 | print(train_size, train_text[:64]) 39 | print(valid_size, valid_text[:64]) 40 | 41 | # Utility functions to map characters to vocabulary IDs and back. 42 | vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' ' 43 | # ascii code for character 44 | first_letter = ord(string.ascii_lowercase[0]) 45 | 46 | 47 | def char2id(char): 48 | if char in string.ascii_lowercase: 49 | return ord(char) - first_letter + 1 50 | elif char == ' ': 51 | return 0 52 | else: 53 | print('Unexpected character: %s' % char) 54 | return 0 55 | 56 | 57 | def id2char(dictid): 58 | if dictid > 0: 59 | return chr(dictid + first_letter - 1) 60 | else: 61 | return ' ' 62 | 63 | 64 | print(char2id('a'), char2id('z'), char2id(' '), char2id('ï')) 65 | print(id2char(1), id2char(26), id2char(0)) 66 | 67 | # Function to generate a training batch for the LSTM model. 68 | batch_size = 64 69 | num_unrollings = 10 70 | 71 | 72 | class BatchGenerator(object): 73 | def __init__(self, text, batch_size, num_unrollings): 74 | self._text = text 75 | self._text_size = len(text) 76 | self._batch_size = batch_size 77 | self._num_unrollings = num_unrollings 78 | segment = self._text_size // batch_size 79 | self._cursor = [offset * segment for offset in range(batch_size)] 80 | self._last_batch = self._next_batch() 81 | 82 | def _next_batch(self): 83 | """Generate a single batch from the current cursor position in the data.""" 84 | batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float) 85 | for b in range(self._batch_size): 86 | # same id, same index of second dimension 87 | batch[b, char2id(self._text[self._cursor[b]])] = 1.0 88 | self._cursor[b] = (self._cursor[b] + 1) % self._text_size 89 | return batch 90 | 91 | def next(self): 92 | """Generate the next array of batches from the data. The array consists of 93 | the last batch of the previous array, followed by num_unrollings new ones. 94 | """ 95 | batches = [self._last_batch] 96 | for step in range(self._num_unrollings): 97 | batches.append(self._next_batch()) 98 | self._last_batch = batches[-1] 99 | return batches 100 | 101 | 102 | def characters(probabilities): 103 | """Turn a 1-hot encoding or a probability distribution over the possible 104 | characters back into its (most likely) character representation.""" 105 | # argmax for the most likely character 106 | return [id2char(c) for c in np.argmax(probabilities, 1)] 107 | 108 | 109 | def batches2string(batches): 110 | """Convert a sequence of batches back into their (most likely) string 111 | representation.""" 112 | s = [''] * batches[0].shape[0] 113 | for b in batches: 114 | s = [''.join(x) for x in zip(s, characters(b))] 115 | return s 116 | 117 | 118 | train_batches = BatchGenerator(train_text, batch_size, num_unrollings) 119 | valid_batches = BatchGenerator(valid_text, 1, 1) 120 | 121 | print(batches2string(train_batches.next())) 122 | print(batches2string(train_batches.next())) 123 | print(batches2string(valid_batches.next())) 124 | print(batches2string(valid_batches.next())) 125 | 126 | 127 | def logprob(predictions, labels): 128 | # prevent negative probability 129 | """Log-probability of the true labels in a predicted batch.""" 130 | predictions[predictions < 1e-10] = 1e-10 131 | return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0] 132 | 133 | 134 | def sample_distribution(distribution): 135 | """Sample one element from a distribution assumed to be an array of normalized 136 | probabilities. 137 | """ 138 | # 取一部分数据用于评估,所取数据比例随机 139 | r = random.uniform(0, 1) 140 | s = 0 141 | for i in range(len(distribution)): 142 | s += distribution[i] 143 | if s >= r: 144 | return i 145 | return len(distribution) - 1 146 | 147 | 148 | def sample(prediction): 149 | """Turn a (column) prediction into 1-hot encoded samples.""" 150 | p = np.zeros(shape=[1, vocabulary_size], dtype=np.float) 151 | p[0, sample_distribution(prediction[0])] = 1.0 152 | return p 153 | 154 | 155 | def random_distribution(): 156 | """Generate a random column of probabilities.""" 157 | b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size]) 158 | return b / np.sum(b, 1)[:, None] 159 | 160 | 161 | # Simple LSTM Model. 162 | num_nodes = 64 163 | 164 | graph = tf.Graph() 165 | with graph.as_default(): 166 | # Parameters: 167 | # Input gate: input, previous output, and bias. 168 | ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) 169 | im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) 170 | ib = tf.Variable(tf.zeros([1, num_nodes])) 171 | # Forget gate: input, previous output, and bias. 172 | fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) 173 | fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) 174 | fb = tf.Variable(tf.zeros([1, num_nodes])) 175 | # Memory cell: input, state and bias. 176 | cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) 177 | cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) 178 | cb = tf.Variable(tf.zeros([1, num_nodes])) 179 | # Output gate: input, previous output, and bias. 180 | ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1)) 181 | om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1)) 182 | ob = tf.Variable(tf.zeros([1, num_nodes])) 183 | # Variables saving state across unrollings. 184 | saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False) 185 | saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False) 186 | # Classifier weights and biases. 187 | w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1)) 188 | b = tf.Variable(tf.zeros([vocabulary_size])) 189 | 190 | # Definition of the cell computation. 191 | def lstm_cell(i, o, state): 192 | """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf 193 | Note that in this formulation, we omit the various connections between the 194 | previous state and the gates.""" 195 | input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib) 196 | forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb) 197 | update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb 198 | state = forget_gate * state + input_gate * tf.tanh(update) 199 | output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob) 200 | return output_gate * tf.tanh(state), state 201 | 202 | # Input data. 203 | train_data = list() 204 | for _ in range(num_unrollings + 1): 205 | train_data.append( 206 | tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size])) 207 | train_inputs = train_data[:num_unrollings] 208 | train_labels = train_data[1:] # labels are inputs shifted by one time step. 209 | 210 | # Unrolled LSTM loop. 211 | outputs = list() 212 | output = saved_output 213 | state = saved_state 214 | for i in train_inputs: 215 | output, state = lstm_cell(i, output, state) 216 | outputs.append(output) 217 | 218 | # State saving across unrollings. 219 | with tf.control_dependencies([saved_output.assign(output), 220 | saved_state.assign(state)]): 221 | # Classifier. 222 | logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b) 223 | loss = tf.reduce_mean( 224 | tf.nn.softmax_cross_entropy_with_logits( 225 | logits, tf.concat(0, train_labels))) 226 | 227 | # Optimizer. 228 | global_step = tf.Variable(0) 229 | learning_rate = tf.train.exponential_decay( 230 | 10.0, global_step, 5000, 0.1, staircase=True) 231 | optimizer = tf.train.GradientDescentOptimizer(learning_rate) 232 | gradients, v = zip(*optimizer.compute_gradients(loss)) 233 | gradients, _ = tf.clip_by_global_norm(gradients, 1.25) 234 | optimizer = optimizer.apply_gradients( 235 | zip(gradients, v), global_step=global_step) 236 | 237 | # Predictions. 238 | train_prediction = tf.nn.softmax(logits) 239 | 240 | # Sampling and validation eval: batch 1, no unrolling. 241 | sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size]) 242 | saved_sample_output = tf.Variable(tf.zeros([1, num_nodes])) 243 | saved_sample_state = tf.Variable(tf.zeros([1, num_nodes])) 244 | reset_sample_state = tf.group( 245 | saved_sample_output.assign(tf.zeros([1, num_nodes])), 246 | saved_sample_state.assign(tf.zeros([1, num_nodes]))) 247 | sample_output, sample_state = lstm_cell( 248 | sample_input, saved_sample_output, saved_sample_state) 249 | with tf.control_dependencies([saved_sample_output.assign(sample_output), 250 | saved_sample_state.assign(sample_state)]): 251 | sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b)) 252 | 253 | num_steps = 7001 254 | summary_frequency = 100 255 | 256 | with tf.Session(graph=graph) as session: 257 | tf.initialize_all_variables().run() 258 | print('Initialized') 259 | mean_loss = 0 260 | for step in range(num_steps): 261 | batches = train_batches.next() 262 | feed_dict = dict() 263 | for i in range(num_unrollings + 1): 264 | feed_dict[train_data[i]] = batches[i] 265 | _, l, predictions, lr = session.run( 266 | [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict) 267 | mean_loss += l 268 | if step % summary_frequency == 0: 269 | if step > 0: 270 | mean_loss /= summary_frequency 271 | # The mean loss is an estimate of the loss over the last few batches. 272 | print( 273 | 'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr)) 274 | mean_loss = 0 275 | labels = np.concatenate(list(batches)[1:]) 276 | print('Minibatch perplexity: %.2f' % float( 277 | np.exp(logprob(predictions, labels)))) 278 | if step % (summary_frequency * 10) == 0: 279 | # Generate some samples. 280 | print('=' * 80) 281 | for _ in range(5): 282 | feed = sample(random_distribution()) 283 | sentence = characters(feed)[0] 284 | reset_sample_state.run() 285 | for _ in range(79): 286 | prediction = sample_prediction.eval({sample_input: feed}) 287 | feed = sample(prediction) 288 | sentence += characters(feed)[0] 289 | print(sentence) 290 | print('=' * 80) 291 | # Measure validation set perplexity. 292 | reset_sample_state.run() 293 | valid_logprob = 0 294 | for _ in range(valid_size): 295 | b = valid_batches.next() 296 | predictions = sample_prediction.eval({sample_input: b[0]}) 297 | valid_logprob = valid_logprob + logprob(predictions, b[1]) 298 | print('Validation set perplexity: %.2f' % float(np.exp( 299 | valid_logprob / valid_size))) 300 | -------------------------------------------------------------------------------- /src/convnet/conv_mnist.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from neural.full_connect import accuracy 5 | from not_mnist.img_pickle import load_pickle 6 | 7 | 8 | def reformat(dataset, labels, image_size, num_labels, num_channels): 9 | dataset = dataset.reshape( 10 | (-1, image_size, image_size, num_channels)).astype(np.float32) 11 | labels = (np.arange(num_labels) == labels[:, None]).astype(np.float32) 12 | return dataset, labels 13 | 14 | 15 | def load_reformat_not_mnist(image_size, num_labels, num_channels): 16 | pickle_file = '../not_mnist/notMNIST_clean.pickle' 17 | save = load_pickle(pickle_file) 18 | train_dataset = save['train_dataset'] 19 | train_labels = save['train_labels'] 20 | valid_dataset = save['valid_dataset'] 21 | valid_labels = save['valid_labels'] 22 | test_dataset = save['test_dataset'] 23 | test_labels = save['test_labels'] 24 | del save # hint to help gc free up memory 25 | print('Training set', train_dataset.shape, train_labels.shape) 26 | print('Validation set', valid_dataset.shape, valid_labels.shape) 27 | print('Test set', test_dataset.shape, test_labels.shape) 28 | train_dataset, train_labels = reformat(train_dataset, train_labels, image_size, num_labels, num_channels) 29 | valid_dataset, valid_labels = reformat(valid_dataset, valid_labels, image_size, num_labels, num_channels) 30 | test_dataset, test_labels = reformat(test_dataset, test_labels, image_size, num_labels, num_channels) 31 | print('Training set', train_dataset.shape, train_labels.shape) 32 | print('Validation set', valid_dataset.shape, valid_labels.shape) 33 | print('Test set', test_dataset.shape, test_labels.shape) 34 | return train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels 35 | 36 | 37 | def maxpool2d(data, k=2, s=2): 38 | # MaxPool2D wrapper 39 | return tf.nn.max_pool(data, ksize=[1, k, k, 1], strides=[1, s, s, 1], 40 | padding='SAME') 41 | 42 | 43 | def conv_train(): 44 | batch_size = 16 45 | patch_size = 5 46 | depth = 16 47 | num_hidden = 64 48 | num_channels = 1 49 | 50 | graph = tf.Graph() 51 | 52 | with graph.as_default(): 53 | # Input data. 54 | tf_train_dataset = tf.placeholder( 55 | tf.float32, shape=(batch_size, image_size, image_size, num_channels)) 56 | tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels)) 57 | tf_valid_dataset = tf.constant(valid_dataset) 58 | tf_test_dataset = tf.constant(test_dataset) 59 | 60 | # Variables. 61 | layer1_weights = tf.Variable(tf.truncated_normal( 62 | [patch_size, patch_size, num_channels, depth], stddev=0.1)) 63 | layer1_biases = tf.Variable(tf.zeros([depth])) 64 | layer2_weights = tf.Variable(tf.truncated_normal( 65 | [patch_size, patch_size, depth, depth], stddev=0.1)) 66 | layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth])) 67 | layer3_weights = tf.Variable(tf.truncated_normal( 68 | [image_size // 4 * image_size // 4 * depth, num_hidden], stddev=0.1)) 69 | layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden])) 70 | layer4_weights = tf.Variable(tf.truncated_normal( 71 | [num_hidden, num_labels], stddev=0.1)) 72 | layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels])) 73 | 74 | # Model. 75 | def model(data): 76 | conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME') 77 | hidden = tf.nn.relu(conv + layer1_biases) 78 | conv = tf.nn.conv2d(hidden, layer2_weights, [1, 2, 2, 1], padding='SAME') 79 | hidden = tf.nn.relu(conv + layer2_biases) 80 | shape = hidden.get_shape().as_list() 81 | reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]]) 82 | hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases) 83 | return tf.matmul(hidden, layer4_weights) + layer4_biases 84 | 85 | # Training computation. 86 | logits = model(tf_train_dataset) 87 | loss = tf.reduce_mean( 88 | tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) 89 | 90 | # Optimizer. 91 | optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss) 92 | 93 | # Predictions for the training, validation, and test data. 94 | train_prediction = tf.nn.softmax(logits) 95 | valid_prediction = tf.nn.softmax(model(tf_valid_dataset)) 96 | test_prediction = tf.nn.softmax(model(tf_test_dataset)) 97 | num_steps = 1001 98 | 99 | with tf.Session(graph=graph) as session: 100 | tf.initialize_all_variables().run() 101 | print('Initialized') 102 | for step in range(num_steps): 103 | offset = (step * batch_size) % (train_labels.shape[0] - batch_size) 104 | batch_data = train_dataset[offset:(offset + batch_size), :, :, :] 105 | batch_labels = train_labels[offset:(offset + batch_size), :] 106 | feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels} 107 | _, l, predictions = session.run( 108 | [optimizer, loss, train_prediction], feed_dict=feed_dict) 109 | if step % 50 == 0: 110 | print('Minibatch loss at step %d: %f' % (step, l)) 111 | print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels)) 112 | print('Validation accuracy: %.1f%%' % accuracy( 113 | valid_prediction.eval(), valid_labels)) 114 | print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels)) 115 | 116 | 117 | def conv_max_pool_train(): 118 | batch_size = 16 119 | patch_size = 5 120 | depth = 16 121 | num_hidden = 64 122 | num_channels = 1 123 | 124 | graph = tf.Graph() 125 | 126 | with graph.as_default(): 127 | # Input data. 128 | tf_train_dataset = tf.placeholder( 129 | tf.float32, shape=(batch_size, image_size, image_size, num_channels)) 130 | tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels)) 131 | tf_valid_dataset = tf.constant(valid_dataset) 132 | tf_test_dataset = tf.constant(test_dataset) 133 | 134 | # Variables. 135 | layer1_weights = tf.Variable(tf.truncated_normal( 136 | [patch_size, patch_size, num_channels, depth], stddev=0.1)) 137 | layer1_biases = tf.Variable(tf.zeros([depth])) 138 | layer2_weights = tf.Variable(tf.truncated_normal( 139 | [patch_size, patch_size, depth, depth], stddev=0.1)) 140 | layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth])) 141 | layer3_weights = tf.Variable(tf.truncated_normal( 142 | [64, num_hidden], stddev=0.1)) 143 | layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden])) 144 | layer4_weights = tf.Variable(tf.truncated_normal( 145 | [num_hidden, num_labels], stddev=0.1)) 146 | layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels])) 147 | 148 | # Model. 149 | def model(data): 150 | conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME') 151 | conv = maxpool2d(conv) 152 | hidden = tf.nn.relu(conv + layer1_biases) 153 | conv = tf.nn.conv2d(hidden, layer2_weights, [1, 2, 2, 1], padding='SAME') 154 | conv = maxpool2d(conv) 155 | hidden = tf.nn.relu(conv + layer2_biases) 156 | shape = hidden.get_shape().as_list() 157 | reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]]) 158 | hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases) 159 | return tf.matmul(hidden, layer4_weights) + layer4_biases 160 | # Training computation. 161 | logits = model(tf_train_dataset) 162 | loss = tf.reduce_mean( 163 | tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) 164 | 165 | # Optimizer. 166 | optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss) 167 | 168 | # Predictions for the training, validation, and test data. 169 | train_prediction = tf.nn.softmax(logits) 170 | valid_prediction = tf.nn.softmax(model(tf_valid_dataset)) 171 | test_prediction = tf.nn.softmax(model(tf_test_dataset)) 172 | num_steps = 1001 173 | 174 | with tf.Session(graph=graph) as session: 175 | tf.initialize_all_variables().run() 176 | print('Initialized') 177 | for step in range(num_steps): 178 | offset = (step * batch_size) % (train_labels.shape[0] - batch_size) 179 | batch_data = train_dataset[offset:(offset + batch_size), :, :, :] 180 | batch_labels = train_labels[offset:(offset + batch_size), :] 181 | feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels} 182 | _, l, predictions = session.run( 183 | [optimizer, loss, train_prediction], feed_dict=feed_dict) 184 | if step % 50 == 0: 185 | print('Minibatch loss at step %d: %f' % (step, l)) 186 | print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels)) 187 | print('Validation accuracy: %.1f%%' % accuracy( 188 | valid_prediction.eval(), valid_labels)) 189 | print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels)) 190 | 191 | 192 | def better_conv_train(drop=False, lrd=False): 193 | batch_size = 16 194 | patch_size = 5 195 | depth = 16 196 | num_hidden = 64 197 | num_channels = 1 198 | 199 | graph = tf.Graph() 200 | 201 | with graph.as_default(): 202 | # Input data. 203 | tf_train_dataset = tf.placeholder( 204 | tf.float32, shape=(batch_size, image_size, image_size, num_channels)) 205 | tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels)) 206 | tf_valid_dataset = tf.constant(valid_dataset) 207 | tf_test_dataset = tf.constant(test_dataset) 208 | 209 | # Variables. 210 | layer1_weights = tf.Variable(tf.truncated_normal( 211 | [patch_size, patch_size, num_channels, depth], stddev=0.1)) 212 | layer1_biases = tf.Variable(tf.zeros([depth])) 213 | layer2_weights = tf.Variable(tf.truncated_normal( 214 | [patch_size, patch_size, depth, depth], stddev=0.1)) 215 | layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth])) 216 | layer3_weights = tf.Variable(tf.truncated_normal( 217 | [64, num_hidden], stddev=0.1)) 218 | layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden])) 219 | layer4_weights = tf.Variable(tf.truncated_normal( 220 | [num_hidden, num_labels], stddev=0.1)) 221 | layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_labels])) 222 | 223 | # Model. 224 | def model(data): 225 | conv = tf.nn.conv2d(data, layer1_weights, [1, 2, 2, 1], padding='SAME') 226 | conv = maxpool2d(conv) 227 | hidden = tf.nn.relu(conv + layer1_biases) 228 | if drop: 229 | hidden = tf.nn.dropout(hidden, 0.5) 230 | conv = tf.nn.conv2d(hidden, layer2_weights, [1, 2, 2, 1], padding='SAME') 231 | conv = maxpool2d(conv) 232 | hidden = tf.nn.relu(conv + layer2_biases) 233 | if drop: 234 | hidden = tf.nn.dropout(hidden, 0.7) 235 | shape = hidden.get_shape().as_list() 236 | reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]]) 237 | hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases) 238 | if drop: 239 | hidden = tf.nn.dropout(hidden, 0.8) 240 | return tf.matmul(hidden, layer4_weights) + layer4_biases 241 | # Training computation. 242 | logits = model(tf_train_dataset) 243 | loss = tf.reduce_mean( 244 | tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels)) 245 | # Optimizer. 246 | if lrd: 247 | cur_step = tf.Variable(0) # count the number of steps taken. 248 | starter_learning_rate = 0.1 249 | learning_rate = tf.train.exponential_decay(starter_learning_rate, cur_step, 10000, 0.96, staircase=True) 250 | optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=cur_step) 251 | else: 252 | optimizer = tf.train.GradientDescentOptimizer(0.05).minimize(loss) 253 | 254 | # Predictions for the training, validation, and test data. 255 | train_prediction = tf.nn.softmax(logits) 256 | valid_prediction = tf.nn.softmax(model(tf_valid_dataset)) 257 | test_prediction = tf.nn.softmax(model(tf_test_dataset)) 258 | num_steps = 5001 259 | 260 | with tf.Session(graph=graph) as session: 261 | tf.initialize_all_variables().run() 262 | print('Initialized') 263 | for step in range(num_steps): 264 | offset = (step * batch_size) % (train_labels.shape[0] - batch_size) 265 | batch_data = train_dataset[offset:(offset + batch_size), :, :, :] 266 | batch_labels = train_labels[offset:(offset + batch_size), :] 267 | feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels} 268 | _, l, predictions = session.run( 269 | [optimizer, loss, train_prediction], feed_dict=feed_dict) 270 | if step % 50 == 0: 271 | print('Minibatch loss at step %d: %f' % (step, l)) 272 | print('Minibatch accuracy: %.1f%%' % accuracy(predictions, batch_labels)) 273 | print('Validation accuracy: %.1f%%' % accuracy( 274 | valid_prediction.eval(), valid_labels)) 275 | print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels)) 276 | 277 | 278 | if __name__ == '__main__': 279 | image_size = 28 280 | num_labels = 10 281 | train_dataset, train_labels, valid_dataset, valid_labels, test_dataset, test_labels = \ 282 | load_reformat_not_mnist(image_size, num_labels, 1) 283 | # conv_max_pool_train() 284 | # conv_train() 285 | better_conv_train(lrd=True) 286 | -------------------------------------------------------------------------------- /src/rnn/embed_bigram_lstm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import random 3 | import string 4 | import zipfile 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | from not_mnist.img_pickle import save_obj, load_pickle 10 | from not_mnist.load_data import maybe_download 11 | 12 | 13 | def read_data(filename): 14 | f = zipfile.ZipFile(filename) 15 | for name in f.namelist(): 16 | return tf.compat.as_str(f.read(name)) 17 | f.close() 18 | 19 | 20 | data_set = load_pickle('text8_text.pickle') 21 | if data_set is None: 22 | # load data 23 | url = 'http://mattmahoney.net/dc/' 24 | filename = maybe_download('text8.zip', 31344016, url=url) 25 | 26 | # read data 27 | text = read_data(filename) 28 | print('Data size %d' % len(text)) 29 | save_obj('text8_text.pickle', text) 30 | else: 31 | text = data_set 32 | 33 | # Create a small validation set. 34 | valid_size = 1000 35 | valid_text = text[:valid_size] 36 | train_text = text[valid_size:] 37 | train_size = len(train_text) 38 | print(train_size, train_text[:64]) 39 | print(valid_size, valid_text[:64]) 40 | 41 | # Utility functions to map characters to vocabulary IDs and back. 42 | vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' ' 43 | # ascii code for character 44 | first_letter = ord(string.ascii_lowercase[0]) 45 | 46 | 47 | def char2id(char): 48 | if char in string.ascii_lowercase: 49 | return ord(char) - first_letter + 1 50 | elif char == ' ': 51 | return 0 52 | else: 53 | print('Unexpected character: %s' % char) 54 | return 0 55 | 56 | 57 | def id2char(dictid): 58 | if dictid > 0: 59 | return chr(dictid + first_letter - 1) 60 | else: 61 | return ' ' 62 | 63 | 64 | print(char2id('a'), char2id('z'), char2id(' '), char2id('ï')) 65 | print(id2char(1), id2char(26), id2char(0)) 66 | 67 | bi_voc_size = vocabulary_size * vocabulary_size 68 | 69 | 70 | class BiBatchGenerator(object): 71 | def __init__(self, text, batch_size, num_unrollings): 72 | self._text = text 73 | self._text_size_in_chars = len(text) 74 | self._text_size = self._text_size_in_chars // 2 # in bigrams 75 | self._batch_size = batch_size 76 | self._num_unrollings = num_unrollings 77 | segment = self._text_size // batch_size 78 | self._cursor = [offset * segment for offset in range(batch_size)] 79 | self._last_batch = self._next_batch() 80 | 81 | def _next_batch(self): 82 | batch = np.zeros(shape=self._batch_size, dtype=np.int) 83 | # print 'batch idx %i' % 84 | for b in range(self._batch_size): 85 | char_idx = self._cursor[b] * 2 86 | ch1 = char2id(self._text[char_idx]) 87 | if self._text_size_in_chars - 1 == char_idx: 88 | ch2 = 0 89 | else: 90 | ch2 = char2id(self._text[char_idx + 1]) 91 | batch[b] = ch1 * vocabulary_size + ch2 92 | self._cursor[b] = (self._cursor[b] + 1) % self._text_size 93 | return batch 94 | 95 | def next(self): 96 | batches = [self._last_batch] 97 | for step in range(self._num_unrollings): 98 | batches.append(self._next_batch()) 99 | self._last_batch = batches[-1] 100 | return batches 101 | 102 | 103 | def bi2str(encoding): 104 | return id2char(encoding // vocabulary_size) + id2char(encoding % vocabulary_size) 105 | 106 | 107 | def bigrams(encodings): 108 | return [bi2str(e) for e in encodings] 109 | 110 | 111 | def bibatches2string(batches): 112 | s = [''] * batches[0].shape[0] 113 | for b in batches: 114 | s = [''.join(x) for x in zip(s, bigrams(b))] 115 | return s 116 | 117 | 118 | bi_onehot = np.zeros((bi_voc_size, bi_voc_size)) 119 | np.fill_diagonal(bi_onehot, 1) 120 | 121 | 122 | def bigramonehot(encodings): 123 | return [bi_onehot[e] for e in encodings] 124 | 125 | 126 | train_batches = BiBatchGenerator(train_text, 8, 8) 127 | valid_batches = BiBatchGenerator(valid_text, 1, 1) 128 | 129 | batch = train_batches.next() 130 | print(batch) 131 | print(bibatches2string(batch)) 132 | # print bigramonehot(batch) 133 | print (bibatches2string(train_batches.next())) 134 | print (bibatches2string(valid_batches.next())) 135 | print (bibatches2string(valid_batches.next())) 136 | 137 | 138 | def logprob(predictions, labels): 139 | """Log-probability of the true labels in a predicted batch.""" 140 | predictions[predictions < 1e-10] = 1e-10 141 | return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0] 142 | 143 | 144 | def sample_distribution(distribution): 145 | """Sample one element from a distribution assumed to be an array of normalized 146 | probabilities. 147 | """ 148 | r = random.uniform(0, 1) 149 | s = 0 150 | for i in range(len(distribution)): 151 | s += distribution[i] 152 | if s >= r: 153 | return i 154 | return len(distribution) - 1 155 | 156 | 157 | def sample(prediction, size=vocabulary_size): 158 | """Turn a (column) prediction into 1-hot encoded samples.""" 159 | p = np.zeros(shape=[1, size], dtype=np.float) 160 | p[0, sample_distribution(prediction[0])] = 1.0 161 | return p 162 | 163 | 164 | def one_hot_voc(prediction, size=vocabulary_size): 165 | p = np.zeros(shape=[1, size], dtype=np.float) 166 | p[0, prediction[0]] = 1.0 167 | return p 168 | 169 | 170 | def random_distribution(size=vocabulary_size): 171 | """Generate a random column of probabilities.""" 172 | b = np.random.uniform(0.0, 1.0, size=[1, size]) 173 | return b / np.sum(b, 1)[:, None] 174 | 175 | 176 | def create_lstm_graph_bi(num_nodes, num_unrollings, batch_size, embedding_size=bi_voc_size): 177 | with tf.Graph().as_default() as g: 178 | # input to all gates 179 | x = tf.Variable(tf.truncated_normal([embedding_size, num_nodes * 4], -0.1, 0.1), name='x') 180 | # memory of all gates 181 | m = tf.Variable(tf.truncated_normal([num_nodes, num_nodes * 4], -0.1, 0.1), name='m') 182 | # biases all gates 183 | biases = tf.Variable(tf.zeros([1, num_nodes * 4])) 184 | # Variables saving state across unrollings. 185 | saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False) 186 | saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False) 187 | # Classifier weights and biases. 188 | w = tf.Variable(tf.truncated_normal([num_nodes, bi_voc_size], -0.1, 0.1)) 189 | b = tf.Variable(tf.zeros([bi_voc_size])) 190 | # embeddings for all possible bigrams 191 | embeddings = tf.Variable(tf.random_uniform([bi_voc_size, embedding_size], -1.0, 1.0), name='embeddings') 192 | # one hot encoding for labels in 193 | np_embeds = np.zeros((bi_voc_size, bi_voc_size)) 194 | np.fill_diagonal(np_embeds, 1) 195 | bigramonehot = tf.constant(np.reshape(np_embeds, -1), dtype=tf.float32, shape=[bi_voc_size, bi_voc_size], 196 | name='bigramonehot') 197 | tf_keep_prob = tf.placeholder(tf.float32, name='tf_keep_prob') 198 | 199 | # Definition of the cell computation. 200 | def lstm_cell(i, o, state): 201 | # apply dropout to the input 202 | i = tf.nn.dropout(i, tf_keep_prob) 203 | mult = tf.matmul(i, x) + tf.matmul(o, m) + biases 204 | input_gate = tf.sigmoid(mult[:, :num_nodes]) 205 | forget_gate = tf.sigmoid(mult[:, num_nodes:num_nodes * 2]) 206 | update = mult[:, num_nodes * 3:num_nodes * 4] 207 | state = forget_gate * state + input_gate * tf.tanh(update) 208 | output_gate = tf.sigmoid(mult[:, num_nodes * 3:]) 209 | output = tf.nn.dropout(output_gate * tf.tanh(state), tf_keep_prob) 210 | return output, state 211 | 212 | # Input data. [num_unrollings, batch_size] -> one hot encoding removed, we send just bigram ids 213 | tf_train_data = tf.placeholder(tf.int32, shape=[num_unrollings + 1, batch_size], name='tf_train_data') 214 | train_data = list() 215 | for i in tf.split(0, num_unrollings + 1, tf_train_data): 216 | train_data.append(tf.squeeze(i)) 217 | train_inputs = train_data[:num_unrollings] 218 | train_labels = list() 219 | for l in train_data[1:]: 220 | # train_labels.append(tf.nn.embedding_lookup(embeddings, l)) 221 | train_labels.append(tf.gather(bigramonehot, l)) 222 | # train_labels.append(tf.reshape(l, [batch_size,1])) # labels are inputs shifted by one time step. 223 | 224 | # Unrolled LSTM loop. 225 | outputs = list() 226 | output = saved_output 227 | state = saved_state 228 | # python loop used: tensorflow does not support sequential operations yet 229 | for i in train_inputs: # having a loop simulates having time 230 | # embed input bigrams -> [batch_size, embedding_size] 231 | output, state = lstm_cell(tf.nn.embedding_lookup(embeddings, i), output, state) 232 | outputs.append(output) 233 | 234 | # State saving across unrollings, control_dependencies makes sure that output and state are computed 235 | with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]): 236 | logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b) 237 | loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, 238 | tf.concat(0, train_labels) 239 | ), name='loss') 240 | # Optimizer. 241 | global_step = tf.Variable(0, name='global_step') 242 | learning_rate = tf.train.exponential_decay(10.0, global_step, 500, 0.9, staircase=True, name='learning_rate') 243 | optimizer = tf.train.GradientDescentOptimizer(learning_rate, name='optimizer') 244 | gradients, v = zip(*optimizer.compute_gradients(loss)) 245 | gradients, _ = tf.clip_by_global_norm(gradients, 1.25) 246 | optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step) 247 | 248 | # here we predict the embedding 249 | # train_prediction = tf.argmax(tf.nn.softmax(logits), 1, name='train_prediction') 250 | train_prediction = tf.nn.softmax(logits, name='train_prediction') 251 | 252 | # Sampling and validation eval: batch 1, no unrolling. 253 | sample_input = tf.placeholder(tf.int32, shape=[1], name='sample_input') 254 | saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]), name='saved_sample_output') 255 | saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]), name='saved_sample_state') 256 | reset_sample_state = tf.group(saved_sample_output.assign(tf.zeros([1, num_nodes])), 257 | saved_sample_state.assign(tf.zeros([1, num_nodes])), name='reset_sample_state') 258 | embed_sample_input = tf.nn.embedding_lookup(embeddings, sample_input) 259 | sample_output, sample_state = lstm_cell(embed_sample_input, saved_sample_output, saved_sample_state) 260 | 261 | with tf.control_dependencies([saved_sample_output.assign(sample_output), 262 | saved_sample_state.assign(sample_state)]): 263 | sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b), name='sample_prediction') 264 | return g 265 | 266 | 267 | # test graph 268 | create_lstm_graph_bi(64, 10, 128, 32) 269 | 270 | 271 | def bitrain(g, num_steps, summary_frequency, num_unrollings, batch_size): 272 | # initalize batch generators 273 | train_batches = BiBatchGenerator(train_text, batch_size, num_unrollings) 274 | valid_batches = BiBatchGenerator(valid_text, 1, 1) 275 | optimizer = g.get_tensor_by_name('optimizer:0') 276 | loss = g.get_tensor_by_name('loss:0') 277 | train_prediction = g.get_tensor_by_name('train_prediction:0') 278 | learning_rate = g.get_tensor_by_name('learning_rate:0') 279 | tf_train_data = g.get_tensor_by_name('tf_train_data:0') 280 | sample_prediction = g.get_tensor_by_name('sample_prediction:0') 281 | # similarity = g.get_tensor_by_name('similarity:0') 282 | reset_sample_state = g.get_operation_by_name('reset_sample_state') 283 | sample_input = g.get_tensor_by_name('sample_input:0') 284 | embeddings = g.get_tensor_by_name('embeddings:0') 285 | keep_prob = g.get_tensor_by_name('tf_keep_prob:0') 286 | with tf.Session(graph=g) as session: 287 | tf.initialize_all_variables().run() 288 | print('Initialized') 289 | mean_loss = 0 290 | for step in range(num_steps): 291 | batches = train_batches.next() 292 | # print bibatches2string(batches) 293 | # print np.array(batches) 294 | # feed_dict = dict() 295 | # for i in range(num_unrollings + 1): 296 | # feed_dict[train_data[i]] = batches[i] 297 | # tf_train_data = 298 | _, l, lr, predictions = session.run([optimizer, loss, learning_rate, train_prediction], 299 | feed_dict={tf_train_data: batches, keep_prob: 0.6}) 300 | mean_loss += l 301 | if step % summary_frequency == 0: 302 | if step > 0: 303 | mean_loss = mean_loss / summary_frequency 304 | # The mean loss is an estimate of the loss over the last few batches. 305 | print ('Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr)) 306 | mean_loss = 0 307 | labels = list(batches)[1:] 308 | labels = np.concatenate([bigramonehot(l) for l in labels]) 309 | # print predictions 310 | # print labels 311 | # print labels.shape[0] 312 | print('Minibatch perplexity: %.2f' % float(np.exp(logprob(predictions, labels)))) 313 | if step % (summary_frequency * 10) == 0: 314 | # Generate some samples. 315 | print('=' * 80) 316 | # print embeddings.eval() 317 | for _ in range(5): 318 | # print random_distribution(bi_voc_size) 319 | feed = np.argmax(sample(random_distribution(bi_voc_size), bi_voc_size)) 320 | sentence = bi2str(feed) 321 | reset_sample_state.run() 322 | for _ in range(49): 323 | # prediction = similarity.eval({sample_input: [feed]}) 324 | # nearest = (-prediction[0]).argsort()[0] 325 | prediction = sample_prediction.eval({sample_input: [feed], keep_prob: 1.0}) 326 | # print prediction 327 | feed = np.argmax(sample(prediction, bi_voc_size)) 328 | # feed = np.argmax(prediction[0]) 329 | sentence += bi2str(feed) 330 | print(sentence) 331 | print('=' * 80) 332 | # Measure validation set perplexity. 333 | reset_sample_state.run() 334 | valid_logprob = 0 335 | for _ in range(valid_size): 336 | b = valid_batches.next() 337 | predictions = sample_prediction.eval({sample_input: b[0], keep_prob: 1.0}) 338 | # print(predictions) 339 | valid_logprob = valid_logprob + logprob(predictions, one_hot_voc(b[1], bi_voc_size)) 340 | print('Validation set perplexity: %.2f' % float(np.exp(valid_logprob / valid_size))) 341 | 342 | 343 | graph = create_lstm_graph_bi(512, 32, 32, 128) 344 | bitrain(graph, 4001, 100, 32, 32) 345 | --------------------------------------------------------------------------------