├── ann
    ├── __init__.py
    ├── RNN.py
    └── Dense.py
├── base
    ├── __init__.py
    └── model.py
├── playground
    ├── IRL.py
    ├── LinearRegression.py
    ├── SVM.py
    ├── A3C.py
    ├── NeuralNetwork.py
    ├── LogisticRegression.py
    ├── MLP.py
    ├── RegressionTree.py
    ├── PolicyGradient.py
    ├── DQN.py
    ├── PPO.py
    ├── DoubleDQN.py
    └── TensorFlowServing.py
├── deprecated
    ├── __init__.py
    └── main.py
├── utility
    ├── __init__.py
    ├── function.py
    ├── logger.py
    └── launcher.py
├── .gitignore
├── static
    ├── __init__.py
    └── checkpoints
    │   └── iris
    │       ├── biases.json
    │       └── weights.json
├── note
    ├── .ipynb_checkpoints
    │   ├── GQN-checkpoint.ipynb
    │   ├── PolicyGradient-checkpoint.ipynb
    │   ├── DQN-checkpoint.ipynb
    │   ├── GloVe-checkpoint.ipynb
    │   ├── A3C-checkpoint.ipynb
    │   ├── Word2Vec-checkpoint.ipynb
    │   ├── DoubleDQN-checkpoint.ipynb
    │   └── PPO-checkpoint.ipynb
    ├── GloVe.ipynb
    ├── Word2Vec.ipynb
    ├── A3C.ipynb
    ├── DoubleDQN.ipynb
    ├── DQN.ipynb
    └── PPO.ipynb
├── main.py
├── LICENSE
└── README.md


/ann/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/base/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/playground/IRL.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/deprecated/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utility/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | .DS_Store
3 | *.pyc
4 | 
5 | .idea/
6 | 
7 | static/*
8 | 
9 | 


--------------------------------------------------------------------------------
/static/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
 4 | 
 5 | LOGS_DIR = os.path.join(os.path.dirname(__file__), 'logs')
 6 | 
 7 | CACHES_DIR = os.path.join(os.path.dirname(__file__), 'caches')
 8 | 
 9 | IMAGES_DIR = os.path.join(os.path.dirname(__file__), 'images')
10 | 
11 | SUMMARIES_DIR = os.path.join(os.path.dirname(__file__), 'summaries')
12 | 
13 | CHECKPOINTS_DIR = os.path.join(os.path.dirname(__file__), 'checkpoints')
14 | 


--------------------------------------------------------------------------------
/static/checkpoints/iris/biases.json:
--------------------------------------------------------------------------------
 1 | [
 2 |  [
 3 |   [
 4 |    -0.31252127786192074
 5 |   ],
 6 |   [
 7 |    -0.28839935674279304
 8 |   ],
 9 |   [
10 |    -0.032112358085811865
11 |   ],
12 |   [
13 |    -0.0131792178653517
14 |   ],
15 |   [
16 |    -0.030514840147082778
17 |   ],
18 |   [
19 |    -0.25601258705142743
20 |   ],
21 |   [
22 |    -0.0008761398625049
23 |   ],
24 |   [
25 |    -0.38658973158121007
26 |   ],
27 |   [
28 |    0.21007817654928057
29 |   ],
30 |   [
31 |    0.2266149898603481
32 |   ]
33 |  ],
34 |  [
35 |   [
36 |    -0.006395863587906123
37 |   ],
38 |   [
39 |    0.8470152609903324
40 |   ],
41 |   [
42 |    0.13011377406245406
43 |   ]
44 |  ]
45 | ]


--------------------------------------------------------------------------------
/note/.ipynb_checkpoints/GQN-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {
 6 |     "collapsed": true
 7 |    },
 8 |    "source": [
 9 |     "# GQN"
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "markdown",
14 |    "metadata": {},
15 |    "source": []
16 |   }
17 |  ],
18 |  "metadata": {
19 |   "kernelspec": {
20 |    "display_name": "Python 3",
21 |    "language": "python",
22 |    "name": "python3"
23 |   },
24 |   "language_info": {
25 |    "codemirror_mode": {
26 |     "name": "ipython",
27 |     "version": 3
28 |    },
29 |    "file_extension": ".py",
30 |    "mimetype": "text/x-python",
31 |    "name": "python",
32 |    "nbconvert_exporter": "python",
33 |    "pygments_lexer": "ipython3",
34 |    "version": "3.5.4"
35 |   }
36 |  },
37 |  "nbformat": 4,
38 |  "nbformat_minor": 1
39 | }
40 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | from sklearn.preprocessing import StandardScaler
 5 | from sklearn.datasets import load_iris
 6 | from utility import function
 7 | from ann.Dense import Dense
 8 | 
 9 | np.random.seed(42)
10 | 
11 | iris = load_iris()
12 | 
13 | scaler = StandardScaler()
14 | scaler.fit(iris.data)
15 | 
16 | x_data = scaler.transform(iris.data)
17 | y_data = np.zeros((150, 3))
18 | y_data[np.arange(150), iris.target] = 1
19 | 
20 | # activation_funcs = [function.tanh] * 1
21 | activation_funcs = [function.relu] * 1
22 | # activation_funcs = [function.sigmoid] * 1
23 | activation_funcs.append(function.linear)
24 | 
25 | dense = Dense(x_space=4, y_space=3, hidden_units_list=[10], **{
26 |     "loss_func": function.mean_square_error,
27 |     "activation_funcs": activation_funcs,
28 |     "learning_rate": 0.01,
29 |     "enable_logger": True,
30 |     "model_name": 'iris',
31 |     "batch_size": 30,
32 |     'model': 'train',
33 | })
34 | 
35 | dense.train(x_data, y_data)
36 | # dense.restore()
37 | dense.evaluate(x_data, y_data)
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Shuyu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/utility/function.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def relu(x):
 5 |     return min(x, 1e2) if x > 0 else 0.0
 6 | 
 7 | 
 8 | def grad_relu(x):
 9 |     return 1.0 if x > 0 else 0.0
10 | 
11 | 
12 | def sigmoid(x):
13 |     return 1.0 / (1.0 + np.power(np.e, min(-x, 1e2)))
14 | 
15 | 
16 | def grad_sigmoid(x):
17 |     return sigmoid(x) * (1 - sigmoid(x))
18 | 
19 | 
20 | def tanh(x):
21 |     return np.tanh(x)
22 | 
23 | 
24 | def grad_tanh(x):
25 |     return 1 - np.power(np.tanh(x), 2)
26 | 
27 | 
28 | def linear(x):
29 |     return x
30 | 
31 | 
32 | def grad_linear(x):
33 |     return 1.0
34 | 
35 | 
36 | def softmax(x):
37 |     x_copy = x.copy()
38 |     a = np.exp(x_copy - np.max(x_copy, axis=1, keepdims=True))
39 |     z = np.sum(a, axis=1, keepdims=True)
40 |     return a / z
41 | 
42 | 
43 | def mean_square_error(y, label):
44 |     return np.mean(np.sqrt(np.sum(np.power(y - label, 2))))
45 | 
46 | 
47 | def grad_mean_square_error(y, label):
48 |     return label - y
49 | 
50 | 
51 | def softmax_cross_entropy(y, label):
52 |     return np.mean(np.sum(label * -np.log(softmax(y) + 1e-100)))
53 | 
54 | 
55 | def grad_softmax_cross_entropy(y, label):
56 |     return label - y
57 | 
58 | 


--------------------------------------------------------------------------------
/note/.ipynb_checkpoints/PolicyGradient-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {
 6 |     "collapsed": true
 7 |    },
 8 |    "source": [
 9 |     "# 问题设定"
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "markdown",
14 |    "metadata": {},
15 |    "source": [
16 |     "在小车倒立杆（CartPole）游戏中，我们希望通过强化学习训练一个Agent，它接受一个4维向量state，分别代表：小车的位置、小车的速度、杆的角度、杆的角速度，输出一个2维向量a，代表向左和向右移动。小车每一次向左或向右移动都会加1分，但是如果杆的角度大于±12°、小车的位置大于±2.4、行动次数大于200次，游戏将会结束。我们希望在游戏结束时得分尽可能大。"
17 |    ]
18 |   },
19 |   {
20 |    "cell_type": "markdown",
21 |    "metadata": {},
22 |    "source": []
23 |   },
24 |   {
25 |    "cell_type": "code",
26 |    "execution_count": null,
27 |    "metadata": {},
28 |    "outputs": [],
29 |    "source": []
30 |   }
31 |  ],
32 |  "metadata": {
33 |   "kernelspec": {
34 |    "display_name": "Python 3",
35 |    "language": "python",
36 |    "name": "python3"
37 |   },
38 |   "language_info": {
39 |    "codemirror_mode": {
40 |     "name": "ipython",
41 |     "version": 3
42 |    },
43 |    "file_extension": ".py",
44 |    "mimetype": "text/x-python",
45 |    "name": "python",
46 |    "nbconvert_exporter": "python",
47 |    "pygments_lexer": "ipython3",
48 |    "version": "3.5.4"
49 |   }
50 |  },
51 |  "nbformat": 4,
52 |  "nbformat_minor": 1
53 | }
54 | 


--------------------------------------------------------------------------------
/playground/LinearRegression.py:
--------------------------------------------------------------------------------
 1 | from mpl_toolkits.mplot3d import Axes3D
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | 
 5 | data_count = 100
 6 | 
 7 | x_data = np.linspace(-20, 20, data_count)
 8 | y_data = np.multiply(2, x_data) + 3 + np.random.normal(loc=0, scale=1.0, size=(data_count,))
 9 | 
10 | x_data = x_data.reshape((-1, 1))
11 | y_data = y_data.reshape((-1, 1))
12 | 
13 | w = 10
14 | b = 20
15 | y_predict = np.dot(x_data, w) + b
16 | 
17 | w_sample = np.linspace(-10, 10, data_count).reshape((-1, 1))
18 | b_sample = np.linspace(-10, 10, data_count).reshape((-1, 1))
19 | 
20 | x_data = x_data.reshape((-1, 1))
21 | y_data = y_data.reshape((-1, 1))
22 | 
23 | loss = np.square(np.dot(w_sample, x_data.T) + b_sample - y_data) / data_count
24 | 
25 | w_cache, b_cache, l_cache, = [], [], []
26 | 
27 | for iteration in range(2000):
28 |     y_predict = w * x_data + b
29 |     diff = y_predict - y_data
30 |     grad_w = np.mean(diff * x_data)
31 |     grad_b = np.mean(diff)
32 |     w -= 0.003 * grad_w
33 |     b -= 0.003 * grad_b
34 |     w_cache.append(w)
35 |     b_cache.append(b)
36 |     l_cache.append(np.mean(diff))
37 | 
38 | w_cache = np.array(w_cache).reshape((-1,))
39 | b_cache = np.array(w_cache).reshape((-1,))
40 | l_cache = np.array(w_cache).reshape((-1,))
41 | 
42 | 
43 | figure = plt.figure(figsize=(16, 9))
44 | figure = Axes3D(figure)
45 | figure.set_xlabel('w')
46 | figure.set_ylabel('b')
47 | figure.plot_surface(w_sample.T, b_sample, loss, cmap='rainbow')
48 | figure.scatter3D(w_cache, b_cache, l_cache, cmap='rainbow')
49 | 
50 | y_predict = w * x_data + b
51 | 
52 | plt.figure(figsize=(16, 9))
53 | plt.scatter(x_data, y_data, s=10, color='g')
54 | plt.plot(x_data, y_predict)
55 | plt.title('y=2x+3')
56 | plt.xlabel('x')
57 | plt.ylabel('y')
58 | plt.show()
59 | 


--------------------------------------------------------------------------------
/playground/SVM.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | 
 4 | data_count = 100
 5 | 
 6 | x1_positive = np.linspace(-10, 10, data_count)
 7 | x2_positive = 0.3 * x1_positive + 10 + np.random.randint(-5, 5, data_count)
 8 | y_positive = np.array([1] * data_count)
 9 | 
10 | x1_negative = np.linspace(-10, 10, data_count)
11 | x2_negative = 0.3 * x1_negative - 10 + np.random.randint(-5, 5, data_count)
12 | y_negative = np.array([-1] * data_count)
13 | 
14 | x1 = np.concatenate([x1_positive, x1_negative])
15 | x2 = np.concatenate([x2_positive, x2_negative])
16 | 
17 | y_label = np.concatenate([y_positive, y_negative])
18 | 
19 | w1 = np.random.normal(0, 0.002)
20 | w2 = np.random.normal(0, 0.002)
21 | b = np.random.normal(0, 0.002)
22 | 
23 | training_steps = 1000
24 | 
25 | eta = 0.001
26 | 
27 | for step in range(training_steps):
28 |     # grad_w1 = np.mean((w1 * x1 + w2 * x2 + b - y_label) * x1)
29 |     # grad_w2 = np.mean((w1 * x1 + w2 * x2 + b - y_label) * x2)
30 |     # grad_b = np.mean(w1 * x1 + w2 * x2 + b)
31 | 
32 |     hinge_judge_term = y_label * (w1 * x1 + w2 * x2 + b)
33 | 
34 |     mask_no_grad = hinge_judge_term > 1
35 | 
36 |     grad_before_mean_w1 = -y_label * x1
37 |     grad_before_mean_w1[mask_no_grad] = 0
38 |     grad_w1 = np.mean(grad_before_mean_w1)
39 | 
40 |     grad_before_mean_w2 = -y_label * x2
41 |     grad_before_mean_w2[mask_no_grad] = 0
42 |     grad_w2 = np.mean(grad_before_mean_w2)
43 | 
44 |     grad_before_mean_b = -y_label * 1
45 |     grad_before_mean_b[mask_no_grad] = 0
46 |     grad_b = np.mean(grad_before_mean_b)
47 | 
48 |     w1 -= eta * grad_w1
49 |     w2 -= eta * grad_w2
50 |     b -= eta * grad_b
51 | 
52 | plt.scatter(x1_positive, x2_positive, c='r')
53 | plt.scatter(x1_negative, x2_negative, c='b')
54 | plt.plot(x1, -(w1 * x1 + b) / w2, c='g')
55 | plt.show()


--------------------------------------------------------------------------------
/utility/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | from datetime import datetime
 5 | from static import LOGS_DIR
 6 | from time import time
 7 | 
 8 | DATETIME_NOW = datetime.now().strftime("%Y%m%d%H%M%S")
 9 | 
10 | 
11 | def get_logger(model_name, mode, filename):
12 |     # Make path.
13 |     dir_path = os.path.join(LOGS_DIR, model_name, mode)
14 |     log_path = os.path.join(dir_path, '{}-{}.log'.format(DATETIME_NOW, filename))
15 |     # Check path.
16 |     if not os.path.exists(dir_path):
17 |         os.makedirs(dir_path)
18 |     # Get logger.
19 |     logger_name = model_name + '-' + filename
20 |     logger = logging.getLogger(logger_name)
21 |     logger.setLevel(logging.DEBUG)
22 |     logger.propagate = False
23 |     # Get logger stream handler.
24 |     # log_sh = logging.StreamHandler(sys.stdout)
25 |     log_sh = logging.StreamHandler()
26 |     # log_sh.setFormatter(logging.Formatter('[{}] {}'.format('%(asctime)s', '%(message)s')))
27 |     log_sh.setLevel(logging.WARNING)
28 |     # Get logger file handler.
29 |     log_fh = logging.FileHandler(log_path)
30 |     log_fh.setLevel(logging.DEBUG)
31 |     log_fh.setFormatter(logging.Formatter('[{}] {}'.format('%(asctime)s', '%(message)s')))
32 |     # Add handler.
33 |     logger.addHandler(log_sh)
34 |     logger.addHandler(log_fh)
35 |     return logger
36 | 
37 | 
38 | class TimeInspector(object):
39 | 
40 |     time_marks = []
41 | 
42 |     @classmethod
43 |     def set_time_mark(cls):
44 |         _time = time()
45 |         cls.time_marks.append(_time)
46 |         return _time
47 | 
48 |     @classmethod
49 |     def pop_time_mark(cls):
50 |         cls.time_marks.pop()
51 | 
52 |     @classmethod
53 |     def log_cost_time(cls, info):
54 |         cost_time = time() - cls.time_marks.pop()
55 |         logging.warning('Time cost: {0:.2f} | {1}'.format(cost_time, info))
56 | 


--------------------------------------------------------------------------------
/playground/A3C.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import multiprocessing as mp
 4 | import tensorflow as tf
 5 | import logging
 6 | import gym
 7 | 
 8 | from base.model import *
 9 | from playground import PPO
10 | from utility.launcher import start_game
11 | 
12 | 
13 | def start_a3c(cluster, role, task_index):
14 |     server = tf.train.Server(cluster, job_name=role, task_index=task_index)
15 |     if role == 'ps':
16 |         logging.warning('Parameter server started.')
17 |         server.join()
18 |     else:
19 |         worker_device = "/job:worker/task:{}".format(task_index)
20 |         logging.warning('Worker: {},  server stated.'.format(worker_device))
21 |         with tf.device(tf.train.replica_device_setter(cluster=cluster)):
22 |             # Make env.
23 |             env = gym.make('CartPole-v0')
24 |             env.seed(1)
25 |             env = env.unwrapped
26 |             # Init session.
27 |             session = tf.Session(server.target)
28 |             # session = tf.Session()
29 |             # Init agent.
30 |             agent = PPO.Agent(env.action_space.n, env.observation_space.shape[0], **{
31 |                 KEY_SESSION: session,
32 |                 KEY_MODEL_NAME: 'PPO',
33 |                 KEY_TRAIN_EPISODE: 1000
34 |             })
35 |             start_game(env, agent, task_index)
36 | 
37 | 
38 | def main():
39 | 
40 |     cluster = tf.train.ClusterSpec({
41 |         'worker': [
42 |             'localhost:8001',
43 |             'localhost:8002',
44 |             'localhost:8003',
45 |         ],
46 |         'ps': [
47 |             'localhost:8000'
48 |         ]
49 |     })
50 | 
51 |     role_task_index_map = [
52 |         ('ps', 0),
53 |         ('worker', 0),
54 |         ('worker', 1),
55 |         ('worker', 2),
56 |     ]
57 | 
58 |     pool = mp.Pool(processes=4)
59 | 
60 |     for role, task_index in role_task_index_map:
61 |         pool.apply_async(start_a3c, args=(cluster, role, task_index, ))
62 |     pool.close()
63 |     pool.join()
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     main()
68 | 


--------------------------------------------------------------------------------
/utility/launcher.py:
--------------------------------------------------------------------------------
 1 | from time import time
 2 | 
 3 | 
 4 | def start_game(env, agent, process=None):
 5 |     # Train.
 6 |     if agent.mode == 'train':
 7 |         for episode in range(agent.train_episodes):
 8 |             s, r_episode, now = env.reset(), 0, time()
 9 |             while True:
10 |                 a = agent.predict(s)
11 |                 s_n, r, done, _ = env.step(a)
12 |                 r = r if not done else -10
13 |                 r_episode += r
14 |                 agent.snapshot(s, a, r, s_n)
15 |                 s = s_n
16 |                 if done:
17 |                     # Logs.
18 |                     if process is None:
19 |                         agent.logger.warning('Episode: {} | Times: {} | Rewards: {}'.format(episode,
20 |                                                                                             time() - now,
21 |                                                                                             r_episode))
22 |                     else:
23 |                         agent.logger.warning('Process: {} | Episode: {} | Times: {} | Rewards: {}'.format(process,
24 |                                                                                                           episode,
25 |                                                                                                           time() - now,
26 |                                                                                                           r_episode))
27 |                     break
28 |             agent.train()
29 |             if episode % 50 == 0:
30 |                 agent.save()
31 |     elif agent.mode == 'test':
32 |         agent.restore()
33 |         # Reset env.
34 |         s, r_episode, now = env.reset(), 0, time()
35 |         while True:
36 |             env.render()
37 |             a = agent.predict(s)
38 |             s_n, r, done, _, = env.step(a)
39 |             r_episode += r
40 |             s = s_n
41 |             if done:
42 |                 agent.logger.warning('Test mode, rewards: {}'.format(r_episode))
43 |                 break
44 | 


--------------------------------------------------------------------------------
/static/checkpoints/iris/weights.json:
--------------------------------------------------------------------------------
  1 | [
  2 |  [
  3 |   [
  4 |    -0.18678438298795857,
  5 |    -0.07914966000312011,
  6 |    0.6342529879617897,
  7 |    0.4529811249267001
  8 |   ],
  9 |   [
 10 |    -0.17595482455364025,
 11 |    -0.08025541057908858,
 12 |    0.5922657398985512,
 13 |    0.4201468573646936
 14 |   ],
 15 |   [
 16 |    0.022154357970908763,
 17 |    -0.017635602933466894,
 18 |    -0.5700575957392919,
 19 |    -0.6128746987080891
 20 |   ],
 21 |   [
 22 |    0.03851755585545247,
 23 |    0.018353598959502188,
 24 |    -0.2508443055444202,
 25 |    -0.07711620382957872
 26 |   ],
 27 |   [
 28 |    -0.0056793667464683385,
 29 |    0.04912562717610973,
 30 |    -0.3871691049577893,
 31 |    -0.15609163649258434
 32 |   ],
 33 |   [
 34 |    -0.19662158407431712,
 35 |    -0.05892142132038428,
 36 |    0.5860437253337684,
 37 |    0.40890938162697044
 38 |   ],
 39 |   [
 40 |    0.004943411808912222,
 41 |    -0.07798519264342238,
 42 |    -0.22017001687542007,
 43 |    -0.48731233342831287
 44 |   ],
 45 |   [
 46 |    0.0093202618749045,
 47 |    -0.32682710266355375,
 48 |    0.9928554361485695,
 49 |    0.3044439064210716
 50 |   ],
 51 |   [
 52 |    -0.5775015666759874,
 53 |    0.26320637439433137,
 54 |    0.3924564664686868,
 55 |    -0.14772811294676533
 56 |   ],
 57 |   [
 58 |    0.6372991366359084,
 59 |    -0.4860761904537719,
 60 |    -0.36112421494322827,
 61 |    -0.5626834691985895
 62 |   ]
 63 |  ],
 64 |  [
 65 |   [
 66 |    0.05898039141717556,
 67 |    0.04981103294277943,
 68 |    0.38030504722256914,
 69 |    0.3810373423764331,
 70 |    0.544658904810149,
 71 |    -0.13771286331281743,
 72 |    -0.1008109944339574,
 73 |    0.01788971466924072,
 74 |    0.04826178711206095,
 75 |    -0.007446264711328485
 76 |   ],
 77 |   [
 78 |    -0.42355832492827655,
 79 |    -0.4480373364602654,
 80 |    -0.5635856073580628,
 81 |    -0.08327523793774468,
 82 |    -0.21014308350300684,
 83 |    0.07039977607174346,
 84 |    -0.31335144648137475,
 85 |    -0.4500461353189752,
 86 |    0.31269293745199345,
 87 |    0.3466045025577133
 88 |   ],
 89 |   [
 90 |    0.1681510259423427,
 91 |    0.10270294053056472,
 92 |    0.12078844037424748,
 93 |    -0.19092135131724694,
 94 |    -0.2582668810741086,
 95 |    0.6052927859552184,
 96 |    0.39822198216164345,
 97 |    0.4380923861287904,
 98 |    -0.3326940292288691,
 99 |    -0.3009245703032898
100 |   ]
101 |  ]
102 | ]


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![License](https://img.shields.io/badge/License-MIT-blue.svg)](/LICENSE)
 2 | [![Platform](https://img.shields.io/badge/Platform-Tensorflow-orange.svg)](https://www.tensorflow.org/)
 3 | [![Python](https://img.shields.io/badge/Python-3.5-green.svg)]()
 4 | 
 5 | # Learning Notes of DRL & DL
 6 | 
 7 | A repo of Learning notes of DRL & DL, theory, codes, models and notes maybe.
 8 | 
 9 | # Content  
10 | 
11 | ## Notes
12 | 
13 | ### Deep Learning Basic
14 | 
15 | - [LinearRegression](/note/LinearRegression.ipynb)
16 | - [LogisticRegression](/note/LogisticRegression.ipynb)
17 | - [RegressionTree](/note/RegressionTree.ipynb)
18 | - [Support Vector Machine](/note/SVM.ipynb)
19 | - [NeuralNetwork](/note/NeuralNetwork.ipynb)
20 | 
21 | ### Natural Language Processing
22 | - [Word2Vec](/note/Word2Vec.ipynb)
23 | - [GloVe](/note/GloVe.ipynb)
24 | 
25 | ### Deep Reinforcement Learning
26 | 
27 | - [PolicyGradient](/note/PolicyGradient.ipynb)
28 | - [DQN](/note/DQN.ipynb)
29 | - [DoubleDQN](/note/DoubleDQN.ipynb)
30 | - [PPO](/note/PPO.ipynb)
31 | - [A3C / DPPO](/note/A3C.ipynb)
32 | 
33 | ### Deep Learning Engineering
34 | 
35 | - [TensorFlow Serving](/note/TensorFlowServing.ipynb)
36 | 
37 | ### Docker
38 | 
39 | - [Docker Notes](/note/Docker.ipynb)
40 | 
41 | ## Codes
42 | 
43 | - [Artifical Neuron Network (ANN)](/ann/Dense.py)   
44 | 
45 | 
46 | # Requirements
47 | - numpy
48 | - scipy
49 | - sklearn
50 | - matplotlib
51 | - tensorflow==1.8
52 | 
53 | # Instructions for codes
54 | 
55 | ### [Artifical Neuron Network (ANN)](/ann/Dense.py) 
56 | 
57 | 1. Load your data, for example, iris data set.
58 | ```
59 | from sklearn.datasets import load_iris
60 | iris = load_iris()
61 | ```
62 | 2. Standardize your data.
63 | ```
64 | scaler = StandardScaler()
65 | scaler.fit(iris.data)
66 | 
67 | x_data = scaler.transform(iris.data)
68 | y_data = np.zeros((150, 3))
69 | y_data[np.arange(150), iris.target] = 1
70 | ``` 
71 | 3. Initialize activations, which are configurable.
72 | ```
73 | activation_funcs = [function.relu] * 1
74 | # activation_funcs = [function.tanh] * 1
75 | # activation_funcs = [function.sigmoid] * 1
76 | activation_funcs.append(function.linear)
77 | ```
78 | 4. Initialize model, option parameters are configurable.
79 | ```
80 | dense = Dense(x_space=4, y_space=3, neuron_count_list=[10], **{
81 |     "loss_func": function.softmax_cross_entropy,
82 |     "activation_funcs": activation_funcs,
83 |     "learning_rate": 0.01,
84 |     "enable_logger": True,
85 |     "model_name": 'iris',
86 |     "batch_size": 30,
87 |     'model': 'train'
88 | )
89 | ```
90 | 5. Train or Restore & Evaluate.
91 | ```
92 | dense.train(x_data, y_data)
93 | # dense.restore()
94 | dense.evaluate(x_data, y_data)
95 | ```
96 | 


--------------------------------------------------------------------------------
/playground/NeuralNetwork.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | 
 4 | from sklearn.preprocessing import StandardScaler
 5 | from utility import function
 6 | from ann.Dense import Dense
 7 | 
 8 | np.random.seed(135)
 9 | 
10 | data_count = 25
11 | 
12 | x1_points = np.linspace(0, 10, data_count).reshape((-1, 1))
13 | x2_points = np.multiply(2, x1_points) + np.random.randint(-10, 10, size=(data_count,)).reshape((-1, 1))
14 | 
15 | x1 = np.concatenate((x1_points, x2_points), axis=1)
16 | y1 = np.array([[1, 0, 0, 0]] * data_count)
17 | 
18 | x1_points = np.linspace(1, 10, data_count).reshape((-1, 1))
19 | x2_points = np.multiply(-2, x1_points) + np.random.randint(-10, 10, size=(data_count,)).reshape((-1, 1))
20 | 
21 | x2 = np.concatenate((x1_points, x2_points), axis=1)
22 | y2 = np.array([[0, 1, 0, 0]] * data_count)
23 | 
24 | x1_points = np.linspace(-1, -10, data_count).reshape((-1, 1))
25 | x2_points = np.multiply(2, x1_points) + np.random.randint(-10, 10, size=(data_count,)).reshape((-1, 1))
26 | 
27 | x3 = np.concatenate((x1_points, x2_points), axis=1)
28 | y3 = np.array([[0, 0, 1, 0]] * data_count)
29 | 
30 | x1_points = np.linspace(-1, -10, data_count).reshape((-1, 1))
31 | x2_points = np.multiply(-2, x1_points) + np.random.randint(-10, 10, size=(data_count,)).reshape((-1, 1))
32 | 
33 | x4 = np.concatenate((x1_points, x2_points), axis=1)
34 | y4 = np.array([[0, 0, 0, 1]] * data_count)
35 | 
36 | x_data = np.concatenate((x1, x2, x3, x4))
37 | y_data = np.concatenate((y1, y2, y3, y4))
38 | 
39 | x_train = StandardScaler().fit_transform(x_data)
40 | y_train = y_data
41 | 
42 | activation_funcs = [function.relu] * 2
43 | # activation_funcs = [function.sigmoid] * 1
44 | activation_funcs.append(function.linear)
45 | 
46 | dense = Dense(x_space=2, y_space=4, hidden_units_list=[6, 6], **{
47 |     "loss_func": function.softmax_cross_entropy,
48 |     "activation_funcs": activation_funcs,
49 |     "learning_rate": 0.003,
50 |     "enable_logger": True,
51 |     "model_name": 'base',
52 |     "batch_size": 100,
53 |     "max_epoch": 1000,
54 |     'model': 'train',
55 | })
56 | 
57 | dense.train(x_data, y_data)
58 | # dense.restore()
59 | dense.evaluate(x_data, y_data)
60 | 
61 | x1_test = np.linspace(-20, 20, 300)
62 | x2_test = np.linspace(-30, 30, 300)
63 | 
64 | x1_mesh, x2_mesh = np.meshgrid(x1_test, x2_test)
65 | 
66 | x_test = np.array([x1_mesh.ravel(), x2_mesh.ravel()]).T
67 | y_test = np.argmax(dense.predict(x_test), axis=1)
68 | 
69 | plt.pcolormesh(x1_mesh, x2_mesh, y_test.reshape(x1_mesh.shape))
70 | plt.scatter(x1[:, 0], x1[:, 1], marker='x')
71 | plt.scatter(x2[:, 0], x2[:, 1], marker='o')
72 | plt.scatter(x3[:, 0], x3[:, 1], marker='*')
73 | plt.scatter(x4[:, 0], x4[:, 1], marker='p')
74 | plt.show()
75 | 


--------------------------------------------------------------------------------
/playground/LogisticRegression.py:
--------------------------------------------------------------------------------
 1 | from mpl_toolkits.mplot3d import Axes3D
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | 
 5 | from utility import function
 6 | 
 7 | data_count = 100
 8 | 
 9 | x1_points = np.linspace(-5, 5, data_count).reshape((-1, 1))
10 | x2_points = np.multiply(5, x1_points) + np.random.randint(-5, 50, size=(data_count,)).reshape((-1, 1))
11 | 
12 | x_positive_data = np.concatenate((x1_points, x2_points), axis=1)
13 | y_positive_data = np.array([1] * data_count)
14 | 
15 | x1_points = np.linspace(-5, 5, data_count).reshape((-1, 1))
16 | x2_points = np.multiply(5, x1_points) - np.random.randint(-5, 50, size=(data_count,)).reshape((-1, 1))
17 | 
18 | x_negative_data = np.concatenate((x1_points, x2_points), axis=1)
19 | y_negative_data = np.array([0] * data_count)
20 | 
21 | x_data = np.concatenate((x_positive_data, x_negative_data))
22 | y_data = np.concatenate((y_positive_data, y_negative_data))
23 | 
24 | sigmoid = np.vectorize(function.sigmoid)
25 | grad_sigmoid = np.vectorize(function.grad_sigmoid)
26 | 
27 | w = np.array([5, 5])
28 | 
29 | loss_cache = []
30 | 
31 | for iteration in range(1000):
32 |     y_product = np.dot(x_data, w.T)
33 |     # 计算预测标签值 (200, 2) * (2, 1) -> (200, 1)
34 |     y_positive_predict = sigmoid(y_product)
35 |     y_negative_predict = 1 - y_positive_predict
36 |     y_negative_predict[y_negative_predict < 1e-4] = 1e-4
37 |     # 计算交叉熵
38 |     cross_entropy = -np.mean(y_data * np.log(y_positive_predict) + (1 - y_data) * np.log(y_negative_predict))
39 |     # 计算梯度
40 |     # grad_w = -np.mean(y_data / y_positive_predict * grad_sigmoid(y_product) * x_data.T, axis=1)
41 |     grad_w = -np.mean((y_data - y_positive_predict) * x_data.T, axis=1)
42 |     # 更新梯度
43 |     w = w - 0.03 * grad_w
44 |     # 缓存交叉熵
45 |     loss_cache.append(cross_entropy)
46 | 
47 | y = - np.multiply(x_data[: data_count, 0], w[0]) / w[1]
48 | #
49 | # w1_sample = np.linspace(-10, 10, 2 * data_count).reshape((-1, 1))
50 | # w2_sample = np.linspace(-10, 10, 2 * data_count).reshape((-1, 1))
51 | #
52 | # w_sample = np.concatenate((w1_sample, w2_sample), axis=1)
53 | #
54 | # # (200, 2) * (2, 200) -> (200 * 200)
55 | # loss = y_data * np.log(sigmoid(np.dot(x_data, w_sample.T)))
56 | #
57 | # figure = plt.figure(figsize=(16, 6))
58 | # axes = Axes3D(figure)
59 | # axes.set_xlabel('w')
60 | # axes.set_ylabel('b')
61 | # axes.plot_surface(w1_sample.T, w2_sample, loss, cmap='rainbow')
62 | 
63 | plt.figure(figsize=(16, 9))
64 | plt.title('CrossEntropy')
65 | plt.plot(loss_cache)
66 | plt.show()
67 | 
68 | plt.figure(figsize=(16, 9))
69 | plt.plot(x_data[: data_count, 0], y)
70 | plt.scatter(x_data[:data_count, 0], x_data[:data_count, 1], s=50, color='g', marker='o')
71 | plt.scatter(x_data[data_count:, 0], x_data[data_count:, 1], s=50, color='r', marker='x')
72 | plt.xlabel('x')
73 | plt.ylabel('y')
74 | plt.show()
75 | 


--------------------------------------------------------------------------------
/ann/RNN.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from utility import function
 4 | 
 5 | 
 6 | class RNN(object):
 7 | 
 8 |     def __init__(self, hidden_size, seq_length, x_space, y_space, **options):
 9 | 
10 |         self.x_space = x_space
11 |         self.y_space = y_space
12 | 
13 |         self.seq_length = seq_length
14 | 
15 |         self.hidden_size = hidden_size
16 | 
17 |         self.x_weights = np.zeros((hidden_size, x_space))
18 |         self.s_weights = np.zeros((hidden_size, y_space))
19 |         self.u_weights = np.zeros((y_space, hidden_size))
20 | 
21 |         self.z_inputs = {}
22 |         self.h_inputs = {}
23 |         self.p_outputs = {}
24 |         self.deltas = {}
25 | 
26 |         self._init_options(options)
27 |         self._init_weights_and_biases()
28 | 
29 |     def _init_options(self, options):
30 | 
31 |         try:
32 |             self.batch_size = options['batch_size']
33 |         except KeyError:
34 |             self.batch_size = 16
35 |         finally:
36 |             if self.batch_size < 1:
37 |                 raise ValueError('Batch size must larger than 1.')
38 | 
39 |     def _init_weights_and_biases(self):
40 |         self.x_weights[:, ] = np.random.normal(loc=0.0, scale=0.001)
41 |         self.s_weights[:, ] = np.random.normal(loc=0.0, scale=0.001)
42 |         self.u_weights[:, ] = np.random.normal(loc=0.0, scale=0.001)
43 | 
44 |     def _forward(self, input_batch):
45 |         # Initialize s_t
46 |         s_t = np.zeros((input_batch.shape[0], self.y_space))
47 |         # Forward pass.
48 |         for seq_index in range(self.seq_length):
49 |             # Get x_t.
50 |             x_t = input_batch[:, seq_index, :]
51 |             # Save dz/dw
52 |             self.z_inputs[seq_index] = x_t
53 |             # (batch_size, x_space) * (x_space, hidden_size) -> (batch_size, hidden_size)
54 |             z_t = np.dot(x_t, self.x_weights.T)
55 |             # Save dh/ds
56 |             self.h_inputs[seq_index] = s_t
57 |             # (batch_size, y_space) * (y_space, hidden_size) -> (batch_size, hidden_size)
58 |             h_t = np.dot(s_t, self.s_weights.T)
59 |             # (batch_size, hidden_size) * (hidden_size, y_space) -> (batch_size, y_space)
60 |             phi_t = np.dot((z_t + h_t), self.u_weights.T)
61 |             # Save da/dp
62 |             self.p_outputs = phi_t
63 |             # Get s_t
64 |             s_t = function.tanh(phi_t)
65 |         return s_t
66 | 
67 |     def _backward(self, error):
68 |         for seq_index in range(self.seq_length)[::-1]:
69 |             z_input = self.z_inputs[seq_index]
70 |             h_input = self.h_inputs[seq_index]
71 |             # da/dp
72 |             p_output = self.p_outputs[seq_index]
73 |             # dp/dz
74 |             p_input = self.p_inputs[seq_index]
75 |             # TODO - Implements
76 | 
77 |     def train(self, x_data, y_data):
78 |         pass
79 | 
80 |     def predict(self):
81 |         pass
82 | 
83 |     def evaluate(self):
84 |         pass
85 | 


--------------------------------------------------------------------------------
/playground/MLP.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import matplotlib.pyplot as plt
  4 | from sklearn.preprocessing import StandardScaler
  5 | from sklearn.datasets import load_iris
  6 | from base.model import *
  7 | 
  8 | 
  9 | class Agent(BaseSLModel):
 10 | 
 11 |     def __init__(self, x_space, y_space, x_train, y_train, x_test, y_test, **options):
 12 |         super(Agent, self).__init__(x_space, y_space, x_train, y_train, x_test, y_test, **options)
 13 | 
 14 |         self._init_options(options)
 15 |         self._init_input()
 16 |         self._init_nn()
 17 |         self._init_op()
 18 |         self._init_saver()
 19 |         self._init_summary_writer()
 20 | 
 21 |         self.session.run(tf.global_variables_initializer())
 22 | 
 23 |     def _init_input(self, *args):
 24 |         self.x_input = tf.placeholder(tf.float32, [None, self.x_space])
 25 |         self.y_input = tf.placeholder(tf.float32, [None, self.y_space])
 26 | 
 27 |     def _init_nn(self, *args):
 28 |         with tf.variable_scope('MLP'):
 29 |             f_dense = tf.layers.dense(self.x_input, 32, tf.nn.relu)
 30 |             s_dense = tf.layers.dense(f_dense, 32, tf.nn.relu)
 31 |             y_predict = tf.layers.dense(s_dense, self.y_space)
 32 |             self.y_predict = y_predict
 33 | 
 34 |     def _init_op(self):
 35 |         with tf.variable_scope('loss_func'):
 36 |             # self.loss_func = tf.reduce_mean(tf.square(self.y_input - self.y_predict) * tf.abs(self.y_predict))
 37 |             # self.loss_func = tf.reduce_mean(tf.square(self.y_input - self.y_predict) * tf.square(self.y_input))
 38 |             self.loss_func = tf.losses.mean_squared_error(self.y_input, self.y_predict)
 39 |             tf.summary.scalar('mse', self.loss_func)
 40 |         with tf.variable_scope('optimizer'):
 41 |             self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_func)
 42 | 
 43 |     def train(self):
 44 |         # Get data size.
 45 |         data_size = len(self.x_train)
 46 |         for train_step in range(30000):
 47 |             # Get mini batch.
 48 |             # indices = np.random.choice(data_size, size=self.batch_size)
 49 |             # x_batch = self.x_train[indices]
 50 |             # y_batch = self.y_train[indices]
 51 |             x_batch = self.x_train
 52 |             y_batch = self.y_train
 53 |             # Train op.
 54 |             ops = [self.optimizer, self.loss_func]
 55 |             if train_step % 500 == 0:
 56 |                 ops.append(self.merged_summary_op)
 57 |             # Train.
 58 |             results = self.session.run(ops, {
 59 |                 self.x_input: x_batch,
 60 |                 self.y_input: y_batch,
 61 |             })
 62 |             # Add summary.
 63 |             if train_step % 500 == 0:
 64 |                 self.summary_writer.add_summary(results[-1], global_step=self.training_step)
 65 |             # Log loss.
 66 |             if train_step % 10 == 0:
 67 |                 self.save()
 68 |                 self.logger.warning('Step: {0}, Training loss: {1:.10f}'.format(train_step, results[1]))
 69 |                 self.evaluate()
 70 |             self.training_step += 1
 71 | 
 72 |     def predict(self, s):
 73 |         y_predict = self.session.run(self.y_predict, {self.x_input: s})
 74 |         return y_predict
 75 | 
 76 |     def evaluate(self):
 77 |         y_predict, loss = self.session.run([self.y_predict, self.loss_func], {
 78 |             self.x_input: self.x_test,
 79 |             self.y_input: self.y_test
 80 |         })
 81 | 
 82 |         self.logger.warning('Step: {0}, Testing loss: {1:.10f}'.format(self.training_step, loss))
 83 | 
 84 | 
 85 | if __name__ == '__main__':
 86 | 
 87 |     x_train = np.linspace(-np.pi, np.pi, num=200).reshape((-1, 1)) + np.random.normal()
 88 |     y_train = np.sin(x_train)
 89 | 
 90 |     x_test = np.linspace(-np.pi, np.pi, num=50).reshape((-1, 1))
 91 |     y_test = np.sin(x_test)
 92 | 
 93 |     agent = Agent(x_train[0].shape[0],
 94 |                   1,
 95 |                   x_train,
 96 |                   y_train,
 97 |                   x_test,
 98 |                   y_test)
 99 | 
100 |     agent.train()
101 | 


--------------------------------------------------------------------------------
/playground/RegressionTree.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class Node(object):
  5 |     def __init__(self, i, j, c1, c2, l_node=None, r_node=None):
  6 |         self.i = i
  7 |         self.j = j
  8 |         self.c1 = c1
  9 |         self.c2 = c2
 10 |         self.offset = 0
 11 |         self.l_node = l_node
 12 |         self.r_node = r_node
 13 | 
 14 | 
 15 | class RegressionTree(object):
 16 | 
 17 |     def __init__(self):
 18 |         self._tree = None
 19 |         self.x_data = None
 20 |         self.y_data = None
 21 |         self.num_nodes = 0
 22 | 
 23 |     def fit(self, x, y, max_depth=3):
 24 |         self.x_data = x
 25 |         self.y_data = y
 26 |         # Calculate nodes.
 27 |         self.num_nodes = 2 ** max_depth - 1
 28 |         # Init root node.
 29 |         root_node = self.make_node(x, y)
 30 | 
 31 |         def _fit(_x, _y, _node):
 32 | 
 33 |             if self.num_nodes <= 0:
 34 |                 return
 35 | 
 36 |             # Make R.
 37 |             x_r1, y_r1 = _x[:_node.i], _y[:_node.i]
 38 |             x_r2, y_r2 = _x[_node.i:], _y[_node.i:]
 39 | 
 40 |             # Make left node.
 41 |             l_node = self.make_node(x_r1, y_r1)
 42 |             _node.l_node = l_node
 43 | 
 44 |             self.num_nodes -= 1
 45 | 
 46 |             if _node.l_node:
 47 |                 # Update offset.
 48 |                 l_node.offset = _node.offset
 49 |                 _fit(x_r1, y_r1, _node.l_node)
 50 | 
 51 |             # Make right node.
 52 |             r_node = self.make_node(x_r2, y_r2)
 53 |             _node.r_node = r_node
 54 | 
 55 |             self.num_nodes -= 1
 56 | 
 57 |             if _node.r_node:
 58 |                 # Update offset.
 59 |                 r_node.offset = _node.i + _node.offset
 60 |                 _fit(x_r2, y_r2, _node.r_node)
 61 | 
 62 |         _fit(x, y, root_node)
 63 | 
 64 |         self._tree = root_node
 65 | 
 66 |     def predict(self, x):
 67 | 
 68 |         node = self._tree
 69 | 
 70 |         def _predict(_x, _node):
 71 | 
 72 |             val = self.x_data[_node.i + _node.offset, _node.j]
 73 | 
 74 |             if _x[_node.j] < val:
 75 |                 if _node.l_node:
 76 |                     return _predict(_x, _node.l_node)
 77 |                 else:
 78 |                     return _node.c1
 79 |             else:
 80 |                 if _node.r_node:
 81 |                     return _predict(_x, _node.r_node)
 82 |                 else:
 83 |                     return _node.c2
 84 | 
 85 |         return _predict(x, node)
 86 | 
 87 |     @staticmethod
 88 |     def make_node(x, y):
 89 |         # Get shape.
 90 |         rows, cols = x.shape
 91 |         if rows <= 1:
 92 |             return None
 93 |         # Init params.
 94 |         best_i, best_j = 1, 1
 95 |         best_c1, best_c2 = 0, 0
 96 |         best_loss = np.inf
 97 |         # Find best split.
 98 |         for i in range(1, rows):
 99 |             for j in range(0, cols):
100 |                 # Calculate c1, c2, loss.
101 |                 c1 = np.mean(y[:i])
102 |                 c2 = np.mean(y[i:])
103 |                 loss = np.mean(y[:i] - c1) + np.mean(y[i:] - c2)
104 |                 # Update best if need.
105 |                 if loss < best_loss:
106 |                     best_loss = loss
107 |                     best_i = i
108 |                     best_j = j
109 |                     best_c1 = c1
110 |                 best_c2 = c2
111 |         node = Node(best_i, best_j, best_c1, best_c2)
112 |         return node
113 | 
114 | 
115 | data_x = np.linspace(-10, 10, 20).reshape((-1, 1))
116 | # data_y = np.linspace(-20, 20, 100) + np.random.normal(loc=0, scale=3.5, size=(100, ))
117 | data_y = 2 * data_x
118 | 
119 | # data_x = np.array([-4, -3, -2, -1, 0, 1, 2, 3, 4]).reshape((-1, 1))
120 | # data_y = np.array([-8, -6, -4, -2, 0, 2, 4, 6, 8])
121 | 
122 | 
123 | t = RegressionTree()
124 | t.fit(data_x, data_y, max_depth=3)
125 | # print(t.predict([-4]))
126 | # print(t.predict([1]))
127 | # print(t.predict([2]))
128 | # print(t.predict([3]))
129 | # print(t.predict([4]))
130 | # print(t.predict([20]))
131 | print(t.predict([4.]))
132 | # print([t.predict(data_x[i, :].reshape((-1, ))) for i in range(0, 100)])
133 | 


--------------------------------------------------------------------------------
/playground/PolicyGradient.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import numpy as np
  4 | import gym
  5 | 
  6 | from base.model import *
  7 | from utility.launcher import start_game
  8 | 
  9 | 
 10 | class Agent(BaseRLModel):
 11 | 
 12 |     def __init__(self, a_space, s_space, **options):
 13 |         super(Agent, self).__init__(a_space, s_space, **options)
 14 | 
 15 |         self._init_input()
 16 |         self._init_nn()
 17 |         self._init_op()
 18 |         self._init_saver()
 19 |         self._init_summary_writer()
 20 | 
 21 |         self.a_buffer, self.s_buffer, self.r_buffer = [], [], []
 22 | 
 23 |         self.session.run(tf.global_variables_initializer())
 24 | 
 25 |     def _init_input(self, *args):
 26 |         with tf.variable_scope('input'):
 27 |             self.s = tf.placeholder(tf.float32, [None, self.s_space])
 28 |             self.a = tf.placeholder(tf.int32,   [None, ])
 29 |             self.r = tf.placeholder(tf.float32, [None, ])
 30 |             # Add summary.
 31 |             tf.summary.histogram('rewards', self.r)
 32 | 
 33 |     def _init_nn(self, *args):
 34 |         with tf.variable_scope('actor_net'):
 35 |             # Kernel initializer.
 36 |             w_initializer = tf.random_normal_initializer(0.0, 0.01)
 37 |             # First dense.
 38 |             f_dense = tf.layers.dense(self.s, 64, tf.nn.relu, kernel_initializer=w_initializer)
 39 |             # Second dense.
 40 |             s_dense = tf.layers.dense(f_dense, 64, tf.nn.relu, kernel_initializer=w_initializer)
 41 |             # Action logits.
 42 |             self.a_logits = tf.layers.dense(s_dense, self.a_space, kernel_initializer=w_initializer)
 43 |             # Action prob.Î
 44 |             self.a_prob = tf.nn.softmax(self.a_logits)
 45 | 
 46 |     def _init_op(self):
 47 |         with tf.variable_scope('loss_func'):
 48 |             # one hot a.
 49 |             a_one_hot = tf.one_hot(self.a, self.a_space)
 50 |             # cross entropy.
 51 |             cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=a_one_hot, logits=self.a_logits)
 52 |             # loss func.
 53 |             self.loss_func = tf.reduce_mean(cross_entropy * self.r)
 54 |             # add summary.
 55 |             tf.summary.scalar('r_cross_entropy', self.loss_func)
 56 |         with tf.variable_scope('optimizer'):
 57 |             self.global_step = tf.Variable(initial_value=0)
 58 |             self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_func)
 59 | 
 60 |     def predict(self, s):
 61 |         a_prob = self.session.run(self.a_prob, {self.s: [s]})
 62 |         if self.mode == 'train':
 63 |             return np.random.choice(range(a_prob.shape[1]), p=a_prob.ravel())
 64 |         else:
 65 |             return np.argmax(a_prob)
 66 | 
 67 |     def snapshot(self, s, a, r, _):
 68 |         self.a_buffer.append(a)
 69 |         self.s_buffer.append(s)
 70 |         self.r_buffer.append(r)
 71 | 
 72 |     def train(self):
 73 |         # Copy r_buffer
 74 |         r_buffer = self.r_buffer
 75 |         # Init r_tau
 76 |         r_tau = 0
 77 |         # Calculate r_tau
 78 |         for index in reversed(range(0, len(r_buffer))):
 79 |             r_tau = r_tau * self.gamma + r_buffer[index]
 80 |             self.r_buffer[index] = r_tau
 81 |         # Make ops.
 82 |         ops = [self.optimizer, self.loss_func]
 83 |         if self.training_step % 5 == 0:
 84 |             ops.append(self.merged_summary_op)
 85 |         # Minimize loss.
 86 |         results = self.session.run(ops, {
 87 |             self.s: self.s_buffer,
 88 |             self.a: self.a_buffer,
 89 |             self.r: self.r_buffer
 90 |         })
 91 | 
 92 |         if self.training_step % 10 == 0:
 93 |             self.summary_writer.add_summary(results[-1], global_step=self.training_step)
 94 | 
 95 |         self.training_step += 1
 96 | 
 97 |         self.s_buffer, self.a_buffer, self.r_buffer = [], [], []
 98 | 
 99 | 
100 | def main(_):
101 |     # Make env.
102 |     env = gym.make('CartPole-v0')
103 |     env.seed(1)
104 |     env = env.unwrapped
105 |     # Init agent.
106 |     agent = Agent(env.action_space.n, env.observation_space.shape[0], **{
107 |         KEY_MODEL_NAME: 'PolicyGradient',
108 |         KEY_TRAIN_EPISODE: 10000
109 |     })
110 |     start_game(env, agent)
111 | 
112 | 
113 | if __name__ == '__main__':
114 |     tf.app.run()
115 | 


--------------------------------------------------------------------------------
/note/.ipynb_checkpoints/DQN-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# 问题设定"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "在小车倒立杆（CartPole）游戏中，我们希望通过强化学习训练一个智能体（agent），尽可能不断地左右移动小车，使得小车上的杆不倒，我们首先定义CartPole游戏："
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "CartPole游戏即是强化学习模型的enviorment，它与agent交互，实时更新state，内部定义了reward function，其中state有以下定义："
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "state每一个维度分别代表了：\n",
 31 |     "\n",
 32 |     "- 小车位置，它的取值范围是-2.4到2.4\n",
 33 |     "- 小车速度，它的取值范围是负无穷到正无穷\n",
 34 |     "- 杆的角度，它的取值范围是-41.8°到41.8°\n",
 35 |     "- 杆的角速，它的取值范围是负无穷到正无穷"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "action是一个2维向量，每一个维度分别代表向左和向右移动。"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "$$\n",
 50 |     "action \\in \\mathbb{R}^2\n",
 51 |     "$$"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "# DQN"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "我们将设计一个网络，作为状态-动作值函数（state-action value function），其输入是state，输出是对应各个action的value，并TD（Temporal Difference）进行迭代训练直至收敛。我们将定义两个这样的网络，分别记作$\\theta$和$\\theta^-$，分别代表估计网络与目标网络。"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "我们希望最小化："
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "$$\n",
 80 |     "\\left( y_j - Q \\left( \\phi_j, a_j; \\theta \\right) \\right)^2\n",
 81 |     "$$"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "其中，$a_j$具有以下形式："
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "$$\n",
 96 |     "a_j = \\mathrm{argmax}_{a} Q \\left( \\phi(s_j), a; \\theta\\right)\n",
 97 |     "$$"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "其中，$y_j$具有以下形式："
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "$$\n",
112 |     "f(x)=\n",
113 |     "\\begin{cases}\n",
114 |     "r_j & \\text{if episode ends at j + 1}\\\\\n",
115 |     "r_j + \\gamma \\max_{a^{\\prime}} \\hat{Q} \\left( \\phi_{j+1}, a^{\\prime}; \\theta^{-} \\right)& \\text{otherwise}\n",
116 |     "\\end{cases}$$\n",
117 |     "\n"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "在最小化TD-Error时，我们将固定目标网络，只对估计网络做梯度反向传播，每次到达一定迭代次数后，将估计网络的权重复制到目标网络。在这个过程中，需要用到经验回放（Experience Replay）技术，即将每一次迭代观测到的$s_t, r_t, a_t, s_{t+1}$作为一个元组缓存，然后在这些缓存中随机抽取元组做批次梯度下降。"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "# 代码实现"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 1,
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "name": "stderr",
141 |      "output_type": "stream",
142 |      "text": [
143 |       "/Users/shuyu/anaconda3/envs/quant/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5\n",
144 |       "  return f(*args, **kwds)\n"
145 |      ]
146 |     }
147 |    ],
148 |    "source": [
149 |     "# coding=utf-8\n",
150 |     "\n",
151 |     "import tensorflow as tf\n",
152 |     "import numpy as np\n",
153 |     "import gym\n"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": []
162 |   }
163 |  ],
164 |  "metadata": {
165 |   "kernelspec": {
166 |    "display_name": "Python 3",
167 |    "language": "python",
168 |    "name": "python3"
169 |   },
170 |   "language_info": {
171 |    "codemirror_mode": {
172 |     "name": "ipython",
173 |     "version": 3
174 |    },
175 |    "file_extension": ".py",
176 |    "mimetype": "text/x-python",
177 |    "name": "python",
178 |    "nbconvert_exporter": "python",
179 |    "pygments_lexer": "ipython3",
180 |    "version": "3.5.4"
181 |   }
182 |  },
183 |  "nbformat": 4,
184 |  "nbformat_minor": 1
185 | }
186 | 


--------------------------------------------------------------------------------
/playground/DQN.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import numpy as np
  4 | import gym
  5 | 
  6 | from base.model import *
  7 | from utility.launcher import start_game
  8 | 
  9 | tf.set_random_seed(7)
 10 | 
 11 | 
 12 | class Agent(BaseRLModel):
 13 | 
 14 |     def __init__(self, a_space, s_space, **options):
 15 |         super(Agent, self).__init__(a_space, s_space, **options)
 16 | 
 17 |         self._init_input()
 18 |         self._init_nn()
 19 |         self._init_op()
 20 |         self._init_saver()
 21 | 
 22 |         self.buffer = np.zeros((self.buffer_size, self.s_space + 1 + 1 + self.s_space))
 23 |         self.buffer_count = 0
 24 | 
 25 |         self.update_target_net_step = 200
 26 | 
 27 |     def _init_input(self, *args):
 28 |         with tf.variable_scope('input'):
 29 |             self.s_n = tf.placeholder(tf.float32, [None, self.s_space])
 30 |             self.s = tf.placeholder(tf.float32,   [None, self.s_space])
 31 |             self.r = tf.placeholder(tf.float32,   [None, ])
 32 |             self.a = tf.placeholder(tf.int32,     [None, ])
 33 | 
 34 |     def _init_nn(self, *args):
 35 |         # w,b initializer
 36 |         w_initializer = tf.random_normal_initializer(mean=0.0, stddev=0.00003)
 37 |         b_initializer = tf.constant_initializer(0.1)
 38 | 
 39 |         with tf.variable_scope('predict_q_net'):
 40 |             phi_state = tf.layers.dense(self.s,
 41 |                                         64,
 42 |                                         tf.nn.relu,
 43 |                                         kernel_initializer=w_initializer,
 44 |                                         bias_initializer=b_initializer)
 45 | 
 46 |             self.q_predict = tf.layers.dense(phi_state,
 47 |                                              self.a_space,
 48 |                                              kernel_initializer=w_initializer,
 49 |                                              bias_initializer=b_initializer)
 50 | 
 51 |         with tf.variable_scope('target_q_net'):
 52 |             phi_state_next = tf.layers.dense(self.s_n,
 53 |                                              64,
 54 |                                              tf.nn.relu,
 55 |                                              kernel_initializer=w_initializer,
 56 |                                              bias_initializer=b_initializer)
 57 | 
 58 |             self.q_target = tf.layers.dense(phi_state_next,
 59 |                                             self.a_space,
 60 |                                             kernel_initializer=w_initializer,
 61 |                                             bias_initializer=b_initializer)
 62 | 
 63 |     def _init_op(self):
 64 |         with tf.variable_scope('q_real'):
 65 |             # size of q_value_real is [BATCH_SIZE, 1]
 66 |             max_q_value = tf.reduce_max(self.q_target, axis=1)
 67 |             q_next = self.r + self.gamma * max_q_value
 68 |             self.q_next = tf.stop_gradient(q_next)
 69 | 
 70 |         with tf.variable_scope('q_predict'):
 71 |             # size of q_value_predict is [BATCH_SIZE, 1]
 72 |             action_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)
 73 |             self.q_eval = tf.gather_nd(self.q_predict, action_indices)
 74 | 
 75 |         with tf.variable_scope('loss'):
 76 |             self.loss_func = tf.reduce_mean(tf.squared_difference(self.q_next, self.q_eval, name='mse'))
 77 | 
 78 |         with tf.variable_scope('train'):
 79 |             self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_func)
 80 | 
 81 |         with tf.variable_scope('update_target_net'):
 82 |             t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_net')
 83 |             p_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='predict_q_net')
 84 |             self.update_q_net = [tf.assign(t, e) for t, e in zip(t_params, p_params)]
 85 | 
 86 |         self.session.run(tf.global_variables_initializer())
 87 | 
 88 |     def predict(self, s):
 89 |         if np.random.uniform() < self.epsilon:
 90 |             a = np.argmax(self.session.run(self.q_predict, feed_dict={self.s: s[np.newaxis, :]}))
 91 |         else:
 92 |             a = np.random.randint(0, self.a_space)
 93 |         return a
 94 | 
 95 |     def snapshot(self, s, a, r, s_n):
 96 |         self.buffer[self.buffer_count % self.buffer_size, :] = np.hstack((s, [a, r], s_n))
 97 |         self.buffer_count += 1
 98 | 
 99 |     def train(self):
100 |         for train_step in range(self.train_steps):
101 |             # Update target net if need.
102 |             if self.training_step % self.update_target_net_step == 0:
103 |                 self.session.run(self.update_q_net)
104 |             # Get batch.
105 |             if self.buffer_count < self.batch_size:
106 |                 batch = self.buffer[np.random.choice(self.buffer_count, size=self.batch_size), :]
107 |             else:
108 |                 batch = self.buffer[np.random.choice(self.buffer_size, size=self.batch_size), :]
109 | 
110 |             s = batch[:, :self.s_space]
111 |             s_n = batch[:, -self.s_space:]
112 |             a = batch[:, self.s_space].reshape((-1))
113 |             r = batch[:, self.s_space + 1]
114 | 
115 |             _, cost = self.session.run([self.train_op, self.loss_func], {
116 |                 self.s: s, self.a: a, self.r: r * 5, self.s_n: s_n
117 |             })
118 | 
119 |             self.training_step += 1
120 | 
121 | 
122 | def main(_):
123 |     # Make env.
124 |     env = gym.make('CartPole-v0')
125 |     env.seed(1)
126 |     env = env.unwrapped
127 |     # Init session.
128 |     # Init agent.
129 |     agent = Agent(env.action_space.n, env.observation_space.shape[0], **{
130 |         KEY_MODEL_NAME: 'DQN',
131 |         KEY_TRAIN_EPISODE: 500
132 |     })
133 |     start_game(env, agent)
134 | 
135 | 
136 | if __name__ == '__main__':
137 |     tf.app.run()
138 | 


--------------------------------------------------------------------------------
/playground/PPO.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import numpy as np
  4 | import gym
  5 | 
  6 | from base.model import *
  7 | from utility.launcher import start_game
  8 | 
  9 | 
 10 | class Agent(BaseRLModel):
 11 | 
 12 |     def __init__(self, a_space, s_space, **options):
 13 |         super(Agent, self).__init__(a_space, s_space, **options)
 14 | 
 15 |         self._init_input()
 16 |         self._init_nn()
 17 |         self._init_op()
 18 |         self._init_saver()
 19 | 
 20 |         self.a_buffer = []
 21 |         self.s_buffer = []
 22 |         self.r_buffer = []
 23 |         self.a_p_r_buffer = []
 24 | 
 25 |         self.session.run(tf.global_variables_initializer())
 26 | 
 27 |     def _init_input(self, *args):
 28 |         with tf.variable_scope('input'):
 29 |             self.s = tf.placeholder(tf.float32, [None, self.s_space], name='s')
 30 |             self.a = tf.placeholder(tf.int32, [None, ], name='a')
 31 |             self.r = tf.placeholder(tf.float32, [None, ], name='r')
 32 |             self.adv = tf.placeholder(tf.float32, [None, ], name='adv')
 33 |             self.a_p_r = tf.placeholder(tf.float32, [None, ], name='a_p_r')
 34 | 
 35 |     def _init_nn(self, *args):
 36 |         self.advantage, self.value = self._init_critic_net('critic_net')
 37 |         self.a_prob_eval, self.a_logits_eval = self._init_actor_net('eval_actor_net')
 38 |         self.a_prob_target, self.a_logits_target = self._init_actor_net('target_actor_net', trainable=False)
 39 | 
 40 |     def _init_op(self):
 41 |         with tf.variable_scope('critic_loss_func'):
 42 |             # loss func.
 43 |             self.c_loss_func = tf.losses.mean_squared_error(labels=self.r, predictions=self.value)
 44 |         with tf.variable_scope('critic_optimizer'):
 45 |             # critic optimizer.
 46 |             self.c_optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.c_loss_func)
 47 |         with tf.variable_scope('update_target_actor_net'):
 48 |             # Get eval w, b.
 49 |             params_e = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_actor_net')
 50 |             params_t = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor_net')
 51 |             self.update_target_a_op = [tf.assign(t, e) for t, e in zip(params_t, params_e)]
 52 |         with tf.variable_scope('actor_loss_func'):
 53 |             # one hot a.
 54 |             a_one_hot = tf.one_hot(self.a, self.a_space)
 55 |             # Clip a_p_r.
 56 |             a_p_r = tf.clip_by_value(self.a_p_r, 1 - self.epsilon, 1 + self.epsilon)
 57 |             # cross entropy.
 58 |             cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=a_one_hot, logits=self.a_logits_eval)
 59 |             # loss func.
 60 |             self.a_loss_func = tf.reduce_mean(cross_entropy * a_p_r * self.adv)
 61 |         with tf.variable_scope('actor_optimizer'):
 62 |             self.a_optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.a_loss_func)
 63 | 
 64 |     def _init_actor_net(self, scope, trainable=True):
 65 |         with tf.variable_scope(scope):
 66 |             # Kernel initializer.
 67 |             w_initializer = tf.random_normal_initializer(0.0, 0.01)
 68 |             # First dense.
 69 |             f_dense = tf.layers.dense(self.s, 64, tf.nn.relu, trainable=trainable, kernel_initializer=w_initializer)
 70 |             # Second dense.
 71 |             s_dense = tf.layers.dense(f_dense, 32, tf.nn.relu, trainable=trainable, kernel_initializer=w_initializer)
 72 |             # Action logits.
 73 |             a_logits = tf.layers.dense(s_dense, self.a_space, trainable=trainable, kernel_initializer=w_initializer)
 74 |             # Action prob.
 75 |             a_prob = tf.nn.softmax(a_logits)
 76 |             return a_prob, a_logits
 77 | 
 78 |     def _init_critic_net(self, scope):
 79 |         with tf.variable_scope(scope):
 80 |             # Kernel initializer.
 81 |             w_initializer = tf.random_normal_initializer(0.0, 0.01)
 82 |             # First dense.
 83 |             f_dense = tf.layers.dense(self.s, 64, tf.nn.relu, kernel_initializer=w_initializer)
 84 |             # Value.
 85 |             value = tf.layers.dense(f_dense, 1)
 86 |             value = tf.reshape(value, [-1, ])
 87 |             # Advantage.
 88 |             advantage = self.r - value
 89 |             return advantage, value
 90 | 
 91 |     def predict(self, s):
 92 |         # Calculate a eval prob.
 93 |         a_prob_eval, a_prob_target = self.session.run([self.a_prob_eval, self.a_prob_target], {self.s: [s]})
 94 |         # Calculate action prob ratio between eval and target.
 95 |         a_p_r = np.max(a_prob_eval) / np.max(a_prob_target)
 96 |         self.a_p_r_buffer.append(a_p_r)
 97 |         return np.random.choice(range(a_prob_eval.shape[1]), p=a_prob_eval.ravel())
 98 | 
 99 |     def snapshot(self, s, a, r, _):
100 |         self.a_buffer.append(a)
101 |         self.s_buffer.append(s)
102 |         self.r_buffer.append(r)
103 | 
104 |     def train(self):
105 |         self.session.run(self.update_target_a_op)
106 |         # Copy r_buffer
107 |         r_buffer = self.r_buffer
108 |         # Init r_tau
109 |         r_tau = 0
110 |         # Calculate r_tau
111 |         for index in reversed(range(0, len(r_buffer))):
112 |             r_tau = r_tau * self.gamma + r_buffer[index]
113 |             self.r_buffer[index] = r_tau
114 |         # Calculate adv.
115 |         adv_buffer = self.session.run(self.advantage, {self.s: self.s_buffer, self.r: self.r_buffer})
116 |         # Minimize loss.
117 |         self.session.run([self.a_optimizer, self.c_optimizer], {
118 |             self.adv: adv_buffer,
119 |             self.s: self.s_buffer,
120 |             self.a: self.a_buffer,
121 |             self.r: self.r_buffer,
122 |             self.a_p_r: self.a_p_r_buffer,
123 |         })
124 |         self.s_buffer = []
125 |         self.a_buffer = []
126 |         self.r_buffer = []
127 |         self.a_p_r_buffer = []
128 | 
129 | 
130 | def main(_):
131 |     # Make env.
132 |     env = gym.make('CartPole-v0')
133 |     env.seed(1)
134 |     env = env.unwrapped
135 |     # Init agent.
136 |     agent = Agent(env.action_space.n, env.observation_space.shape[0], **{
137 |         # KEY_MODE: 'test',
138 |         KEY_MODEL_NAME: 'PPO',
139 |         KEY_TRAIN_EPISODE: 10000
140 |     })
141 |     start_game(env, agent)
142 | 
143 | 
144 | if __name__ == '__main__':
145 |     tf.app.run()
146 | 


--------------------------------------------------------------------------------
/base/model.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import tensorflow as tf
  4 | 
  5 | from abc import abstractmethod
  6 | from utility.logger import *
  7 | from static import *
  8 | 
  9 | KEY_TRAIN_EPISODE = 'train_episodes'
 10 | KEY_LEARNING_RATE = 'learning_rate'
 11 | KEY_ENABLE_EAGER = 'enable_eager'
 12 | KEY_SAVE_EPISODE = 'save_episode'
 13 | KEY_EVAL_EPISODE = 'eval_episode'
 14 | KEY_BUFFER_SIZE = 'buffer_size'
 15 | KEY_TRAIN_STEPS = 'train_steps'
 16 | KEY_MODEL_NAME = 'model_name'
 17 | KEY_BATCH_SIZE = 'batch_size'
 18 | KEY_SEQ_LENGTH = 'seq_length'
 19 | KEY_SAVE_DIR = 'save_dir'
 20 | KEY_SESSION = 'session'
 21 | KET_EPSILON = 'epsilon'
 22 | KEY_GAMMA = 'gamma'
 23 | KEY_MODE = 'mode'
 24 | KEY_TAU = 'tau'
 25 | 
 26 | 
 27 | class BaseModel(object):
 28 | 
 29 |     def __init__(self, **options):
 30 |         # Init vars.
 31 |         self.mode = 'train'
 32 |         self.save_dir = None
 33 |         self.training_step = 0
 34 |         self.checkpoint_path = None
 35 |         # Init parameters.
 36 |         self._init_options(options)
 37 |         self._init_logger()
 38 | 
 39 |     def _init_logger(self):
 40 |         self.logger = get_logger(self.model_name, self.mode, 'algo')
 41 | 
 42 |     def _init_saver(self):
 43 |         save_dir = os.path.join(CHECKPOINTS_DIR, self.model_name)
 44 |         if not os.path.exists(save_dir):
 45 |             os.makedirs(save_dir)
 46 |         self.checkpoint_path = os.path.join(CHECKPOINTS_DIR, self.model_name, save_dir, 'ckpt')
 47 |         self.saver = tf.train.Saver()
 48 | 
 49 |     def _init_summary_writer(self):
 50 |         self.summary_path = os.path.join(SUMMARIES_DIR, self.model_name, DATETIME_NOW)
 51 |         self.summary_writer = tf.summary.FileWriter(self.summary_path, graph=self.session.graph)
 52 |         self.merged_summary_op = tf.summary.merge_all()
 53 | 
 54 |     def _init_options(self, options):
 55 | 
 56 |         try:
 57 |             self.enable_eager = options[KEY_ENABLE_EAGER]
 58 |         except KeyError:
 59 |             self.enable_eager = False
 60 | 
 61 |         try:
 62 |             self.session = options[KEY_SESSION]
 63 |         except KeyError:
 64 |             if not self.enable_eager:
 65 |                 self.session = tf.Session()
 66 | 
 67 |         try:
 68 |             self.model_name = options[KEY_MODEL_NAME]
 69 |         except KeyError:
 70 |             self.model_name = 'model'
 71 | 
 72 |         try:
 73 |             self.mode = options[KEY_MODE]
 74 |         except KeyError:
 75 |             self.mode = 'train'
 76 | 
 77 |         try:
 78 |             self.learning_rate = options[KEY_LEARNING_RATE]
 79 |         except KeyError:
 80 |             self.learning_rate = 0.001
 81 | 
 82 |         try:
 83 |             self.batch_size = options[KEY_BATCH_SIZE]
 84 |         except KeyError:
 85 |             self.batch_size = 128
 86 | 
 87 |         try:
 88 |             self.seq_length = options[KEY_SEQ_LENGTH]
 89 |         except KeyError:
 90 |             self.seq_length = 5
 91 | 
 92 |     def save(self):
 93 |         # Save checkpoint.
 94 |         self.saver.save(self.session, self.checkpoint_path)
 95 |         self.logger.warning("Saver reach checkpoint.")
 96 | 
 97 |     def restore(self):
 98 |         self.saver.restore(self.session, self.checkpoint_path)
 99 | 
100 | 
101 | class BaseRLModel(BaseModel):
102 | 
103 |     def __init__(self, a_space, s_space, **options):
104 |         super(BaseRLModel, self).__init__(**options)
105 |         # Init spaces.
106 |         self.a_space, self.s_space = a_space, s_space
107 |         # Init buffer count.
108 |         self.buffer_count = 0
109 | 
110 |     def _init_options(self, options):
111 |         super(BaseRLModel, self)._init_options(options)
112 | 
113 |         try:
114 |             self.train_episodes = options[KEY_TRAIN_EPISODE]
115 |         except KeyError:
116 |             self.train_episodes = 1000
117 | 
118 |         try:
119 |             self.train_steps = options[KEY_TRAIN_STEPS]
120 |         except KeyError:
121 |             self.train_steps = 1000
122 | 
123 |         try:
124 |             self.eval_episodes = options[KEY_EVAL_EPISODE]
125 |         except KeyError:
126 |             self.eval_episodes = 1
127 | 
128 |         try:
129 |             self.gamma = options[KEY_GAMMA]
130 |         except KeyError:
131 |             self.gamma = 0.95
132 | 
133 |         try:
134 |             self.tau = options[KEY_TAU]
135 |         except KeyError:
136 |             self.tau = 0.01
137 | 
138 |         try:
139 |             self.epsilon = options[KET_EPSILON]
140 |         except KeyError:
141 |             self.epsilon = 0.9
142 | 
143 |         try:
144 |             self.buffer_size = options[KEY_BUFFER_SIZE]
145 |         except KeyError:
146 |             self.buffer_size = 500
147 | 
148 |         try:
149 |             self.save_episode = options[KEY_SAVE_EPISODE]
150 |         except KeyError:
151 |             self.save_episode = 50
152 | 
153 |     @abstractmethod
154 |     def _init_input(self, *args):
155 |         pass
156 | 
157 |     @abstractmethod
158 |     def _init_nn(self, *args):
159 |         pass
160 | 
161 |     @abstractmethod
162 |     def _init_op(self):
163 |         pass
164 | 
165 |     @abstractmethod
166 |     def train(self):
167 |         pass
168 | 
169 |     @abstractmethod
170 |     def predict(self, s):
171 |         pass
172 | 
173 |     @abstractmethod
174 |     def snapshot(self, s, a, r, s_n):
175 |         pass
176 | 
177 | 
178 | class BaseSLModel(BaseModel):
179 | 
180 |     def __init__(self, x_space, y_space, x_train, y_train, x_test, y_test, **options):
181 |         super(BaseSLModel, self).__init__(**options)
182 |         self.x_train, self.y_train = x_train, y_train
183 |         self.x_test, self.y_test = x_test, y_test
184 |         self.x_space, self.y_space = x_space, y_space
185 | 
186 |     def _init_options(self, options):
187 |         super(BaseSLModel, self)._init_options(options)
188 | 
189 |     @abstractmethod
190 |     def _init_input(self, *args):
191 |         pass
192 | 
193 |     @abstractmethod
194 |     def _init_nn(self, *args):
195 |         pass
196 | 
197 |     @abstractmethod
198 |     def _init_op(self):
199 |         pass
200 | 
201 |     @abstractmethod
202 |     def train(self, *args):
203 |         pass
204 | 
205 |     @abstractmethod
206 |     def predict(self, s):
207 |         pass
208 | 
209 |     @abstractmethod
210 |     def evaluate(self, *args):
211 |         pass
212 | 
213 | 


--------------------------------------------------------------------------------
/playground/DoubleDQN.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import numpy as np
  4 | import gym
  5 | 
  6 | from base.model import *
  7 | from utility.launcher import start_game
  8 | 
  9 | 
 10 | class Agent(BaseRLModel):
 11 | 
 12 |     def __init__(self, a_space, s_space, **options):
 13 |         super(Agent, self).__init__(a_space, s_space, **options)
 14 | 
 15 |         self._init_input()
 16 |         self._init_nn()
 17 |         self._init_op()
 18 |         self._init_saver()
 19 | 
 20 |         self.buffer = np.zeros((self.buffer_size, self.s_space + 1 + 1 + self.s_space))
 21 |         self.buffer_count = 0
 22 | 
 23 |         self.update_target_net_step = 200
 24 | 
 25 |         self.session.run(tf.global_variables_initializer())
 26 | 
 27 |     def _init_input(self, *args):
 28 |         with tf.variable_scope('input'):
 29 |             self.s_n = tf.placeholder(tf.float32, [None, self.s_space])
 30 |             self.s = tf.placeholder(tf.float32, [None, self.s_space])
 31 |             self.q_n = tf.placeholder(tf.float32, [None, ])
 32 |             self.r = tf.placeholder(tf.float32, [None, ])
 33 |             self.a = tf.placeholder(tf.int32, [None, ])
 34 | 
 35 |     def _init_nn(self, *args):
 36 |         # w,b initializer
 37 |         w_initializer = tf.random_normal_initializer(mean=0.0, stddev=0.3)
 38 |         b_initializer = tf.constant_initializer(0.1)
 39 | 
 40 |         with tf.variable_scope('predict_q_net'):
 41 |             phi_state = tf.layers.dense(self.s,
 42 |                                         64,
 43 |                                         tf.nn.relu,
 44 |                                         kernel_initializer=w_initializer,
 45 |                                         bias_initializer=b_initializer)
 46 | 
 47 |             self.q_predict = tf.layers.dense(phi_state,
 48 |                                              self.a_space,
 49 |                                              kernel_initializer=w_initializer,
 50 |                                              bias_initializer=b_initializer)
 51 | 
 52 |         with tf.variable_scope('target_q_net'):
 53 |             phi_state_next = tf.layers.dense(self.s_n,
 54 |                                              64,
 55 |                                              tf.nn.relu,
 56 |                                              kernel_initializer=w_initializer,
 57 |                                              bias_initializer=b_initializer,
 58 |                                              trainable=False)
 59 | 
 60 |             self.q_target = tf.layers.dense(phi_state_next,
 61 |                                             self.a_space,
 62 |                                             kernel_initializer=w_initializer,
 63 |                                             bias_initializer=b_initializer,
 64 |                                             trainable=False)
 65 | 
 66 |     def _init_op(self):
 67 | 
 68 |         with tf.variable_scope('q_predict'):
 69 |             # size of q_value_predict is [BATCH_SIZE, 1]
 70 |             action_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)
 71 |             self.q_eval = tf.gather_nd(self.q_predict, action_indices)
 72 | 
 73 |         with tf.variable_scope('loss'):
 74 |             self.loss_func = tf.losses.mean_squared_error(self.q_n, self.q_eval)
 75 | 
 76 |         with tf.variable_scope('train'):
 77 |             self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_func)
 78 | 
 79 |         with tf.variable_scope('update_target_net'):
 80 |             t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_net')
 81 |             p_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='predict_q_net')
 82 |             self.update_q_net = [tf.assign(t, e) for t, e in zip(t_params, p_params)]
 83 | 
 84 |     def predict(self, s):
 85 |         if np.random.uniform() < self.epsilon or self.mode == 'test':
 86 |             a = np.argmax(self.session.run(self.q_predict, feed_dict={self.s: s[np.newaxis, :]}))
 87 |         else:
 88 |             a = np.random.randint(0, self.a_space)
 89 |         return a
 90 | 
 91 |     def snapshot(self, s, a, r, s_n):
 92 |         self.buffer[self.buffer_count % self.buffer_size, :] = np.hstack((s, [a, r], s_n))
 93 |         self.buffer_count += 1
 94 | 
 95 |     def train(self):
 96 | 
 97 |         for train_step in range(self.train_steps):
 98 |             # Update target net if need.
 99 |             if self.training_step % self.update_target_net_step == 0:
100 |                 self.session.run(self.update_q_net)
101 |             # Get batch.
102 |             if self.buffer_count < self.buffer_size:
103 |                 batch = self.buffer[np.random.choice(self.buffer_count, size=self.batch_size), :]
104 |             else:
105 |                 batch = self.buffer[np.random.choice(self.buffer_size, size=self.batch_size), :]
106 | 
107 |             s = batch[:, :self.s_space]
108 |             s_n = batch[:, -self.s_space:]
109 |             a = batch[:, self.s_space].reshape((-1))
110 |             r = batch[:, self.s_space + 1]
111 | 
112 |             # 1. Calculate q_next_predict and q_next_target.
113 |             q_next_predict, q_next_target = self.session.run([self.q_predict, self.q_target], {
114 |                 self.s: s_n, self.s_n: s_n
115 |             })
116 | 
117 |             # 2. Select a_indices in q_next_predict.
118 |             a_indices = np.argmax(q_next_predict, axis=1)
119 | 
120 |             # 3. Select Q values with a_indices
121 |             q_next = q_next_target[np.arange(0, self.batch_size), a_indices]
122 | 
123 |             # 4. Calculate q_real.
124 |             q_real = r + self.gamma * q_next
125 | 
126 |             _, cost = self.session.run([self.train_op, self.loss_func], {
127 |                 self.s: s, self.a: a, self.q_n: q_real
128 |             })
129 | 
130 |             self.training_step += 1
131 | 
132 | 
133 | if __name__ == '__main__':
134 | 
135 |     def main(_):
136 |         # Make env.
137 |         env = gym.make('CartPole-v0')
138 |         env.seed(1)
139 |         env = env.unwrapped
140 |         # Init agent.
141 |         agent = Agent(env.action_space.n, env.observation_space.shape[0], **{
142 |             KEY_MODEL_NAME: 'PPO',
143 |             KEY_TRAIN_EPISODE: 10000
144 |         })
145 |         start_game(env, agent)
146 | 
147 | 
148 |     if __name__ == '__main__':
149 |         tf.app.run()
150 | 


--------------------------------------------------------------------------------
/playground/TensorFlowServing.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import tensorflow as tf
  4 | import numpy as np
  5 | import requests
  6 | import logging
  7 | import shutil
  8 | import os
  9 | 
 10 | from static import CHECKPOINTS_DIR
 11 | 
 12 | from grpc.beta import implementations
 13 | from tensorflow_serving.apis import predict_pb2
 14 | from tensorflow_serving.apis import prediction_service_pb2
 15 | 
 16 | # os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 17 | 
 18 | data_save_dir = os.path.join(CHECKPOINTS_DIR, 'TensorFlowServing')
 19 | graph_save_dir = os.path.join(CHECKPOINTS_DIR, 'TensorFlowServing', 'graph')
 20 | 
 21 | # x_test = np.linspace(2 * -np.pi, 2 * np.pi, num=100).reshape((-1, 1))
 22 | # y_test = np.sin(x_test)
 23 | 
 24 | # x_train = x_test + np.random.normal(0.3, 0.003)
 25 | # y_train = np.sin(x_train) + np.random.normal(0.0, 0.00003)
 26 | 
 27 | # x_train = x_train.astype(np.float32)
 28 | # y_train = y_train.astype(np.float32)
 29 | 
 30 | x_train = np.load(os.path.join(data_save_dir, 'x_train.npy')).astype(np.float32)
 31 | y_train = np.load(os.path.join(data_save_dir, 'y_train.npy')).astype(np.float32)
 32 | 
 33 | np.save(os.path.join(data_save_dir, 'x_train.npy'), x_train)
 34 | np.save(os.path.join(data_save_dir, 'y_train.npy'), y_train)
 35 | 
 36 | 
 37 | def train():
 38 | 
 39 |     session = tf.Session()
 40 | 
 41 |     x_input = tf.placeholder(tf.float32, [None, 1], name='x_input')
 42 |     y_input = tf.placeholder(tf.float32, [None, 1], name='y_input')
 43 | 
 44 |     fc1 = tf.layers.dense(x_input, 10, tf.nn.relu)
 45 |     fc2 = tf.layers.dense(fc1, 10, tf.nn.relu)
 46 | 
 47 |     y_predict = tf.layers.dense(fc2, 1)
 48 | 
 49 |     loss_func = tf.losses.mean_squared_error(labels=y_input, predictions=y_predict)
 50 | 
 51 |     optimizer = tf.train.AdamOptimizer().minimize(loss_func)
 52 | 
 53 |     session.run(tf.global_variables_initializer())
 54 | 
 55 |     signature = tf.saved_model.signature_def_utils.build_signature_def(
 56 |         inputs={
 57 |             'x_input': tf.saved_model.utils.build_tensor_info(x_input),
 58 |             'y_input': tf.saved_model.utils.build_tensor_info(y_input)
 59 |         },
 60 |         outputs={
 61 |             'y_predict': tf.saved_model.utils.build_tensor_info(y_predict),
 62 |             'loss_func': tf.saved_model.utils.build_tensor_info(loss_func)
 63 |         },
 64 |         method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME
 65 |     )
 66 | 
 67 |     for step in range(2000):
 68 |         session.run(optimizer, {
 69 |             x_input: x_train,
 70 |             y_input: y_train
 71 |         })
 72 |         if (step + 1) % 500 == 0:
 73 |             if os.path.exists(graph_save_dir):
 74 |                 shutil.rmtree(graph_save_dir)
 75 |             builder = tf.saved_model.builder.SavedModelBuilder(graph_save_dir)
 76 |             builder.add_meta_graph_and_variables(session,
 77 |                                                  [tf.saved_model.tag_constants.SERVING],
 78 |                                                  {tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature})
 79 |             # builder.add_meta_graph([tf.saved_model.tag_constants.SERVING], {'signature': signature})
 80 |             builder.save()
 81 | 
 82 |     loss = session.run(loss_func, {
 83 |         x_input: x_train,
 84 |         y_input: y_train
 85 |     })
 86 | 
 87 |     logging.warning('Loss: {}'.format(loss))
 88 |     # builder.add_meta_graph_and_variables(session, [tf.saved_model.tag_constants.TRAINING], {'signature': signature})
 89 |     # builder.add_meta_graph([tf.saved_model.tag_constants.SERVING], {'signature': signature})
 90 |     # builder.save()
 91 | 
 92 | 
 93 | def test():
 94 |     # Session.
 95 |     session = tf.Session()
 96 |     # Load meta graph.
 97 |     meta_graph_def = tf.saved_model.loader.load(session, [tf.saved_model.tag_constants.SERVING], graph_save_dir)  # type: tf.MetaGraphDef
 98 |     # Get signature.
 99 |     signature_def = meta_graph_def.signature_def
100 |     signature = signature_def[tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
101 |     # Get input tensor.
102 |     x_input_tensor = signature.inputs['x_input'].name
103 |     y_input_tensor = signature.inputs['y_input'].name
104 |     # Get output tensor.
105 |     y_predict_tensor = signature.outputs['y_predict'].name
106 |     # Get loss func.
107 |     loss_op = signature.outputs['loss_func'].name
108 | 
109 |     _, loss = session.run([y_predict_tensor, loss_op], {
110 |         x_input_tensor: x_train,
111 |         y_input_tensor: y_train,
112 |     })
113 | 
114 |     logging.warning('Loss: {}'.format(loss))
115 | 
116 | 
117 | def inference_v1():
118 |     # Init channel.
119 |     channel = implementations.insecure_channel('localhost', 9000)
120 |     # Init stub.
121 |     stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)
122 |     # Init request.
123 |     request = predict_pb2.PredictRequest()
124 |     request.model_spec.name = 'test'
125 |     request.model_spec.signature_name = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
126 |     request.inputs['x_input'].CopyFrom(
127 |         tf.contrib.util.make_tensor_proto(x_train, shape=x_train.shape)
128 |     )
129 |     request.inputs['y_input'].CopyFrom(
130 |         tf.contrib.util.make_tensor_proto(y_train, shape=y_train.shape)
131 |     )
132 |     # Predict.
133 |     future = stub.Predict.future(request, 2.0)
134 |     result = future.result().outputs['loss_func'].float_val
135 |     logging.warning('Loss: {}'.format(result))
136 | 
137 | 
138 | def inference_v2():
139 |     # Init url.
140 |     url = "http://localhost:9001/v1/models/test:predict"
141 |     # url = 'http://172.16.11.43:10000/tool_list/test'
142 |     # Init body.
143 |     import json
144 |     body = {
145 |         # 'signature_name': tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
146 |         'instances':  [
147 |             {
148 |                 'x_input': json.dumps(x_train.tolist(), ensure_ascii=True)
149 |             }
150 |         ]
151 |     }
152 |     # Post.
153 |     # response = requests.post(url, data=body)
154 |     response = requests.post(url, json=json.dumps(body))
155 |     logging.warning('{}'.format(response.text))
156 |     return response
157 | 
158 | 
159 | train()
160 | # test()
161 | # inference_v1()
162 | # inference_v2()
163 | 
164 | 
165 | # plt.plot(y_train)
166 | # plt.plot(y_test)
167 | # plt.show()
168 | 
169 | 


--------------------------------------------------------------------------------
/note/GloVe.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# GloVe"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 问题设定"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "GloVe是Global Vectors for Word Representation的缩写。"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "对于One-hot词向量："
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "$$\n",
 36 |     "\\begin{aligned}\n",
 37 |     "I &= [1, 0, 0] \\\\\n",
 38 |     "Like &= [0, 1, 0] \\\\\n",
 39 |     "Apple &= [0, 0, 1] \n",
 40 |     "\\end{aligned}\n",
 41 |     "$$"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "无法通过两向量夹角余弦值计算其相似度，word2vec是一种嵌入模型，通过这种模型训练出的词向量可以较好的表示出词之间的相似度，但是word2vec仅仅考虑了两个词在一段上下文的相关度，而GloVe考虑了两个词向量在全文中的相关度。"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "## 共现矩阵（Co-occurrence Probabilities Matrix）"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "训练GloVe模型前，首先需要构建一个共现矩阵，设词表大小为V，共现矩阵将是一个V行V列的方阵，而第i行第j列的表示了以第i个中心词$w_i$，第j个背景词$w_j$出现的次数。"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "假设我们有上下文："
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "$$\n",
 77 |     "an\\ apple\\ a\\ day\\ keeps\\ an\\ apple\\ a\\ day\n",
 78 |     "$$"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "我们设定滑窗大小m等于2，我们将会有如下中心词-背景词对："
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "中心词  |    背景词   |\n",
 93 |     ":---:  |    :---:   |\n",
 94 |     "an     | apple, a   |\n",
 95 |     "apple  | an, a, day |\n",
 96 |     "a      | an, apple, day, keeps   |\n",
 97 |     "day    | apple, a, keeps, an     |\n",
 98 |     "keeps  | a, day, an, apple       |\n",
 99 |     "an     | day, keeps, apple, a |\n",
100 |     "apple  | keeps, an, a, day |\n",
101 |     "a      | an, apple, day |\n",
102 |     "day    | apple, a |"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "然后遍历中心词-背景词对，更新共现矩阵，以上图为例，最后共现矩阵的结果将有如下形式："
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "|        |   An  | apple | a | day | keeps |\n",
117 |     "|    -   |   -   |   -   | - |  -  |   -   |\n",
118 |     "|   An   |   0   |   2   | 2 |  1  |   1   |\n",
119 |     "|  apple |   2   |   0   | 2 |  2  |   1   |\n",
120 |     "|    a   |   2   |   2   | 0 |  2  |   1   |\n",
121 |     "|   day  |   0   |   2   | 2 |  0  |   1   |\n",
122 |     "|  keeps |   1   |   1   | 1 |  1  |   0   |"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "共现矩阵揭示了某种规律，定义共现矩阵的第i行的和为："
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "$$\n",
137 |     "X_i = \\sum^{V}_{j=1}X_{i, j}\n",
138 |     "$$"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "之后我们有条件概率，即第j列对应的词出现在第i行上下文中的条件概率："
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "$$\n",
153 |     "\\mathbb{P}_{i, j} = \\frac{X_{i, j}}{X_i}\n",
154 |     "$$"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "而对于某个词$w_k$，他在第i行或者第j行上下文出现的条件概率的比值："
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "$$\n",
169 |     "\\frac{\\mathbb{P}_{i, k}}{\\mathbb{P}_{j, k}}\n",
170 |     "$$"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "metadata": {},
176 |    "source": [
177 |     "这个值是可以直接观测并计算到的，并将会有如下规律：\n",
178 |     "- 如果$w_j$与$w_k$相关，且$w_i$与$w_k$相关，那么这个比值将会趋近于1\n",
179 |     "- 如果$w_j$与$w_k$相关，且$w_i$与$w_k$不相关，那么这个比值将会很小\n",
180 |     "- 如果$w_j$与$w_k$不相关，且$w_i$与$w_k$相关，那么这个比值将会很大\n",
181 |     "- 如果$w_j$与$w_k$不相关，且$w_i$与$w_k$不相关，那么这个比值将会趋近于1"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "## 损失函数"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {},
194 |    "source": [
195 |     "我们希望设计一个损失函数，希望对词表内每两个词对，$w_i$与$w_j$，尽可能与$w_k$在共现矩阵中对于第i, j上下文中，出现的条件概率比值相近："
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "$$\n",
203 |     "\\frac{\\mathbb{P}_{i, k}}{\\mathbb{P}_{j, k}} = \\frac{\\exp (v^T_i v_k) }{\\exp (v^T_j v_k)}\n",
204 |     "$$"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "两边取对数，对于分子分母："
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "markdown",
216 |    "metadata": {},
217 |    "source": [
218 |     "$$\n",
219 |     "\\log \\frac{X_{i, k}}{X_i} = \\log X_{i, k} - \\log X_i = v^T_i v_k \\\\\n",
220 |     "\\log \\frac{X_{j, k}}{X_j} = \\log X_{j, k} - \\log X_j = v^T_j v_k\n",
221 |     "$$"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "metadata": {},
227 |    "source": [
228 |     "可以看到问题得到了简化，我们希望左式的分子尽可能等于右式的分子，分母亦然，则问题被简化为：对于词表内任意一组词对i, j，我们希望最小化下式："
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "$$\n",
236 |     "\\sum^{V}_{i=1} \\sum^{V}_{j=1} \\left( v^T_i v_j - \\log X_i - \\log(X_{i, j})\\right )^2\n",
237 |     "$$"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "其中偏置项$b_i, b_j$将会替换$\\log X_i$:"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "markdown",
249 |    "metadata": {},
250 |    "source": [
251 |     "但是并不是每一个词对都是平权的，需要考虑词频来设定每一个词对的权重："
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "markdown",
256 |    "metadata": {},
257 |    "source": [
258 |     "$$\n",
259 |     "f(X_{i, j}) = \n",
260 |     "\\begin{cases}\n",
261 |     "(X_{i, j} \\ /\\ C)^{0.75}& \\text{ X > c }\\\\\n",
262 |     "1& \\text{ X < 0}\n",
263 |     "\\end{cases}\n",
264 |     "$$"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "metadata": {},
270 |    "source": [
271 |     "最后我们希望最小化："
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "markdown",
276 |    "metadata": {},
277 |    "source": [
278 |     "$$\n",
279 |     "\\sum^{V}_{i=1} \\sum^{V}_{j=1} f(X_{i, j}) \\left( v^T_i v_j + b_i + b_j - \\log(X_{i, j})\\right )^2\n",
280 |     "$$"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "markdown",
285 |    "metadata": {},
286 |    "source": [
287 |     "最后使用中心词向量$v_j$与背景词向量$v_i$的和作为中心词向量的表示。"
288 |    ]
289 |   }
290 |  ],
291 |  "metadata": {
292 |   "kernelspec": {
293 |    "display_name": "Python 3",
294 |    "language": "python",
295 |    "name": "python3"
296 |   },
297 |   "language_info": {
298 |    "codemirror_mode": {
299 |     "name": "ipython",
300 |     "version": 3
301 |    },
302 |    "file_extension": ".py",
303 |    "mimetype": "text/x-python",
304 |    "name": "python",
305 |    "nbconvert_exporter": "python",
306 |    "pygments_lexer": "ipython3",
307 |    "version": "3.5.4"
308 |   }
309 |  },
310 |  "nbformat": 4,
311 |  "nbformat_minor": 2
312 | }
313 | 


--------------------------------------------------------------------------------
/note/.ipynb_checkpoints/GloVe-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# GloVe"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 问题设定"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "GloVe是Global Vectors for Word Representation的缩写。"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "对于One-hot词向量："
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "$$\n",
 36 |     "\\begin{aligned}\n",
 37 |     "I &= [1, 0, 0] \\\\\n",
 38 |     "Like &= [0, 1, 0] \\\\\n",
 39 |     "Apple &= [0, 0, 1] \n",
 40 |     "\\end{aligned}\n",
 41 |     "$$"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "无法通过两向量夹角余弦值计算其相似度，word2vec是一种嵌入模型，通过这种模型训练出的词向量可以较好的表示出词之间的相似度，但是word2vec仅仅考虑了两个词在一段上下文的相关度，而GloVe考虑了两个词向量在全文中的相关度。"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "## 共现矩阵（Co-occurrence Probabilities Matrix）"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "训练GloVe模型前，首先需要构建一个共现矩阵，设词表大小为V，共现矩阵将是一个V行V列的方阵，而第i行第j列的表示了以第i个中心词$w_i$，第j个背景词$w_j$出现的次数。"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "假设我们有上下文："
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "$$\n",
 77 |     "an\\ apple\\ a\\ day\\ keeps\\ an\\ apple\\ a\\ day\n",
 78 |     "$$"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "我们设定滑窗大小m等于2，我们将会有如下中心词-背景词对："
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "中心词  |    背景词   |\n",
 93 |     ":---:  |    :---:   |\n",
 94 |     "an     | apple, a   |\n",
 95 |     "apple  | an, a, day |\n",
 96 |     "a      | an, apple, day, keeps   |\n",
 97 |     "day    | apple, a, keeps, an     |\n",
 98 |     "keeps  | a, day, an, apple       |\n",
 99 |     "an     | day, keeps, apple, a |\n",
100 |     "apple  | keeps, an, a, day |\n",
101 |     "a      | an, apple, day |\n",
102 |     "day    | apple, a |"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "然后遍历中心词-背景词对，更新共现矩阵，以上图为例，最后共现矩阵的结果将有如下形式："
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "|        |   An  | apple | a | day | keeps |\n",
117 |     "|    -   |   -   |   -   | - |  -  |   -   |\n",
118 |     "|   An   |   0   |   2   | 2 |  1  |   1   |\n",
119 |     "|  apple |   2   |   0   | 2 |  2  |   1   |\n",
120 |     "|    a   |   2   |   2   | 0 |  2  |   1   |\n",
121 |     "|   day  |   0   |   2   | 2 |  0  |   1   |\n",
122 |     "|  keeps |   1   |   1   | 1 |  1  |   0   |"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "共现矩阵揭示了某种规律，定义共现矩阵的第i行的和为："
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "$$\n",
137 |     "X_i = \\sum^{V}_{j=1}X_{i, j}\n",
138 |     "$$"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "之后我们有条件概率，即第j列对应的词出现在第i行上下文中的条件概率："
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "$$\n",
153 |     "\\mathbb{P}_{i, j} = \\frac{X_{i, j}}{X_i}\n",
154 |     "$$"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "而对于某个词$w_k$，他在第i行或者第j行上下文出现的条件概率的比值："
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "$$\n",
169 |     "\\frac{\\mathbb{P}_{i, k}}{\\mathbb{P}_{j, k}}\n",
170 |     "$$"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "metadata": {},
176 |    "source": [
177 |     "这个值是可以直接观测并计算到的，并将会有如下规律：\n",
178 |     "- 如果$w_j$与$w_k$相关，且$w_i$与$w_k$相关，那么这个比值将会趋近于1\n",
179 |     "- 如果$w_j$与$w_k$相关，且$w_i$与$w_k$不相关，那么这个比值将会很小\n",
180 |     "- 如果$w_j$与$w_k$不相关，且$w_i$与$w_k$相关，那么这个比值将会很大\n",
181 |     "- 如果$w_j$与$w_k$不相关，且$w_i$与$w_k$不相关，那么这个比值将会趋近于1"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "## 损失函数"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {},
194 |    "source": [
195 |     "我们希望设计一个损失函数，希望对词表内每两个词对，$w_i$与$w_j$，尽可能与$w_k$在共现矩阵中对于第i, j上下文中，出现的条件概率比值相近："
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "$$\n",
203 |     "\\frac{\\mathbb{P}_{i, k}}{\\mathbb{P}_{j, k}} = \\frac{\\exp (v^T_i v_k) }{\\exp (v^T_j v_k)}\n",
204 |     "$$"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "两边取对数，对于分子分母："
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "markdown",
216 |    "metadata": {},
217 |    "source": [
218 |     "$$\n",
219 |     "\\log \\frac{X_{i, k}}{X_i} = \\log X_{i, k} - \\log X_i = v^T_i v_k \\\\\n",
220 |     "\\log \\frac{X_{j, k}}{X_j} = \\log X_{j, k} - \\log X_j = v^T_j v_k\n",
221 |     "$$"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "metadata": {},
227 |    "source": [
228 |     "可以看到问题得到了简化，我们希望左式的分子尽可能等于右式的分子，分母亦然，则问题被简化为：对于词表内任意一组词对i, j，我们希望最小化下式："
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "$$\n",
236 |     "\\sum^{V}_{i=1} \\sum^{V}_{j=1} \\left( v^T_i v_j - \\log X_i - \\log(X_{i, j})\\right )^2\n",
237 |     "$$"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "其中偏置项$b_i, b_j$将会替换$\\log X_i$:"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "markdown",
249 |    "metadata": {},
250 |    "source": [
251 |     "但是并不是每一个词对都是平权的，需要考虑词频来设定每一个词对的权重："
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "markdown",
256 |    "metadata": {},
257 |    "source": [
258 |     "$$\n",
259 |     "f(X_{i, j}) = \n",
260 |     "\\begin{cases}\n",
261 |     "(X_{i, j} \\ /\\ C)^{0.75}& \\text{ X > c }\\\\\n",
262 |     "1& \\text{ X < 0}\n",
263 |     "\\end{cases}\n",
264 |     "$$"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "metadata": {},
270 |    "source": [
271 |     "最后我们希望最小化："
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "markdown",
276 |    "metadata": {},
277 |    "source": [
278 |     "$$\n",
279 |     "\\sum^{V}_{i=1} \\sum^{V}_{j=1} f(X_{i, j}) \\left( v^T_i v_j + b_i + b_j - \\log(X_{i, j})\\right )^2\n",
280 |     "$$"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "markdown",
285 |    "metadata": {},
286 |    "source": [
287 |     "最后使用中心词向量$v_j$与背景词向量$v_i$的和作为中心词向量的表示。"
288 |    ]
289 |   }
290 |  ],
291 |  "metadata": {
292 |   "kernelspec": {
293 |    "display_name": "Python 3",
294 |    "language": "python",
295 |    "name": "python3"
296 |   },
297 |   "language_info": {
298 |    "codemirror_mode": {
299 |     "name": "ipython",
300 |     "version": 3
301 |    },
302 |    "file_extension": ".py",
303 |    "mimetype": "text/x-python",
304 |    "name": "python",
305 |    "nbconvert_exporter": "python",
306 |    "pygments_lexer": "ipython3",
307 |    "version": "3.5.4"
308 |   }
309 |  },
310 |  "nbformat": 4,
311 |  "nbformat_minor": 2
312 | }
313 | 


--------------------------------------------------------------------------------
/note/Word2Vec.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# Word2Vec"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## 问题设定"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "对于One-hot的词向量："
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "$$\n",
 31 |     "\\begin{aligned}\n",
 32 |     "I &= [1, 0, 0] \\\\\n",
 33 |     "Like &= [0, 1, 0] \\\\\n",
 34 |     "Apple &= [0, 0, 1] \n",
 35 |     "\\end{aligned}\n",
 36 |     "$$"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "无法通过两向量夹角余弦值计算其相似度，word2vec提供了Skip-Gram（跳字模型）与CBOW（连续词袋模型）两个词嵌入模型，通过这种模型训练出的词向量可以较好的表示出词之间的相似度。"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "## Skip-Gram"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "即跳字模型，其核心思想是对于一个上下文，设定一个大小为$m$的滑窗，在滑窗内选择$1$个中心词，预测滑窗内$m - 1$个背景词。即如果上下文是："
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "$$\n",
 65 |     "I\\  eat\\  apple\\  every\\  day\n",
 66 |     "$$"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "对每一个词进行One-hot编码："
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "$$\n",
 81 |     "\\begin{aligned}\n",
 82 |     "I &= [1, 0, 0, 0, 0] \\\\ \n",
 83 |     "eat &= [0, 1, 0, 0, 0] \\\\\n",
 84 |     "apple &= [0, 0, 1, 0, 0] \\\\\n",
 85 |     "every &= [0, 0, 0, 1, 0] \\\\\n",
 86 |     "day &= [0, 0, 0, 0, 1]\n",
 87 |     "\\end{aligned}\n",
 88 |     "$$"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "设定滑窗大小为$2$，如果选择中心词$apple$，那么将会有以下训练数据："
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "$$\n",
103 |     "\\begin{aligned}\n",
104 |     "x &= [0, 0, 1, 0, 0] \\\\ \n",
105 |     "y &= [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]\n",
106 |     "\\end{aligned}\n",
107 |     "$$"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "设计一个只有1个输入层、1个隐藏层、1个输出层的神经网络，其中输出层的神经元个数等于输入层即等于One-hot编码的维度，而隐含层的神经元个数通常远小于输出层，比如One-hot维度如果是10000，隐含层可以只有300个神经元："
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "我们通过最大化似然函数："
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "$$\n",
129 |     "\\prod^{N}_{i=1} \\prod_{-m <= j <= m} \\mathbb{P} \\left( w^{i+j} \\ \\lvert \\  w^i \\right)\n",
130 |     "$$"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "即对于上下文内所有的词，给定中心词$w^i$，预测滑窗内其他词，越准确越好。对上式取对数并展开："
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "$$\n",
145 |     "\\begin{aligned}\n",
146 |     "\\prod^{N}_{i=1} \\prod_{-m <= j <= m} \\mathbb{P} \\left( w^{i+j} \\ \\lvert \\  w^i \\right) &= \\sum^{N}_{i=1} \\sum_{-m <= j <= m} \\log \\mathbb{P} \\left( w^{i+j} \\ \\lvert \\  w^i \\right) \\\\\n",
147 |     "&= \\sum^{N}_{i=1} \\sum_{-m <= j <= m} \\log \\left( \\frac{\\exp(\\mathrm{u^T_{i+j} \\cdot v_{i}} )}{ \\mathrm{\\sum^{N}_{k=1} \\exp(\\mathrm{u^T_{k} \\cdot v_{i}})}} \\right) \\\\\n",
148 |     "\\end{aligned}\n",
149 |     "$$"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "其中，$\\mathrm{v_i}$即是隐藏层的权重，也是隐藏层的输入$z_i$，也是第i个词的词向量，$\\mathrm{u_{i+j}}$是输出层的权重，也是第i+j个词的词向量的另一个表达。最大化上式的最大似然函数，即最小化下式交叉熵："
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "$$\n",
164 |     "- \\sum^{N}_{i=1} \\mathrm{y_i} \\cdot \\log \\mathrm{p_i}\n",
165 |     "$$"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "其中$\\mathrm{y_i}$与$\\mathrm{p_i}$是维度为词表长度的向量，分别代表观测值与计算值，对$\\mathrm{v_i}$求梯度有："
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "$$\n",
180 |     "\\begin{aligned}\n",
181 |     "\\frac {\\partial \\log \\mathbb{P} \\left( w^{j} \\ \\lvert \\  w^i \\right)} {\\mathrm{v_i}} &= \\frac {\\partial \\log \\left( \\exp(\\mathrm{u^T_{j} \\cdot v_{i}} ) \\right) - \\log \\left (  \\mathrm{\\sum^{V}_{k=1} \\exp(\\mathrm{u^T_{k} \\cdot v_{i}})} \\right)}{\\partial \\mathrm{v_{i}}} \\\\\n",
182 |     "&= \\mathrm{u_{j}} - \\frac{1}{\\sum^{V}_{w=1} \\exp(\\mathrm{u^T_w v_i}) } \\left[ \\sum^{V}_{k=1} \\exp(\\mathrm{u^T_k v_i) \\cdot \\mathrm{u_k}} \\right] \\\\\n",
183 |     "&= \\mathrm{u_{j}} - \\sum^{V}_{k=1} \\frac{ \\exp(\\mathrm{u^T_k v_i}) }{ \\sum^{V}_{w=1} \\exp(\\mathrm{u^T_w v_i}) } \\cdot \\mathrm{u_k}\n",
184 |     "\\end{aligned}\n",
185 |     "$$"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "然后使用梯度下降更新$\\mathrm{v_i}$，此处的$\\mathrm{v_i}$是向量，在网络中，即是输入层的第i个神经元到隐含层的权重。"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "## CBOW"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "即Continuous Bag of Words，连续词袋模型，其核心思想是对于一个上下文，设定一个大小为$m$的滑窗，在滑窗内选择$1$个背景词，$m - 1$个中心词，与Skip-Gram相反，设定滑窗大小为$2$，如果选择中心词$\\ I,\\ eat,\\ every,\\ day$，那么将会有以下训练数据："
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {},
212 |    "source": [
213 |     "$$\n",
214 |     "\\begin{aligned}\n",
215 |     "x &= [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1] \\\\ \n",
216 |     "y &= [0, 0, 1, 0, 0] \\\\ \n",
217 |     "\\end{aligned}\n",
218 |     "$$"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {},
224 |    "source": [
225 |     "而对于概率："
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "metadata": {},
231 |    "source": [
232 |     "$$\n",
233 |     "\\mathbb{P} \\left( w^{j} \\ \\lvert \\ w^{i-m}, \\cdots,  w^i, \\cdots, w^{i+m} \\right) =  \\frac{\\exp(\\mathrm{u^T_{j} \\cdot (v_{i} + \\cdots + v_{i+2m}} ) / 2m)}{ \\mathrm{\\sum^{V}_{k=1} \\exp(\\mathrm{u^T_{k} \\cdot (v_{i} + \\cdots + v_{i+2m}} ) / 2m})}\n",
234 |     "$$"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "metadata": {},
240 |    "source": [
241 |     "与Skip-Gram的不同之处在于将中心词求和后平均，之后的梯度计算与更新和Skip-Gram相同，这里就不展开了。"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "## 负采样"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "可以直观地从上面的梯度更新公式中看到，每一次更新都伴随着巨量的计算开销，这个计算开销主要是因为Softmax函数的分母。可以使用负采样替换Softmax，减少计算开销。"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "相对于原条件概率："
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {},
268 |    "source": [
269 |     "$$\n",
270 |     "\\mathbb{P} \\left( w^{j} \\ \\lvert \\  w^i \\right) =  \\frac{\\exp(\\mathrm{u^T_{j} \\cdot v_{i}} )}{ \\mathrm{\\sum^{V}_{k=1} \\exp(\\mathrm{u^T_{k} \\cdot v_{i}})}}\n",
271 |     "$$"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "markdown",
276 |    "metadata": {},
277 |    "source": [
278 |     "将被改写为："
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {},
284 |    "source": [
285 |     "$$\n",
286 |     "\\mathbb{P} \\left( w^{j} \\ \\lvert \\  w^i \\right) = \\log \\frac{1}{1 + \\exp(- \\mathrm{u^T_j v_i})} + \\sum^{K}_{k=1} \\log \\left( 1 - \\frac{1}{1 + \\exp(- \\mathrm{u^T_k v_i})} \\right)\n",
287 |     "$$"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "markdown",
292 |    "metadata": {},
293 |    "source": [
294 |     "即筛选出K个不在滑窗内的词向量，直观地理解是希望中心词尽可能地不预测出这些采样出的词，筛选出某个词的概率由这个公式决定："
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "markdown",
299 |    "metadata": {},
300 |    "source": [
301 |     "$$\n",
302 |     "\\mathrm{P(w_i)} = \\frac{f(w_i)^{\\frac{3}{4}}}{\\sum^{V}_{k=1}f(w_k)^{\\frac{3}{4}}}\n",
303 |     "$$"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "markdown",
308 |    "metadata": {},
309 |    "source": [
310 |     "其中，$f(w_i)$是这个单词在上下文中出现的频率。"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "markdown",
315 |    "metadata": {},
316 |    "source": [
317 |     "## 结果"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "markdown",
322 |    "metadata": {},
323 |    "source": [
324 |     "通过这种词嵌入模型训练出的词向量能较好的表示两个相近意思的词的近似程度。"
325 |    ]
326 |   }
327 |  ],
328 |  "metadata": {
329 |   "kernelspec": {
330 |    "display_name": "Python 3",
331 |    "language": "python",
332 |    "name": "python3"
333 |   },
334 |   "language_info": {
335 |    "codemirror_mode": {
336 |     "name": "ipython",
337 |     "version": 3
338 |    },
339 |    "file_extension": ".py",
340 |    "mimetype": "text/x-python",
341 |    "name": "python",
342 |    "nbconvert_exporter": "python",
343 |    "pygments_lexer": "ipython3",
344 |    "version": "3.5.4"
345 |   }
346 |  },
347 |  "nbformat": 4,
348 |  "nbformat_minor": 1
349 | }
350 | 


--------------------------------------------------------------------------------
/note/A3C.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# A3C"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "A3C是Asynchronous Advantage Actor-Critic Model的简称，即异步优势演员-评论家模型，A3C并不是一种像Policy Gradient或DQN这样具体的算法，而是一种解决问题的思想，它的核心精神是，在强化学习的训练过程中，我们可以并行地训练多个Agent，在训练的过程中，各个Agent是参数共享的。更具体一些，我们可能有N个Agent并行地在环境采样1回合后计算1次梯度，我们还有1个或多个Agent在这个过程中什么都不做，当N个Agent中的1个或者M个或者N个采样完成，并反向传播或者基于时间的反向传播计算完1回合的梯度后，这些Agent会将梯度异步地分发给那1个或者多个什么都不做的Agent，然后这些什么都不做的Agent执行一次参数更新，再将更新后的参数分发给这个分发梯度的Agent，然后一直重复这个过程。当然分发梯度和分发参数这个过程是否是异步或者同步，也是可以大做文章的。"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "# A3C with TensorFlow"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "那么具体要怎么操作呢？这里以TensorFlow为框架实现了一个DPPO，即Distributed Proximal Policy Optimization，分布式近端策略优化模型。这个和A3C有什么关系呢？在上文提到，A3C并不是一个具体的算法，它的核心精神是一套异步训练模型、同步或者异步更新参数的思想。或者换句话说，不管是DQN、Policy Gradient、PPO、ACER，还是基于它们的一系列改进，我们都可以用A3C的思想去改进它们。好在TensorFlow已经为我们做了大部分的底层工作，我们只需要几十行代码，就可以把一个单进程的训练过程改进为分布式训练过程。"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "# Distributed TensorFlow"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "为了避免让文章沦为文档翻译，所以这里仅仅对分布式TensorFlow做非常简短的说明，详细的文档可以参考：\n",
 43 |     "> [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed)     "
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "首先，需要构造集群描述对象让集群待命，可以通过如下方法构造集群描述对象：\n",
 51 |     "```\n",
 52 |     "cluster = tf.train.ClusterSpec({\n",
 53 |     "    'worker': [\n",
 54 |     "        'localhost:8001',\n",
 55 |     "        'localhost:8002',\n",
 56 |     "        'localhost:8003',\n",
 57 |     "    ],\n",
 58 |     "    'ps': [\n",
 59 |     "        'localhost:8000'\n",
 60 |     "    ]\n",
 61 |     "})\n",
 62 |     "```\n",
 63 |     "可以看出，集群描述对象是一个键为job_name（任务名），值为ip:port的字典，至于 job_name 的定义稍后会做解释。然后通过如下语句启动集群中的一个节点，并让节点待命：\n",
 64 |     "```\n",
 65 |     "server = tf.train.Server(cluster, job_name=role, task_index=task_index)\n",
 66 |     "if role == 'ps':\n",
 67 |     "    logging.warning('Parameter server started.')\n",
 68 |     "    server.join()\n",
 69 |     "else:\n",
 70 |     "    pass\n",
 71 |     "    # do some sth later.\n",
 72 |     "```\n",
 73 |     "至此，一个节点就被启动并待命了，可以看到一个节点会被抽象为一个server对象，其中job_name对应了节点的任务名，ps是Parameter Server，即参数服务器，worker即计算服务器，它们的用途会在下文提到。根据集群中的每个节点是否会完整地构建自己的计算图，TensorFlow提供了两种方案，分别是 In-graph replication 和 Between-graph replication，每个节点是否会构建自己的计算图，也决定了每个节点的工作方式。"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "### In-graph replication"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "在这种方案中，集群中的每一个节点不会完整地构建自己的计算图，每一个节点仅仅是单纯地利用自己的算力通过以下的语句执行任务：\n",
 88 |     "```\n",
 89 |     "with tf.device(\"/job:ps/task:0\"):\n",
 90 |     "    # Define vars.\n",
 91 |     "    \n",
 92 |     "with tf.device(\"/job:worker/task:0\"):\n",
 93 |     "    # Do computations.\n",
 94 |     "```\n",
 95 |     "通常，只需要提前启动集群，然后构造一个Session，然后根据节点分配计算图中的各个结点，然后进行训练就可以了，非常地直觉。这样做有一个缺点是数据会在各个结点分发，如果数据非常大，这样是得不偿失的。在下文实现的DPPO中，我们将不会采用这套方案。"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "### Between-graph replication"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "与In-graph replication不同的是，集群中的每一个节点会完整地构建自己的计算图，可以说这种方案就是为了A3C而设计的，在这种方案中，我们会有一个或者多个参数节点（Parameters Server），多个计算节点（Worker Server），每个计算节点完成梯度计算后，会异步地将梯度分发到参数节点，然后参数节点会同步或者异步地用梯度更新参数，然后分发最新的参数到一个或者多个计算节点。"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "### Parameters Server & Worker Server"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "ps，即参数节点，在Between-graph replication的方案中，它通常什么都不做，节点启动后即调用`join()`待命，worker，即计算节点，在Between-graph replication方案中，这些节点定义了完整的计算图并执行这些计算，在计算节点完成一次梯度计算后，梯度会被异步分发给参数节点，参数节点更新参数后，分发参数给计算节点。这个过程可以既可以是异步的也可以是同步的，在Between-graph replication方案中，默认是异步的。"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "# PPO"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "在前一篇文章中已经实现了一个PPO，学习笔记：\n",
138 |     "> [PPO Note](https://github.com/Ceruleanacg/Learning-Notes/blob/master/note/PPO.ipynb)    "
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "源码：\n",
146 |     "> [PPO Code](https://github.com/Ceruleanacg/Learning-Notes/blob/master/playground/PPO.py)\n"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "# DPPO in Action"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {},
159 |    "source": [
160 |     "首先实现一个方法，它用来启动集群的各个节点，并根据节点类型待命或者定义并执行计算图："
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 1,
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "name": "stderr",
170 |      "output_type": "stream",
171 |      "text": [
172 |       "/Users/shuyu/anaconda3/envs/quant/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5\n",
173 |       "  return f(*args, **kwds)\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "# coding=utf-8\n",
179 |     "\n",
180 |     "import sys\n",
181 |     "sys.path.append('..')\n",
182 |     "\n",
183 |     "import multiprocessing as mp\n",
184 |     "import tensorflow as tf\n",
185 |     "import logging\n",
186 |     "import gym\n",
187 |     "\n",
188 |     "from base.model import *\n",
189 |     "from playground import PPO\n",
190 |     "from utility.launcher import start_game\n",
191 |     "\n",
192 |     "\n",
193 |     "def start_a3c(cluster, role, task_index):\n",
194 |     "    # 根据集群描述对象启动节点\n",
195 |     "    server = tf.train.Server(cluster, job_name=role, task_index=task_index)\n",
196 |     "    if role == 'ps':\n",
197 |     "        # 如果是参数节点，则join待命\n",
198 |     "        logging.warning('Parameter server started.')\n",
199 |     "        server.join()\n",
200 |     "    else:\n",
201 |     "        # 如果是计算节点，定义计算图，计算梯度\n",
202 |     "        worker_device = \"/job:worker/task:{}\".format(task_index)\n",
203 |     "        logging.warning('Worker: {},  server stated.'.format(worker_device))\n",
204 |     "        # 根据集群描述对象分配节点\n",
205 |     "        with tf.device(tf.train.replica_device_setter(cluster=cluster)):\n",
206 |     "            # Make env.\n",
207 |     "            env = gym.make('CartPole-v0')\n",
208 |     "            env.seed(1)\n",
209 |     "            env = env.unwrapped\n",
210 |     "            # Init session.\n",
211 |     "            session = tf.Session(server.target)\n",
212 |     "            # session = tf.Session()\n",
213 |     "            # Init agent.\n",
214 |     "            agent = PPO.Agent(env.action_space.n, env.observation_space.shape[0], **{\n",
215 |     "                KEY_SESSION: session,\n",
216 |     "                KEY_MODEL_NAME: 'PPO',\n",
217 |     "                KEY_TRAIN_EPISODE: 1000\n",
218 |     "            })\n",
219 |     "            start_game(env, agent, task_index)"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {},
225 |    "source": [
226 |     "然后定义集群描述对象："
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 2,
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "cluster = tf.train.ClusterSpec({\n",
236 |     "        'worker': [\n",
237 |     "            'localhost:8001',\n",
238 |     "            'localhost:8002',\n",
239 |     "            'localhost:8003',\n",
240 |     "        ],\n",
241 |     "        'ps': [\n",
242 |     "            'localhost:8000'\n",
243 |     "        ]\n",
244 |     "    })\n",
245 |     "\n",
246 |     "role_task_index_map = [\n",
247 |     "    ('ps', 0),\n",
248 |     "    ('worker', 0),\n",
249 |     "    ('worker', 1),\n",
250 |     "    ('worker', 2),\n",
251 |     "]"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "markdown",
256 |    "metadata": {},
257 |    "source": [
258 |     "启动A3C并训练："
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": null,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": [
267 |     "pool = mp.Pool(processes=4)\n",
268 |     "\n",
269 |     "for role, task_index in role_task_index_map:\n",
270 |     "    pool.apply_async(start_a3c, args=(cluster, role, task_index, ))\n",
271 |     "pool.close()\n",
272 |     "pool.join()"
273 |    ]
274 |   }
275 |  ],
276 |  "metadata": {
277 |   "kernelspec": {
278 |    "display_name": "Python 3",
279 |    "language": "python",
280 |    "name": "python3"
281 |   },
282 |   "language_info": {
283 |    "codemirror_mode": {
284 |     "name": "ipython",
285 |     "version": 3
286 |    },
287 |    "file_extension": ".py",
288 |    "mimetype": "text/x-python",
289 |    "name": "python",
290 |    "nbconvert_exporter": "python",
291 |    "pygments_lexer": "ipython3",
292 |    "version": "3.5.4"
293 |   }
294 |  },
295 |  "nbformat": 4,
296 |  "nbformat_minor": 2
297 | }
298 | 


--------------------------------------------------------------------------------
/note/.ipynb_checkpoints/A3C-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# A3C"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "A3C是Asynchronous Advantage Actor-Critic Model的简称，即异步优势演员-评论家模型，A3C并不是一种像Policy Gradient或DQN这样具体的算法，而是一种解决问题的思想，它的核心精神是，在强化学习的训练过程中，我们可以并行地训练多个Agent，在训练的过程中，各个Agent是参数共享的。更具体一些，我们可能有N个Agent并行地在环境采样1回合后计算1次梯度，我们还有1个或多个Agent在这个过程中什么都不做，当N个Agent中的1个或者M个或者N个采样完成，并反向传播或者基于时间的反向传播计算完1回合的梯度后，这些Agent会将梯度异步地分发给那1个或者多个什么都不做的Agent，然后这些什么都不做的Agent执行一次参数更新，再将更新后的参数分发给这个分发梯度的Agent，然后一直重复这个过程。当然分发梯度和分发参数这个过程是否是异步或者同步，也是可以大做文章的。"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "# A3C with TensorFlow"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "那么具体要怎么操作呢？这里以TensorFlow为框架实现了一个DPPO，即Distributed Proximal Policy Optimization，分布式近端策略优化模型。这个和A3C有什么关系呢？在上文提到，A3C并不是一个具体的算法，它的核心精神是一套异步训练模型、同步或者异步更新参数的思想。或者换句话说，不管是DQN、Policy Gradient、PPO、ACER，还是基于它们的一系列改进，我们都可以用A3C的思想去改进它们。好在TensorFlow已经为我们做了大部分的底层工作，我们只需要几十行代码，就可以把一个单进程的训练过程改进为分布式训练过程。"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "# Distributed TensorFlow"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "为了避免让文章沦为文档翻译，所以这里仅仅对分布式TensorFlow做非常简短的说明，详细的文档可以参考：\n",
 43 |     "> [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed)     "
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "首先，需要构造集群描述对象让集群待命，可以通过如下方法构造集群描述对象：\n",
 51 |     "```\n",
 52 |     "cluster = tf.train.ClusterSpec({\n",
 53 |     "    'worker': [\n",
 54 |     "        'localhost:8001',\n",
 55 |     "        'localhost:8002',\n",
 56 |     "        'localhost:8003',\n",
 57 |     "    ],\n",
 58 |     "    'ps': [\n",
 59 |     "        'localhost:8000'\n",
 60 |     "    ]\n",
 61 |     "})\n",
 62 |     "```\n",
 63 |     "可以看出，集群描述对象是一个键为job_name（任务名），值为ip:port的字典，至于 job_name 的定义稍后会做解释。然后通过如下语句启动集群中的一个节点，并让节点待命：\n",
 64 |     "```\n",
 65 |     "server = tf.train.Server(cluster, job_name=role, task_index=task_index)\n",
 66 |     "if role == 'ps':\n",
 67 |     "    logging.warning('Parameter server started.')\n",
 68 |     "    server.join()\n",
 69 |     "else:\n",
 70 |     "    pass\n",
 71 |     "    # do some sth later.\n",
 72 |     "```\n",
 73 |     "至此，一个节点就被启动并待命了，可以看到一个节点会被抽象为一个server对象，其中job_name对应了节点的任务名，ps是Parameter Server，即参数服务器，worker即计算服务器，它们的用途会在下文提到。根据集群中的每个节点是否会完整地构建自己的计算图，TensorFlow提供了两种方案，分别是 In-graph replication 和 Between-graph replication，每个节点是否会构建自己的计算图，也决定了每个节点的工作方式。"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "### In-graph replication"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "在这种方案中，集群中的每一个节点不会完整地构建自己的计算图，每一个节点仅仅是单纯地利用自己的算力通过以下的语句执行任务：\n",
 88 |     "```\n",
 89 |     "with tf.device(\"/job:ps/task:0\"):\n",
 90 |     "    # Define vars.\n",
 91 |     "    \n",
 92 |     "with tf.device(\"/job:worker/task:0\"):\n",
 93 |     "    # Do computations.\n",
 94 |     "```\n",
 95 |     "通常，只需要提前启动集群，然后构造一个Session，然后根据节点分配计算图中的各个结点，然后进行训练就可以了，非常地直觉。这样做有一个缺点是数据会在各个结点分发，如果数据非常大，这样是得不偿失的。在下文实现的DPPO中，我们将不会采用这套方案。"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "### Between-graph replication"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "与In-graph replication不同的是，集群中的每一个节点会完整地构建自己的计算图，可以说这种方案就是为了A3C而设计的，在这种方案中，我们会有一个或者多个参数节点（Parameters Server），多个计算节点（Worker Server），每个计算节点完成梯度计算后，会异步地将梯度分发到参数节点，然后参数节点会同步或者异步地用梯度更新参数，然后分发最新的参数到一个或者多个计算节点。"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "### Parameters Server & Worker Server"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "ps，即参数节点，在Between-graph replication的方案中，它通常什么都不做，节点启动后即调用`join()`待命，worker，即计算节点，在Between-graph replication方案中，这些节点定义了完整的计算图并执行这些计算，在计算节点完成一次梯度计算后，梯度会被异步分发给参数节点，参数节点更新参数后，分发参数给计算节点。这个过程可以既可以是异步的也可以是同步的，在Between-graph replication方案中，默认是异步的。"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "# PPO"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "在前一篇文章中已经实现了一个PPO，学习笔记：\n",
138 |     "> [PPO Note](https://github.com/Ceruleanacg/Learning-Notes/blob/master/note/PPO.ipynb)    "
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "源码：\n",
146 |     "> [PPO Code](https://github.com/Ceruleanacg/Learning-Notes/blob/master/playground/PPO.py)\n"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "# DPPO in Action"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {},
159 |    "source": [
160 |     "首先实现一个方法，它用来启动集群的各个节点，并根据节点类型待命或者定义并执行计算图："
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 1,
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "name": "stderr",
170 |      "output_type": "stream",
171 |      "text": [
172 |       "/Users/shuyu/anaconda3/envs/quant/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5\n",
173 |       "  return f(*args, **kwds)\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "# coding=utf-8\n",
179 |     "\n",
180 |     "import sys\n",
181 |     "sys.path.append('..')\n",
182 |     "\n",
183 |     "import multiprocessing as mp\n",
184 |     "import tensorflow as tf\n",
185 |     "import logging\n",
186 |     "import gym\n",
187 |     "\n",
188 |     "from base.model import *\n",
189 |     "from playground import PPO\n",
190 |     "from utility.launcher import start_game\n",
191 |     "\n",
192 |     "\n",
193 |     "def start_a3c(cluster, role, task_index):\n",
194 |     "    # 根据集群描述对象启动节点\n",
195 |     "    server = tf.train.Server(cluster, job_name=role, task_index=task_index)\n",
196 |     "    if role == 'ps':\n",
197 |     "        # 如果是参数节点，则join待命\n",
198 |     "        logging.warning('Parameter server started.')\n",
199 |     "        server.join()\n",
200 |     "    else:\n",
201 |     "        # 如果是计算节点，定义计算图，计算梯度\n",
202 |     "        worker_device = \"/job:worker/task:{}\".format(task_index)\n",
203 |     "        logging.warning('Worker: {},  server stated.'.format(worker_device))\n",
204 |     "        # 根据集群描述对象分配节点\n",
205 |     "        with tf.device(tf.train.replica_device_setter(cluster=cluster)):\n",
206 |     "            # Make env.\n",
207 |     "            env = gym.make('CartPole-v0')\n",
208 |     "            env.seed(1)\n",
209 |     "            env = env.unwrapped\n",
210 |     "            # Init session.\n",
211 |     "            session = tf.Session(server.target)\n",
212 |     "            # session = tf.Session()\n",
213 |     "            # Init agent.\n",
214 |     "            agent = PPO.Agent(env.action_space.n, env.observation_space.shape[0], **{\n",
215 |     "                KEY_SESSION: session,\n",
216 |     "                KEY_MODEL_NAME: 'PPO',\n",
217 |     "                KEY_TRAIN_EPISODE: 1000\n",
218 |     "            })\n",
219 |     "            start_game(env, agent, task_index)"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {},
225 |    "source": [
226 |     "然后定义集群描述对象："
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 2,
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "cluster = tf.train.ClusterSpec({\n",
236 |     "        'worker': [\n",
237 |     "            'localhost:8001',\n",
238 |     "            'localhost:8002',\n",
239 |     "            'localhost:8003',\n",
240 |     "        ],\n",
241 |     "        'ps': [\n",
242 |     "            'localhost:8000'\n",
243 |     "        ]\n",
244 |     "    })\n",
245 |     "\n",
246 |     "role_task_index_map = [\n",
247 |     "    ('ps', 0),\n",
248 |     "    ('worker', 0),\n",
249 |     "    ('worker', 1),\n",
250 |     "    ('worker', 2),\n",
251 |     "]"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "markdown",
256 |    "metadata": {},
257 |    "source": [
258 |     "启动A3C并训练："
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": null,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": [
267 |     "pool = mp.Pool(processes=4)\n",
268 |     "\n",
269 |     "for role, task_index in role_task_index_map:\n",
270 |     "    pool.apply_async(start_a3c, args=(cluster, role, task_index, ))\n",
271 |     "pool.close()\n",
272 |     "pool.join()"
273 |    ]
274 |   }
275 |  ],
276 |  "metadata": {
277 |   "kernelspec": {
278 |    "display_name": "Python 3",
279 |    "language": "python",
280 |    "name": "python3"
281 |   },
282 |   "language_info": {
283 |    "codemirror_mode": {
284 |     "name": "ipython",
285 |     "version": 3
286 |    },
287 |    "file_extension": ".py",
288 |    "mimetype": "text/x-python",
289 |    "name": "python",
290 |    "nbconvert_exporter": "python",
291 |    "pygments_lexer": "ipython3",
292 |    "version": "3.5.4"
293 |   }
294 |  },
295 |  "nbformat": 4,
296 |  "nbformat_minor": 2
297 | }
298 | 


--------------------------------------------------------------------------------
/note/.ipynb_checkpoints/Word2Vec-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# Word2Vec"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## 问题设定"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "对于One-hot的词向量："
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "$$\n",
 31 |     "\\begin{aligned}\n",
 32 |     "I &= [1, 0, 0] \\\\\n",
 33 |     "Like &= [0, 1, 0] \\\\\n",
 34 |     "Apple &= [0, 0, 1] \n",
 35 |     "\\end{aligned}\n",
 36 |     "$$"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "无法通过两向量夹角余弦值计算其相似度，word2vec提供了Skip-Gram（跳字模型）与CBOW（连续词袋模型）两个词嵌入模型，通过这种模型训练出的词向量可以较好的表示出词之间的相似度。"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "## Skip-Gram"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "即跳字模型，其核心思想是对于一个上下文，设定一个大小为$m$的滑窗，在滑窗内选择$1$个中心词，预测滑窗内$m - 1$个背景词。即如果上下文是："
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "$$\n",
 65 |     "I\\  eat\\  apple\\  every\\  day\n",
 66 |     "$$"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "对每一个词进行One-hot编码："
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "$$\n",
 81 |     "\\begin{aligned}\n",
 82 |     "I &= [1, 0, 0, 0, 0] \\\\ \n",
 83 |     "eat &= [0, 1, 0, 0, 0] \\\\\n",
 84 |     "apple &= [0, 0, 1, 0, 0] \\\\\n",
 85 |     "every &= [0, 0, 0, 1, 0] \\\\\n",
 86 |     "day &= [0, 0, 0, 0, 1]\n",
 87 |     "\\end{aligned}\n",
 88 |     "$$"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "设定滑窗大小为$2$，如果选择中心词$apple$，那么将会有以下训练数据："
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "$$\n",
103 |     "\\begin{aligned}\n",
104 |     "x &= [0, 0, 1, 0, 0] \\\\ \n",
105 |     "y &= [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]\n",
106 |     "\\end{aligned}\n",
107 |     "$$"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "设计一个只有1个输入层、1个隐藏层、1个输出层的神经网络，其中输出层的神经元个数等于输入层即等于One-hot编码的维度，而隐含层的神经元个数通常远小于输出层，比如One-hot维度如果是10000，隐含层可以只有300个神经元："
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "我们通过最大化似然函数："
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "$$\n",
129 |     "\\prod^{N}_{i=1} \\prod_{-m <= j <= m} \\mathbb{P} \\left( w^{i+j} \\ \\lvert \\  w^i \\right)\n",
130 |     "$$"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "即对于上下文内所有的词，给定中心词$w^i$，预测滑窗内其他词，越准确越好。对上式取对数并展开："
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "$$\n",
145 |     "\\begin{aligned}\n",
146 |     "\\prod^{N}_{i=1} \\prod_{-m <= j <= m} \\mathbb{P} \\left( w^{i+j} \\ \\lvert \\  w^i \\right) &= \\sum^{N}_{i=1} \\sum_{-m <= j <= m} \\log \\mathbb{P} \\left( w^{i+j} \\ \\lvert \\  w^i \\right) \\\\\n",
147 |     "&= \\sum^{N}_{i=1} \\sum_{-m <= j <= m} \\log \\left( \\frac{\\exp(\\mathrm{u^T_{i+j} \\cdot v_{i}} )}{ \\mathrm{\\sum^{N}_{k=1} \\exp(\\mathrm{u^T_{k} \\cdot v_{i}})}} \\right) \\\\\n",
148 |     "\\end{aligned}\n",
149 |     "$$"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "其中，$\\mathrm{v_i}$即是隐藏层的权重，也是隐藏层的输入$z_i$，也是第i个词的词向量，$\\mathrm{u_{i+j}}$是输出层的权重，也是第i+j个词的词向量的另一个表达。最大化上式的最大似然函数，即最小化下式交叉熵："
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "$$\n",
164 |     "- \\sum^{N}_{i=1} \\mathrm{y_i} \\cdot \\log \\mathrm{p_i}\n",
165 |     "$$"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "其中$\\mathrm{y_i}$与$\\mathrm{p_i}$是维度为词表长度的向量，分别代表观测值与计算值，对$\\mathrm{v_i}$求梯度有："
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "$$\n",
180 |     "\\begin{aligned}\n",
181 |     "\\frac {\\partial \\log \\mathbb{P} \\left( w^{j} \\ \\lvert \\  w^i \\right)} {\\mathrm{v_i}} &= \\frac {\\partial \\log \\left( \\exp(\\mathrm{u^T_{j} \\cdot v_{i}} ) \\right) - \\log \\left (  \\mathrm{\\sum^{V}_{k=1} \\exp(\\mathrm{u^T_{k} \\cdot v_{i}})} \\right)}{\\partial \\mathrm{v_{i}}} \\\\\n",
182 |     "&= \\mathrm{u_{j}} - \\frac{1}{\\sum^{V}_{w=1} \\exp(\\mathrm{u^T_w v_i}) } \\left[ \\sum^{V}_{k=1} \\exp(\\mathrm{u^T_k v_i) \\cdot \\mathrm{u_k}} \\right] \\\\\n",
183 |     "&= \\mathrm{u_{j}} - \\sum^{V}_{k=1} \\frac{ \\exp(\\mathrm{u^T_k v_i}) }{ \\sum^{V}_{w=1} \\exp(\\mathrm{u^T_w v_i}) } \\cdot \\mathrm{u_k}\n",
184 |     "\\end{aligned}\n",
185 |     "$$"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "然后使用梯度下降更新$\\mathrm{v_i}$，此处的$\\mathrm{v_i}$是向量，在网络中，即是输入层的第i个神经元到隐含层的权重。"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "## CBOW"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "即Continuous Bag of Words，连续词袋模型，其核心思想是对于一个上下文，设定一个大小为$m$的滑窗，在滑窗内选择$1$个背景词，$m - 1$个中心词，与Skip-Gram相反，设定滑窗大小为$2$，如果选择中心词$\\ I,\\ eat,\\ every,\\ day$，那么将会有以下训练数据："
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {},
212 |    "source": [
213 |     "$$\n",
214 |     "\\begin{aligned}\n",
215 |     "x &= [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1] \\\\ \n",
216 |     "y &= [0, 0, 1, 0, 0] \\\\ \n",
217 |     "\\end{aligned}\n",
218 |     "$$"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {},
224 |    "source": [
225 |     "而对于概率："
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "metadata": {},
231 |    "source": [
232 |     "$$\n",
233 |     "\\mathbb{P} \\left( w^{j} \\ \\lvert \\ w^{i-m}, \\cdots,  w^i, \\cdots, w^{i+m} \\right) =  \\frac{\\exp(\\mathrm{u^T_{j} \\cdot (v_{i} + \\cdots + v_{i+2m}} ) / 2m)}{ \\mathrm{\\sum^{V}_{k=1} \\exp(\\mathrm{u^T_{k} \\cdot (v_{i} + \\cdots + v_{i+2m}} ) / 2m})}\n",
234 |     "$$"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "metadata": {},
240 |    "source": [
241 |     "与Skip-Gram的不同之处在于将中心词求和后平均，之后的梯度计算与更新和Skip-Gram相同，这里就不展开了。"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "## 负采样"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "可以直观地从上面的梯度更新公式中看到，每一次更新都伴随着巨量的计算开销，这个计算开销主要是因为Softmax函数的分母。可以使用负采样替换Softmax，减少计算开销。"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "相对于原条件概率："
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {},
268 |    "source": [
269 |     "$$\n",
270 |     "\\mathbb{P} \\left( w^{j} \\ \\lvert \\  w^i \\right) =  \\frac{\\exp(\\mathrm{u^T_{j} \\cdot v_{i}} )}{ \\mathrm{\\sum^{V}_{k=1} \\exp(\\mathrm{u^T_{k} \\cdot v_{i}})}}\n",
271 |     "$$"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "markdown",
276 |    "metadata": {},
277 |    "source": [
278 |     "将被改写为："
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {},
284 |    "source": [
285 |     "$$\n",
286 |     "\\mathbb{P} \\left( w^{j} \\ \\lvert \\  w^i \\right) = \\log \\frac{1}{1 + \\exp(- \\mathrm{u^T_j v_i})} + \\sum^{K}_{k=1} \\log \\left( 1 - \\frac{1}{1 + \\exp(- \\mathrm{u^T_k v_i})} \\right)\n",
287 |     "$$"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "markdown",
292 |    "metadata": {},
293 |    "source": [
294 |     "即筛选出K个不在滑窗内的词向量，直观地理解是希望中心词尽可能地不预测出这些采样出的词，筛选出某个词的概率由这个公式决定："
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "markdown",
299 |    "metadata": {},
300 |    "source": [
301 |     "$$\n",
302 |     "\\mathrm{P(w_i)} = \\frac{f(w_i)^{\\frac{3}{4}}}{\\sum^{V}_{k=1}f(w_k)^{\\frac{3}{4}}}\n",
303 |     "$$"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "markdown",
308 |    "metadata": {},
309 |    "source": [
310 |     "其中，$f(w_i)$是这个单词在上下文中出现的频率。"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "markdown",
315 |    "metadata": {},
316 |    "source": [
317 |     "## 结果"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "markdown",
322 |    "metadata": {},
323 |    "source": [
324 |     "通过这种词嵌入模型训练出的词向量能较好的表示两个相近意思的词的近似程度。"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "markdown",
329 |    "metadata": {},
330 |    "source": []
331 |   }
332 |  ],
333 |  "metadata": {
334 |   "kernelspec": {
335 |    "display_name": "Python 3",
336 |    "language": "python",
337 |    "name": "python3"
338 |   },
339 |   "language_info": {
340 |    "codemirror_mode": {
341 |     "name": "ipython",
342 |     "version": 3
343 |    },
344 |    "file_extension": ".py",
345 |    "mimetype": "text/x-python",
346 |    "name": "python",
347 |    "nbconvert_exporter": "python",
348 |    "pygments_lexer": "ipython3",
349 |    "version": "3.6.4"
350 |   }
351 |  },
352 |  "nbformat": 4,
353 |  "nbformat_minor": 1
354 | }
355 | 


--------------------------------------------------------------------------------
/deprecated/main.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing as mp
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | import gym, time
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | 
  8 | UPDATE_GLOBAL_ITER = 10
  9 | GAMMA = 0.9
 10 | ENTROPY_BETA = 0.001
 11 | LR_A = 0.001    # learning rate for actor
 12 | LR_C = 0.001    # learning rate for critic
 13 | 
 14 | env = gym.make('CartPole-v0')
 15 | N_S = env.observation_space.shape[0]
 16 | N_A = env.action_space.n
 17 | 
 18 | 
 19 | class ACNet(object):
 20 |     sess = None
 21 | 
 22 |     def __init__(self, scope, opt_a=None, opt_c=None, global_net=None):
 23 |         if scope == 'global_net':  # get global network
 24 |             with tf.variable_scope(scope):
 25 |                 self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
 26 |                 self.a_params, self.c_params = self._build_net(scope)[-2:]
 27 |         else:
 28 |             with tf.variable_scope(scope):
 29 |                 self.s = tf.placeholder(tf.float32, [None, N_S], 'S')
 30 |                 self.a_his = tf.placeholder(tf.int32, [None, ], 'A')
 31 |                 self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget')
 32 | 
 33 |                 self.a_prob, self.v, self.a_params, self.c_params = self._build_net(scope)
 34 | 
 35 |                 td = tf.subtract(self.v_target, self.v, name='TD_error')
 36 |                 with tf.name_scope('c_loss'):
 37 |                     self.c_loss = tf.reduce_mean(tf.square(td))
 38 | 
 39 |                 with tf.name_scope('a_loss'):
 40 |                     log_prob = tf.reduce_sum(
 41 |                         tf.log(self.a_prob) * tf.one_hot(self.a_his, N_A, dtype=tf.float32),
 42 |                         axis=1, keep_dims=True)
 43 |                     exp_v = log_prob * tf.stop_gradient(td)
 44 |                     entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob + 1e-5),
 45 |                                              axis=1, keep_dims=True)  # encourage exploration
 46 |                     self.exp_v = ENTROPY_BETA * entropy + exp_v
 47 |                     self.a_loss = tf.reduce_mean(-self.exp_v)
 48 | 
 49 |                 with tf.name_scope('local_grad'):
 50 |                     self.a_grads = tf.gradients(self.a_loss, self.a_params)
 51 |                     self.c_grads = tf.gradients(self.c_loss, self.c_params)
 52 | 
 53 |             self.global_step = tf.train.get_or_create_global_step()
 54 |             with tf.name_scope('sync'):
 55 |                 with tf.name_scope('pull'):
 56 |                     self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, global_net.a_params)]
 57 |                     self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, global_net.c_params)]
 58 |                 with tf.name_scope('push'):
 59 |                     self.update_a_op = opt_a.apply_gradients(zip(self.a_grads, global_net.a_params), global_step=self.global_step)
 60 |                     self.update_c_op = opt_c.apply_gradients(zip(self.c_grads, global_net.c_params))
 61 | 
 62 |     def _build_net(self, scope):
 63 |         w_init = tf.random_normal_initializer(0., .1)
 64 |         with tf.variable_scope('actor'):
 65 |             l_a = tf.layers.dense(self.s, 200, tf.nn.relu6, kernel_initializer=w_init, name='la')
 66 |             a_prob = tf.layers.dense(l_a, N_A, tf.nn.softmax, kernel_initializer=w_init, name='ap')
 67 |         with tf.variable_scope('critic'):
 68 |             l_c = tf.layers.dense(self.s, 100, tf.nn.relu6, kernel_initializer=w_init, name='lc')
 69 |             v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v')  # state value
 70 |         a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor')
 71 |         c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic')
 72 |         return a_prob, v, a_params, c_params
 73 | 
 74 |     def choose_action(self, s):  # run by a local
 75 |         prob_weights = self.sess.run(self.a_prob, feed_dict={self.s: s[np.newaxis, :]})
 76 |         action = np.random.choice(range(prob_weights.shape[1]),
 77 |                                   p=prob_weights.ravel())  # select action w.r.t the actions prob
 78 |         return action
 79 | 
 80 |     def update_global(self, feed_dict):  # run by a local
 81 |         self.sess.run([self.update_a_op, self.update_c_op], feed_dict)  # local grads applies to global net
 82 | 
 83 |     def pull_global(self):  # run by a local
 84 |         self.sess.run([self.pull_a_params_op, self.pull_c_params_op])
 85 | 
 86 | 
 87 | def work(job_name, task_index, global_ep, lock, r_queue, global_running_r):
 88 |     # set work's ip:port
 89 |     cluster = tf.train.ClusterSpec({
 90 |         "ps": ['localhost:2220', 'localhost:2221',],
 91 |         "worker": ['localhost:2222', 'localhost:2223', 'localhost:2224', 'localhost:2225',]
 92 |     })
 93 |     server = tf.train.Server(cluster, job_name=job_name, task_index=task_index)
 94 |     if job_name == 'ps':
 95 |         print('Start Parameter Sever: ', task_index)
 96 |         server.join()
 97 |     else:
 98 |         t1 = time.time()
 99 |         env = gym.make('CartPole-v0').unwrapped
100 |         print('Start Worker: ', task_index)
101 |         with tf.device(tf.train.replica_device_setter(
102 |                 worker_device="/job:worker/task:%d" % task_index,
103 |                 cluster=cluster)):
104 |             opt_a = tf.train.RMSPropOptimizer(LR_A, name='opt_a')
105 |             opt_c = tf.train.RMSPropOptimizer(LR_C, name='opt_c')
106 |             global_net = ACNet('global_net')
107 | 
108 |         local_net = ACNet('local_ac%d' % task_index, opt_a, opt_c, global_net)
109 |         # set training steps
110 |         hooks = [tf.train.StopAtStepHook(last_step=100000)]
111 |         with tf.train.MonitoredTrainingSession(master=server.target,
112 |                                                is_chief=True,
113 |                                                hooks=hooks,) as sess:
114 |             print('Start Worker Session: ', task_index)
115 |             local_net.sess = sess
116 |             total_step = 1
117 |             buffer_s, buffer_a, buffer_r = [], [], []
118 |             while (not sess.should_stop()) and (global_ep.value < 1000):
119 |                 s = env.reset()
120 |                 ep_r = 0
121 |                 while True:
122 |                     # if task_index:
123 |                     #     env.render()
124 |                     a = local_net.choose_action(s)
125 |                     s_, r, done, info = env.step(a)
126 |                     if done: r = -5.
127 |                     ep_r += r
128 |                     buffer_s.append(s)
129 |                     buffer_a.append(a)
130 |                     buffer_r.append(r)
131 | 
132 |                     if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net
133 |                         if done:
134 |                             v_s_ = 0  # terminal
135 |                         else:
136 |                             v_s_ = sess.run(local_net.v, {local_net.s: s_[np.newaxis, :]})[0, 0]
137 |                         buffer_v_target = []
138 |                         for r in buffer_r[::-1]:  # reverse buffer r
139 |                             v_s_ = r + GAMMA * v_s_
140 |                             buffer_v_target.append(v_s_)
141 |                         buffer_v_target.reverse()
142 | 
143 |                         buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.array(buffer_a), np.vstack(
144 |                             buffer_v_target)
145 |                         feed_dict = {
146 |                             local_net.s: buffer_s,
147 |                             local_net.a_his: buffer_a,
148 |                             local_net.v_target: buffer_v_target,
149 |                         }
150 |                         local_net.update_global(feed_dict)
151 |                         buffer_s, buffer_a, buffer_r = [], [], []
152 |                         local_net.pull_global()
153 |                     s = s_
154 |                     total_step += 1
155 |                     if done:
156 |                         if r_queue.empty():  # record running episode reward
157 |                             global_running_r.value = ep_r
158 |                         else:
159 |                             global_running_r.value = .99 * global_running_r.value + 0.01 * ep_r
160 |                         r_queue.put(global_running_r.value)
161 | 
162 |                         print(
163 |                             "Task: %i" % task_index,
164 |                             "| Ep: %i" % global_ep.value,
165 |                             "| Ep_r: %i" % global_running_r.value,
166 |                             "| Global_step: %i" % sess.run(local_net.global_step),
167 |                         )
168 |                         with lock:
169 |                             global_ep.value += 1
170 |                         break
171 | 
172 |         print('Worker Done: ', task_index, time.time()-t1)
173 | 
174 | 
175 | if __name__ == "__main__":
176 |     # use multiprocessing to create a local cluster with 2 parameter servers and 2 workers
177 |     global_ep = mp.Value('i', 0)
178 |     lock = mp.Lock()
179 |     r_queue = mp.Queue()
180 |     global_running_r = mp.Value('d', 0)
181 | 
182 |     jobs = [
183 |         ('ps', 0), ('ps', 1),
184 |         ('worker', 0), ('worker', 1), ('worker', 2), ('worker', 3)
185 |     ]
186 |     ps = [mp.Process(target=work, args=(j, i, global_ep, lock, r_queue, global_running_r), ) for j, i in jobs]
187 |     [p.start() for p in ps]
188 |     [p.join() for p in ps[2:]]
189 | 
190 |     ep_r = []
191 |     while not r_queue.empty():
192 |         ep_r.append(r_queue.get())
193 |     plt.plot(np.arange(len(ep_r)), ep_r)
194 |     plt.title('Distributed training')
195 |     plt.xlabel('Step')
196 |     plt.ylabel('Total moving reward')
197 |     plt.show()
198 | 


--------------------------------------------------------------------------------
/ann/Dense.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import json
  3 | import os
  4 | 
  5 | from static import CKPT_DIR
  6 | from utility import function
  7 | from utility.logger import generate_model_logger
  8 | 
  9 | 
 10 | class Dense(object):
 11 | 
 12 |     def __init__(self, x_space, y_space, hidden_units_list, **options):
 13 | 
 14 |         # Init x space, y space.
 15 |         self.x_space = x_space
 16 |         self.y_space = y_space
 17 | 
 18 |         # Init layer & neuron info.
 19 |         self.hidden_units_list = hidden_units_list
 20 |         self.hidden_layer_count = len(hidden_units_list)
 21 |         self.total_layer_count = self.hidden_layer_count + 1
 22 | 
 23 |         # Init weights, biases.
 24 |         self.weights, self.biases = {}, {}
 25 | 
 26 |         # Init a, z, outputs caches.
 27 |         self.z_outputs, self.z_inputs = {}, {}
 28 | 
 29 |         # Init deltas caches.
 30 |         self.deltas = {}
 31 | 
 32 |         self._validate_parameters()
 33 |         self._init_func_map()
 34 |         self._init_options(options)
 35 |         self._init_weights_and_biases()
 36 | 
 37 |     def _init_weights_and_biases(self):
 38 |         # Hidden Layer.
 39 |         for index, hidden_units in enumerate(self.hidden_units_list):
 40 |             # x_space is the shape of last layer, and the shape of weight of current layer.
 41 |             x_space = self.x_space if index == 0 else self.hidden_units_list[index - 1]
 42 |             # hidden_units is shape of current layer, also neuron count.
 43 |             weights, biases = np.random.normal(0, 0.01, (hidden_units, x_space)), np.zeros((hidden_units, 1))
 44 |             self.weights[index], self.biases[index] = weights, biases
 45 |         # Output Layer.
 46 |         x_space = self.hidden_units_list[-1]
 47 |         weights, biases = np.random.normal(0, 0.01, (self.y_space, x_space)), np.zeros((self.y_space, 1))
 48 |         self.weights[self.total_layer_count - 1], self.biases[self.total_layer_count - 1] = weights, biases
 49 | 
 50 |     def _validate_parameters(self):
 51 |         if self.hidden_layer_count == 0 or len(self.hidden_units_list) == 0:
 52 |             raise ValueError('Layer count or neuron count list cannot be zero.')
 53 |         if self.hidden_layer_count != len(self.hidden_units_list):
 54 |             raise ValueError('Layer count should be equal to length of neuron count list.')
 55 | 
 56 |     def _init_func_map(self):
 57 |         # Init Activation Func and Grad Map.
 58 |         self.activation_grad_map = {
 59 |             function.relu: np.vectorize(function.grad_relu),
 60 |             function.tanh: np.vectorize(function.grad_tanh),
 61 |             function.linear: np.vectorize(function.grad_linear),
 62 |             function.sigmoid: np.vectorize(function.grad_sigmoid),
 63 |         }
 64 |         self.grad_loss_map = {
 65 |             function.softmax_cross_entropy: function.grad_softmax_cross_entropy,
 66 |             function.mean_square_error: function.grad_mean_square_error
 67 |         }
 68 | 
 69 |     def _init_options(self, options):
 70 | 
 71 |         try:
 72 |             self.model_name = options['model_name']
 73 |         except KeyError:
 74 |             self.model_name = 'model'
 75 |         finally:
 76 |             if not isinstance(self.model_name, str):
 77 |                 raise ValueError('Model name must be a str.')
 78 | 
 79 |         try:
 80 |             self.mode = options['mode']
 81 |         except KeyError:
 82 |             self.mode = 'train'
 83 | 
 84 |         # Init Activation Func and Grad Func.
 85 |         try:
 86 |             self.activation_funcs = options['activation_funcs']
 87 |         except KeyError:
 88 |             self.activation_funcs = [function.tanh] * self.hidden_layer_count
 89 |             self.activation_funcs.append(function.linear)
 90 |         finally:
 91 |             if len(self.activation_funcs) != self.total_layer_count:
 92 |                 raise ValueError('Activation func count should be equal to total layer count.')
 93 | 
 94 |         try:
 95 |             self.grad_activation_funcs = [self.activation_grad_map[act_func] for act_func in self.activation_funcs]
 96 |             self.activation_funcs = [np.vectorize(act_func) for act_func in self.activation_funcs]
 97 |         except KeyError:
 98 |             raise KeyError('Grad func not exists.')
 99 | 
100 |         try:
101 |             self.loss_func = options['loss_func']
102 |         except KeyError:
103 |             self.loss_func = function.mean_square_error
104 |         finally:
105 |             self.grad_func = self.grad_loss_map[self.loss_func]
106 |             # Enable softmax.
107 |             if self.grad_func == self.grad_loss_map[function.softmax_cross_entropy]:
108 |                 self.enable_softmax = True
109 |             else:
110 |                 self.enable_softmax = False
111 | 
112 |         # Init Batch Size.
113 |         try:
114 |             self.batch_size = options['batch_size']
115 |         except KeyError:
116 |             self.batch_size = 16
117 |         finally:
118 |             if self.batch_size < 1:
119 |                 raise ValueError('Batch size must larger than 1.')
120 | 
121 |         # Init Learning Rate.
122 |         try:
123 |             self.learning_rate = options['learning_rate']
124 |         except KeyError:
125 |             self.learning_rate = 0.003
126 |         finally:
127 |             if self.learning_rate < 0.0:
128 |                 raise ValueError('Learning rate must be positive.')
129 | 
130 |         try:
131 |             self.max_epoch = options['max_epoch']
132 |         except KeyError:
133 |             self.max_epoch = 3000
134 |         finally:
135 |             if self.max_epoch < 1:
136 |                 raise ValueError('Epoch must be larger than 1.')
137 | 
138 |         try:
139 |             self.enable_logger = options['enable_logger']
140 |         except KeyError:
141 |             self.enable_logger = True
142 |         finally:
143 |             if self.enable_logger:
144 |                 self.logger = generate_model_logger(self.model_name)
145 | 
146 |         self.history_loss = []
147 | 
148 |     def _forward(self, input_batch):
149 |         # Temporal result, a_batch.
150 |         z_input = input_batch
151 |         # Forward layer by layer.
152 |         for layer_index in range(self.total_layer_count):
153 |             # Get weights and biases.
154 |             weights, biases = self.weights[layer_index], self.biases[layer_index]
155 |             # Save result as grad w.
156 |             self.z_inputs[layer_index] = z_input
157 |             z_output = np.dot(z_input, weights.T) + biases.T
158 |             # Save result of a for backward.
159 |             self.z_outputs[layer_index] = z_output
160 |             # z_input is also called a_output.
161 |             z_input = self.activation_funcs[layer_index](z_output)
162 |         return z_input
163 | 
164 |     def _backward(self, error):
165 |         # error here is shape of (batch_size, y_space)
166 |         for index in np.arange(0, self.total_layer_count)[::-1]:
167 |             # dl/dw = dz/dw * da/dz * (dl/da) | x = x_batch.
168 |             z_outputs = self.z_outputs[index]
169 |             # Get grad of activation func.
170 |             grad_activation_func = self.grad_activation_funcs[index]
171 |             # Calculate da/dz.
172 |             grad_z_batch = grad_activation_func(z_outputs)
173 |             # Calculate dl/da * da/dz.
174 |             delta = error * grad_z_batch
175 |             # Save delta.
176 |             self.deltas[index] = delta
177 |             # Update error, dz/da
178 |             error = np.dot(delta, self.weights[index])
179 | 
180 |     def _update_weights_and_biases(self):
181 |         for index in range(self.total_layer_count):
182 |             # Get z_input and delta.
183 |             z_input, delta = self.z_inputs[index], self.deltas[index]
184 |             # Calculate grad weights, grad biases, dl/da * da/dz * dz/dw
185 |             grad_weights = -np.dot(delta.T, z_input)
186 |             grad_biases = -np.mean(delta, axis=0).reshape(self.biases[index].shape)
187 |             # Update weights, biases.
188 |             self.weights[index] -= self.learning_rate * grad_weights
189 |             self.biases[index] -= self.learning_rate * grad_biases
190 | 
191 |     def train(self, x_data, y_data):
192 |         iteration, epoch, x_data_count = 0, 0, len(x_data)
193 |         while epoch < self.max_epoch:
194 |             s_index, e_index, epoch_loss = 0, self.batch_size, []
195 |             while True:
196 |                 # Generate batch x, y
197 |                 x_batch, y_batch = x_data[s_index: e_index], y_data[s_index: e_index]
198 |                 # Calculate y_predict.
199 |                 y_predict = self._forward(x_batch)
200 |                 # Calculate loss.
201 |                 loss = self.loss_func(y_predict, y_batch)
202 |                 epoch_loss.append(loss)
203 |                 # Calculate error.
204 |                 error = self.grad_func(y_predict, y_batch)
205 |                 # Bp & Update.
206 |                 self._backward(error)
207 |                 self._update_weights_and_biases()
208 |                 # Update index.
209 |                 s_index += self.batch_size
210 |                 e_index = s_index + self.batch_size
211 |                 # Add iteration.
212 |                 iteration += 1
213 |                 if e_index > len(x_data):
214 |                     mean_epoch_loss = np.mean(epoch_loss)
215 |                     self.history_loss.append(mean_epoch_loss)
216 |                     break
217 |             if epoch % 100 == 0:
218 |                 self.save()
219 |                 self.evaluate(x_data, y_data)
220 |                 self.logger.warning("Epoch: {:d} | loss: {:.6f}".format(epoch, mean_epoch_loss))
221 |             epoch += 1
222 | 
223 |     def predict(self, x_batch):
224 |         if self.enable_softmax:
225 |             result = function.softmax(self._forward(x_batch))
226 |         else:
227 |             result = self._forward(x_batch)
228 |         return result
229 | 
230 |     def evaluate(self, x_data, y_data):
231 |         y_label, y_output = np.argmax(y_data, axis=1), np.argmax(self.predict(x_data), axis=1)
232 |         self.logger.warning("Accuracy: {:.3f} ".format(np.sum(y_label == y_output) / len(x_data)))
233 | 
234 |     def save(self):
235 |         save_dir = os.path.join(CKPT_DIR, self.model_name)
236 |         if not os.path.exists(save_dir):
237 |             os.makedirs(save_dir)
238 |         with open(os.path.join(save_dir, 'weights.json'), 'w') as fp:
239 |             weights = [weights.tolist() for weights in self.weights.values()]
240 |             json.dump(weights, fp, indent=True)
241 |         with open(os.path.join(save_dir, 'biases.json'), 'w') as fp:
242 |             biases = [biases.tolist() for biases in self.biases.values()]
243 |             json.dump(biases, fp, indent=True)
244 |         self.logger.warning("Model saved.")
245 | 
246 |     def restore(self):
247 |         save_dir = os.path.join(CKPT_DIR, self.model_name)
248 |         try:
249 |             with open(os.path.join(save_dir, 'weights.json'), 'r') as fp:
250 |                 weights = json.load(fp)
251 |                 for index in range(self.total_layer_count):
252 |                     self.weights[index] = np.array(weights[index])
253 |         except FileNotFoundError:
254 |             raise FileNotFoundError('Weights not exists.')
255 | 
256 |         try:
257 |             with open(os.path.join(save_dir, 'biases.json'), 'r') as fp:
258 |                 biases = json.load(fp)
259 |                 for index in range(self.total_layer_count):
260 |                     self.biases[index] = np.array(biases[index])
261 |         except FileNotFoundError:
262 |             raise FileNotFoundError('biases not exists.')
263 | 
264 |         self.logger.warning("Model restored.")
265 | 


--------------------------------------------------------------------------------
/note/.ipynb_checkpoints/DoubleDQN-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Double DQN"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 背景"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "Double DQN是DQN（Deep Q Network）的一种改进，旨在解决DQN训练过程中存在的过估计（Overestimating）问题。在训练过程中，与DQN直接选取目标网络（Target Q Network）中下一个State各个Action对应的Q值最大的那一个Q值不同，Double DQN的核心精神在于，它首先使用预测网络（Predict Q Network）计算下一个State的对应各个Action的Q值，然后选取最大的那个Q值对应Action的索引，再使用目标网络计算该状态的对应各个状态的Q值，然后选取预测网络中给定Action索引对应的Q值，但是它可能不是最大的那个，从而一定程度上避免了过度估计，提高了训练DQN的稳定性和速度。"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## DQN"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "即Deep Q Network，DQN有两个网络，分别是预测网络（Predict Q Network）和目标网络（Target Q Network），预测网络用来预测当前状态对应各个动作的Q值，目标网络用来预测下一个，或者下第几个状态各个动作的Q值，这个取决于训练过程采用时间差分（Temporal Difference）还是蒙特卡洛（MC）方法，以TD的训练过程为例，我们期望对采样过的每一个状态、动作、奖励元组最小化下式："
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "$$\n",
 43 |     "\\left( Q \\left( s_j, a_j; \\theta \\right) - y_j \\right)^2\n",
 44 |     "$$"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "其中Q对应的θ即预测网络，它接受当前状态，输出一个当前状态对应各个动作的Q值，然后选取当前动作对应的那个Q值。$y_j$是Ground Truth标签，它是由目标网络计算得出："
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "$$\n",
 59 |     "y_j=\n",
 60 |     "\\begin{cases}\n",
 61 |     "r_j & \\text{if episode ends at j + 1}\\\\\n",
 62 |     "r_j + \\gamma \\max_{a^{\\prime}} \\hat{Q} \\left( s_{j+1}, a^{\\prime}; \\theta^{-} \\right)& \\text{otherwise}\n",
 63 |     "\\end{cases}\n",
 64 |     "$$"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "其中Q对应的θ-即是目标网络，当采取这个动作后回合结束，则标签即是这次动作产生的奖励，如果回合未结束，则标签将由两部分构成，第一部分即是这次动作产生的奖励，另一部分则是由目标网络计算，即计算下一个状态各个动作对应的Q值，然后选取最大的那个Q值。"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "需要注意的是，目标网络的参数是设定是不可train的，在训练经过M次后，我们会将预测网络被更新的全部参数复制给目标网络，其中M次的M是一个可调的超参数，这样的一个直觉的好处就是避免了震荡。"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "## Experience Replay"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "在实作中，On-Policy的DQN表现非常不稳定，一种改进被称之为经验回放（Experience Replay）的技术通过缓存每一步状态、动作、奖励、下一状态元组，在一回合结束后批量训练多次，将On-Policy的过程转化为Off-Policy，提高了DQN的训练速度和稳定性，具体的实现非常直觉，即维护一个指定大小的缓存数组，每回合用新产生的N个状态、动作、奖励、下一状态元组随机替换掉缓存池中现有的N个，然后再回合结束后做数次训练。"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "## Double DQN"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "正如背景中提到的："
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "> 与DQN直接选取目标网络（Target Q Network）中下一个State各个Action对应的Q值最大的那一个Q值不同，Double DQN的核心精神在于，它首先使用预测网络（Predict Q Network）计算下一个State的对应各个Action的Q值，然后选取最大的那个Q值对应Action的索引，再使用目标网络计算该状态的对应各个状态的Q值，然后选取预测网络中给定Action索引对应的Q值，但是它可能不是最大的那个。\n"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "Double DQN与DQN相同的是他们都有被称之为预测网络与目标网络的两个网络，只是在实作过程中，标签的计算过程做了修正："
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "$$\n",
128 |     "y_j=\n",
129 |     "\\begin{cases}\n",
130 |     "r_j & \\text{if episode ends at j + 1}\\\\\n",
131 |     "r_j + \\gamma Q \\left( s_{j+1}, \\max_{a^{\\prime}} Q \\left (s_{j+1}, a^{\\prime}; \\theta\\right) ; \\theta^{-} \\right)& \\text{otherwise}\n",
132 |     "\\end{cases}\n",
133 |     "$$\n"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "可以看出这个修正非常地直觉，在实验中，也确实要比原始的DQN训练稳定。"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "## Experiment"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 5,
153 |    "metadata": {},
154 |    "outputs": [
155 |     {
156 |      "name": "stderr",
157 |      "output_type": "stream",
158 |      "text": [
159 |       "/Users/shuyu/anaconda3/envs/quant/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5\n",
160 |       "  return f(*args, **kwds)\n"
161 |      ]
162 |     }
163 |    ],
164 |    "source": [
165 |     "# coding=utf-8\n",
166 |     "\n",
167 |     "import numpy as np\n",
168 |     "import gym\n",
169 |     "\n",
170 |     "import sys\n",
171 |     "sys.path.append('..')\n",
172 |     "\n",
173 |     "from base.model import *\n",
174 |     "from utility.launcher import start_game\n",
175 |     "\n",
176 |     "\n",
177 |     "class Agent(BaseRLModel):\n",
178 |     "\n",
179 |     "    def __init__(self, a_space, s_space, **options):\n",
180 |     "        super(Agent, self).__init__(a_space, s_space, **options)\n",
181 |     "\n",
182 |     "        self._init_input()\n",
183 |     "        self._init_nn()\n",
184 |     "        self._init_op()\n",
185 |     "        self._init_saver()\n",
186 |     "\n",
187 |     "        self.buffer = np.zeros((self.buffer_size, self.s_space + 1 + 1 + self.s_space))\n",
188 |     "        self.buffer_count = 0\n",
189 |     "\n",
190 |     "        self.update_target_net_step = 200\n",
191 |     "\n",
192 |     "        self.session.run(tf.global_variables_initializer())\n",
193 |     "\n",
194 |     "    def _init_input(self, *args):\n",
195 |     "        with tf.variable_scope('input'):\n",
196 |     "            self.s_n = tf.placeholder(tf.float32, [None, self.s_space])\n",
197 |     "            self.s = tf.placeholder(tf.float32, [None, self.s_space])\n",
198 |     "            self.q_n = tf.placeholder(tf.float32, [None, ])\n",
199 |     "            self.r = tf.placeholder(tf.float32, [None, ])\n",
200 |     "            self.a = tf.placeholder(tf.int32, [None, ])\n",
201 |     "\n",
202 |     "    def _init_nn(self, *args):\n",
203 |     "        # w,b initializer\n",
204 |     "        w_initializer = tf.random_normal_initializer(mean=0.0, stddev=0.3)\n",
205 |     "        b_initializer = tf.constant_initializer(0.1)\n",
206 |     "\n",
207 |     "        with tf.variable_scope('predict_q_net'):\n",
208 |     "            phi_state = tf.layers.dense(self.s,\n",
209 |     "                                        64,\n",
210 |     "                                        tf.nn.relu,\n",
211 |     "                                        kernel_initializer=w_initializer,\n",
212 |     "                                        bias_initializer=b_initializer)\n",
213 |     "\n",
214 |     "            self.q_predict = tf.layers.dense(phi_state,\n",
215 |     "                                             self.a_space,\n",
216 |     "                                             kernel_initializer=w_initializer,\n",
217 |     "                                             bias_initializer=b_initializer)\n",
218 |     "\n",
219 |     "        with tf.variable_scope('target_q_net'):\n",
220 |     "            phi_state_next = tf.layers.dense(self.s_n,\n",
221 |     "                                             64,\n",
222 |     "                                             tf.nn.relu,\n",
223 |     "                                             kernel_initializer=w_initializer,\n",
224 |     "                                             bias_initializer=b_initializer,\n",
225 |     "                                             trainable=False)\n",
226 |     "\n",
227 |     "            self.q_target = tf.layers.dense(phi_state_next,\n",
228 |     "                                            self.a_space,\n",
229 |     "                                            kernel_initializer=w_initializer,\n",
230 |     "                                            bias_initializer=b_initializer,\n",
231 |     "                                            trainable=False)\n",
232 |     "\n",
233 |     "    def _init_op(self):\n",
234 |     "\n",
235 |     "        with tf.variable_scope('q_predict'):\n",
236 |     "            # size of q_value_predict is [BATCH_SIZE, 1]\n",
237 |     "            action_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)\n",
238 |     "            self.q_eval = tf.gather_nd(self.q_predict, action_indices)\n",
239 |     "\n",
240 |     "        with tf.variable_scope('loss'):\n",
241 |     "            self.loss_func = tf.losses.mean_squared_error(self.q_n, self.q_eval)\n",
242 |     "\n",
243 |     "        with tf.variable_scope('train'):\n",
244 |     "            self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_func)\n",
245 |     "\n",
246 |     "        with tf.variable_scope('update_target_net'):\n",
247 |     "            t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_net')\n",
248 |     "            p_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='predict_q_net')\n",
249 |     "            self.update_q_net = [tf.assign(t, e) for t, e in zip(t_params, p_params)]\n",
250 |     "\n",
251 |     "    def predict(self, s):\n",
252 |     "        if np.random.uniform() < self.epsilon or self.mode == 'test':\n",
253 |     "            a = np.argmax(self.session.run(self.q_predict, feed_dict={self.s: s[np.newaxis, :]}))\n",
254 |     "        else:\n",
255 |     "            a = np.random.randint(0, self.a_space)\n",
256 |     "        return a\n",
257 |     "\n",
258 |     "    def snapshot(self, s, a, r, s_n):\n",
259 |     "        self.buffer[self.buffer_count % self.buffer_size, :] = np.hstack((s, [a, r], s_n))\n",
260 |     "        self.buffer_count += 1\n",
261 |     "\n",
262 |     "    def train(self):\n",
263 |     "\n",
264 |     "        for train_step in range(self.train_steps):\n",
265 |     "            # Update target net if need.\n",
266 |     "            if self.training_step % self.update_target_net_step == 0:\n",
267 |     "                self.session.run(self.update_q_net)\n",
268 |     "            # Get batch.\n",
269 |     "            if self.buffer_count < self.buffer_size:\n",
270 |     "                batch = self.buffer[np.random.choice(self.buffer_count, size=self.batch_size), :]\n",
271 |     "            else:\n",
272 |     "                batch = self.buffer[np.random.choice(self.buffer_size, size=self.batch_size), :]\n",
273 |     "\n",
274 |     "            s = batch[:, :self.s_space]\n",
275 |     "            s_n = batch[:, -self.s_space:]\n",
276 |     "            a = batch[:, self.s_space].reshape((-1))\n",
277 |     "            r = batch[:, self.s_space + 1]\n",
278 |     "\n",
279 |     "            # 1. Calculate q_next_predict and q_next_target.\n",
280 |     "            q_next_predict, q_next_target = self.session.run([self.q_predict, self.q_target], {\n",
281 |     "                self.s: s_n, self.s_n: s_n\n",
282 |     "            })\n",
283 |     "\n",
284 |     "            # 2. Select a_indices in q_next_predict.\n",
285 |     "            a_indices = np.argmax(q_next_predict, axis=1)\n",
286 |     "\n",
287 |     "            # 3. Select Q values with a_indices\n",
288 |     "            q_next = q_next_target[np.arange(0, self.batch_size), a_indices]\n",
289 |     "\n",
290 |     "            # 4. Calculate q_real.\n",
291 |     "            q_real = r + self.gamma * q_next\n",
292 |     "\n",
293 |     "            _, cost = self.session.run([self.train_op, self.loss_func], {\n",
294 |     "                self.s: s, self.a: a, self.q_n: q_real\n",
295 |     "            })\n",
296 |     "\n",
297 |     "            self.training_step += 1"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {},
303 |    "source": [
304 |     "## Running"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": null,
310 |    "metadata": {},
311 |    "outputs": [],
312 |    "source": [
313 |     "if __name__ == '__main__':\n",
314 |     "\n",
315 |     "    def main(_):\n",
316 |     "        # Make env.\n",
317 |     "        env = gym.make('CartPole-v0')\n",
318 |     "        env.seed(1)\n",
319 |     "        env = env.unwrapped\n",
320 |     "        # Init agent.\n",
321 |     "        agent = Agent(env.action_space.n, env.observation_space.shape[0], **{\n",
322 |     "            KEY_MODEL_NAME: 'PPO',\n",
323 |     "            KEY_TRAIN_EPISODE: 10000\n",
324 |     "        })\n",
325 |     "        start_game(env, agent)\n",
326 |     "\n",
327 |     "\n",
328 |     "    if __name__ == '__main__':\n",
329 |     "        tf.app.run()"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "markdown",
334 |    "metadata": {},
335 |    "source": [
336 |     "## 结尾"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "markdown",
341 |    "metadata": {},
342 |    "source": [
343 |     "略了。"
344 |    ]
345 |   }
346 |  ],
347 |  "metadata": {
348 |   "kernelspec": {
349 |    "display_name": "Python 3",
350 |    "language": "python",
351 |    "name": "python3"
352 |   },
353 |   "language_info": {
354 |    "codemirror_mode": {
355 |     "name": "ipython",
356 |     "version": 3
357 |    },
358 |    "file_extension": ".py",
359 |    "mimetype": "text/x-python",
360 |    "name": "python",
361 |    "nbconvert_exporter": "python",
362 |    "pygments_lexer": "ipython3",
363 |    "version": "3.5.4"
364 |   }
365 |  },
366 |  "nbformat": 4,
367 |  "nbformat_minor": 2
368 | }
369 | 


--------------------------------------------------------------------------------
/note/DoubleDQN.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Double DQN"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## 背景"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "Double DQN是DQN（Deep Q Network）的一种改进，旨在解决DQN训练过程中存在的过估计（Overestimating）问题。在训练过程中，与DQN直接选取目标网络（Target Q Network）中下一个State各个Action对应的Q值最大的那一个Q值不同，Double DQN的核心精神在于，它首先使用预测网络（Predict Q Network）计算下一个State的对应各个Action的Q值，然后选取最大的那个Q值对应Action的索引，再使用目标网络计算该状态的对应各个状态的Q值，然后选取预测网络中给定Action索引对应的Q值，但是它可能不是最大的那个，从而一定程度上避免了过度估计，提高了训练DQN的稳定性和速度。"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## DQN"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "即Deep Q Network，DQN有两个网络，分别是预测网络（Predict Q Network）和目标网络（Target Q Network），预测网络用来预测当前状态对应各个动作的Q值，目标网络用来预测下一个，或者下第几个状态各个动作的Q值，这个取决于训练过程采用时间差分（Temporal Difference）还是蒙特卡洛（MC）方法，以TD的训练过程为例，我们期望对采样过的每一个状态、动作、奖励元组最小化下式："
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "$$\n",
 43 |     "\\left( Q \\left( s_j, a_j; \\theta \\right) - y_j \\right)^2\n",
 44 |     "$$"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "其中Q对应的θ即预测网络，它接受当前状态，输出一个当前状态对应各个动作的Q值，然后选取当前动作对应的那个Q值。$y_j$是Ground Truth标签，它是由目标网络计算得出："
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "$$\n",
 59 |     "y_j=\n",
 60 |     "\\begin{cases}\n",
 61 |     "r_j & \\text{if episode ends at j + 1}\\\\\n",
 62 |     "r_j + \\gamma \\max_{a^{\\prime}} \\hat{Q} \\left( s_{j+1}, a^{\\prime}; \\theta^{-} \\right)& \\text{otherwise}\n",
 63 |     "\\end{cases}\n",
 64 |     "$$"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "其中Q对应的θ-即是目标网络，当采取这个动作后回合结束，则标签即是这次动作产生的奖励，如果回合未结束，则标签将由两部分构成，第一部分即是这次动作产生的奖励，另一部分则是由目标网络计算，即计算下一个状态各个动作对应的Q值，然后选取最大的那个Q值。"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "需要注意的是，目标网络的参数是设定是不可train的，在训练经过M次后，我们会将预测网络被更新的全部参数复制给目标网络，其中M次的M是一个可调的超参数，这样的一个直觉的好处就是避免了震荡。"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "## Experience Replay"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "在实作中，On-Policy的DQN表现非常不稳定，一种改进被称之为经验回放（Experience Replay）的技术通过缓存每一步状态、动作、奖励、下一状态元组，在一回合结束后批量训练多次，将On-Policy的过程转化为Off-Policy，提高了DQN的训练速度和稳定性，具体的实现非常直觉，即维护一个指定大小的缓存数组，每回合用新产生的N个状态、动作、奖励、下一状态元组随机替换掉缓存池中现有的N个，然后再回合结束后做数次训练。"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "## Double DQN"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "正如背景中提到的："
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "> 与DQN直接选取目标网络（Target Q Network）中下一个State各个Action对应的Q值最大的那一个Q值不同，Double DQN的核心精神在于，它首先使用预测网络（Predict Q Network）计算下一个State的对应各个Action的Q值，然后选取最大的那个Q值对应Action的索引，再使用目标网络计算该状态的对应各个状态的Q值，然后选取预测网络中给定Action索引对应的Q值，但是它可能不是最大的那个。\n"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "Double DQN与DQN相同的是他们都有被称之为预测网络与目标网络的两个网络，只是在实作过程中，标签的计算过程做了修正："
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "$$\n",
128 |     "y_j=\n",
129 |     "\\begin{cases}\n",
130 |     "r_j & \\text{if episode ends at j + 1}\\\\\n",
131 |     "r_j + \\gamma Q \\left( s_{j+1}, \\max_{a^{\\prime}} Q \\left (s_{j+1}, a^{\\prime}; \\theta\\right) ; \\theta^{-} \\right)& \\text{otherwise}\n",
132 |     "\\end{cases}\n",
133 |     "$$\n"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "可以看出这个修正非常地直觉，在实验中，也确实要比原始的DQN训练稳定。"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "## Experiment"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 5,
153 |    "metadata": {},
154 |    "outputs": [
155 |     {
156 |      "name": "stderr",
157 |      "output_type": "stream",
158 |      "text": [
159 |       "/Users/shuyu/anaconda3/envs/quant/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5\n",
160 |       "  return f(*args, **kwds)\n"
161 |      ]
162 |     }
163 |    ],
164 |    "source": [
165 |     "# coding=utf-8\n",
166 |     "\n",
167 |     "import numpy as np\n",
168 |     "import gym\n",
169 |     "\n",
170 |     "import sys\n",
171 |     "sys.path.append('..')\n",
172 |     "\n",
173 |     "from base.model import *\n",
174 |     "from utility.launcher import start_game\n",
175 |     "\n",
176 |     "\n",
177 |     "class Agent(BaseRLModel):\n",
178 |     "\n",
179 |     "    def __init__(self, a_space, s_space, **options):\n",
180 |     "        super(Agent, self).__init__(a_space, s_space, **options)\n",
181 |     "\n",
182 |     "        self._init_input()\n",
183 |     "        self._init_nn()\n",
184 |     "        self._init_op()\n",
185 |     "        self._init_saver()\n",
186 |     "\n",
187 |     "        self.buffer = np.zeros((self.buffer_size, self.s_space + 1 + 1 + self.s_space))\n",
188 |     "        self.buffer_count = 0\n",
189 |     "\n",
190 |     "        self.update_target_net_step = 200\n",
191 |     "\n",
192 |     "        self.session.run(tf.global_variables_initializer())\n",
193 |     "\n",
194 |     "    def _init_input(self, *args):\n",
195 |     "        with tf.variable_scope('input'):\n",
196 |     "            self.s_n = tf.placeholder(tf.float32, [None, self.s_space])\n",
197 |     "            self.s = tf.placeholder(tf.float32, [None, self.s_space])\n",
198 |     "            self.q_n = tf.placeholder(tf.float32, [None, ])\n",
199 |     "            self.r = tf.placeholder(tf.float32, [None, ])\n",
200 |     "            self.a = tf.placeholder(tf.int32, [None, ])\n",
201 |     "\n",
202 |     "    def _init_nn(self, *args):\n",
203 |     "        # w,b initializer\n",
204 |     "        w_initializer = tf.random_normal_initializer(mean=0.0, stddev=0.3)\n",
205 |     "        b_initializer = tf.constant_initializer(0.1)\n",
206 |     "\n",
207 |     "        with tf.variable_scope('predict_q_net'):\n",
208 |     "            phi_state = tf.layers.dense(self.s,\n",
209 |     "                                        64,\n",
210 |     "                                        tf.nn.relu,\n",
211 |     "                                        kernel_initializer=w_initializer,\n",
212 |     "                                        bias_initializer=b_initializer)\n",
213 |     "\n",
214 |     "            self.q_predict = tf.layers.dense(phi_state,\n",
215 |     "                                             self.a_space,\n",
216 |     "                                             kernel_initializer=w_initializer,\n",
217 |     "                                             bias_initializer=b_initializer)\n",
218 |     "\n",
219 |     "        with tf.variable_scope('target_q_net'):\n",
220 |     "            phi_state_next = tf.layers.dense(self.s_n,\n",
221 |     "                                             64,\n",
222 |     "                                             tf.nn.relu,\n",
223 |     "                                             kernel_initializer=w_initializer,\n",
224 |     "                                             bias_initializer=b_initializer,\n",
225 |     "                                             trainable=False)\n",
226 |     "\n",
227 |     "            self.q_target = tf.layers.dense(phi_state_next,\n",
228 |     "                                            self.a_space,\n",
229 |     "                                            kernel_initializer=w_initializer,\n",
230 |     "                                            bias_initializer=b_initializer,\n",
231 |     "                                            trainable=False)\n",
232 |     "\n",
233 |     "    def _init_op(self):\n",
234 |     "\n",
235 |     "        with tf.variable_scope('q_predict'):\n",
236 |     "            # size of q_value_predict is [BATCH_SIZE, 1]\n",
237 |     "            action_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)\n",
238 |     "            self.q_eval = tf.gather_nd(self.q_predict, action_indices)\n",
239 |     "\n",
240 |     "        with tf.variable_scope('loss'):\n",
241 |     "            self.loss_func = tf.losses.mean_squared_error(self.q_n, self.q_eval)\n",
242 |     "\n",
243 |     "        with tf.variable_scope('train'):\n",
244 |     "            self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_func)\n",
245 |     "\n",
246 |     "        with tf.variable_scope('update_target_net'):\n",
247 |     "            t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_net')\n",
248 |     "            p_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='predict_q_net')\n",
249 |     "            self.update_q_net = [tf.assign(t, e) for t, e in zip(t_params, p_params)]\n",
250 |     "\n",
251 |     "    def predict(self, s):\n",
252 |     "        if np.random.uniform() < self.epsilon or self.mode == 'test':\n",
253 |     "            a = np.argmax(self.session.run(self.q_predict, feed_dict={self.s: s[np.newaxis, :]}))\n",
254 |     "        else:\n",
255 |     "            a = np.random.randint(0, self.a_space)\n",
256 |     "        return a\n",
257 |     "\n",
258 |     "    def snapshot(self, s, a, r, s_n):\n",
259 |     "        self.buffer[self.buffer_count % self.buffer_size, :] = np.hstack((s, [a, r], s_n))\n",
260 |     "        self.buffer_count += 1\n",
261 |     "\n",
262 |     "    def train(self):\n",
263 |     "\n",
264 |     "        for train_step in range(self.train_steps):\n",
265 |     "            # Update target net if need.\n",
266 |     "            if self.training_step % self.update_target_net_step == 0:\n",
267 |     "                self.session.run(self.update_q_net)\n",
268 |     "            # Get batch.\n",
269 |     "            if self.buffer_count < self.buffer_size:\n",
270 |     "                batch = self.buffer[np.random.choice(self.buffer_count, size=self.batch_size), :]\n",
271 |     "            else:\n",
272 |     "                batch = self.buffer[np.random.choice(self.buffer_size, size=self.batch_size), :]\n",
273 |     "\n",
274 |     "            s = batch[:, :self.s_space]\n",
275 |     "            s_n = batch[:, -self.s_space:]\n",
276 |     "            a = batch[:, self.s_space].reshape((-1))\n",
277 |     "            r = batch[:, self.s_space + 1]\n",
278 |     "\n",
279 |     "            # 1. Calculate q_next_predict and q_next_target.\n",
280 |     "            q_next_predict, q_next_target = self.session.run([self.q_predict, self.q_target], {\n",
281 |     "                self.s: s_n, self.s_n: s_n\n",
282 |     "            })\n",
283 |     "\n",
284 |     "            # 2. Select a_indices in q_next_predict.\n",
285 |     "            a_indices = np.argmax(q_next_predict, axis=1)\n",
286 |     "\n",
287 |     "            # 3. Select Q values with a_indices\n",
288 |     "            q_next = q_next_target[np.arange(0, self.batch_size), a_indices]\n",
289 |     "\n",
290 |     "            # 4. Calculate q_real.\n",
291 |     "            q_real = r + self.gamma * q_next\n",
292 |     "\n",
293 |     "            _, cost = self.session.run([self.train_op, self.loss_func], {\n",
294 |     "                self.s: s, self.a: a, self.q_n: q_real\n",
295 |     "            })\n",
296 |     "\n",
297 |     "            self.training_step += 1"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {},
303 |    "source": [
304 |     "## Running"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": null,
310 |    "metadata": {},
311 |    "outputs": [],
312 |    "source": [
313 |     "if __name__ == '__main__':\n",
314 |     "\n",
315 |     "    def main(_):\n",
316 |     "        # Make env.\n",
317 |     "        env = gym.make('CartPole-v0')\n",
318 |     "        env.seed(1)\n",
319 |     "        env = env.unwrapped\n",
320 |     "        # Init agent.\n",
321 |     "        agent = Agent(env.action_space.n, env.observation_space.shape[0], **{\n",
322 |     "            KEY_MODEL_NAME: 'PPO',\n",
323 |     "            KEY_TRAIN_EPISODE: 10000\n",
324 |     "        })\n",
325 |     "        start_game(env, agent)\n",
326 |     "\n",
327 |     "\n",
328 |     "    if __name__ == '__main__':\n",
329 |     "        tf.app.run()"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "markdown",
334 |    "metadata": {},
335 |    "source": [
336 |     "## 结尾"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "markdown",
341 |    "metadata": {},
342 |    "source": [
343 |     "相对于DQN，运气到了可能3000步在小车倒立杆收敛，Double DQN如果运气到了可能只需要1500步。"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "metadata": {},
350 |    "outputs": [],
351 |    "source": []
352 |   }
353 |  ],
354 |  "metadata": {
355 |   "kernelspec": {
356 |    "display_name": "Python 3",
357 |    "language": "python",
358 |    "name": "python3"
359 |   },
360 |   "language_info": {
361 |    "codemirror_mode": {
362 |     "name": "ipython",
363 |     "version": 3
364 |    },
365 |    "file_extension": ".py",
366 |    "mimetype": "text/x-python",
367 |    "name": "python",
368 |    "nbconvert_exporter": "python",
369 |    "pygments_lexer": "ipython3",
370 |    "version": "3.5.4"
371 |   }
372 |  },
373 |  "nbformat": 4,
374 |  "nbformat_minor": 2
375 | }
376 | 


--------------------------------------------------------------------------------
/note/DQN.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# 问题设定"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "在小车倒立杆（CartPole）游戏中，我们希望通过强化学习训练一个智能体（agent），尽可能不断地左右移动小车，使得小车上的杆不倒，我们首先定义CartPole游戏："
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "CartPole游戏即是强化学习模型的enviorment，它与agent交互，实时更新state，内部定义了reward function，其中state有以下定义："
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "state每一个维度分别代表了：\n",
 31 |     "\n",
 32 |     "- 小车位置，它的取值范围是-2.4到2.4\n",
 33 |     "- 小车速度，它的取值范围是负无穷到正无穷\n",
 34 |     "- 杆的角度，它的取值范围是-41.8°到41.8°\n",
 35 |     "- 杆的角速，它的取值范围是负无穷到正无穷"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "action是一个2维向量，每一个维度分别代表向左和向右移动。"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "$$\n",
 50 |     "action \\in \\mathbb{R}^2\n",
 51 |     "$$"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "# DQN"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "我们将设计一个网络，作为状态-动作值函数（state-action value function），其输入是state，输出是对应各个action的value，并TD（Temporal Difference）进行迭代训练直至收敛。我们将定义两个这样的网络，分别记作$\\theta$和$\\theta^-$，分别代表估计网络与目标网络。"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "我们希望最小化："
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "$$\n",
 80 |     "\\left( y_j - Q \\left( \\phi_j, a_j; \\theta \\right) \\right)^2\n",
 81 |     "$$"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "其中，$a_j$具有以下形式："
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "$$\n",
 96 |     "a_j = \\mathrm{argmax}_{a} Q \\left( \\phi(s_j), a; \\theta\\right)\n",
 97 |     "$$"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "其中，$y_j$具有以下形式："
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "$$\n",
112 |     "f(x)=\n",
113 |     "\\begin{cases}\n",
114 |     "r_j & \\text{if episode ends at j + 1}\\\\\n",
115 |     "r_j + \\gamma \\max_{a^{\\prime}} \\hat{Q} \\left( \\phi_{j+1}, a^{\\prime}; \\theta^{-} \\right)& \\text{otherwise}\n",
116 |     "\\end{cases}$$\n",
117 |     "\n"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "在最小化TD-Error时，我们将固定目标网络，只对估计网络做梯度反向传播，每次到达一定迭代次数后，将估计网络的权重复制到目标网络。在这个过程中，需要用到经验回放（Experience Replay）技术，即将每一次迭代观测到的$s_t, r_t, a_t, s_{t+1}$作为一个元组缓存，然后在这些缓存中随机抽取元组做批次梯度下降。"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "# 代码实现"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 2,
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "name": "stderr",
141 |      "output_type": "stream",
142 |      "text": [
143 |       "/Users/shuyu/anaconda3/envs/quant/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5\n",
144 |       "  return f(*args, **kwds)\n"
145 |      ]
146 |     }
147 |    ],
148 |    "source": [
149 |     "# coding=utf-8\n",
150 |     "\n",
151 |     "import tensorflow as tf\n",
152 |     "import numpy as np\n",
153 |     "import gym\n",
154 |     "import sys\n",
155 |     "\n",
156 |     "sys.path.append('..')\n",
157 |     "\n",
158 |     "from base.model import *\n",
159 |     "\n",
160 |     "%matplotlib inline"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 3,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "class Agent(BaseRLModel):\n",
170 |     "\n",
171 |     "    def __init__(self, session, env, a_space, s_space, **options):\n",
172 |     "        super(Agent, self).__init__(session, env, a_space, s_space, **options)\n",
173 |     "\n",
174 |     "        self._init_input()\n",
175 |     "        self._init_nn()\n",
176 |     "        self._init_op()\n",
177 |     "        self._init_saver()\n",
178 |     "\n",
179 |     "        self.buffer = np.zeros((self.buffer_size, self.s_space + 1 + 1 + self.s_space))\n",
180 |     "        self.buffer_count = 0\n",
181 |     "\n",
182 |     "        self.total_train_step = 0\n",
183 |     "\n",
184 |     "        self.update_target_net_step = 200\n",
185 |     "\n",
186 |     "        self.session.run(tf.global_variables_initializer())\n",
187 |     "\n",
188 |     "    def _init_input(self, *args):\n",
189 |     "        with tf.variable_scope('input'):\n",
190 |     "            self.s_n = tf.placeholder(tf.float32, [None, self.s_space])\n",
191 |     "            self.s = tf.placeholder(tf.float32,   [None, self.s_space])\n",
192 |     "            self.r = tf.placeholder(tf.float32,   [None, ])\n",
193 |     "            self.a = tf.placeholder(tf.int32,     [None, ])\n",
194 |     "\n",
195 |     "    def _init_nn(self, *args):\n",
196 |     "        with tf.variable_scope('actor_net'):\n",
197 |     "            # w,b initializer\n",
198 |     "            w_initializer = tf.random_normal_initializer(mean=0.0, stddev=0.3)\n",
199 |     "            b_initializer = tf.constant_initializer(0.1)\n",
200 |     "\n",
201 |     "            with tf.variable_scope('predict_q_net'):\n",
202 |     "                phi_state = tf.layers.dense(self.s,\n",
203 |     "                                            32,\n",
204 |     "                                            tf.nn.relu,\n",
205 |     "                                            kernel_initializer=w_initializer,\n",
206 |     "                                            bias_initializer=b_initializer)\n",
207 |     "\n",
208 |     "                self.q_predict = tf.layers.dense(phi_state,\n",
209 |     "                                                 self.a_space,\n",
210 |     "                                                 kernel_initializer=w_initializer,\n",
211 |     "                                                 bias_initializer=b_initializer)\n",
212 |     "\n",
213 |     "            with tf.variable_scope('target_q_net'):\n",
214 |     "                phi_state_next = tf.layers.dense(self.s_n,\n",
215 |     "                                                 32,\n",
216 |     "                                                 tf.nn.relu,\n",
217 |     "                                                 kernel_initializer=w_initializer,\n",
218 |     "                                                 bias_initializer=b_initializer)\n",
219 |     "\n",
220 |     "                self.q_target = tf.layers.dense(phi_state_next,\n",
221 |     "                                                self.a_space,\n",
222 |     "                                                kernel_initializer=w_initializer,\n",
223 |     "                                                bias_initializer=b_initializer)\n",
224 |     "\n",
225 |     "    def _init_op(self):\n",
226 |     "        with tf.variable_scope('q_real'):\n",
227 |     "            # size of q_value_real is [BATCH_SIZE, 1]\n",
228 |     "            max_q_value = tf.reduce_max(self.q_target, axis=1)\n",
229 |     "            q_next = self.r + self.gamma * max_q_value\n",
230 |     "            self.q_next = tf.stop_gradient(q_next)\n",
231 |     "\n",
232 |     "        with tf.variable_scope('q_predict'):\n",
233 |     "            # size of q_value_predict is [BATCH_SIZE, 1]\n",
234 |     "            action_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)\n",
235 |     "            self.q_eval = tf.gather_nd(self.q_predict, action_indices)\n",
236 |     "\n",
237 |     "        with tf.variable_scope('loss'):\n",
238 |     "            self.loss_func = tf.reduce_mean(tf.squared_difference(self.q_next, self.q_eval, name='mse'))\n",
239 |     "\n",
240 |     "        with tf.variable_scope('train'):\n",
241 |     "            self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_func)\n",
242 |     "\n",
243 |     "        with tf.variable_scope('update_target_net'):\n",
244 |     "            t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_net')\n",
245 |     "            p_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='predict_q_net')\n",
246 |     "            self.update_q_net = [tf.assign(t, e) for t, e in zip(t_params, p_params)]\n",
247 |     "\n",
248 |     "    def predict(self, s):\n",
249 |     "        if np.random.uniform() < self.epsilon:\n",
250 |     "            a = np.argmax(self.session.run(self.q_predict, feed_dict={self.s: s[np.newaxis, :]}))\n",
251 |     "        else:\n",
252 |     "            a = np.random.randint(0, self.a_space)\n",
253 |     "        return a\n",
254 |     "\n",
255 |     "    def snapshot(self, s, a, r, s_n):\n",
256 |     "        self.buffer[self.buffer_count % self.buffer_size, :] = np.hstack((s, [a, r], s_n))\n",
257 |     "        self.buffer_count += 1\n",
258 |     "\n",
259 |     "    def train(self):\n",
260 |     "        if self.total_train_step % self.update_target_net_step == 0:\n",
261 |     "            self.session.run(self.update_q_net)\n",
262 |     "\n",
263 |     "        batch = self.buffer[np.random.choice(self.buffer_size, size=self.batch_size), :]\n",
264 |     "\n",
265 |     "        s = batch[:, :self.s_space]\n",
266 |     "        s_n = batch[:, -self.s_space:]\n",
267 |     "        a = batch[:, self.s_space].reshape((-1))\n",
268 |     "        r = batch[:, self.s_space + 1]\n",
269 |     "\n",
270 |     "        _, cost = self.session.run([self.train_op, self.loss_func], {\n",
271 |     "            self.s: s, self.a: a, self.r: r, self.s_n: s_n\n",
272 |     "        })\n",
273 |     "\n",
274 |     "    def run(self):\n",
275 |     "        if self.mode == 'train':\n",
276 |     "            for episode in range(self.train_episodes):\n",
277 |     "                s, r_episode = self.env.reset(), 0\n",
278 |     "                while True:\n",
279 |     "                    # if episode > 400:\n",
280 |     "                    #     self.env.render()\n",
281 |     "                    a = self.predict(s)\n",
282 |     "                    s_n, r, done, _ = self.env.step(a)\n",
283 |     "                    if done:\n",
284 |     "                        r = -5\n",
285 |     "                    r_episode += r\n",
286 |     "                    self.snapshot(s, a, r_episode, s_n)\n",
287 |     "                    s = s_n\n",
288 |     "                    if done:\n",
289 |     "                        break\n",
290 |     "                if self.buffer_count > self.buffer_size:\n",
291 |     "                    self.train()\n",
292 |     "                if episode % 200 == 0:\n",
293 |     "                    self.logger.warning('Episode: {} | Rewards: {}'.format(episode, r_episode))\n",
294 |     "                    self.save()\n",
295 |     "        else:\n",
296 |     "            for episode in range(self.eval_episodes):\n",
297 |     "                s, r_episode = self.env.reset()\n",
298 |     "                while True:\n",
299 |     "                    a = self.predict(s)\n",
300 |     "                    s_n, r, done, _ = self.env.step(a)\n",
301 |     "                    r_episode += r\n",
302 |     "                    s = s_n\n",
303 |     "                    if done:\n",
304 |     "                        break"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": 4,
310 |    "metadata": {},
311 |    "outputs": [],
312 |    "source": [
313 |     "def main(_):\n",
314 |     "    # Make env.\n",
315 |     "    env = gym.make('CartPole-v0')\n",
316 |     "    env.seed(1)\n",
317 |     "    env = env.unwrapped\n",
318 |     "    # Init session.\n",
319 |     "    session = tf.Session()\n",
320 |     "    # Init agent.\n",
321 |     "    agent = Agent(session, env, env.action_space.n, env.observation_space.shape[0], **{\n",
322 |     "        KEY_MODEL_NAME: 'DQN',\n",
323 |     "        KEY_TRAIN_EPISODE: 3000\n",
324 |     "    })\n",
325 |     "    agent.run()\n"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "metadata": {},
332 |    "outputs": [
333 |     {
334 |      "name": "stdout",
335 |      "output_type": "stream",
336 |      "text": [
337 |       "\u001b[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.\u001b[0m\n"
338 |      ]
339 |     },
340 |     {
341 |      "name": "stderr",
342 |      "output_type": "stream",
343 |      "text": [
344 |       "Episode: 0 | Rewards: 3.0\n",
345 |       "Episode: 200 | Rewards: 4.0\n",
346 |       "Episode: 400 | Rewards: 4.0\n",
347 |       "Episode: 600 | Rewards: 4.0\n",
348 |       "Episode: 800 | Rewards: 3.0\n",
349 |       "Episode: 1000 | Rewards: 3.0\n",
350 |       "Episode: 1200 | Rewards: 36.0\n",
351 |       "Episode: 1400 | Rewards: 50.0\n",
352 |       "Episode: 1600 | Rewards: 31.0\n",
353 |       "Episode: 1800 | Rewards: 187.0\n"
354 |      ]
355 |     }
356 |    ],
357 |    "source": [
358 |     "main(_)"
359 |    ]
360 |   }
361 |  ],
362 |  "metadata": {
363 |   "kernelspec": {
364 |    "display_name": "Python 3",
365 |    "language": "python",
366 |    "name": "python3"
367 |   },
368 |   "language_info": {
369 |    "codemirror_mode": {
370 |     "name": "ipython",
371 |     "version": 3
372 |    },
373 |    "file_extension": ".py",
374 |    "mimetype": "text/x-python",
375 |    "name": "python",
376 |    "nbconvert_exporter": "python",
377 |    "pygments_lexer": "ipython3",
378 |    "version": "3.5.4"
379 |   }
380 |  },
381 |  "nbformat": 4,
382 |  "nbformat_minor": 1
383 | }
384 | 


--------------------------------------------------------------------------------
/note/PPO.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# Proximal Policy Optimization （PPO）"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## 背景"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "Proximal Policy Optimization，简称PPO，即近端策略优化，是对Policy Graident，即策略梯度的一种改进算法。PPO的核心精神在于，通过一种被称之为Importce Sampling的方法，将Policy Gradient中On-policy的训练过程转化为Off-policy，即从在线学习转化为离线学习，某种意义上与基于值迭代算法中的Experience Replay有异曲同工之处。通过这个改进，训练速度与效果在实验上相较于Policy Gradient具有明显提升。"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "## Policy Gradient"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "Policy Gradient是一种基于策略迭代的强化学习算法，不同于基于值迭代的DQN、Double-DQN、Duling-DQN通过间接地估计动作-状态值函数来学习的过程，Policy Gradient直接地通过采样状态、动作、奖励，然后期望直接最大化奖励的期望。PPO与PG都希望最大化奖励的期望，当采样足够充分时，奖励的期望可以近似为N回合的奖励的平均值："
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "$$\n",
 45 |     "\\bar{R}_{\\theta} = \\sum_{\\tau} R(\\tau) P(\\tau \\lvert \\theta) \\approx \\frac{1}{N} \\sum^{N}_{n=1} R(\\tau^{n})\n",
 46 |     "$$"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "上式中的第n回合的奖励值之和$R(\\tau^n)$被定义为如下形式："
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "$$\n",
 61 |     "R(\\tau) = \\sum^{T}_{t=1} r_t\n",
 62 |     "$$"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "在前篇专门介绍Policy Gradient文章中，已经详细地推导了关于$\\nabla \\bar{R}_{\\theta}$的计算方法，所以在这里的具体推导过程将略过，最后关于$\\nabla \\bar{R}_{\\theta}$的计算公式将有如下形式："
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "$$\n",
 77 |     "\\nabla \\bar{R}_{\\theta} = \\frac{1}{N} \\sum^{N}_{n=1} \\sum^{T_n}_{t=1} R(\\tau^n) \\nabla  \\log p(a_t \\lvert s_t, \\theta)\n",
 78 |     "$$"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "本质上是最小化N回合采样出的动作与网络输出的动作的交叉熵的基础上乘以$R(\\tau^n)$，奖励值给了梯度下降的方向，推导出了$\\nabla \\bar{R}_{\\theta}$，其实就已经可以根据梯度下降法反向传播改进网络进行训练了，但是通常情况下我们会根据具体的问题对$R(\\tau^n)$做一些修正。"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "## Actor-Critic Model"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "对$R(\\tau^n)$的修正通常情况下是必须的，也是有意义的，符合直觉的。以CartPole-v0与MountainCar-v0，即小车倒立杆和过山车游戏为例，每一个状态采取的动作对整个回合的奖励和是不同的，对于小车倒立杆问题而言，初始的几个状态采取的动作直接决定了杆是否会很快地倒，所以直觉地他们更加重要，而对于过山车问题而言，在小车即将爬上山时的这些状态采取的动作直接决定了小车能不能爬上山，所以直觉地他们更加重要。"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "这将引入我们的第一个改进，对于小车倒立杆问题而言，我们需要针对每一个状态、动作元组对$R(\\tau^n)$进行如下替换："
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "$$\n",
114 |     "R(\\tau^n) \\rightarrow \\sum^{T_n}_{t=t^{\\prime}} \\gamma^{t} r^{n}_{t}\n",
115 |     "$$"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "这样原来的梯度公式将会被改写为以下形式："
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "$$\n",
130 |     "\\nabla \\bar{R}_{\\theta} = \\frac{1}{N} \\sum^{N}_{n=1} \\sum^{T_n}_{t=1} \\sum^{T_n}_{t=t^{\\prime}} \\gamma^{t} r^{n}_{t} \\nabla  \\log p(a_t \\lvert s_t, \\theta)\n",
131 |     "$$"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "但是这样还存在一个称之为Overestimate，即过估计的问题。因为在实际情况中，我们的状态-动作采样通常是不充分的，这会导致一些一些动作或者状态几乎不会被采样，这样在进行梯度下降训练网络时，在这些状态对应的动作将可能被极大的放大或者缩小。由于输出层是soft-max，这些概率会此消彼长，这显然不是我们想看到的。所以我们需要做第二个改进：引入Baseline，通常可能是一个待调整的常超参数，或者Critic，通常是一个待训练的网络。"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "如果引入的是一个Critic，这样的模型将会被称之为Actor-Critic Model，即演员-评论家模型，而N回合平均奖励值的梯度将会被改写为以下形式："
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "$$\n",
153 |     "\\nabla \\bar{R}_{\\theta} = \\frac{1}{N} \\sum^{N}_{n=1} \\sum^{T_n}_{t=1} A^{\\theta}(a_t \\lvert s_t) \\nabla  \\log p(a_t \\lvert s_t, \\theta)\n",
154 |     "$$"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "在一次训练过程中，我们会按顺序同时更新这两个网络，目前这样的模型已经被广泛使用，并在实验上证明了较好的效果。"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "## Importance Sampling"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "在前面提到，PPO的一个核心改进是将Policy Gradient中On-policy的训练过程转化为Off-policy，即从在线学习转化为离线学习，这个转化过程被称之为Importance Sampling，是一种数学手段。如果我们有连续随机变量X，它的概率密度函数记作$p(x)$，则$f(x)$的期望通过如下公式计算："
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "$$\n",
183 |     "E_{x \\sim p} \\left[ f(x) \\right] = \\int^{}_{} f(x)p(x)dx\n",
184 |     "$$"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "若我们对于连续随机变量X，有另一个概率密度函数记作$q(x)$，那么他们将有以下关系："
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "metadata": {},
197 |    "source": [
198 |     "$$\n",
199 |     "E_{x \\sim p} \\left[ f(x) \\right] = \\int f(x) \\cdot p(x)dx = \\int f(x) \\frac{p(x)}{q(x)} \\cdot q(x) dx = E_{x \\sim q} \\left[ f(x) \\frac{p(x)}{q(x)} \\right]\n",
200 |     "$$"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "在上式中最右边的项中，$\\frac{p(x)}{q(x)}$被称之为Importance Weight，类比到我们的问题，$f(x)$是$A^{\\theta}(a_t \\lvert s_t)$，而$\\frac{p(x)}{q(x)}$，则是新老策略对于当前状态采取当前动作对应的概率之比，这句话比较费解，更加具体一些，对于小车倒立杆为例，动作是离散的，在网络的输出是一组离散的概率分布，以这个概率分布选择动作，这个动作在新老策略中，在当前状态中都对应了一个概率值，$\\frac{p(x)}{q(x)}$即是他们的比值。"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "通过这一操作，在采样充分的情况下，我们可以认为："
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "metadata": {},
220 |    "source": [
221 |     "$$\n",
222 |     "E_{x \\sim p} \\left[ f(x) \\right] =  E_{x \\sim q} \\left[ f(x) \\frac{p(x)}{q(x)} \\right]\n",
223 |     "$$"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "## Proximal Policy Optimization"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "markdown",
235 |    "metadata": {},
236 |    "source": [
237 |     "最终我们将推导出PPO，Importance Sampling将给我们将On-policy的训练过程转化为Off-policy以基础，即我们可以通过老策略，即$q(x)$进行充分采样，然后改进新策略$p(x)$，这个过程可以在一回合重复N次，而不再是1次，这样大幅度减少了原始PG算法在线学习进行采样状态-动作-奖励元组对时间，同时保证了训练效果，而N回合平均奖励值的梯度也将被改写为以下形式："
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "$$\n",
245 |     "\\nabla \\bar{R}_{\\theta} = \\frac{1}{N} \\sum^{N}_{n=1} \\sum^{T_n}_{t=1} \\frac{p_{\\theta}(a_t \\lvert s_t)}{p_{\\theta^{\\prime}}(a_t \\lvert s_t)} A^{\\theta}(a_t \\lvert s_t) \\nabla  \\log p(a_t \\lvert s_t, \\theta)\n",
246 |     "$$"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "markdown",
251 |    "metadata": {},
252 |    "source": [
253 |     "在实际训练过程中，会有一个对$\\frac{p_{\\theta}(a_t \\lvert s_t)}{p_{\\theta^{\\prime}}(a_t \\lvert s_t)}$的clip的操作："
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "$$\n",
261 |     "clip(\\frac{p_{\\theta}(a_t \\lvert s_t)}{p_{\\theta^{\\prime}}(a_t \\lvert s_t)}, 1 - \\epsilon, 1 + \\epsilon)\n",
262 |     "$$"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {},
268 |    "source": [
269 |     "相当于一个正则化的操作，其中$\\epsilon$是一个可调整的超参数，至此，PPO也就介绍完了。"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "metadata": {},
275 |    "source": [
276 |     "## Experiment"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 5,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "# coding=utf-8\n",
286 |     "\n",
287 |     "import tensorflow as tf\n",
288 |     "import numpy as np\n",
289 |     "import gym\n",
290 |     "import sys\n",
291 |     "\n",
292 |     "sys.path.append('..')\n",
293 |     "\n",
294 |     "from base.model import BaseRLModel\n",
295 |     "\n",
296 |     "class Agent(BaseRLModel):\n",
297 |     "\n",
298 |     "    def __init__(self, session, env, a_space, s_space, **options):\n",
299 |     "        super(Agent, self).__init__(session, env, a_space, s_space, **options)\n",
300 |     "\n",
301 |     "        self._init_input()\n",
302 |     "        self._init_nn()\n",
303 |     "        self._init_op()\n",
304 |     "        self._init_saver()\n",
305 |     "\n",
306 |     "        self.a_buffer = []\n",
307 |     "        self.s_buffer = []\n",
308 |     "        self.r_buffer = []\n",
309 |     "        self.a_p_r_buffer = []\n",
310 |     "\n",
311 |     "        self.session.run(tf.global_variables_initializer())\n",
312 |     "\n",
313 |     "    def _init_input(self, *args):\n",
314 |     "        with tf.variable_scope('input'):\n",
315 |     "            self.s = tf.placeholder(tf.float32, [None, self.s_space], name='s')\n",
316 |     "            self.a = tf.placeholder(tf.int32, [None, ], name='a')\n",
317 |     "            self.r = tf.placeholder(tf.float32, [None, ], name='r')\n",
318 |     "            self.adv = tf.placeholder(tf.float32, [None, ], name='adv')\n",
319 |     "            self.a_p_r = tf.placeholder(tf.float32, [None, ], name='a_p_r')\n",
320 |     "\n",
321 |     "    def _init_nn(self, *args):\n",
322 |     "        self.advantage, self.value = self._init_critic_net('critic_net')\n",
323 |     "        self.a_prob_eval, self.a_logits_eval = self._init_actor_net('eval_actor_net')\n",
324 |     "        self.a_prob_target, self.a_logits_target = self._init_actor_net('target_actor_net', trainable=False)\n",
325 |     "\n",
326 |     "    def _init_op(self):\n",
327 |     "        with tf.variable_scope('critic_loss_func'):\n",
328 |     "            # loss func.\n",
329 |     "            self.c_loss_func = tf.losses.mean_squared_error(labels=self.r, predictions=self.value)\n",
330 |     "        with tf.variable_scope('critic_optimizer'):\n",
331 |     "            # critic optimizer.\n",
332 |     "            self.c_optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.c_loss_func)\n",
333 |     "        with tf.variable_scope('update_target_actor_net'):\n",
334 |     "            # Get eval w, b.\n",
335 |     "            params_e = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_actor_net')\n",
336 |     "            params_t = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor_net')\n",
337 |     "            self.update_target_a_op = [tf.assign(t, e) for t, e in zip(params_t, params_e)]\n",
338 |     "        with tf.variable_scope('actor_loss_func'):\n",
339 |     "            # one hot a.\n",
340 |     "            a_one_hot = tf.one_hot(self.a, self.a_space)\n",
341 |     "            # cross entropy.\n",
342 |     "            cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=a_one_hot, logits=self.a_logits_eval)\n",
343 |     "            # loss func.\n",
344 |     "            self.a_loss_func = tf.reduce_mean(cross_entropy * self.adv * self.a_p_r)\n",
345 |     "        with tf.variable_scope('actor_optimizer'):\n",
346 |     "            self.a_optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.a_loss_func)\n",
347 |     "\n",
348 |     "    def _init_actor_net(self, scope, trainable=True):\n",
349 |     "        with tf.variable_scope(scope):\n",
350 |     "            # Kernel initializer.\n",
351 |     "            w_initializer = tf.random_normal_initializer(0.0, 0.01)\n",
352 |     "            # First dense.\n",
353 |     "            f_dense = tf.layers.dense(self.s, 32, tf.nn.relu, trainable=trainable, kernel_initializer=w_initializer)\n",
354 |     "            # Second dense.\n",
355 |     "            s_dense = tf.layers.dense(f_dense, 32, tf.nn.relu, trainable=trainable, kernel_initializer=w_initializer)\n",
356 |     "            # Action logits.\n",
357 |     "            a_logits = tf.layers.dense(s_dense, self.a_space, trainable=trainable, kernel_initializer=w_initializer)\n",
358 |     "            # Action prob.\n",
359 |     "            a_prob = tf.nn.softmax(a_logits)\n",
360 |     "            return a_prob, a_logits\n",
361 |     "\n",
362 |     "    def _init_critic_net(self, scope):\n",
363 |     "        with tf.variable_scope(scope):\n",
364 |     "            # Kernel initializer.\n",
365 |     "            w_initializer = tf.random_normal_initializer(0.0, 0.01)\n",
366 |     "            # First dense.\n",
367 |     "            f_dense = tf.layers.dense(self.s, 64, tf.nn.relu, kernel_initializer=w_initializer)\n",
368 |     "            # Value.\n",
369 |     "            value = tf.layers.dense(f_dense, 1)\n",
370 |     "            value = tf.reshape(value, [-1, ])\n",
371 |     "            # Advantage.\n",
372 |     "            advantage = self.r - value\n",
373 |     "            return advantage, value\n",
374 |     "\n",
375 |     "    def predict(self, s):\n",
376 |     "        # Calculate a eval prob.\n",
377 |     "        a_prob_eval, a_prob_target = self.session.run([self.a_prob_eval, self.a_prob_target], {self.s: [s]})\n",
378 |     "        # Calculate action prob ratio between eval and target.\n",
379 |     "        a_p_r = np.max(a_prob_eval) / np.max(a_prob_target)\n",
380 |     "        self.a_p_r_buffer.append(a_p_r)\n",
381 |     "        return np.random.choice(range(a_prob_eval.shape[1]), p=a_prob_eval.ravel())\n",
382 |     "\n",
383 |     "    def snapshot(self, s, a, r, _):\n",
384 |     "        self.a_buffer.append(a)\n",
385 |     "        self.s_buffer.append(s)\n",
386 |     "        self.r_buffer.append(r)\n",
387 |     "\n",
388 |     "    def train(self):\n",
389 |     "        # Copy r_buffer\n",
390 |     "        r_buffer = self.r_buffer\n",
391 |     "        # Init r_tau\n",
392 |     "        r_tau = 0\n",
393 |     "        # Calculate r_tau\n",
394 |     "        for index in reversed(range(0, len(r_buffer))):\n",
395 |     "            r_tau = r_tau * self.gamma + r_buffer[index]\n",
396 |     "            self.r_buffer[index] = r_tau\n",
397 |     "        # Calculate adv.\n",
398 |     "        adv_buffer = self.session.run(self.advantage, {self.s: self.s_buffer, self.r: self.r_buffer})\n",
399 |     "        # Minimize loss.\n",
400 |     "        self.session.run([self.a_optimizer, self.c_optimizer], {\n",
401 |     "            self.adv: adv_buffer,\n",
402 |     "            self.s: self.s_buffer,\n",
403 |     "            self.a: self.a_buffer,\n",
404 |     "            self.r: self.r_buffer,\n",
405 |     "            self.a_p_r: self.a_p_r_buffer,\n",
406 |     "        })\n",
407 |     "        self.s_buffer = []\n",
408 |     "        self.a_buffer = []\n",
409 |     "        self.r_buffer = []\n",
410 |     "        self.a_p_r_buffer = []\n",
411 |     "\n",
412 |     "    def run(self):\n",
413 |     "        if self.mode == 'train':\n",
414 |     "            for episode in range(self.train_episodes):\n",
415 |     "                s, r_episode = self.env.reset(), 0\n",
416 |     "                while True:\n",
417 |     "                    if episode > 200:\n",
418 |     "                        self.env.render()\n",
419 |     "                    a = self.predict(s)\n",
420 |     "                    s_n, r, done, _ = self.env.step(a)\n",
421 |     "                    if done:\n",
422 |     "                        r = -5\n",
423 |     "                    r_episode += r\n",
424 |     "                    self.snapshot(s, a, r, s_n)\n",
425 |     "                    s = s_n\n",
426 |     "                    if done:\n",
427 |     "                        break\n",
428 |     "                self.train()\n",
429 |     "                if episode % 25 == 0:\n",
430 |     "                    self.logger.warning('Episode: {} | Rewards: {}'.format(episode, r_episode))\n",
431 |     "                    self.save()\n",
432 |     "        else:\n",
433 |     "            for episode in range(self.eval_episodes):\n",
434 |     "                s, r_episode = self.env.reset()\n",
435 |     "                while True:\n",
436 |     "                    a = self.predict(s)\n",
437 |     "                    s_n, r, done, _ = self.env.step(a)\n",
438 |     "                    r_episode += r\n",
439 |     "                    s = s_n\n",
440 |     "                    if done:\n",
441 |     "                        break"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "markdown",
446 |    "metadata": {},
447 |    "source": [
448 |     "## Running"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": null,
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": [
457 |     "# Make env.\n",
458 |     "env = gym.make('CartPole-v0')\n",
459 |     "env.seed(1)\n",
460 |     "env = env.unwrapped\n",
461 |     "# Init session.\n",
462 |     "session = tf.Session()\n",
463 |     "# Init agent.\n",
464 |     "agent = Agent(session, env, env.action_space.n, env.observation_space.shape[0], **{\n",
465 |     "    'model_name': 'PolicyGradient',\n",
466 |     "})\n",
467 |     "agent.run()"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "markdown",
472 |    "metadata": {},
473 |    "source": [
474 |     "## 结尾"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "markdown",
479 |    "metadata": {},
480 |    "source": [
481 |     "就目前观察，PPO在小车倒立杆问题上的收敛速度几倍于PG与一票基于值迭代的方法，让我非常惊讶。"
482 |    ]
483 |   }
484 |  ],
485 |  "metadata": {
486 |   "kernelspec": {
487 |    "display_name": "Python 3",
488 |    "language": "python",
489 |    "name": "python3"
490 |   },
491 |   "language_info": {
492 |    "codemirror_mode": {
493 |     "name": "ipython",
494 |     "version": 3
495 |    },
496 |    "file_extension": ".py",
497 |    "mimetype": "text/x-python",
498 |    "name": "python",
499 |    "nbconvert_exporter": "python",
500 |    "pygments_lexer": "ipython3",
501 |    "version": "3.5.4"
502 |   }
503 |  },
504 |  "nbformat": 4,
505 |  "nbformat_minor": 1
506 | }
507 | 


--------------------------------------------------------------------------------
/note/.ipynb_checkpoints/PPO-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# Proximal Policy Optimization （PPO）"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## 背景"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "Proximal Policy Optimization，简称PPO，即近端策略优化，是对Policy Graident，即策略梯度的一种改进算法。PPO的核心精神在于，通过一种被称之为Importce Sampling的方法，将Policy Gradient中On-policy的训练过程转化为Off-policy，即从在线学习转化为离线学习，某种意义上与基于值迭代算法中的Experience Replay有异曲同工之处。通过这个改进，训练速度与效果在实验上相较于Policy Gradient具有明显提升。"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "## Policy Gradient"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "Policy Gradient是一种基于策略迭代的强化学习算法，不同于基于值迭代的DQN、Double-DQN、Duling-DQN通过间接地估计动作-状态值函数来学习的过程，Policy Gradient直接地通过采样状态、动作、奖励，然后期望直接最大化奖励的期望。PPO与PG都希望最大化奖励的期望，当采样足够充分时，奖励的期望可以近似为N回合的奖励的平均值："
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "$$\n",
 45 |     "\\bar{R}_{\\theta} = \\sum_{\\tau} R(\\tau) P(\\tau \\lvert \\theta) \\approx \\frac{1}{N} \\sum^{N}_{n=1} R(\\tau^{n})\n",
 46 |     "$$"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "上式中的第n回合的奖励值之和$R(\\tau^n)$被定义为如下形式："
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "$$\n",
 61 |     "R(\\tau) = \\sum^{T}_{t=1} r_t\n",
 62 |     "$$"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "在前篇专门介绍Policy Gradient文章中，已经详细地推导了关于$\\nabla \\bar{R}_{\\theta}$的计算方法，所以在这里的具体推导过程将略过，最后关于$\\nabla \\bar{R}_{\\theta}$的计算公式将有如下形式："
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "$$\n",
 77 |     "\\nabla \\bar{R}_{\\theta} = \\frac{1}{N} \\sum^{N}_{n=1} \\sum^{T_n}_{t=1} R(\\tau^n) \\nabla  \\log p(a_t \\lvert s_t, \\theta)\n",
 78 |     "$$"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "本质上是最小化N回合采样出的动作与网络输出的动作的交叉熵的基础上乘以$R(\\tau^n)$，奖励值给了梯度下降的方向，推导出了$\\nabla \\bar{R}_{\\theta}$，其实就已经可以根据梯度下降法反向传播改进网络进行训练了，但是通常情况下我们会根据具体的问题对$R(\\tau^n)$做一些修正。"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "## Actor-Critic Model"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "对$R(\\tau^n)$的修正通常情况下是必须的，也是有意义的，符合直觉的。以CartPole-v0与MountainCar-v0，即小车倒立杆和过山车游戏为例，每一个状态采取的动作对整个回合的奖励和是不同的，对于小车倒立杆问题而言，初始的几个状态采取的动作直接决定了杆是否会很快地倒，所以直觉地他们更加重要，而对于过山车问题而言，在小车即将爬上山时的这些状态采取的动作直接决定了小车能不能爬上山，所以直觉地他们更加重要。"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "这将引入我们的第一个改进，对于小车倒立杆问题而言，我们需要针对每一个状态、动作元组对$R(\\tau^n)$进行如下替换："
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "$$\n",
114 |     "R(\\tau^n) \\rightarrow \\sum^{T_n}_{t=t^{\\prime}} \\gamma^{t} r^{n}_{t}\n",
115 |     "$$"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "这样原来的梯度公式将会被改写为以下形式："
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "$$\n",
130 |     "\\nabla \\bar{R}_{\\theta} = \\frac{1}{N} \\sum^{N}_{n=1} \\sum^{T_n}_{t=1} \\sum^{T_n}_{t=t^{\\prime}} \\gamma^{t} r^{n}_{t} \\nabla  \\log p(a_t \\lvert s_t, \\theta)\n",
131 |     "$$"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "但是这样还存在一个称之为Overestimate，即过估计的问题。因为在实际情况中，我们的状态-动作采样通常是不充分的，这会导致一些一些动作或者状态几乎不会被采样，这样在进行梯度下降训练网络时，在这些状态对应的动作将可能被极大的放大或者缩小。由于输出层是soft-max，这些概率会此消彼长，这显然不是我们想看到的。所以我们需要做第二个改进：引入Baseline，通常可能是一个待调整的常超参数，或者Critic，通常是一个待训练的网络。"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "如果引入的是一个Critic，这样的模型将会被称之为Actor-Critic Model，即演员-评论家模型，而N回合平均奖励值的梯度将会被改写为以下形式："
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "$$\n",
153 |     "\\nabla \\bar{R}_{\\theta} = \\frac{1}{N} \\sum^{N}_{n=1} \\sum^{T_n}_{t=1} A^{\\theta}(a_t \\lvert s_t) \\nabla  \\log p(a_t \\lvert s_t, \\theta)\n",
154 |     "$$"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "在一次训练过程中，我们会按顺序同时更新这两个网络，目前这样的模型已经被广泛使用，并在实验上证明了较好的效果。"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "## Importance Sampling"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "在前面提到，PPO的一个核心改进是将Policy Gradient中On-policy的训练过程转化为Off-policy，即从在线学习转化为离线学习，这个转化过程被称之为Importance Sampling，是一种数学手段。如果我们有连续随机变量X，它的概率密度函数记作$p(x)$，则$f(x)$的期望通过如下公式计算："
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "$$\n",
183 |     "E_{x \\sim p} \\left[ f(x) \\right] = \\int^{}_{} f(x)p(x)dx\n",
184 |     "$$"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "若我们对于连续随机变量X，有另一个概率密度函数记作$q(x)$，那么他们将有以下关系："
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "metadata": {},
197 |    "source": [
198 |     "$$\n",
199 |     "E_{x \\sim p} \\left[ f(x) \\right] = \\int f(x) \\cdot p(x)dx = \\int f(x) \\frac{p(x)}{q(x)} \\cdot q(x) dx = E_{x \\sim q} \\left[ f(x) \\frac{p(x)}{q(x)} \\right]\n",
200 |     "$$"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "在上式中最右边的项中，$\\frac{p(x)}{q(x)}$被称之为Importance Weight，类比到我们的问题，$f(x)$是$A^{\\theta}(a_t \\lvert s_t)$，而$\\frac{p(x)}{q(x)}$，则是新老策略对于当前状态采取当前动作对应的概率之比，这句话比较费解，更加具体一些，对于小车倒立杆为例，动作是离散的，在网络的输出是一组离散的概率分布，以这个概率分布选择动作，这个动作在新老策略中，在当前状态中都对应了一个概率值，$\\frac{p(x)}{q(x)}$即是他们的比值。"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "通过这一操作，在采样充分的情况下，我们可以认为："
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "metadata": {},
220 |    "source": [
221 |     "$$\n",
222 |     "E_{x \\sim p} \\left[ f(x) \\right] =  E_{x \\sim q} \\left[ f(x) \\frac{p(x)}{q(x)} \\right]\n",
223 |     "$$"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "## Proximal Policy Optimization"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "markdown",
235 |    "metadata": {},
236 |    "source": [
237 |     "最终我们将推导出PPO，Importance Sampling将给我们将On-policy的训练过程转化为Off-policy以基础，即我们可以通过老策略，即$q(x)$进行充分采样，然后改进新策略$p(x)$，这个过程可以在一回合重复N次，而不再是1次，这样大幅度减少了原始PG算法在线学习进行采样状态-动作-奖励元组对时间，同时保证了训练效果，而N回合平均奖励值的梯度也将被改写为以下形式："
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "$$\n",
245 |     "\\nabla \\bar{R}_{\\theta} = \\frac{1}{N} \\sum^{N}_{n=1} \\sum^{T_n}_{t=1} \\frac{p_{\\theta}(a_t \\lvert s_t)}{p_{\\theta^{\\prime}}(a_t \\lvert s_t)} A^{\\theta}(a_t \\lvert s_t) \\nabla  \\log p(a_t \\lvert s_t, \\theta)\n",
246 |     "$$"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "markdown",
251 |    "metadata": {},
252 |    "source": [
253 |     "在实际训练过程中，会有一个对$\\frac{p_{\\theta}(a_t \\lvert s_t)}{p_{\\theta^{\\prime}}(a_t \\lvert s_t)}$的clip的操作："
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "$$\n",
261 |     "clip(\\frac{p_{\\theta}(a_t \\lvert s_t)}{p_{\\theta^{\\prime}}(a_t \\lvert s_t)}, 1 - \\epsilon, 1 + \\epsilon)\n",
262 |     "$$"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {},
268 |    "source": [
269 |     "相当于一个正则化的操作，其中$\\epsilon$是一个可调整的超参数，至此，PPO也就介绍完了。"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "metadata": {},
275 |    "source": [
276 |     "## Experiment"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 5,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "# coding=utf-8\n",
286 |     "\n",
287 |     "import tensorflow as tf\n",
288 |     "import numpy as np\n",
289 |     "import gym\n",
290 |     "import sys\n",
291 |     "\n",
292 |     "sys.path.append('..')\n",
293 |     "\n",
294 |     "from base.model import BaseRLModel\n",
295 |     "\n",
296 |     "class Agent(BaseRLModel):\n",
297 |     "\n",
298 |     "    def __init__(self, session, env, a_space, s_space, **options):\n",
299 |     "        super(Agent, self).__init__(session, env, a_space, s_space, **options)\n",
300 |     "\n",
301 |     "        self._init_input()\n",
302 |     "        self._init_nn()\n",
303 |     "        self._init_op()\n",
304 |     "        self._init_saver()\n",
305 |     "\n",
306 |     "        self.a_buffer = []\n",
307 |     "        self.s_buffer = []\n",
308 |     "        self.r_buffer = []\n",
309 |     "        self.a_p_r_buffer = []\n",
310 |     "\n",
311 |     "        self.session.run(tf.global_variables_initializer())\n",
312 |     "\n",
313 |     "    def _init_input(self, *args):\n",
314 |     "        with tf.variable_scope('input'):\n",
315 |     "            self.s = tf.placeholder(tf.float32, [None, self.s_space], name='s')\n",
316 |     "            self.a = tf.placeholder(tf.int32, [None, ], name='a')\n",
317 |     "            self.r = tf.placeholder(tf.float32, [None, ], name='r')\n",
318 |     "            self.adv = tf.placeholder(tf.float32, [None, ], name='adv')\n",
319 |     "            self.a_p_r = tf.placeholder(tf.float32, [None, ], name='a_p_r')\n",
320 |     "\n",
321 |     "    def _init_nn(self, *args):\n",
322 |     "        self.advantage, self.value = self._init_critic_net('critic_net')\n",
323 |     "        self.a_prob_eval, self.a_logits_eval = self._init_actor_net('eval_actor_net')\n",
324 |     "        self.a_prob_target, self.a_logits_target = self._init_actor_net('target_actor_net', trainable=False)\n",
325 |     "\n",
326 |     "    def _init_op(self):\n",
327 |     "        with tf.variable_scope('critic_loss_func'):\n",
328 |     "            # loss func.\n",
329 |     "            self.c_loss_func = tf.losses.mean_squared_error(labels=self.r, predictions=self.value)\n",
330 |     "        with tf.variable_scope('critic_optimizer'):\n",
331 |     "            # critic optimizer.\n",
332 |     "            self.c_optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.c_loss_func)\n",
333 |     "        with tf.variable_scope('update_target_actor_net'):\n",
334 |     "            # Get eval w, b.\n",
335 |     "            params_e = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_actor_net')\n",
336 |     "            params_t = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor_net')\n",
337 |     "            self.update_target_a_op = [tf.assign(t, e) for t, e in zip(params_t, params_e)]\n",
338 |     "        with tf.variable_scope('actor_loss_func'):\n",
339 |     "            # one hot a.\n",
340 |     "            a_one_hot = tf.one_hot(self.a, self.a_space)\n",
341 |     "            # cross entropy.\n",
342 |     "            cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=a_one_hot, logits=self.a_logits_eval)\n",
343 |     "            # loss func.\n",
344 |     "            self.a_loss_func = tf.reduce_mean(cross_entropy * self.adv * self.a_p_r)\n",
345 |     "        with tf.variable_scope('actor_optimizer'):\n",
346 |     "            self.a_optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.a_loss_func)\n",
347 |     "\n",
348 |     "    def _init_actor_net(self, scope, trainable=True):\n",
349 |     "        with tf.variable_scope(scope):\n",
350 |     "            # Kernel initializer.\n",
351 |     "            w_initializer = tf.random_normal_initializer(0.0, 0.01)\n",
352 |     "            # First dense.\n",
353 |     "            f_dense = tf.layers.dense(self.s, 32, tf.nn.relu, trainable=trainable, kernel_initializer=w_initializer)\n",
354 |     "            # Second dense.\n",
355 |     "            s_dense = tf.layers.dense(f_dense, 32, tf.nn.relu, trainable=trainable, kernel_initializer=w_initializer)\n",
356 |     "            # Action logits.\n",
357 |     "            a_logits = tf.layers.dense(s_dense, self.a_space, trainable=trainable, kernel_initializer=w_initializer)\n",
358 |     "            # Action prob.\n",
359 |     "            a_prob = tf.nn.softmax(a_logits)\n",
360 |     "            return a_prob, a_logits\n",
361 |     "\n",
362 |     "    def _init_critic_net(self, scope):\n",
363 |     "        with tf.variable_scope(scope):\n",
364 |     "            # Kernel initializer.\n",
365 |     "            w_initializer = tf.random_normal_initializer(0.0, 0.01)\n",
366 |     "            # First dense.\n",
367 |     "            f_dense = tf.layers.dense(self.s, 64, tf.nn.relu, kernel_initializer=w_initializer)\n",
368 |     "            # Value.\n",
369 |     "            value = tf.layers.dense(f_dense, 1)\n",
370 |     "            value = tf.reshape(value, [-1, ])\n",
371 |     "            # Advantage.\n",
372 |     "            advantage = self.r - value\n",
373 |     "            return advantage, value\n",
374 |     "\n",
375 |     "    def predict(self, s):\n",
376 |     "        # Calculate a eval prob.\n",
377 |     "        a_prob_eval, a_prob_target = self.session.run([self.a_prob_eval, self.a_prob_target], {self.s: [s]})\n",
378 |     "        # Calculate action prob ratio between eval and target.\n",
379 |     "        a_p_r = np.max(a_prob_eval) / np.max(a_prob_target)\n",
380 |     "        self.a_p_r_buffer.append(a_p_r)\n",
381 |     "        return np.random.choice(range(a_prob_eval.shape[1]), p=a_prob_eval.ravel())\n",
382 |     "\n",
383 |     "    def snapshot(self, s, a, r, _):\n",
384 |     "        self.a_buffer.append(a)\n",
385 |     "        self.s_buffer.append(s)\n",
386 |     "        self.r_buffer.append(r)\n",
387 |     "\n",
388 |     "    def train(self):\n",
389 |     "        # Copy r_buffer\n",
390 |     "        r_buffer = self.r_buffer\n",
391 |     "        # Init r_tau\n",
392 |     "        r_tau = 0\n",
393 |     "        # Calculate r_tau\n",
394 |     "        for index in reversed(range(0, len(r_buffer))):\n",
395 |     "            r_tau = r_tau * self.gamma + r_buffer[index]\n",
396 |     "            self.r_buffer[index] = r_tau\n",
397 |     "        # Calculate adv.\n",
398 |     "        adv_buffer = self.session.run(self.advantage, {self.s: self.s_buffer, self.r: self.r_buffer})\n",
399 |     "        # Minimize loss.\n",
400 |     "        self.session.run([self.a_optimizer, self.c_optimizer], {\n",
401 |     "            self.adv: adv_buffer,\n",
402 |     "            self.s: self.s_buffer,\n",
403 |     "            self.a: self.a_buffer,\n",
404 |     "            self.r: self.r_buffer,\n",
405 |     "            self.a_p_r: self.a_p_r_buffer,\n",
406 |     "        })\n",
407 |     "        self.s_buffer = []\n",
408 |     "        self.a_buffer = []\n",
409 |     "        self.r_buffer = []\n",
410 |     "        self.a_p_r_buffer = []\n",
411 |     "\n",
412 |     "    def run(self):\n",
413 |     "        if self.mode == 'train':\n",
414 |     "            for episode in range(self.train_episodes):\n",
415 |     "                s, r_episode = self.env.reset(), 0\n",
416 |     "                while True:\n",
417 |     "                    if episode > 200:\n",
418 |     "                        self.env.render()\n",
419 |     "                    a = self.predict(s)\n",
420 |     "                    s_n, r, done, _ = self.env.step(a)\n",
421 |     "                    if done:\n",
422 |     "                        r = -5\n",
423 |     "                    r_episode += r\n",
424 |     "                    self.snapshot(s, a, r, s_n)\n",
425 |     "                    s = s_n\n",
426 |     "                    if done:\n",
427 |     "                        break\n",
428 |     "                self.train()\n",
429 |     "                if episode % 25 == 0:\n",
430 |     "                    self.logger.warning('Episode: {} | Rewards: {}'.format(episode, r_episode))\n",
431 |     "                    self.save()\n",
432 |     "        else:\n",
433 |     "            for episode in range(self.eval_episodes):\n",
434 |     "                s, r_episode = self.env.reset()\n",
435 |     "                while True:\n",
436 |     "                    a = self.predict(s)\n",
437 |     "                    s_n, r, done, _ = self.env.step(a)\n",
438 |     "                    r_episode += r\n",
439 |     "                    s = s_n\n",
440 |     "                    if done:\n",
441 |     "                        break"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "markdown",
446 |    "metadata": {},
447 |    "source": [
448 |     "## Running"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": null,
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": [
457 |     "# Make env.\n",
458 |     "env = gym.make('CartPole-v0')\n",
459 |     "env.seed(1)\n",
460 |     "env = env.unwrapped\n",
461 |     "# Init session.\n",
462 |     "session = tf.Session()\n",
463 |     "# Init agent.\n",
464 |     "agent = Agent(session, env, env.action_space.n, env.observation_space.shape[0], **{\n",
465 |     "    'model_name': 'PolicyGradient',\n",
466 |     "})\n",
467 |     "agent.run()"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "markdown",
472 |    "metadata": {},
473 |    "source": [
474 |     "## 结尾"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "markdown",
479 |    "metadata": {},
480 |    "source": [
481 |     "就目前观察，PPO在小车倒立杆问题上的收敛速度几倍于PG与一票基于值迭代的方法，让我非常惊讶。"
482 |    ]
483 |   }
484 |  ],
485 |  "metadata": {
486 |   "kernelspec": {
487 |    "display_name": "Python 3",
488 |    "language": "python",
489 |    "name": "python3"
490 |   },
491 |   "language_info": {
492 |    "codemirror_mode": {
493 |     "name": "ipython",
494 |     "version": 3
495 |    },
496 |    "file_extension": ".py",
497 |    "mimetype": "text/x-python",
498 |    "name": "python",
499 |    "nbconvert_exporter": "python",
500 |    "pygments_lexer": "ipython3",
501 |    "version": "3.5.4"
502 |   }
503 |  },
504 |  "nbformat": 4,
505 |  "nbformat_minor": 1
506 | }
507 | 


--------------------------------------------------------------------------------