├── ann ├── __init__.py ├── RNN.py └── Dense.py ├── base ├── __init__.py └── model.py ├── playground ├── IRL.py ├── LinearRegression.py ├── SVM.py ├── A3C.py ├── NeuralNetwork.py ├── LogisticRegression.py ├── MLP.py ├── RegressionTree.py ├── PolicyGradient.py ├── DQN.py ├── PPO.py ├── DoubleDQN.py └── TensorFlowServing.py ├── deprecated ├── __init__.py └── main.py ├── utility ├── __init__.py ├── function.py ├── logger.py └── launcher.py ├── .gitignore ├── static ├── __init__.py └── checkpoints │ └── iris │ ├── biases.json │ └── weights.json ├── note ├── .ipynb_checkpoints │ ├── GQN-checkpoint.ipynb │ ├── PolicyGradient-checkpoint.ipynb │ ├── DQN-checkpoint.ipynb │ ├── GloVe-checkpoint.ipynb │ ├── A3C-checkpoint.ipynb │ ├── Word2Vec-checkpoint.ipynb │ ├── DoubleDQN-checkpoint.ipynb │ └── PPO-checkpoint.ipynb ├── GloVe.ipynb ├── Word2Vec.ipynb ├── A3C.ipynb ├── DoubleDQN.ipynb ├── DQN.ipynb └── PPO.ipynb ├── main.py ├── LICENSE └── README.md /ann/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /base/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /playground/IRL.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /deprecated/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utility/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | .DS_Store 3 | *.pyc 4 | 5 | .idea/ 6 | 7 | static/* 8 | 9 | -------------------------------------------------------------------------------- /static/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') 4 | 5 | LOGS_DIR = os.path.join(os.path.dirname(__file__), 'logs') 6 | 7 | CACHES_DIR = os.path.join(os.path.dirname(__file__), 'caches') 8 | 9 | IMAGES_DIR = os.path.join(os.path.dirname(__file__), 'images') 10 | 11 | SUMMARIES_DIR = os.path.join(os.path.dirname(__file__), 'summaries') 12 | 13 | CHECKPOINTS_DIR = os.path.join(os.path.dirname(__file__), 'checkpoints') 14 | -------------------------------------------------------------------------------- /static/checkpoints/iris/biases.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | [ 4 | -0.31252127786192074 5 | ], 6 | [ 7 | -0.28839935674279304 8 | ], 9 | [ 10 | -0.032112358085811865 11 | ], 12 | [ 13 | -0.0131792178653517 14 | ], 15 | [ 16 | -0.030514840147082778 17 | ], 18 | [ 19 | -0.25601258705142743 20 | ], 21 | [ 22 | -0.0008761398625049 23 | ], 24 | [ 25 | -0.38658973158121007 26 | ], 27 | [ 28 | 0.21007817654928057 29 | ], 30 | [ 31 | 0.2266149898603481 32 | ] 33 | ], 34 | [ 35 | [ 36 | -0.006395863587906123 37 | ], 38 | [ 39 | 0.8470152609903324 40 | ], 41 | [ 42 | 0.13011377406245406 43 | ] 44 | ] 45 | ] -------------------------------------------------------------------------------- /note/.ipynb_checkpoints/GQN-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# GQN" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [] 16 | } 17 | ], 18 | "metadata": { 19 | "kernelspec": { 20 | "display_name": "Python 3", 21 | "language": "python", 22 | "name": "python3" 23 | }, 24 | "language_info": { 25 | "codemirror_mode": { 26 | "name": "ipython", 27 | "version": 3 28 | }, 29 | "file_extension": ".py", 30 | "mimetype": "text/x-python", 31 | "name": "python", 32 | "nbconvert_exporter": "python", 33 | "pygments_lexer": "ipython3", 34 | "version": "3.5.4" 35 | } 36 | }, 37 | "nbformat": 4, 38 | "nbformat_minor": 1 39 | } 40 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | from sklearn.preprocessing import StandardScaler 5 | from sklearn.datasets import load_iris 6 | from utility import function 7 | from ann.Dense import Dense 8 | 9 | np.random.seed(42) 10 | 11 | iris = load_iris() 12 | 13 | scaler = StandardScaler() 14 | scaler.fit(iris.data) 15 | 16 | x_data = scaler.transform(iris.data) 17 | y_data = np.zeros((150, 3)) 18 | y_data[np.arange(150), iris.target] = 1 19 | 20 | # activation_funcs = [function.tanh] * 1 21 | activation_funcs = [function.relu] * 1 22 | # activation_funcs = [function.sigmoid] * 1 23 | activation_funcs.append(function.linear) 24 | 25 | dense = Dense(x_space=4, y_space=3, hidden_units_list=[10], **{ 26 | "loss_func": function.mean_square_error, 27 | "activation_funcs": activation_funcs, 28 | "learning_rate": 0.01, 29 | "enable_logger": True, 30 | "model_name": 'iris', 31 | "batch_size": 30, 32 | 'model': 'train', 33 | }) 34 | 35 | dense.train(x_data, y_data) 36 | # dense.restore() 37 | dense.evaluate(x_data, y_data) 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Shuyu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /utility/function.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def relu(x): 5 | return min(x, 1e2) if x > 0 else 0.0 6 | 7 | 8 | def grad_relu(x): 9 | return 1.0 if x > 0 else 0.0 10 | 11 | 12 | def sigmoid(x): 13 | return 1.0 / (1.0 + np.power(np.e, min(-x, 1e2))) 14 | 15 | 16 | def grad_sigmoid(x): 17 | return sigmoid(x) * (1 - sigmoid(x)) 18 | 19 | 20 | def tanh(x): 21 | return np.tanh(x) 22 | 23 | 24 | def grad_tanh(x): 25 | return 1 - np.power(np.tanh(x), 2) 26 | 27 | 28 | def linear(x): 29 | return x 30 | 31 | 32 | def grad_linear(x): 33 | return 1.0 34 | 35 | 36 | def softmax(x): 37 | x_copy = x.copy() 38 | a = np.exp(x_copy - np.max(x_copy, axis=1, keepdims=True)) 39 | z = np.sum(a, axis=1, keepdims=True) 40 | return a / z 41 | 42 | 43 | def mean_square_error(y, label): 44 | return np.mean(np.sqrt(np.sum(np.power(y - label, 2)))) 45 | 46 | 47 | def grad_mean_square_error(y, label): 48 | return label - y 49 | 50 | 51 | def softmax_cross_entropy(y, label): 52 | return np.mean(np.sum(label * -np.log(softmax(y) + 1e-100))) 53 | 54 | 55 | def grad_softmax_cross_entropy(y, label): 56 | return label - y 57 | 58 | -------------------------------------------------------------------------------- /note/.ipynb_checkpoints/PolicyGradient-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# 问题设定" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "在小车倒立杆(CartPole)游戏中,我们希望通过强化学习训练一个Agent,它接受一个4维向量state,分别代表:小车的位置、小车的速度、杆的角度、杆的角速度,输出一个2维向量a,代表向左和向右移动。小车每一次向左或向右移动都会加1分,但是如果杆的角度大于±12°、小车的位置大于±2.4、行动次数大于200次,游戏将会结束。我们希望在游戏结束时得分尽可能大。" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [] 30 | } 31 | ], 32 | "metadata": { 33 | "kernelspec": { 34 | "display_name": "Python 3", 35 | "language": "python", 36 | "name": "python3" 37 | }, 38 | "language_info": { 39 | "codemirror_mode": { 40 | "name": "ipython", 41 | "version": 3 42 | }, 43 | "file_extension": ".py", 44 | "mimetype": "text/x-python", 45 | "name": "python", 46 | "nbconvert_exporter": "python", 47 | "pygments_lexer": "ipython3", 48 | "version": "3.5.4" 49 | } 50 | }, 51 | "nbformat": 4, 52 | "nbformat_minor": 1 53 | } 54 | -------------------------------------------------------------------------------- /playground/LinearRegression.py: -------------------------------------------------------------------------------- 1 | from mpl_toolkits.mplot3d import Axes3D 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | data_count = 100 6 | 7 | x_data = np.linspace(-20, 20, data_count) 8 | y_data = np.multiply(2, x_data) + 3 + np.random.normal(loc=0, scale=1.0, size=(data_count,)) 9 | 10 | x_data = x_data.reshape((-1, 1)) 11 | y_data = y_data.reshape((-1, 1)) 12 | 13 | w = 10 14 | b = 20 15 | y_predict = np.dot(x_data, w) + b 16 | 17 | w_sample = np.linspace(-10, 10, data_count).reshape((-1, 1)) 18 | b_sample = np.linspace(-10, 10, data_count).reshape((-1, 1)) 19 | 20 | x_data = x_data.reshape((-1, 1)) 21 | y_data = y_data.reshape((-1, 1)) 22 | 23 | loss = np.square(np.dot(w_sample, x_data.T) + b_sample - y_data) / data_count 24 | 25 | w_cache, b_cache, l_cache, = [], [], [] 26 | 27 | for iteration in range(2000): 28 | y_predict = w * x_data + b 29 | diff = y_predict - y_data 30 | grad_w = np.mean(diff * x_data) 31 | grad_b = np.mean(diff) 32 | w -= 0.003 * grad_w 33 | b -= 0.003 * grad_b 34 | w_cache.append(w) 35 | b_cache.append(b) 36 | l_cache.append(np.mean(diff)) 37 | 38 | w_cache = np.array(w_cache).reshape((-1,)) 39 | b_cache = np.array(w_cache).reshape((-1,)) 40 | l_cache = np.array(w_cache).reshape((-1,)) 41 | 42 | 43 | figure = plt.figure(figsize=(16, 9)) 44 | figure = Axes3D(figure) 45 | figure.set_xlabel('w') 46 | figure.set_ylabel('b') 47 | figure.plot_surface(w_sample.T, b_sample, loss, cmap='rainbow') 48 | figure.scatter3D(w_cache, b_cache, l_cache, cmap='rainbow') 49 | 50 | y_predict = w * x_data + b 51 | 52 | plt.figure(figsize=(16, 9)) 53 | plt.scatter(x_data, y_data, s=10, color='g') 54 | plt.plot(x_data, y_predict) 55 | plt.title('y=2x+3') 56 | plt.xlabel('x') 57 | plt.ylabel('y') 58 | plt.show() 59 | -------------------------------------------------------------------------------- /playground/SVM.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | data_count = 100 5 | 6 | x1_positive = np.linspace(-10, 10, data_count) 7 | x2_positive = 0.3 * x1_positive + 10 + np.random.randint(-5, 5, data_count) 8 | y_positive = np.array([1] * data_count) 9 | 10 | x1_negative = np.linspace(-10, 10, data_count) 11 | x2_negative = 0.3 * x1_negative - 10 + np.random.randint(-5, 5, data_count) 12 | y_negative = np.array([-1] * data_count) 13 | 14 | x1 = np.concatenate([x1_positive, x1_negative]) 15 | x2 = np.concatenate([x2_positive, x2_negative]) 16 | 17 | y_label = np.concatenate([y_positive, y_negative]) 18 | 19 | w1 = np.random.normal(0, 0.002) 20 | w2 = np.random.normal(0, 0.002) 21 | b = np.random.normal(0, 0.002) 22 | 23 | training_steps = 1000 24 | 25 | eta = 0.001 26 | 27 | for step in range(training_steps): 28 | # grad_w1 = np.mean((w1 * x1 + w2 * x2 + b - y_label) * x1) 29 | # grad_w2 = np.mean((w1 * x1 + w2 * x2 + b - y_label) * x2) 30 | # grad_b = np.mean(w1 * x1 + w2 * x2 + b) 31 | 32 | hinge_judge_term = y_label * (w1 * x1 + w2 * x2 + b) 33 | 34 | mask_no_grad = hinge_judge_term > 1 35 | 36 | grad_before_mean_w1 = -y_label * x1 37 | grad_before_mean_w1[mask_no_grad] = 0 38 | grad_w1 = np.mean(grad_before_mean_w1) 39 | 40 | grad_before_mean_w2 = -y_label * x2 41 | grad_before_mean_w2[mask_no_grad] = 0 42 | grad_w2 = np.mean(grad_before_mean_w2) 43 | 44 | grad_before_mean_b = -y_label * 1 45 | grad_before_mean_b[mask_no_grad] = 0 46 | grad_b = np.mean(grad_before_mean_b) 47 | 48 | w1 -= eta * grad_w1 49 | w2 -= eta * grad_w2 50 | b -= eta * grad_b 51 | 52 | plt.scatter(x1_positive, x2_positive, c='r') 53 | plt.scatter(x1_negative, x2_negative, c='b') 54 | plt.plot(x1, -(w1 * x1 + b) / w2, c='g') 55 | plt.show() -------------------------------------------------------------------------------- /utility/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | from datetime import datetime 5 | from static import LOGS_DIR 6 | from time import time 7 | 8 | DATETIME_NOW = datetime.now().strftime("%Y%m%d%H%M%S") 9 | 10 | 11 | def get_logger(model_name, mode, filename): 12 | # Make path. 13 | dir_path = os.path.join(LOGS_DIR, model_name, mode) 14 | log_path = os.path.join(dir_path, '{}-{}.log'.format(DATETIME_NOW, filename)) 15 | # Check path. 16 | if not os.path.exists(dir_path): 17 | os.makedirs(dir_path) 18 | # Get logger. 19 | logger_name = model_name + '-' + filename 20 | logger = logging.getLogger(logger_name) 21 | logger.setLevel(logging.DEBUG) 22 | logger.propagate = False 23 | # Get logger stream handler. 24 | # log_sh = logging.StreamHandler(sys.stdout) 25 | log_sh = logging.StreamHandler() 26 | # log_sh.setFormatter(logging.Formatter('[{}] {}'.format('%(asctime)s', '%(message)s'))) 27 | log_sh.setLevel(logging.WARNING) 28 | # Get logger file handler. 29 | log_fh = logging.FileHandler(log_path) 30 | log_fh.setLevel(logging.DEBUG) 31 | log_fh.setFormatter(logging.Formatter('[{}] {}'.format('%(asctime)s', '%(message)s'))) 32 | # Add handler. 33 | logger.addHandler(log_sh) 34 | logger.addHandler(log_fh) 35 | return logger 36 | 37 | 38 | class TimeInspector(object): 39 | 40 | time_marks = [] 41 | 42 | @classmethod 43 | def set_time_mark(cls): 44 | _time = time() 45 | cls.time_marks.append(_time) 46 | return _time 47 | 48 | @classmethod 49 | def pop_time_mark(cls): 50 | cls.time_marks.pop() 51 | 52 | @classmethod 53 | def log_cost_time(cls, info): 54 | cost_time = time() - cls.time_marks.pop() 55 | logging.warning('Time cost: {0:.2f} | {1}'.format(cost_time, info)) 56 | -------------------------------------------------------------------------------- /playground/A3C.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import multiprocessing as mp 4 | import tensorflow as tf 5 | import logging 6 | import gym 7 | 8 | from base.model import * 9 | from playground import PPO 10 | from utility.launcher import start_game 11 | 12 | 13 | def start_a3c(cluster, role, task_index): 14 | server = tf.train.Server(cluster, job_name=role, task_index=task_index) 15 | if role == 'ps': 16 | logging.warning('Parameter server started.') 17 | server.join() 18 | else: 19 | worker_device = "/job:worker/task:{}".format(task_index) 20 | logging.warning('Worker: {}, server stated.'.format(worker_device)) 21 | with tf.device(tf.train.replica_device_setter(cluster=cluster)): 22 | # Make env. 23 | env = gym.make('CartPole-v0') 24 | env.seed(1) 25 | env = env.unwrapped 26 | # Init session. 27 | session = tf.Session(server.target) 28 | # session = tf.Session() 29 | # Init agent. 30 | agent = PPO.Agent(env.action_space.n, env.observation_space.shape[0], **{ 31 | KEY_SESSION: session, 32 | KEY_MODEL_NAME: 'PPO', 33 | KEY_TRAIN_EPISODE: 1000 34 | }) 35 | start_game(env, agent, task_index) 36 | 37 | 38 | def main(): 39 | 40 | cluster = tf.train.ClusterSpec({ 41 | 'worker': [ 42 | 'localhost:8001', 43 | 'localhost:8002', 44 | 'localhost:8003', 45 | ], 46 | 'ps': [ 47 | 'localhost:8000' 48 | ] 49 | }) 50 | 51 | role_task_index_map = [ 52 | ('ps', 0), 53 | ('worker', 0), 54 | ('worker', 1), 55 | ('worker', 2), 56 | ] 57 | 58 | pool = mp.Pool(processes=4) 59 | 60 | for role, task_index in role_task_index_map: 61 | pool.apply_async(start_a3c, args=(cluster, role, task_index, )) 62 | pool.close() 63 | pool.join() 64 | 65 | 66 | if __name__ == '__main__': 67 | main() 68 | -------------------------------------------------------------------------------- /utility/launcher.py: -------------------------------------------------------------------------------- 1 | from time import time 2 | 3 | 4 | def start_game(env, agent, process=None): 5 | # Train. 6 | if agent.mode == 'train': 7 | for episode in range(agent.train_episodes): 8 | s, r_episode, now = env.reset(), 0, time() 9 | while True: 10 | a = agent.predict(s) 11 | s_n, r, done, _ = env.step(a) 12 | r = r if not done else -10 13 | r_episode += r 14 | agent.snapshot(s, a, r, s_n) 15 | s = s_n 16 | if done: 17 | # Logs. 18 | if process is None: 19 | agent.logger.warning('Episode: {} | Times: {} | Rewards: {}'.format(episode, 20 | time() - now, 21 | r_episode)) 22 | else: 23 | agent.logger.warning('Process: {} | Episode: {} | Times: {} | Rewards: {}'.format(process, 24 | episode, 25 | time() - now, 26 | r_episode)) 27 | break 28 | agent.train() 29 | if episode % 50 == 0: 30 | agent.save() 31 | elif agent.mode == 'test': 32 | agent.restore() 33 | # Reset env. 34 | s, r_episode, now = env.reset(), 0, time() 35 | while True: 36 | env.render() 37 | a = agent.predict(s) 38 | s_n, r, done, _, = env.step(a) 39 | r_episode += r 40 | s = s_n 41 | if done: 42 | agent.logger.warning('Test mode, rewards: {}'.format(r_episode)) 43 | break 44 | -------------------------------------------------------------------------------- /static/checkpoints/iris/weights.json: -------------------------------------------------------------------------------- 1 | [ 2 | [ 3 | [ 4 | -0.18678438298795857, 5 | -0.07914966000312011, 6 | 0.6342529879617897, 7 | 0.4529811249267001 8 | ], 9 | [ 10 | -0.17595482455364025, 11 | -0.08025541057908858, 12 | 0.5922657398985512, 13 | 0.4201468573646936 14 | ], 15 | [ 16 | 0.022154357970908763, 17 | -0.017635602933466894, 18 | -0.5700575957392919, 19 | -0.6128746987080891 20 | ], 21 | [ 22 | 0.03851755585545247, 23 | 0.018353598959502188, 24 | -0.2508443055444202, 25 | -0.07711620382957872 26 | ], 27 | [ 28 | -0.0056793667464683385, 29 | 0.04912562717610973, 30 | -0.3871691049577893, 31 | -0.15609163649258434 32 | ], 33 | [ 34 | -0.19662158407431712, 35 | -0.05892142132038428, 36 | 0.5860437253337684, 37 | 0.40890938162697044 38 | ], 39 | [ 40 | 0.004943411808912222, 41 | -0.07798519264342238, 42 | -0.22017001687542007, 43 | -0.48731233342831287 44 | ], 45 | [ 46 | 0.0093202618749045, 47 | -0.32682710266355375, 48 | 0.9928554361485695, 49 | 0.3044439064210716 50 | ], 51 | [ 52 | -0.5775015666759874, 53 | 0.26320637439433137, 54 | 0.3924564664686868, 55 | -0.14772811294676533 56 | ], 57 | [ 58 | 0.6372991366359084, 59 | -0.4860761904537719, 60 | -0.36112421494322827, 61 | -0.5626834691985895 62 | ] 63 | ], 64 | [ 65 | [ 66 | 0.05898039141717556, 67 | 0.04981103294277943, 68 | 0.38030504722256914, 69 | 0.3810373423764331, 70 | 0.544658904810149, 71 | -0.13771286331281743, 72 | -0.1008109944339574, 73 | 0.01788971466924072, 74 | 0.04826178711206095, 75 | -0.007446264711328485 76 | ], 77 | [ 78 | -0.42355832492827655, 79 | -0.4480373364602654, 80 | -0.5635856073580628, 81 | -0.08327523793774468, 82 | -0.21014308350300684, 83 | 0.07039977607174346, 84 | -0.31335144648137475, 85 | -0.4500461353189752, 86 | 0.31269293745199345, 87 | 0.3466045025577133 88 | ], 89 | [ 90 | 0.1681510259423427, 91 | 0.10270294053056472, 92 | 0.12078844037424748, 93 | -0.19092135131724694, 94 | -0.2582668810741086, 95 | 0.6052927859552184, 96 | 0.39822198216164345, 97 | 0.4380923861287904, 98 | -0.3326940292288691, 99 | -0.3009245703032898 100 | ] 101 | ] 102 | ] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![License](https://img.shields.io/badge/License-MIT-blue.svg)](/LICENSE) 2 | [![Platform](https://img.shields.io/badge/Platform-Tensorflow-orange.svg)](https://www.tensorflow.org/) 3 | [![Python](https://img.shields.io/badge/Python-3.5-green.svg)]() 4 | 5 | # Learning Notes of DRL & DL 6 | 7 | A repo of Learning notes of DRL & DL, theory, codes, models and notes maybe. 8 | 9 | # Content 10 | 11 | ## Notes 12 | 13 | ### Deep Learning Basic 14 | 15 | - [LinearRegression](/note/LinearRegression.ipynb) 16 | - [LogisticRegression](/note/LogisticRegression.ipynb) 17 | - [RegressionTree](/note/RegressionTree.ipynb) 18 | - [Support Vector Machine](/note/SVM.ipynb) 19 | - [NeuralNetwork](/note/NeuralNetwork.ipynb) 20 | 21 | ### Natural Language Processing 22 | - [Word2Vec](/note/Word2Vec.ipynb) 23 | - [GloVe](/note/GloVe.ipynb) 24 | 25 | ### Deep Reinforcement Learning 26 | 27 | - [PolicyGradient](/note/PolicyGradient.ipynb) 28 | - [DQN](/note/DQN.ipynb) 29 | - [DoubleDQN](/note/DoubleDQN.ipynb) 30 | - [PPO](/note/PPO.ipynb) 31 | - [A3C / DPPO](/note/A3C.ipynb) 32 | 33 | ### Deep Learning Engineering 34 | 35 | - [TensorFlow Serving](/note/TensorFlowServing.ipynb) 36 | 37 | ### Docker 38 | 39 | - [Docker Notes](/note/Docker.ipynb) 40 | 41 | ## Codes 42 | 43 | - [Artifical Neuron Network (ANN)](/ann/Dense.py) 44 | 45 | 46 | # Requirements 47 | - numpy 48 | - scipy 49 | - sklearn 50 | - matplotlib 51 | - tensorflow==1.8 52 | 53 | # Instructions for codes 54 | 55 | ### [Artifical Neuron Network (ANN)](/ann/Dense.py) 56 | 57 | 1. Load your data, for example, iris data set. 58 | ``` 59 | from sklearn.datasets import load_iris 60 | iris = load_iris() 61 | ``` 62 | 2. Standardize your data. 63 | ``` 64 | scaler = StandardScaler() 65 | scaler.fit(iris.data) 66 | 67 | x_data = scaler.transform(iris.data) 68 | y_data = np.zeros((150, 3)) 69 | y_data[np.arange(150), iris.target] = 1 70 | ``` 71 | 3. Initialize activations, which are configurable. 72 | ``` 73 | activation_funcs = [function.relu] * 1 74 | # activation_funcs = [function.tanh] * 1 75 | # activation_funcs = [function.sigmoid] * 1 76 | activation_funcs.append(function.linear) 77 | ``` 78 | 4. Initialize model, option parameters are configurable. 79 | ``` 80 | dense = Dense(x_space=4, y_space=3, neuron_count_list=[10], **{ 81 | "loss_func": function.softmax_cross_entropy, 82 | "activation_funcs": activation_funcs, 83 | "learning_rate": 0.01, 84 | "enable_logger": True, 85 | "model_name": 'iris', 86 | "batch_size": 30, 87 | 'model': 'train' 88 | ) 89 | ``` 90 | 5. Train or Restore & Evaluate. 91 | ``` 92 | dense.train(x_data, y_data) 93 | # dense.restore() 94 | dense.evaluate(x_data, y_data) 95 | ``` 96 | -------------------------------------------------------------------------------- /playground/NeuralNetwork.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | from sklearn.preprocessing import StandardScaler 5 | from utility import function 6 | from ann.Dense import Dense 7 | 8 | np.random.seed(135) 9 | 10 | data_count = 25 11 | 12 | x1_points = np.linspace(0, 10, data_count).reshape((-1, 1)) 13 | x2_points = np.multiply(2, x1_points) + np.random.randint(-10, 10, size=(data_count,)).reshape((-1, 1)) 14 | 15 | x1 = np.concatenate((x1_points, x2_points), axis=1) 16 | y1 = np.array([[1, 0, 0, 0]] * data_count) 17 | 18 | x1_points = np.linspace(1, 10, data_count).reshape((-1, 1)) 19 | x2_points = np.multiply(-2, x1_points) + np.random.randint(-10, 10, size=(data_count,)).reshape((-1, 1)) 20 | 21 | x2 = np.concatenate((x1_points, x2_points), axis=1) 22 | y2 = np.array([[0, 1, 0, 0]] * data_count) 23 | 24 | x1_points = np.linspace(-1, -10, data_count).reshape((-1, 1)) 25 | x2_points = np.multiply(2, x1_points) + np.random.randint(-10, 10, size=(data_count,)).reshape((-1, 1)) 26 | 27 | x3 = np.concatenate((x1_points, x2_points), axis=1) 28 | y3 = np.array([[0, 0, 1, 0]] * data_count) 29 | 30 | x1_points = np.linspace(-1, -10, data_count).reshape((-1, 1)) 31 | x2_points = np.multiply(-2, x1_points) + np.random.randint(-10, 10, size=(data_count,)).reshape((-1, 1)) 32 | 33 | x4 = np.concatenate((x1_points, x2_points), axis=1) 34 | y4 = np.array([[0, 0, 0, 1]] * data_count) 35 | 36 | x_data = np.concatenate((x1, x2, x3, x4)) 37 | y_data = np.concatenate((y1, y2, y3, y4)) 38 | 39 | x_train = StandardScaler().fit_transform(x_data) 40 | y_train = y_data 41 | 42 | activation_funcs = [function.relu] * 2 43 | # activation_funcs = [function.sigmoid] * 1 44 | activation_funcs.append(function.linear) 45 | 46 | dense = Dense(x_space=2, y_space=4, hidden_units_list=[6, 6], **{ 47 | "loss_func": function.softmax_cross_entropy, 48 | "activation_funcs": activation_funcs, 49 | "learning_rate": 0.003, 50 | "enable_logger": True, 51 | "model_name": 'base', 52 | "batch_size": 100, 53 | "max_epoch": 1000, 54 | 'model': 'train', 55 | }) 56 | 57 | dense.train(x_data, y_data) 58 | # dense.restore() 59 | dense.evaluate(x_data, y_data) 60 | 61 | x1_test = np.linspace(-20, 20, 300) 62 | x2_test = np.linspace(-30, 30, 300) 63 | 64 | x1_mesh, x2_mesh = np.meshgrid(x1_test, x2_test) 65 | 66 | x_test = np.array([x1_mesh.ravel(), x2_mesh.ravel()]).T 67 | y_test = np.argmax(dense.predict(x_test), axis=1) 68 | 69 | plt.pcolormesh(x1_mesh, x2_mesh, y_test.reshape(x1_mesh.shape)) 70 | plt.scatter(x1[:, 0], x1[:, 1], marker='x') 71 | plt.scatter(x2[:, 0], x2[:, 1], marker='o') 72 | plt.scatter(x3[:, 0], x3[:, 1], marker='*') 73 | plt.scatter(x4[:, 0], x4[:, 1], marker='p') 74 | plt.show() 75 | -------------------------------------------------------------------------------- /playground/LogisticRegression.py: -------------------------------------------------------------------------------- 1 | from mpl_toolkits.mplot3d import Axes3D 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | 5 | from utility import function 6 | 7 | data_count = 100 8 | 9 | x1_points = np.linspace(-5, 5, data_count).reshape((-1, 1)) 10 | x2_points = np.multiply(5, x1_points) + np.random.randint(-5, 50, size=(data_count,)).reshape((-1, 1)) 11 | 12 | x_positive_data = np.concatenate((x1_points, x2_points), axis=1) 13 | y_positive_data = np.array([1] * data_count) 14 | 15 | x1_points = np.linspace(-5, 5, data_count).reshape((-1, 1)) 16 | x2_points = np.multiply(5, x1_points) - np.random.randint(-5, 50, size=(data_count,)).reshape((-1, 1)) 17 | 18 | x_negative_data = np.concatenate((x1_points, x2_points), axis=1) 19 | y_negative_data = np.array([0] * data_count) 20 | 21 | x_data = np.concatenate((x_positive_data, x_negative_data)) 22 | y_data = np.concatenate((y_positive_data, y_negative_data)) 23 | 24 | sigmoid = np.vectorize(function.sigmoid) 25 | grad_sigmoid = np.vectorize(function.grad_sigmoid) 26 | 27 | w = np.array([5, 5]) 28 | 29 | loss_cache = [] 30 | 31 | for iteration in range(1000): 32 | y_product = np.dot(x_data, w.T) 33 | # 计算预测标签值 (200, 2) * (2, 1) -> (200, 1) 34 | y_positive_predict = sigmoid(y_product) 35 | y_negative_predict = 1 - y_positive_predict 36 | y_negative_predict[y_negative_predict < 1e-4] = 1e-4 37 | # 计算交叉熵 38 | cross_entropy = -np.mean(y_data * np.log(y_positive_predict) + (1 - y_data) * np.log(y_negative_predict)) 39 | # 计算梯度 40 | # grad_w = -np.mean(y_data / y_positive_predict * grad_sigmoid(y_product) * x_data.T, axis=1) 41 | grad_w = -np.mean((y_data - y_positive_predict) * x_data.T, axis=1) 42 | # 更新梯度 43 | w = w - 0.03 * grad_w 44 | # 缓存交叉熵 45 | loss_cache.append(cross_entropy) 46 | 47 | y = - np.multiply(x_data[: data_count, 0], w[0]) / w[1] 48 | # 49 | # w1_sample = np.linspace(-10, 10, 2 * data_count).reshape((-1, 1)) 50 | # w2_sample = np.linspace(-10, 10, 2 * data_count).reshape((-1, 1)) 51 | # 52 | # w_sample = np.concatenate((w1_sample, w2_sample), axis=1) 53 | # 54 | # # (200, 2) * (2, 200) -> (200 * 200) 55 | # loss = y_data * np.log(sigmoid(np.dot(x_data, w_sample.T))) 56 | # 57 | # figure = plt.figure(figsize=(16, 6)) 58 | # axes = Axes3D(figure) 59 | # axes.set_xlabel('w') 60 | # axes.set_ylabel('b') 61 | # axes.plot_surface(w1_sample.T, w2_sample, loss, cmap='rainbow') 62 | 63 | plt.figure(figsize=(16, 9)) 64 | plt.title('CrossEntropy') 65 | plt.plot(loss_cache) 66 | plt.show() 67 | 68 | plt.figure(figsize=(16, 9)) 69 | plt.plot(x_data[: data_count, 0], y) 70 | plt.scatter(x_data[:data_count, 0], x_data[:data_count, 1], s=50, color='g', marker='o') 71 | plt.scatter(x_data[data_count:, 0], x_data[data_count:, 1], s=50, color='r', marker='x') 72 | plt.xlabel('x') 73 | plt.ylabel('y') 74 | plt.show() 75 | -------------------------------------------------------------------------------- /ann/RNN.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from utility import function 4 | 5 | 6 | class RNN(object): 7 | 8 | def __init__(self, hidden_size, seq_length, x_space, y_space, **options): 9 | 10 | self.x_space = x_space 11 | self.y_space = y_space 12 | 13 | self.seq_length = seq_length 14 | 15 | self.hidden_size = hidden_size 16 | 17 | self.x_weights = np.zeros((hidden_size, x_space)) 18 | self.s_weights = np.zeros((hidden_size, y_space)) 19 | self.u_weights = np.zeros((y_space, hidden_size)) 20 | 21 | self.z_inputs = {} 22 | self.h_inputs = {} 23 | self.p_outputs = {} 24 | self.deltas = {} 25 | 26 | self._init_options(options) 27 | self._init_weights_and_biases() 28 | 29 | def _init_options(self, options): 30 | 31 | try: 32 | self.batch_size = options['batch_size'] 33 | except KeyError: 34 | self.batch_size = 16 35 | finally: 36 | if self.batch_size < 1: 37 | raise ValueError('Batch size must larger than 1.') 38 | 39 | def _init_weights_and_biases(self): 40 | self.x_weights[:, ] = np.random.normal(loc=0.0, scale=0.001) 41 | self.s_weights[:, ] = np.random.normal(loc=0.0, scale=0.001) 42 | self.u_weights[:, ] = np.random.normal(loc=0.0, scale=0.001) 43 | 44 | def _forward(self, input_batch): 45 | # Initialize s_t 46 | s_t = np.zeros((input_batch.shape[0], self.y_space)) 47 | # Forward pass. 48 | for seq_index in range(self.seq_length): 49 | # Get x_t. 50 | x_t = input_batch[:, seq_index, :] 51 | # Save dz/dw 52 | self.z_inputs[seq_index] = x_t 53 | # (batch_size, x_space) * (x_space, hidden_size) -> (batch_size, hidden_size) 54 | z_t = np.dot(x_t, self.x_weights.T) 55 | # Save dh/ds 56 | self.h_inputs[seq_index] = s_t 57 | # (batch_size, y_space) * (y_space, hidden_size) -> (batch_size, hidden_size) 58 | h_t = np.dot(s_t, self.s_weights.T) 59 | # (batch_size, hidden_size) * (hidden_size, y_space) -> (batch_size, y_space) 60 | phi_t = np.dot((z_t + h_t), self.u_weights.T) 61 | # Save da/dp 62 | self.p_outputs = phi_t 63 | # Get s_t 64 | s_t = function.tanh(phi_t) 65 | return s_t 66 | 67 | def _backward(self, error): 68 | for seq_index in range(self.seq_length)[::-1]: 69 | z_input = self.z_inputs[seq_index] 70 | h_input = self.h_inputs[seq_index] 71 | # da/dp 72 | p_output = self.p_outputs[seq_index] 73 | # dp/dz 74 | p_input = self.p_inputs[seq_index] 75 | # TODO - Implements 76 | 77 | def train(self, x_data, y_data): 78 | pass 79 | 80 | def predict(self): 81 | pass 82 | 83 | def evaluate(self): 84 | pass 85 | -------------------------------------------------------------------------------- /playground/MLP.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import matplotlib.pyplot as plt 4 | from sklearn.preprocessing import StandardScaler 5 | from sklearn.datasets import load_iris 6 | from base.model import * 7 | 8 | 9 | class Agent(BaseSLModel): 10 | 11 | def __init__(self, x_space, y_space, x_train, y_train, x_test, y_test, **options): 12 | super(Agent, self).__init__(x_space, y_space, x_train, y_train, x_test, y_test, **options) 13 | 14 | self._init_options(options) 15 | self._init_input() 16 | self._init_nn() 17 | self._init_op() 18 | self._init_saver() 19 | self._init_summary_writer() 20 | 21 | self.session.run(tf.global_variables_initializer()) 22 | 23 | def _init_input(self, *args): 24 | self.x_input = tf.placeholder(tf.float32, [None, self.x_space]) 25 | self.y_input = tf.placeholder(tf.float32, [None, self.y_space]) 26 | 27 | def _init_nn(self, *args): 28 | with tf.variable_scope('MLP'): 29 | f_dense = tf.layers.dense(self.x_input, 32, tf.nn.relu) 30 | s_dense = tf.layers.dense(f_dense, 32, tf.nn.relu) 31 | y_predict = tf.layers.dense(s_dense, self.y_space) 32 | self.y_predict = y_predict 33 | 34 | def _init_op(self): 35 | with tf.variable_scope('loss_func'): 36 | # self.loss_func = tf.reduce_mean(tf.square(self.y_input - self.y_predict) * tf.abs(self.y_predict)) 37 | # self.loss_func = tf.reduce_mean(tf.square(self.y_input - self.y_predict) * tf.square(self.y_input)) 38 | self.loss_func = tf.losses.mean_squared_error(self.y_input, self.y_predict) 39 | tf.summary.scalar('mse', self.loss_func) 40 | with tf.variable_scope('optimizer'): 41 | self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_func) 42 | 43 | def train(self): 44 | # Get data size. 45 | data_size = len(self.x_train) 46 | for train_step in range(30000): 47 | # Get mini batch. 48 | # indices = np.random.choice(data_size, size=self.batch_size) 49 | # x_batch = self.x_train[indices] 50 | # y_batch = self.y_train[indices] 51 | x_batch = self.x_train 52 | y_batch = self.y_train 53 | # Train op. 54 | ops = [self.optimizer, self.loss_func] 55 | if train_step % 500 == 0: 56 | ops.append(self.merged_summary_op) 57 | # Train. 58 | results = self.session.run(ops, { 59 | self.x_input: x_batch, 60 | self.y_input: y_batch, 61 | }) 62 | # Add summary. 63 | if train_step % 500 == 0: 64 | self.summary_writer.add_summary(results[-1], global_step=self.training_step) 65 | # Log loss. 66 | if train_step % 10 == 0: 67 | self.save() 68 | self.logger.warning('Step: {0}, Training loss: {1:.10f}'.format(train_step, results[1])) 69 | self.evaluate() 70 | self.training_step += 1 71 | 72 | def predict(self, s): 73 | y_predict = self.session.run(self.y_predict, {self.x_input: s}) 74 | return y_predict 75 | 76 | def evaluate(self): 77 | y_predict, loss = self.session.run([self.y_predict, self.loss_func], { 78 | self.x_input: self.x_test, 79 | self.y_input: self.y_test 80 | }) 81 | 82 | self.logger.warning('Step: {0}, Testing loss: {1:.10f}'.format(self.training_step, loss)) 83 | 84 | 85 | if __name__ == '__main__': 86 | 87 | x_train = np.linspace(-np.pi, np.pi, num=200).reshape((-1, 1)) + np.random.normal() 88 | y_train = np.sin(x_train) 89 | 90 | x_test = np.linspace(-np.pi, np.pi, num=50).reshape((-1, 1)) 91 | y_test = np.sin(x_test) 92 | 93 | agent = Agent(x_train[0].shape[0], 94 | 1, 95 | x_train, 96 | y_train, 97 | x_test, 98 | y_test) 99 | 100 | agent.train() 101 | -------------------------------------------------------------------------------- /playground/RegressionTree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Node(object): 5 | def __init__(self, i, j, c1, c2, l_node=None, r_node=None): 6 | self.i = i 7 | self.j = j 8 | self.c1 = c1 9 | self.c2 = c2 10 | self.offset = 0 11 | self.l_node = l_node 12 | self.r_node = r_node 13 | 14 | 15 | class RegressionTree(object): 16 | 17 | def __init__(self): 18 | self._tree = None 19 | self.x_data = None 20 | self.y_data = None 21 | self.num_nodes = 0 22 | 23 | def fit(self, x, y, max_depth=3): 24 | self.x_data = x 25 | self.y_data = y 26 | # Calculate nodes. 27 | self.num_nodes = 2 ** max_depth - 1 28 | # Init root node. 29 | root_node = self.make_node(x, y) 30 | 31 | def _fit(_x, _y, _node): 32 | 33 | if self.num_nodes <= 0: 34 | return 35 | 36 | # Make R. 37 | x_r1, y_r1 = _x[:_node.i], _y[:_node.i] 38 | x_r2, y_r2 = _x[_node.i:], _y[_node.i:] 39 | 40 | # Make left node. 41 | l_node = self.make_node(x_r1, y_r1) 42 | _node.l_node = l_node 43 | 44 | self.num_nodes -= 1 45 | 46 | if _node.l_node: 47 | # Update offset. 48 | l_node.offset = _node.offset 49 | _fit(x_r1, y_r1, _node.l_node) 50 | 51 | # Make right node. 52 | r_node = self.make_node(x_r2, y_r2) 53 | _node.r_node = r_node 54 | 55 | self.num_nodes -= 1 56 | 57 | if _node.r_node: 58 | # Update offset. 59 | r_node.offset = _node.i + _node.offset 60 | _fit(x_r2, y_r2, _node.r_node) 61 | 62 | _fit(x, y, root_node) 63 | 64 | self._tree = root_node 65 | 66 | def predict(self, x): 67 | 68 | node = self._tree 69 | 70 | def _predict(_x, _node): 71 | 72 | val = self.x_data[_node.i + _node.offset, _node.j] 73 | 74 | if _x[_node.j] < val: 75 | if _node.l_node: 76 | return _predict(_x, _node.l_node) 77 | else: 78 | return _node.c1 79 | else: 80 | if _node.r_node: 81 | return _predict(_x, _node.r_node) 82 | else: 83 | return _node.c2 84 | 85 | return _predict(x, node) 86 | 87 | @staticmethod 88 | def make_node(x, y): 89 | # Get shape. 90 | rows, cols = x.shape 91 | if rows <= 1: 92 | return None 93 | # Init params. 94 | best_i, best_j = 1, 1 95 | best_c1, best_c2 = 0, 0 96 | best_loss = np.inf 97 | # Find best split. 98 | for i in range(1, rows): 99 | for j in range(0, cols): 100 | # Calculate c1, c2, loss. 101 | c1 = np.mean(y[:i]) 102 | c2 = np.mean(y[i:]) 103 | loss = np.mean(y[:i] - c1) + np.mean(y[i:] - c2) 104 | # Update best if need. 105 | if loss < best_loss: 106 | best_loss = loss 107 | best_i = i 108 | best_j = j 109 | best_c1 = c1 110 | best_c2 = c2 111 | node = Node(best_i, best_j, best_c1, best_c2) 112 | return node 113 | 114 | 115 | data_x = np.linspace(-10, 10, 20).reshape((-1, 1)) 116 | # data_y = np.linspace(-20, 20, 100) + np.random.normal(loc=0, scale=3.5, size=(100, )) 117 | data_y = 2 * data_x 118 | 119 | # data_x = np.array([-4, -3, -2, -1, 0, 1, 2, 3, 4]).reshape((-1, 1)) 120 | # data_y = np.array([-8, -6, -4, -2, 0, 2, 4, 6, 8]) 121 | 122 | 123 | t = RegressionTree() 124 | t.fit(data_x, data_y, max_depth=3) 125 | # print(t.predict([-4])) 126 | # print(t.predict([1])) 127 | # print(t.predict([2])) 128 | # print(t.predict([3])) 129 | # print(t.predict([4])) 130 | # print(t.predict([20])) 131 | print(t.predict([4.])) 132 | # print([t.predict(data_x[i, :].reshape((-1, ))) for i in range(0, 100)]) 133 | -------------------------------------------------------------------------------- /playground/PolicyGradient.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import numpy as np 4 | import gym 5 | 6 | from base.model import * 7 | from utility.launcher import start_game 8 | 9 | 10 | class Agent(BaseRLModel): 11 | 12 | def __init__(self, a_space, s_space, **options): 13 | super(Agent, self).__init__(a_space, s_space, **options) 14 | 15 | self._init_input() 16 | self._init_nn() 17 | self._init_op() 18 | self._init_saver() 19 | self._init_summary_writer() 20 | 21 | self.a_buffer, self.s_buffer, self.r_buffer = [], [], [] 22 | 23 | self.session.run(tf.global_variables_initializer()) 24 | 25 | def _init_input(self, *args): 26 | with tf.variable_scope('input'): 27 | self.s = tf.placeholder(tf.float32, [None, self.s_space]) 28 | self.a = tf.placeholder(tf.int32, [None, ]) 29 | self.r = tf.placeholder(tf.float32, [None, ]) 30 | # Add summary. 31 | tf.summary.histogram('rewards', self.r) 32 | 33 | def _init_nn(self, *args): 34 | with tf.variable_scope('actor_net'): 35 | # Kernel initializer. 36 | w_initializer = tf.random_normal_initializer(0.0, 0.01) 37 | # First dense. 38 | f_dense = tf.layers.dense(self.s, 64, tf.nn.relu, kernel_initializer=w_initializer) 39 | # Second dense. 40 | s_dense = tf.layers.dense(f_dense, 64, tf.nn.relu, kernel_initializer=w_initializer) 41 | # Action logits. 42 | self.a_logits = tf.layers.dense(s_dense, self.a_space, kernel_initializer=w_initializer) 43 | # Action prob.Î 44 | self.a_prob = tf.nn.softmax(self.a_logits) 45 | 46 | def _init_op(self): 47 | with tf.variable_scope('loss_func'): 48 | # one hot a. 49 | a_one_hot = tf.one_hot(self.a, self.a_space) 50 | # cross entropy. 51 | cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=a_one_hot, logits=self.a_logits) 52 | # loss func. 53 | self.loss_func = tf.reduce_mean(cross_entropy * self.r) 54 | # add summary. 55 | tf.summary.scalar('r_cross_entropy', self.loss_func) 56 | with tf.variable_scope('optimizer'): 57 | self.global_step = tf.Variable(initial_value=0) 58 | self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_func) 59 | 60 | def predict(self, s): 61 | a_prob = self.session.run(self.a_prob, {self.s: [s]}) 62 | if self.mode == 'train': 63 | return np.random.choice(range(a_prob.shape[1]), p=a_prob.ravel()) 64 | else: 65 | return np.argmax(a_prob) 66 | 67 | def snapshot(self, s, a, r, _): 68 | self.a_buffer.append(a) 69 | self.s_buffer.append(s) 70 | self.r_buffer.append(r) 71 | 72 | def train(self): 73 | # Copy r_buffer 74 | r_buffer = self.r_buffer 75 | # Init r_tau 76 | r_tau = 0 77 | # Calculate r_tau 78 | for index in reversed(range(0, len(r_buffer))): 79 | r_tau = r_tau * self.gamma + r_buffer[index] 80 | self.r_buffer[index] = r_tau 81 | # Make ops. 82 | ops = [self.optimizer, self.loss_func] 83 | if self.training_step % 5 == 0: 84 | ops.append(self.merged_summary_op) 85 | # Minimize loss. 86 | results = self.session.run(ops, { 87 | self.s: self.s_buffer, 88 | self.a: self.a_buffer, 89 | self.r: self.r_buffer 90 | }) 91 | 92 | if self.training_step % 10 == 0: 93 | self.summary_writer.add_summary(results[-1], global_step=self.training_step) 94 | 95 | self.training_step += 1 96 | 97 | self.s_buffer, self.a_buffer, self.r_buffer = [], [], [] 98 | 99 | 100 | def main(_): 101 | # Make env. 102 | env = gym.make('CartPole-v0') 103 | env.seed(1) 104 | env = env.unwrapped 105 | # Init agent. 106 | agent = Agent(env.action_space.n, env.observation_space.shape[0], **{ 107 | KEY_MODEL_NAME: 'PolicyGradient', 108 | KEY_TRAIN_EPISODE: 10000 109 | }) 110 | start_game(env, agent) 111 | 112 | 113 | if __name__ == '__main__': 114 | tf.app.run() 115 | -------------------------------------------------------------------------------- /note/.ipynb_checkpoints/DQN-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# 问题设定" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "在小车倒立杆(CartPole)游戏中,我们希望通过强化学习训练一个智能体(agent),尽可能不断地左右移动小车,使得小车上的杆不倒,我们首先定义CartPole游戏:" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "CartPole游戏即是强化学习模型的enviorment,它与agent交互,实时更新state,内部定义了reward function,其中state有以下定义:" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "state每一个维度分别代表了:\n", 31 | "\n", 32 | "- 小车位置,它的取值范围是-2.4到2.4\n", 33 | "- 小车速度,它的取值范围是负无穷到正无穷\n", 34 | "- 杆的角度,它的取值范围是-41.8°到41.8°\n", 35 | "- 杆的角速,它的取值范围是负无穷到正无穷" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "action是一个2维向量,每一个维度分别代表向左和向右移动。" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "$$\n", 50 | "action \\in \\mathbb{R}^2\n", 51 | "$$" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "# DQN" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "我们将设计一个网络,作为状态-动作值函数(state-action value function),其输入是state,输出是对应各个action的value,并TD(Temporal Difference)进行迭代训练直至收敛。我们将定义两个这样的网络,分别记作$\\theta$和$\\theta^-$,分别代表估计网络与目标网络。" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "我们希望最小化:" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "$$\n", 80 | "\\left( y_j - Q \\left( \\phi_j, a_j; \\theta \\right) \\right)^2\n", 81 | "$$" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "其中,$a_j$具有以下形式:" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "$$\n", 96 | "a_j = \\mathrm{argmax}_{a} Q \\left( \\phi(s_j), a; \\theta\\right)\n", 97 | "$$" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "其中,$y_j$具有以下形式:" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "$$\n", 112 | "f(x)=\n", 113 | "\\begin{cases}\n", 114 | "r_j & \\text{if episode ends at j + 1}\\\\\n", 115 | "r_j + \\gamma \\max_{a^{\\prime}} \\hat{Q} \\left( \\phi_{j+1}, a^{\\prime}; \\theta^{-} \\right)& \\text{otherwise}\n", 116 | "\\end{cases}$$\n", 117 | "\n" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "在最小化TD-Error时,我们将固定目标网络,只对估计网络做梯度反向传播,每次到达一定迭代次数后,将估计网络的权重复制到目标网络。在这个过程中,需要用到经验回放(Experience Replay)技术,即将每一次迭代观测到的$s_t, r_t, a_t, s_{t+1}$作为一个元组缓存,然后在这些缓存中随机抽取元组做批次梯度下降。" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "# 代码实现" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 1, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "name": "stderr", 141 | "output_type": "stream", 142 | "text": [ 143 | "/Users/shuyu/anaconda3/envs/quant/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5\n", 144 | " return f(*args, **kwds)\n" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "# coding=utf-8\n", 150 | "\n", 151 | "import tensorflow as tf\n", 152 | "import numpy as np\n", 153 | "import gym\n" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [] 162 | } 163 | ], 164 | "metadata": { 165 | "kernelspec": { 166 | "display_name": "Python 3", 167 | "language": "python", 168 | "name": "python3" 169 | }, 170 | "language_info": { 171 | "codemirror_mode": { 172 | "name": "ipython", 173 | "version": 3 174 | }, 175 | "file_extension": ".py", 176 | "mimetype": "text/x-python", 177 | "name": "python", 178 | "nbconvert_exporter": "python", 179 | "pygments_lexer": "ipython3", 180 | "version": "3.5.4" 181 | } 182 | }, 183 | "nbformat": 4, 184 | "nbformat_minor": 1 185 | } 186 | -------------------------------------------------------------------------------- /playground/DQN.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import numpy as np 4 | import gym 5 | 6 | from base.model import * 7 | from utility.launcher import start_game 8 | 9 | tf.set_random_seed(7) 10 | 11 | 12 | class Agent(BaseRLModel): 13 | 14 | def __init__(self, a_space, s_space, **options): 15 | super(Agent, self).__init__(a_space, s_space, **options) 16 | 17 | self._init_input() 18 | self._init_nn() 19 | self._init_op() 20 | self._init_saver() 21 | 22 | self.buffer = np.zeros((self.buffer_size, self.s_space + 1 + 1 + self.s_space)) 23 | self.buffer_count = 0 24 | 25 | self.update_target_net_step = 200 26 | 27 | def _init_input(self, *args): 28 | with tf.variable_scope('input'): 29 | self.s_n = tf.placeholder(tf.float32, [None, self.s_space]) 30 | self.s = tf.placeholder(tf.float32, [None, self.s_space]) 31 | self.r = tf.placeholder(tf.float32, [None, ]) 32 | self.a = tf.placeholder(tf.int32, [None, ]) 33 | 34 | def _init_nn(self, *args): 35 | # w,b initializer 36 | w_initializer = tf.random_normal_initializer(mean=0.0, stddev=0.00003) 37 | b_initializer = tf.constant_initializer(0.1) 38 | 39 | with tf.variable_scope('predict_q_net'): 40 | phi_state = tf.layers.dense(self.s, 41 | 64, 42 | tf.nn.relu, 43 | kernel_initializer=w_initializer, 44 | bias_initializer=b_initializer) 45 | 46 | self.q_predict = tf.layers.dense(phi_state, 47 | self.a_space, 48 | kernel_initializer=w_initializer, 49 | bias_initializer=b_initializer) 50 | 51 | with tf.variable_scope('target_q_net'): 52 | phi_state_next = tf.layers.dense(self.s_n, 53 | 64, 54 | tf.nn.relu, 55 | kernel_initializer=w_initializer, 56 | bias_initializer=b_initializer) 57 | 58 | self.q_target = tf.layers.dense(phi_state_next, 59 | self.a_space, 60 | kernel_initializer=w_initializer, 61 | bias_initializer=b_initializer) 62 | 63 | def _init_op(self): 64 | with tf.variable_scope('q_real'): 65 | # size of q_value_real is [BATCH_SIZE, 1] 66 | max_q_value = tf.reduce_max(self.q_target, axis=1) 67 | q_next = self.r + self.gamma * max_q_value 68 | self.q_next = tf.stop_gradient(q_next) 69 | 70 | with tf.variable_scope('q_predict'): 71 | # size of q_value_predict is [BATCH_SIZE, 1] 72 | action_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1) 73 | self.q_eval = tf.gather_nd(self.q_predict, action_indices) 74 | 75 | with tf.variable_scope('loss'): 76 | self.loss_func = tf.reduce_mean(tf.squared_difference(self.q_next, self.q_eval, name='mse')) 77 | 78 | with tf.variable_scope('train'): 79 | self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_func) 80 | 81 | with tf.variable_scope('update_target_net'): 82 | t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_net') 83 | p_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='predict_q_net') 84 | self.update_q_net = [tf.assign(t, e) for t, e in zip(t_params, p_params)] 85 | 86 | self.session.run(tf.global_variables_initializer()) 87 | 88 | def predict(self, s): 89 | if np.random.uniform() < self.epsilon: 90 | a = np.argmax(self.session.run(self.q_predict, feed_dict={self.s: s[np.newaxis, :]})) 91 | else: 92 | a = np.random.randint(0, self.a_space) 93 | return a 94 | 95 | def snapshot(self, s, a, r, s_n): 96 | self.buffer[self.buffer_count % self.buffer_size, :] = np.hstack((s, [a, r], s_n)) 97 | self.buffer_count += 1 98 | 99 | def train(self): 100 | for train_step in range(self.train_steps): 101 | # Update target net if need. 102 | if self.training_step % self.update_target_net_step == 0: 103 | self.session.run(self.update_q_net) 104 | # Get batch. 105 | if self.buffer_count < self.batch_size: 106 | batch = self.buffer[np.random.choice(self.buffer_count, size=self.batch_size), :] 107 | else: 108 | batch = self.buffer[np.random.choice(self.buffer_size, size=self.batch_size), :] 109 | 110 | s = batch[:, :self.s_space] 111 | s_n = batch[:, -self.s_space:] 112 | a = batch[:, self.s_space].reshape((-1)) 113 | r = batch[:, self.s_space + 1] 114 | 115 | _, cost = self.session.run([self.train_op, self.loss_func], { 116 | self.s: s, self.a: a, self.r: r * 5, self.s_n: s_n 117 | }) 118 | 119 | self.training_step += 1 120 | 121 | 122 | def main(_): 123 | # Make env. 124 | env = gym.make('CartPole-v0') 125 | env.seed(1) 126 | env = env.unwrapped 127 | # Init session. 128 | # Init agent. 129 | agent = Agent(env.action_space.n, env.observation_space.shape[0], **{ 130 | KEY_MODEL_NAME: 'DQN', 131 | KEY_TRAIN_EPISODE: 500 132 | }) 133 | start_game(env, agent) 134 | 135 | 136 | if __name__ == '__main__': 137 | tf.app.run() 138 | -------------------------------------------------------------------------------- /playground/PPO.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import numpy as np 4 | import gym 5 | 6 | from base.model import * 7 | from utility.launcher import start_game 8 | 9 | 10 | class Agent(BaseRLModel): 11 | 12 | def __init__(self, a_space, s_space, **options): 13 | super(Agent, self).__init__(a_space, s_space, **options) 14 | 15 | self._init_input() 16 | self._init_nn() 17 | self._init_op() 18 | self._init_saver() 19 | 20 | self.a_buffer = [] 21 | self.s_buffer = [] 22 | self.r_buffer = [] 23 | self.a_p_r_buffer = [] 24 | 25 | self.session.run(tf.global_variables_initializer()) 26 | 27 | def _init_input(self, *args): 28 | with tf.variable_scope('input'): 29 | self.s = tf.placeholder(tf.float32, [None, self.s_space], name='s') 30 | self.a = tf.placeholder(tf.int32, [None, ], name='a') 31 | self.r = tf.placeholder(tf.float32, [None, ], name='r') 32 | self.adv = tf.placeholder(tf.float32, [None, ], name='adv') 33 | self.a_p_r = tf.placeholder(tf.float32, [None, ], name='a_p_r') 34 | 35 | def _init_nn(self, *args): 36 | self.advantage, self.value = self._init_critic_net('critic_net') 37 | self.a_prob_eval, self.a_logits_eval = self._init_actor_net('eval_actor_net') 38 | self.a_prob_target, self.a_logits_target = self._init_actor_net('target_actor_net', trainable=False) 39 | 40 | def _init_op(self): 41 | with tf.variable_scope('critic_loss_func'): 42 | # loss func. 43 | self.c_loss_func = tf.losses.mean_squared_error(labels=self.r, predictions=self.value) 44 | with tf.variable_scope('critic_optimizer'): 45 | # critic optimizer. 46 | self.c_optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.c_loss_func) 47 | with tf.variable_scope('update_target_actor_net'): 48 | # Get eval w, b. 49 | params_e = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_actor_net') 50 | params_t = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor_net') 51 | self.update_target_a_op = [tf.assign(t, e) for t, e in zip(params_t, params_e)] 52 | with tf.variable_scope('actor_loss_func'): 53 | # one hot a. 54 | a_one_hot = tf.one_hot(self.a, self.a_space) 55 | # Clip a_p_r. 56 | a_p_r = tf.clip_by_value(self.a_p_r, 1 - self.epsilon, 1 + self.epsilon) 57 | # cross entropy. 58 | cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=a_one_hot, logits=self.a_logits_eval) 59 | # loss func. 60 | self.a_loss_func = tf.reduce_mean(cross_entropy * a_p_r * self.adv) 61 | with tf.variable_scope('actor_optimizer'): 62 | self.a_optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.a_loss_func) 63 | 64 | def _init_actor_net(self, scope, trainable=True): 65 | with tf.variable_scope(scope): 66 | # Kernel initializer. 67 | w_initializer = tf.random_normal_initializer(0.0, 0.01) 68 | # First dense. 69 | f_dense = tf.layers.dense(self.s, 64, tf.nn.relu, trainable=trainable, kernel_initializer=w_initializer) 70 | # Second dense. 71 | s_dense = tf.layers.dense(f_dense, 32, tf.nn.relu, trainable=trainable, kernel_initializer=w_initializer) 72 | # Action logits. 73 | a_logits = tf.layers.dense(s_dense, self.a_space, trainable=trainable, kernel_initializer=w_initializer) 74 | # Action prob. 75 | a_prob = tf.nn.softmax(a_logits) 76 | return a_prob, a_logits 77 | 78 | def _init_critic_net(self, scope): 79 | with tf.variable_scope(scope): 80 | # Kernel initializer. 81 | w_initializer = tf.random_normal_initializer(0.0, 0.01) 82 | # First dense. 83 | f_dense = tf.layers.dense(self.s, 64, tf.nn.relu, kernel_initializer=w_initializer) 84 | # Value. 85 | value = tf.layers.dense(f_dense, 1) 86 | value = tf.reshape(value, [-1, ]) 87 | # Advantage. 88 | advantage = self.r - value 89 | return advantage, value 90 | 91 | def predict(self, s): 92 | # Calculate a eval prob. 93 | a_prob_eval, a_prob_target = self.session.run([self.a_prob_eval, self.a_prob_target], {self.s: [s]}) 94 | # Calculate action prob ratio between eval and target. 95 | a_p_r = np.max(a_prob_eval) / np.max(a_prob_target) 96 | self.a_p_r_buffer.append(a_p_r) 97 | return np.random.choice(range(a_prob_eval.shape[1]), p=a_prob_eval.ravel()) 98 | 99 | def snapshot(self, s, a, r, _): 100 | self.a_buffer.append(a) 101 | self.s_buffer.append(s) 102 | self.r_buffer.append(r) 103 | 104 | def train(self): 105 | self.session.run(self.update_target_a_op) 106 | # Copy r_buffer 107 | r_buffer = self.r_buffer 108 | # Init r_tau 109 | r_tau = 0 110 | # Calculate r_tau 111 | for index in reversed(range(0, len(r_buffer))): 112 | r_tau = r_tau * self.gamma + r_buffer[index] 113 | self.r_buffer[index] = r_tau 114 | # Calculate adv. 115 | adv_buffer = self.session.run(self.advantage, {self.s: self.s_buffer, self.r: self.r_buffer}) 116 | # Minimize loss. 117 | self.session.run([self.a_optimizer, self.c_optimizer], { 118 | self.adv: adv_buffer, 119 | self.s: self.s_buffer, 120 | self.a: self.a_buffer, 121 | self.r: self.r_buffer, 122 | self.a_p_r: self.a_p_r_buffer, 123 | }) 124 | self.s_buffer = [] 125 | self.a_buffer = [] 126 | self.r_buffer = [] 127 | self.a_p_r_buffer = [] 128 | 129 | 130 | def main(_): 131 | # Make env. 132 | env = gym.make('CartPole-v0') 133 | env.seed(1) 134 | env = env.unwrapped 135 | # Init agent. 136 | agent = Agent(env.action_space.n, env.observation_space.shape[0], **{ 137 | # KEY_MODE: 'test', 138 | KEY_MODEL_NAME: 'PPO', 139 | KEY_TRAIN_EPISODE: 10000 140 | }) 141 | start_game(env, agent) 142 | 143 | 144 | if __name__ == '__main__': 145 | tf.app.run() 146 | -------------------------------------------------------------------------------- /base/model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import tensorflow as tf 4 | 5 | from abc import abstractmethod 6 | from utility.logger import * 7 | from static import * 8 | 9 | KEY_TRAIN_EPISODE = 'train_episodes' 10 | KEY_LEARNING_RATE = 'learning_rate' 11 | KEY_ENABLE_EAGER = 'enable_eager' 12 | KEY_SAVE_EPISODE = 'save_episode' 13 | KEY_EVAL_EPISODE = 'eval_episode' 14 | KEY_BUFFER_SIZE = 'buffer_size' 15 | KEY_TRAIN_STEPS = 'train_steps' 16 | KEY_MODEL_NAME = 'model_name' 17 | KEY_BATCH_SIZE = 'batch_size' 18 | KEY_SEQ_LENGTH = 'seq_length' 19 | KEY_SAVE_DIR = 'save_dir' 20 | KEY_SESSION = 'session' 21 | KET_EPSILON = 'epsilon' 22 | KEY_GAMMA = 'gamma' 23 | KEY_MODE = 'mode' 24 | KEY_TAU = 'tau' 25 | 26 | 27 | class BaseModel(object): 28 | 29 | def __init__(self, **options): 30 | # Init vars. 31 | self.mode = 'train' 32 | self.save_dir = None 33 | self.training_step = 0 34 | self.checkpoint_path = None 35 | # Init parameters. 36 | self._init_options(options) 37 | self._init_logger() 38 | 39 | def _init_logger(self): 40 | self.logger = get_logger(self.model_name, self.mode, 'algo') 41 | 42 | def _init_saver(self): 43 | save_dir = os.path.join(CHECKPOINTS_DIR, self.model_name) 44 | if not os.path.exists(save_dir): 45 | os.makedirs(save_dir) 46 | self.checkpoint_path = os.path.join(CHECKPOINTS_DIR, self.model_name, save_dir, 'ckpt') 47 | self.saver = tf.train.Saver() 48 | 49 | def _init_summary_writer(self): 50 | self.summary_path = os.path.join(SUMMARIES_DIR, self.model_name, DATETIME_NOW) 51 | self.summary_writer = tf.summary.FileWriter(self.summary_path, graph=self.session.graph) 52 | self.merged_summary_op = tf.summary.merge_all() 53 | 54 | def _init_options(self, options): 55 | 56 | try: 57 | self.enable_eager = options[KEY_ENABLE_EAGER] 58 | except KeyError: 59 | self.enable_eager = False 60 | 61 | try: 62 | self.session = options[KEY_SESSION] 63 | except KeyError: 64 | if not self.enable_eager: 65 | self.session = tf.Session() 66 | 67 | try: 68 | self.model_name = options[KEY_MODEL_NAME] 69 | except KeyError: 70 | self.model_name = 'model' 71 | 72 | try: 73 | self.mode = options[KEY_MODE] 74 | except KeyError: 75 | self.mode = 'train' 76 | 77 | try: 78 | self.learning_rate = options[KEY_LEARNING_RATE] 79 | except KeyError: 80 | self.learning_rate = 0.001 81 | 82 | try: 83 | self.batch_size = options[KEY_BATCH_SIZE] 84 | except KeyError: 85 | self.batch_size = 128 86 | 87 | try: 88 | self.seq_length = options[KEY_SEQ_LENGTH] 89 | except KeyError: 90 | self.seq_length = 5 91 | 92 | def save(self): 93 | # Save checkpoint. 94 | self.saver.save(self.session, self.checkpoint_path) 95 | self.logger.warning("Saver reach checkpoint.") 96 | 97 | def restore(self): 98 | self.saver.restore(self.session, self.checkpoint_path) 99 | 100 | 101 | class BaseRLModel(BaseModel): 102 | 103 | def __init__(self, a_space, s_space, **options): 104 | super(BaseRLModel, self).__init__(**options) 105 | # Init spaces. 106 | self.a_space, self.s_space = a_space, s_space 107 | # Init buffer count. 108 | self.buffer_count = 0 109 | 110 | def _init_options(self, options): 111 | super(BaseRLModel, self)._init_options(options) 112 | 113 | try: 114 | self.train_episodes = options[KEY_TRAIN_EPISODE] 115 | except KeyError: 116 | self.train_episodes = 1000 117 | 118 | try: 119 | self.train_steps = options[KEY_TRAIN_STEPS] 120 | except KeyError: 121 | self.train_steps = 1000 122 | 123 | try: 124 | self.eval_episodes = options[KEY_EVAL_EPISODE] 125 | except KeyError: 126 | self.eval_episodes = 1 127 | 128 | try: 129 | self.gamma = options[KEY_GAMMA] 130 | except KeyError: 131 | self.gamma = 0.95 132 | 133 | try: 134 | self.tau = options[KEY_TAU] 135 | except KeyError: 136 | self.tau = 0.01 137 | 138 | try: 139 | self.epsilon = options[KET_EPSILON] 140 | except KeyError: 141 | self.epsilon = 0.9 142 | 143 | try: 144 | self.buffer_size = options[KEY_BUFFER_SIZE] 145 | except KeyError: 146 | self.buffer_size = 500 147 | 148 | try: 149 | self.save_episode = options[KEY_SAVE_EPISODE] 150 | except KeyError: 151 | self.save_episode = 50 152 | 153 | @abstractmethod 154 | def _init_input(self, *args): 155 | pass 156 | 157 | @abstractmethod 158 | def _init_nn(self, *args): 159 | pass 160 | 161 | @abstractmethod 162 | def _init_op(self): 163 | pass 164 | 165 | @abstractmethod 166 | def train(self): 167 | pass 168 | 169 | @abstractmethod 170 | def predict(self, s): 171 | pass 172 | 173 | @abstractmethod 174 | def snapshot(self, s, a, r, s_n): 175 | pass 176 | 177 | 178 | class BaseSLModel(BaseModel): 179 | 180 | def __init__(self, x_space, y_space, x_train, y_train, x_test, y_test, **options): 181 | super(BaseSLModel, self).__init__(**options) 182 | self.x_train, self.y_train = x_train, y_train 183 | self.x_test, self.y_test = x_test, y_test 184 | self.x_space, self.y_space = x_space, y_space 185 | 186 | def _init_options(self, options): 187 | super(BaseSLModel, self)._init_options(options) 188 | 189 | @abstractmethod 190 | def _init_input(self, *args): 191 | pass 192 | 193 | @abstractmethod 194 | def _init_nn(self, *args): 195 | pass 196 | 197 | @abstractmethod 198 | def _init_op(self): 199 | pass 200 | 201 | @abstractmethod 202 | def train(self, *args): 203 | pass 204 | 205 | @abstractmethod 206 | def predict(self, s): 207 | pass 208 | 209 | @abstractmethod 210 | def evaluate(self, *args): 211 | pass 212 | 213 | -------------------------------------------------------------------------------- /playground/DoubleDQN.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import numpy as np 4 | import gym 5 | 6 | from base.model import * 7 | from utility.launcher import start_game 8 | 9 | 10 | class Agent(BaseRLModel): 11 | 12 | def __init__(self, a_space, s_space, **options): 13 | super(Agent, self).__init__(a_space, s_space, **options) 14 | 15 | self._init_input() 16 | self._init_nn() 17 | self._init_op() 18 | self._init_saver() 19 | 20 | self.buffer = np.zeros((self.buffer_size, self.s_space + 1 + 1 + self.s_space)) 21 | self.buffer_count = 0 22 | 23 | self.update_target_net_step = 200 24 | 25 | self.session.run(tf.global_variables_initializer()) 26 | 27 | def _init_input(self, *args): 28 | with tf.variable_scope('input'): 29 | self.s_n = tf.placeholder(tf.float32, [None, self.s_space]) 30 | self.s = tf.placeholder(tf.float32, [None, self.s_space]) 31 | self.q_n = tf.placeholder(tf.float32, [None, ]) 32 | self.r = tf.placeholder(tf.float32, [None, ]) 33 | self.a = tf.placeholder(tf.int32, [None, ]) 34 | 35 | def _init_nn(self, *args): 36 | # w,b initializer 37 | w_initializer = tf.random_normal_initializer(mean=0.0, stddev=0.3) 38 | b_initializer = tf.constant_initializer(0.1) 39 | 40 | with tf.variable_scope('predict_q_net'): 41 | phi_state = tf.layers.dense(self.s, 42 | 64, 43 | tf.nn.relu, 44 | kernel_initializer=w_initializer, 45 | bias_initializer=b_initializer) 46 | 47 | self.q_predict = tf.layers.dense(phi_state, 48 | self.a_space, 49 | kernel_initializer=w_initializer, 50 | bias_initializer=b_initializer) 51 | 52 | with tf.variable_scope('target_q_net'): 53 | phi_state_next = tf.layers.dense(self.s_n, 54 | 64, 55 | tf.nn.relu, 56 | kernel_initializer=w_initializer, 57 | bias_initializer=b_initializer, 58 | trainable=False) 59 | 60 | self.q_target = tf.layers.dense(phi_state_next, 61 | self.a_space, 62 | kernel_initializer=w_initializer, 63 | bias_initializer=b_initializer, 64 | trainable=False) 65 | 66 | def _init_op(self): 67 | 68 | with tf.variable_scope('q_predict'): 69 | # size of q_value_predict is [BATCH_SIZE, 1] 70 | action_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1) 71 | self.q_eval = tf.gather_nd(self.q_predict, action_indices) 72 | 73 | with tf.variable_scope('loss'): 74 | self.loss_func = tf.losses.mean_squared_error(self.q_n, self.q_eval) 75 | 76 | with tf.variable_scope('train'): 77 | self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_func) 78 | 79 | with tf.variable_scope('update_target_net'): 80 | t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_net') 81 | p_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='predict_q_net') 82 | self.update_q_net = [tf.assign(t, e) for t, e in zip(t_params, p_params)] 83 | 84 | def predict(self, s): 85 | if np.random.uniform() < self.epsilon or self.mode == 'test': 86 | a = np.argmax(self.session.run(self.q_predict, feed_dict={self.s: s[np.newaxis, :]})) 87 | else: 88 | a = np.random.randint(0, self.a_space) 89 | return a 90 | 91 | def snapshot(self, s, a, r, s_n): 92 | self.buffer[self.buffer_count % self.buffer_size, :] = np.hstack((s, [a, r], s_n)) 93 | self.buffer_count += 1 94 | 95 | def train(self): 96 | 97 | for train_step in range(self.train_steps): 98 | # Update target net if need. 99 | if self.training_step % self.update_target_net_step == 0: 100 | self.session.run(self.update_q_net) 101 | # Get batch. 102 | if self.buffer_count < self.buffer_size: 103 | batch = self.buffer[np.random.choice(self.buffer_count, size=self.batch_size), :] 104 | else: 105 | batch = self.buffer[np.random.choice(self.buffer_size, size=self.batch_size), :] 106 | 107 | s = batch[:, :self.s_space] 108 | s_n = batch[:, -self.s_space:] 109 | a = batch[:, self.s_space].reshape((-1)) 110 | r = batch[:, self.s_space + 1] 111 | 112 | # 1. Calculate q_next_predict and q_next_target. 113 | q_next_predict, q_next_target = self.session.run([self.q_predict, self.q_target], { 114 | self.s: s_n, self.s_n: s_n 115 | }) 116 | 117 | # 2. Select a_indices in q_next_predict. 118 | a_indices = np.argmax(q_next_predict, axis=1) 119 | 120 | # 3. Select Q values with a_indices 121 | q_next = q_next_target[np.arange(0, self.batch_size), a_indices] 122 | 123 | # 4. Calculate q_real. 124 | q_real = r + self.gamma * q_next 125 | 126 | _, cost = self.session.run([self.train_op, self.loss_func], { 127 | self.s: s, self.a: a, self.q_n: q_real 128 | }) 129 | 130 | self.training_step += 1 131 | 132 | 133 | if __name__ == '__main__': 134 | 135 | def main(_): 136 | # Make env. 137 | env = gym.make('CartPole-v0') 138 | env.seed(1) 139 | env = env.unwrapped 140 | # Init agent. 141 | agent = Agent(env.action_space.n, env.observation_space.shape[0], **{ 142 | KEY_MODEL_NAME: 'PPO', 143 | KEY_TRAIN_EPISODE: 10000 144 | }) 145 | start_game(env, agent) 146 | 147 | 148 | if __name__ == '__main__': 149 | tf.app.run() 150 | -------------------------------------------------------------------------------- /playground/TensorFlowServing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | import requests 6 | import logging 7 | import shutil 8 | import os 9 | 10 | from static import CHECKPOINTS_DIR 11 | 12 | from grpc.beta import implementations 13 | from tensorflow_serving.apis import predict_pb2 14 | from tensorflow_serving.apis import prediction_service_pb2 15 | 16 | # os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 17 | 18 | data_save_dir = os.path.join(CHECKPOINTS_DIR, 'TensorFlowServing') 19 | graph_save_dir = os.path.join(CHECKPOINTS_DIR, 'TensorFlowServing', 'graph') 20 | 21 | # x_test = np.linspace(2 * -np.pi, 2 * np.pi, num=100).reshape((-1, 1)) 22 | # y_test = np.sin(x_test) 23 | 24 | # x_train = x_test + np.random.normal(0.3, 0.003) 25 | # y_train = np.sin(x_train) + np.random.normal(0.0, 0.00003) 26 | 27 | # x_train = x_train.astype(np.float32) 28 | # y_train = y_train.astype(np.float32) 29 | 30 | x_train = np.load(os.path.join(data_save_dir, 'x_train.npy')).astype(np.float32) 31 | y_train = np.load(os.path.join(data_save_dir, 'y_train.npy')).astype(np.float32) 32 | 33 | np.save(os.path.join(data_save_dir, 'x_train.npy'), x_train) 34 | np.save(os.path.join(data_save_dir, 'y_train.npy'), y_train) 35 | 36 | 37 | def train(): 38 | 39 | session = tf.Session() 40 | 41 | x_input = tf.placeholder(tf.float32, [None, 1], name='x_input') 42 | y_input = tf.placeholder(tf.float32, [None, 1], name='y_input') 43 | 44 | fc1 = tf.layers.dense(x_input, 10, tf.nn.relu) 45 | fc2 = tf.layers.dense(fc1, 10, tf.nn.relu) 46 | 47 | y_predict = tf.layers.dense(fc2, 1) 48 | 49 | loss_func = tf.losses.mean_squared_error(labels=y_input, predictions=y_predict) 50 | 51 | optimizer = tf.train.AdamOptimizer().minimize(loss_func) 52 | 53 | session.run(tf.global_variables_initializer()) 54 | 55 | signature = tf.saved_model.signature_def_utils.build_signature_def( 56 | inputs={ 57 | 'x_input': tf.saved_model.utils.build_tensor_info(x_input), 58 | 'y_input': tf.saved_model.utils.build_tensor_info(y_input) 59 | }, 60 | outputs={ 61 | 'y_predict': tf.saved_model.utils.build_tensor_info(y_predict), 62 | 'loss_func': tf.saved_model.utils.build_tensor_info(loss_func) 63 | }, 64 | method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME 65 | ) 66 | 67 | for step in range(2000): 68 | session.run(optimizer, { 69 | x_input: x_train, 70 | y_input: y_train 71 | }) 72 | if (step + 1) % 500 == 0: 73 | if os.path.exists(graph_save_dir): 74 | shutil.rmtree(graph_save_dir) 75 | builder = tf.saved_model.builder.SavedModelBuilder(graph_save_dir) 76 | builder.add_meta_graph_and_variables(session, 77 | [tf.saved_model.tag_constants.SERVING], 78 | {tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature}) 79 | # builder.add_meta_graph([tf.saved_model.tag_constants.SERVING], {'signature': signature}) 80 | builder.save() 81 | 82 | loss = session.run(loss_func, { 83 | x_input: x_train, 84 | y_input: y_train 85 | }) 86 | 87 | logging.warning('Loss: {}'.format(loss)) 88 | # builder.add_meta_graph_and_variables(session, [tf.saved_model.tag_constants.TRAINING], {'signature': signature}) 89 | # builder.add_meta_graph([tf.saved_model.tag_constants.SERVING], {'signature': signature}) 90 | # builder.save() 91 | 92 | 93 | def test(): 94 | # Session. 95 | session = tf.Session() 96 | # Load meta graph. 97 | meta_graph_def = tf.saved_model.loader.load(session, [tf.saved_model.tag_constants.SERVING], graph_save_dir) # type: tf.MetaGraphDef 98 | # Get signature. 99 | signature_def = meta_graph_def.signature_def 100 | signature = signature_def[tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY] 101 | # Get input tensor. 102 | x_input_tensor = signature.inputs['x_input'].name 103 | y_input_tensor = signature.inputs['y_input'].name 104 | # Get output tensor. 105 | y_predict_tensor = signature.outputs['y_predict'].name 106 | # Get loss func. 107 | loss_op = signature.outputs['loss_func'].name 108 | 109 | _, loss = session.run([y_predict_tensor, loss_op], { 110 | x_input_tensor: x_train, 111 | y_input_tensor: y_train, 112 | }) 113 | 114 | logging.warning('Loss: {}'.format(loss)) 115 | 116 | 117 | def inference_v1(): 118 | # Init channel. 119 | channel = implementations.insecure_channel('localhost', 9000) 120 | # Init stub. 121 | stub = prediction_service_pb2.beta_create_PredictionService_stub(channel) 122 | # Init request. 123 | request = predict_pb2.PredictRequest() 124 | request.model_spec.name = 'test' 125 | request.model_spec.signature_name = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY 126 | request.inputs['x_input'].CopyFrom( 127 | tf.contrib.util.make_tensor_proto(x_train, shape=x_train.shape) 128 | ) 129 | request.inputs['y_input'].CopyFrom( 130 | tf.contrib.util.make_tensor_proto(y_train, shape=y_train.shape) 131 | ) 132 | # Predict. 133 | future = stub.Predict.future(request, 2.0) 134 | result = future.result().outputs['loss_func'].float_val 135 | logging.warning('Loss: {}'.format(result)) 136 | 137 | 138 | def inference_v2(): 139 | # Init url. 140 | url = "http://localhost:9001/v1/models/test:predict" 141 | # url = 'http://172.16.11.43:10000/tool_list/test' 142 | # Init body. 143 | import json 144 | body = { 145 | # 'signature_name': tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, 146 | 'instances': [ 147 | { 148 | 'x_input': json.dumps(x_train.tolist(), ensure_ascii=True) 149 | } 150 | ] 151 | } 152 | # Post. 153 | # response = requests.post(url, data=body) 154 | response = requests.post(url, json=json.dumps(body)) 155 | logging.warning('{}'.format(response.text)) 156 | return response 157 | 158 | 159 | train() 160 | # test() 161 | # inference_v1() 162 | # inference_v2() 163 | 164 | 165 | # plt.plot(y_train) 166 | # plt.plot(y_test) 167 | # plt.show() 168 | 169 | -------------------------------------------------------------------------------- /note/GloVe.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# GloVe" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 问题设定" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "GloVe是Global Vectors for Word Representation的缩写。" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "对于One-hot词向量:" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "$$\n", 36 | "\\begin{aligned}\n", 37 | "I &= [1, 0, 0] \\\\\n", 38 | "Like &= [0, 1, 0] \\\\\n", 39 | "Apple &= [0, 0, 1] \n", 40 | "\\end{aligned}\n", 41 | "$$" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "无法通过两向量夹角余弦值计算其相似度,word2vec是一种嵌入模型,通过这种模型训练出的词向量可以较好的表示出词之间的相似度,但是word2vec仅仅考虑了两个词在一段上下文的相关度,而GloVe考虑了两个词向量在全文中的相关度。" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "## 共现矩阵(Co-occurrence Probabilities Matrix)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "训练GloVe模型前,首先需要构建一个共现矩阵,设词表大小为V,共现矩阵将是一个V行V列的方阵,而第i行第j列的表示了以第i个中心词$w_i$,第j个背景词$w_j$出现的次数。" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "假设我们有上下文:" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "$$\n", 77 | "an\\ apple\\ a\\ day\\ keeps\\ an\\ apple\\ a\\ day\n", 78 | "$$" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "我们设定滑窗大小m等于2,我们将会有如下中心词-背景词对:" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "中心词 | 背景词 |\n", 93 | ":---: | :---: |\n", 94 | "an | apple, a |\n", 95 | "apple | an, a, day |\n", 96 | "a | an, apple, day, keeps |\n", 97 | "day | apple, a, keeps, an |\n", 98 | "keeps | a, day, an, apple |\n", 99 | "an | day, keeps, apple, a |\n", 100 | "apple | keeps, an, a, day |\n", 101 | "a | an, apple, day |\n", 102 | "day | apple, a |" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "然后遍历中心词-背景词对,更新共现矩阵,以上图为例,最后共现矩阵的结果将有如下形式:" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "| | An | apple | a | day | keeps |\n", 117 | "| - | - | - | - | - | - |\n", 118 | "| An | 0 | 2 | 2 | 1 | 1 |\n", 119 | "| apple | 2 | 0 | 2 | 2 | 1 |\n", 120 | "| a | 2 | 2 | 0 | 2 | 1 |\n", 121 | "| day | 0 | 2 | 2 | 0 | 1 |\n", 122 | "| keeps | 1 | 1 | 1 | 1 | 0 |" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "共现矩阵揭示了某种规律,定义共现矩阵的第i行的和为:" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "$$\n", 137 | "X_i = \\sum^{V}_{j=1}X_{i, j}\n", 138 | "$$" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "之后我们有条件概率,即第j列对应的词出现在第i行上下文中的条件概率:" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "$$\n", 153 | "\\mathbb{P}_{i, j} = \\frac{X_{i, j}}{X_i}\n", 154 | "$$" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "而对于某个词$w_k$,他在第i行或者第j行上下文出现的条件概率的比值:" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "$$\n", 169 | "\\frac{\\mathbb{P}_{i, k}}{\\mathbb{P}_{j, k}}\n", 170 | "$$" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "这个值是可以直接观测并计算到的,并将会有如下规律:\n", 178 | "- 如果$w_j$与$w_k$相关,且$w_i$与$w_k$相关,那么这个比值将会趋近于1\n", 179 | "- 如果$w_j$与$w_k$相关,且$w_i$与$w_k$不相关,那么这个比值将会很小\n", 180 | "- 如果$w_j$与$w_k$不相关,且$w_i$与$w_k$相关,那么这个比值将会很大\n", 181 | "- 如果$w_j$与$w_k$不相关,且$w_i$与$w_k$不相关,那么这个比值将会趋近于1" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "## 损失函数" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "我们希望设计一个损失函数,希望对词表内每两个词对,$w_i$与$w_j$,尽可能与$w_k$在共现矩阵中对于第i, j上下文中,出现的条件概率比值相近:" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "$$\n", 203 | "\\frac{\\mathbb{P}_{i, k}}{\\mathbb{P}_{j, k}} = \\frac{\\exp (v^T_i v_k) }{\\exp (v^T_j v_k)}\n", 204 | "$$" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "两边取对数,对于分子分母:" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "$$\n", 219 | "\\log \\frac{X_{i, k}}{X_i} = \\log X_{i, k} - \\log X_i = v^T_i v_k \\\\\n", 220 | "\\log \\frac{X_{j, k}}{X_j} = \\log X_{j, k} - \\log X_j = v^T_j v_k\n", 221 | "$$" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "可以看到问题得到了简化,我们希望左式的分子尽可能等于右式的分子,分母亦然,则问题被简化为:对于词表内任意一组词对i, j,我们希望最小化下式:" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "$$\n", 236 | "\\sum^{V}_{i=1} \\sum^{V}_{j=1} \\left( v^T_i v_j - \\log X_i - \\log(X_{i, j})\\right )^2\n", 237 | "$$" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "其中偏置项$b_i, b_j$将会替换$\\log X_i$:" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "但是并不是每一个词对都是平权的,需要考虑词频来设定每一个词对的权重:" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "$$\n", 259 | "f(X_{i, j}) = \n", 260 | "\\begin{cases}\n", 261 | "(X_{i, j} \\ /\\ C)^{0.75}& \\text{ X > c }\\\\\n", 262 | "1& \\text{ X < 0}\n", 263 | "\\end{cases}\n", 264 | "$$" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "最后我们希望最小化:" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "$$\n", 279 | "\\sum^{V}_{i=1} \\sum^{V}_{j=1} f(X_{i, j}) \\left( v^T_i v_j + b_i + b_j - \\log(X_{i, j})\\right )^2\n", 280 | "$$" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "最后使用中心词向量$v_j$与背景词向量$v_i$的和作为中心词向量的表示。" 288 | ] 289 | } 290 | ], 291 | "metadata": { 292 | "kernelspec": { 293 | "display_name": "Python 3", 294 | "language": "python", 295 | "name": "python3" 296 | }, 297 | "language_info": { 298 | "codemirror_mode": { 299 | "name": "ipython", 300 | "version": 3 301 | }, 302 | "file_extension": ".py", 303 | "mimetype": "text/x-python", 304 | "name": "python", 305 | "nbconvert_exporter": "python", 306 | "pygments_lexer": "ipython3", 307 | "version": "3.5.4" 308 | } 309 | }, 310 | "nbformat": 4, 311 | "nbformat_minor": 2 312 | } 313 | -------------------------------------------------------------------------------- /note/.ipynb_checkpoints/GloVe-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# GloVe" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 问题设定" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "GloVe是Global Vectors for Word Representation的缩写。" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "对于One-hot词向量:" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "$$\n", 36 | "\\begin{aligned}\n", 37 | "I &= [1, 0, 0] \\\\\n", 38 | "Like &= [0, 1, 0] \\\\\n", 39 | "Apple &= [0, 0, 1] \n", 40 | "\\end{aligned}\n", 41 | "$$" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "无法通过两向量夹角余弦值计算其相似度,word2vec是一种嵌入模型,通过这种模型训练出的词向量可以较好的表示出词之间的相似度,但是word2vec仅仅考虑了两个词在一段上下文的相关度,而GloVe考虑了两个词向量在全文中的相关度。" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "## 共现矩阵(Co-occurrence Probabilities Matrix)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "训练GloVe模型前,首先需要构建一个共现矩阵,设词表大小为V,共现矩阵将是一个V行V列的方阵,而第i行第j列的表示了以第i个中心词$w_i$,第j个背景词$w_j$出现的次数。" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "假设我们有上下文:" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "$$\n", 77 | "an\\ apple\\ a\\ day\\ keeps\\ an\\ apple\\ a\\ day\n", 78 | "$$" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "我们设定滑窗大小m等于2,我们将会有如下中心词-背景词对:" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "中心词 | 背景词 |\n", 93 | ":---: | :---: |\n", 94 | "an | apple, a |\n", 95 | "apple | an, a, day |\n", 96 | "a | an, apple, day, keeps |\n", 97 | "day | apple, a, keeps, an |\n", 98 | "keeps | a, day, an, apple |\n", 99 | "an | day, keeps, apple, a |\n", 100 | "apple | keeps, an, a, day |\n", 101 | "a | an, apple, day |\n", 102 | "day | apple, a |" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "然后遍历中心词-背景词对,更新共现矩阵,以上图为例,最后共现矩阵的结果将有如下形式:" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "| | An | apple | a | day | keeps |\n", 117 | "| - | - | - | - | - | - |\n", 118 | "| An | 0 | 2 | 2 | 1 | 1 |\n", 119 | "| apple | 2 | 0 | 2 | 2 | 1 |\n", 120 | "| a | 2 | 2 | 0 | 2 | 1 |\n", 121 | "| day | 0 | 2 | 2 | 0 | 1 |\n", 122 | "| keeps | 1 | 1 | 1 | 1 | 0 |" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "共现矩阵揭示了某种规律,定义共现矩阵的第i行的和为:" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "$$\n", 137 | "X_i = \\sum^{V}_{j=1}X_{i, j}\n", 138 | "$$" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "之后我们有条件概率,即第j列对应的词出现在第i行上下文中的条件概率:" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "$$\n", 153 | "\\mathbb{P}_{i, j} = \\frac{X_{i, j}}{X_i}\n", 154 | "$$" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "而对于某个词$w_k$,他在第i行或者第j行上下文出现的条件概率的比值:" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "$$\n", 169 | "\\frac{\\mathbb{P}_{i, k}}{\\mathbb{P}_{j, k}}\n", 170 | "$$" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "这个值是可以直接观测并计算到的,并将会有如下规律:\n", 178 | "- 如果$w_j$与$w_k$相关,且$w_i$与$w_k$相关,那么这个比值将会趋近于1\n", 179 | "- 如果$w_j$与$w_k$相关,且$w_i$与$w_k$不相关,那么这个比值将会很小\n", 180 | "- 如果$w_j$与$w_k$不相关,且$w_i$与$w_k$相关,那么这个比值将会很大\n", 181 | "- 如果$w_j$与$w_k$不相关,且$w_i$与$w_k$不相关,那么这个比值将会趋近于1" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "## 损失函数" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "我们希望设计一个损失函数,希望对词表内每两个词对,$w_i$与$w_j$,尽可能与$w_k$在共现矩阵中对于第i, j上下文中,出现的条件概率比值相近:" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "$$\n", 203 | "\\frac{\\mathbb{P}_{i, k}}{\\mathbb{P}_{j, k}} = \\frac{\\exp (v^T_i v_k) }{\\exp (v^T_j v_k)}\n", 204 | "$$" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "两边取对数,对于分子分母:" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "$$\n", 219 | "\\log \\frac{X_{i, k}}{X_i} = \\log X_{i, k} - \\log X_i = v^T_i v_k \\\\\n", 220 | "\\log \\frac{X_{j, k}}{X_j} = \\log X_{j, k} - \\log X_j = v^T_j v_k\n", 221 | "$$" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "可以看到问题得到了简化,我们希望左式的分子尽可能等于右式的分子,分母亦然,则问题被简化为:对于词表内任意一组词对i, j,我们希望最小化下式:" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "$$\n", 236 | "\\sum^{V}_{i=1} \\sum^{V}_{j=1} \\left( v^T_i v_j - \\log X_i - \\log(X_{i, j})\\right )^2\n", 237 | "$$" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "其中偏置项$b_i, b_j$将会替换$\\log X_i$:" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "但是并不是每一个词对都是平权的,需要考虑词频来设定每一个词对的权重:" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "$$\n", 259 | "f(X_{i, j}) = \n", 260 | "\\begin{cases}\n", 261 | "(X_{i, j} \\ /\\ C)^{0.75}& \\text{ X > c }\\\\\n", 262 | "1& \\text{ X < 0}\n", 263 | "\\end{cases}\n", 264 | "$$" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "最后我们希望最小化:" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "$$\n", 279 | "\\sum^{V}_{i=1} \\sum^{V}_{j=1} f(X_{i, j}) \\left( v^T_i v_j + b_i + b_j - \\log(X_{i, j})\\right )^2\n", 280 | "$$" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "最后使用中心词向量$v_j$与背景词向量$v_i$的和作为中心词向量的表示。" 288 | ] 289 | } 290 | ], 291 | "metadata": { 292 | "kernelspec": { 293 | "display_name": "Python 3", 294 | "language": "python", 295 | "name": "python3" 296 | }, 297 | "language_info": { 298 | "codemirror_mode": { 299 | "name": "ipython", 300 | "version": 3 301 | }, 302 | "file_extension": ".py", 303 | "mimetype": "text/x-python", 304 | "name": "python", 305 | "nbconvert_exporter": "python", 306 | "pygments_lexer": "ipython3", 307 | "version": "3.5.4" 308 | } 309 | }, 310 | "nbformat": 4, 311 | "nbformat_minor": 2 312 | } 313 | -------------------------------------------------------------------------------- /note/Word2Vec.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Word2Vec" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## 问题设定" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "对于One-hot的词向量:" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "$$\n", 31 | "\\begin{aligned}\n", 32 | "I &= [1, 0, 0] \\\\\n", 33 | "Like &= [0, 1, 0] \\\\\n", 34 | "Apple &= [0, 0, 1] \n", 35 | "\\end{aligned}\n", 36 | "$$" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "无法通过两向量夹角余弦值计算其相似度,word2vec提供了Skip-Gram(跳字模型)与CBOW(连续词袋模型)两个词嵌入模型,通过这种模型训练出的词向量可以较好的表示出词之间的相似度。" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## Skip-Gram" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "即跳字模型,其核心思想是对于一个上下文,设定一个大小为$m$的滑窗,在滑窗内选择$1$个中心词,预测滑窗内$m - 1$个背景词。即如果上下文是:" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "$$\n", 65 | "I\\ eat\\ apple\\ every\\ day\n", 66 | "$$" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "对每一个词进行One-hot编码:" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "$$\n", 81 | "\\begin{aligned}\n", 82 | "I &= [1, 0, 0, 0, 0] \\\\ \n", 83 | "eat &= [0, 1, 0, 0, 0] \\\\\n", 84 | "apple &= [0, 0, 1, 0, 0] \\\\\n", 85 | "every &= [0, 0, 0, 1, 0] \\\\\n", 86 | "day &= [0, 0, 0, 0, 1]\n", 87 | "\\end{aligned}\n", 88 | "$$" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "设定滑窗大小为$2$,如果选择中心词$apple$,那么将会有以下训练数据:" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "$$\n", 103 | "\\begin{aligned}\n", 104 | "x &= [0, 0, 1, 0, 0] \\\\ \n", 105 | "y &= [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]\n", 106 | "\\end{aligned}\n", 107 | "$$" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "设计一个只有1个输入层、1个隐藏层、1个输出层的神经网络,其中输出层的神经元个数等于输入层即等于One-hot编码的维度,而隐含层的神经元个数通常远小于输出层,比如One-hot维度如果是10000,隐含层可以只有300个神经元:" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "我们通过最大化似然函数:" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "$$\n", 129 | "\\prod^{N}_{i=1} \\prod_{-m <= j <= m} \\mathbb{P} \\left( w^{i+j} \\ \\lvert \\ w^i \\right)\n", 130 | "$$" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "即对于上下文内所有的词,给定中心词$w^i$,预测滑窗内其他词,越准确越好。对上式取对数并展开:" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "$$\n", 145 | "\\begin{aligned}\n", 146 | "\\prod^{N}_{i=1} \\prod_{-m <= j <= m} \\mathbb{P} \\left( w^{i+j} \\ \\lvert \\ w^i \\right) &= \\sum^{N}_{i=1} \\sum_{-m <= j <= m} \\log \\mathbb{P} \\left( w^{i+j} \\ \\lvert \\ w^i \\right) \\\\\n", 147 | "&= \\sum^{N}_{i=1} \\sum_{-m <= j <= m} \\log \\left( \\frac{\\exp(\\mathrm{u^T_{i+j} \\cdot v_{i}} )}{ \\mathrm{\\sum^{N}_{k=1} \\exp(\\mathrm{u^T_{k} \\cdot v_{i}})}} \\right) \\\\\n", 148 | "\\end{aligned}\n", 149 | "$$" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "其中,$\\mathrm{v_i}$即是隐藏层的权重,也是隐藏层的输入$z_i$,也是第i个词的词向量,$\\mathrm{u_{i+j}}$是输出层的权重,也是第i+j个词的词向量的另一个表达。最大化上式的最大似然函数,即最小化下式交叉熵:" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "$$\n", 164 | "- \\sum^{N}_{i=1} \\mathrm{y_i} \\cdot \\log \\mathrm{p_i}\n", 165 | "$$" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "其中$\\mathrm{y_i}$与$\\mathrm{p_i}$是维度为词表长度的向量,分别代表观测值与计算值,对$\\mathrm{v_i}$求梯度有:" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "$$\n", 180 | "\\begin{aligned}\n", 181 | "\\frac {\\partial \\log \\mathbb{P} \\left( w^{j} \\ \\lvert \\ w^i \\right)} {\\mathrm{v_i}} &= \\frac {\\partial \\log \\left( \\exp(\\mathrm{u^T_{j} \\cdot v_{i}} ) \\right) - \\log \\left ( \\mathrm{\\sum^{V}_{k=1} \\exp(\\mathrm{u^T_{k} \\cdot v_{i}})} \\right)}{\\partial \\mathrm{v_{i}}} \\\\\n", 182 | "&= \\mathrm{u_{j}} - \\frac{1}{\\sum^{V}_{w=1} \\exp(\\mathrm{u^T_w v_i}) } \\left[ \\sum^{V}_{k=1} \\exp(\\mathrm{u^T_k v_i) \\cdot \\mathrm{u_k}} \\right] \\\\\n", 183 | "&= \\mathrm{u_{j}} - \\sum^{V}_{k=1} \\frac{ \\exp(\\mathrm{u^T_k v_i}) }{ \\sum^{V}_{w=1} \\exp(\\mathrm{u^T_w v_i}) } \\cdot \\mathrm{u_k}\n", 184 | "\\end{aligned}\n", 185 | "$$" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "然后使用梯度下降更新$\\mathrm{v_i}$,此处的$\\mathrm{v_i}$是向量,在网络中,即是输入层的第i个神经元到隐含层的权重。" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "## CBOW" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "即Continuous Bag of Words,连续词袋模型,其核心思想是对于一个上下文,设定一个大小为$m$的滑窗,在滑窗内选择$1$个背景词,$m - 1$个中心词,与Skip-Gram相反,设定滑窗大小为$2$,如果选择中心词$\\ I,\\ eat,\\ every,\\ day$,那么将会有以下训练数据:" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "$$\n", 214 | "\\begin{aligned}\n", 215 | "x &= [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1] \\\\ \n", 216 | "y &= [0, 0, 1, 0, 0] \\\\ \n", 217 | "\\end{aligned}\n", 218 | "$$" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "而对于概率:" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "$$\n", 233 | "\\mathbb{P} \\left( w^{j} \\ \\lvert \\ w^{i-m}, \\cdots, w^i, \\cdots, w^{i+m} \\right) = \\frac{\\exp(\\mathrm{u^T_{j} \\cdot (v_{i} + \\cdots + v_{i+2m}} ) / 2m)}{ \\mathrm{\\sum^{V}_{k=1} \\exp(\\mathrm{u^T_{k} \\cdot (v_{i} + \\cdots + v_{i+2m}} ) / 2m})}\n", 234 | "$$" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "与Skip-Gram的不同之处在于将中心词求和后平均,之后的梯度计算与更新和Skip-Gram相同,这里就不展开了。" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "## 负采样" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "可以直观地从上面的梯度更新公式中看到,每一次更新都伴随着巨量的计算开销,这个计算开销主要是因为Softmax函数的分母。可以使用负采样替换Softmax,减少计算开销。" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "相对于原条件概率:" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "$$\n", 270 | "\\mathbb{P} \\left( w^{j} \\ \\lvert \\ w^i \\right) = \\frac{\\exp(\\mathrm{u^T_{j} \\cdot v_{i}} )}{ \\mathrm{\\sum^{V}_{k=1} \\exp(\\mathrm{u^T_{k} \\cdot v_{i}})}}\n", 271 | "$$" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "将被改写为:" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "$$\n", 286 | "\\mathbb{P} \\left( w^{j} \\ \\lvert \\ w^i \\right) = \\log \\frac{1}{1 + \\exp(- \\mathrm{u^T_j v_i})} + \\sum^{K}_{k=1} \\log \\left( 1 - \\frac{1}{1 + \\exp(- \\mathrm{u^T_k v_i})} \\right)\n", 287 | "$$" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "即筛选出K个不在滑窗内的词向量,直观地理解是希望中心词尽可能地不预测出这些采样出的词,筛选出某个词的概率由这个公式决定:" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "$$\n", 302 | "\\mathrm{P(w_i)} = \\frac{f(w_i)^{\\frac{3}{4}}}{\\sum^{V}_{k=1}f(w_k)^{\\frac{3}{4}}}\n", 303 | "$$" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "其中,$f(w_i)$是这个单词在上下文中出现的频率。" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "## 结果" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "通过这种词嵌入模型训练出的词向量能较好的表示两个相近意思的词的近似程度。" 325 | ] 326 | } 327 | ], 328 | "metadata": { 329 | "kernelspec": { 330 | "display_name": "Python 3", 331 | "language": "python", 332 | "name": "python3" 333 | }, 334 | "language_info": { 335 | "codemirror_mode": { 336 | "name": "ipython", 337 | "version": 3 338 | }, 339 | "file_extension": ".py", 340 | "mimetype": "text/x-python", 341 | "name": "python", 342 | "nbconvert_exporter": "python", 343 | "pygments_lexer": "ipython3", 344 | "version": "3.5.4" 345 | } 346 | }, 347 | "nbformat": 4, 348 | "nbformat_minor": 1 349 | } 350 | -------------------------------------------------------------------------------- /note/A3C.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# A3C" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "A3C是Asynchronous Advantage Actor-Critic Model的简称,即异步优势演员-评论家模型,A3C并不是一种像Policy Gradient或DQN这样具体的算法,而是一种解决问题的思想,它的核心精神是,在强化学习的训练过程中,我们可以并行地训练多个Agent,在训练的过程中,各个Agent是参数共享的。更具体一些,我们可能有N个Agent并行地在环境采样1回合后计算1次梯度,我们还有1个或多个Agent在这个过程中什么都不做,当N个Agent中的1个或者M个或者N个采样完成,并反向传播或者基于时间的反向传播计算完1回合的梯度后,这些Agent会将梯度异步地分发给那1个或者多个什么都不做的Agent,然后这些什么都不做的Agent执行一次参数更新,再将更新后的参数分发给这个分发梯度的Agent,然后一直重复这个过程。当然分发梯度和分发参数这个过程是否是异步或者同步,也是可以大做文章的。" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "# A3C with TensorFlow" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "那么具体要怎么操作呢?这里以TensorFlow为框架实现了一个DPPO,即Distributed Proximal Policy Optimization,分布式近端策略优化模型。这个和A3C有什么关系呢?在上文提到,A3C并不是一个具体的算法,它的核心精神是一套异步训练模型、同步或者异步更新参数的思想。或者换句话说,不管是DQN、Policy Gradient、PPO、ACER,还是基于它们的一系列改进,我们都可以用A3C的思想去改进它们。好在TensorFlow已经为我们做了大部分的底层工作,我们只需要几十行代码,就可以把一个单进程的训练过程改进为分布式训练过程。" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Distributed TensorFlow" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "为了避免让文章沦为文档翻译,所以这里仅仅对分布式TensorFlow做非常简短的说明,详细的文档可以参考:\n", 43 | "> [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed) " 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "首先,需要构造集群描述对象让集群待命,可以通过如下方法构造集群描述对象:\n", 51 | "```\n", 52 | "cluster = tf.train.ClusterSpec({\n", 53 | " 'worker': [\n", 54 | " 'localhost:8001',\n", 55 | " 'localhost:8002',\n", 56 | " 'localhost:8003',\n", 57 | " ],\n", 58 | " 'ps': [\n", 59 | " 'localhost:8000'\n", 60 | " ]\n", 61 | "})\n", 62 | "```\n", 63 | "可以看出,集群描述对象是一个键为job_name(任务名),值为ip:port的字典,至于 job_name 的定义稍后会做解释。然后通过如下语句启动集群中的一个节点,并让节点待命:\n", 64 | "```\n", 65 | "server = tf.train.Server(cluster, job_name=role, task_index=task_index)\n", 66 | "if role == 'ps':\n", 67 | " logging.warning('Parameter server started.')\n", 68 | " server.join()\n", 69 | "else:\n", 70 | " pass\n", 71 | " # do some sth later.\n", 72 | "```\n", 73 | "至此,一个节点就被启动并待命了,可以看到一个节点会被抽象为一个server对象,其中job_name对应了节点的任务名,ps是Parameter Server,即参数服务器,worker即计算服务器,它们的用途会在下文提到。根据集群中的每个节点是否会完整地构建自己的计算图,TensorFlow提供了两种方案,分别是 In-graph replication 和 Between-graph replication,每个节点是否会构建自己的计算图,也决定了每个节点的工作方式。" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "### In-graph replication" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "在这种方案中,集群中的每一个节点不会完整地构建自己的计算图,每一个节点仅仅是单纯地利用自己的算力通过以下的语句执行任务:\n", 88 | "```\n", 89 | "with tf.device(\"/job:ps/task:0\"):\n", 90 | " # Define vars.\n", 91 | " \n", 92 | "with tf.device(\"/job:worker/task:0\"):\n", 93 | " # Do computations.\n", 94 | "```\n", 95 | "通常,只需要提前启动集群,然后构造一个Session,然后根据节点分配计算图中的各个结点,然后进行训练就可以了,非常地直觉。这样做有一个缺点是数据会在各个结点分发,如果数据非常大,这样是得不偿失的。在下文实现的DPPO中,我们将不会采用这套方案。" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "### Between-graph replication" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "与In-graph replication不同的是,集群中的每一个节点会完整地构建自己的计算图,可以说这种方案就是为了A3C而设计的,在这种方案中,我们会有一个或者多个参数节点(Parameters Server),多个计算节点(Worker Server),每个计算节点完成梯度计算后,会异步地将梯度分发到参数节点,然后参数节点会同步或者异步地用梯度更新参数,然后分发最新的参数到一个或者多个计算节点。" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "### Parameters Server & Worker Server" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "ps,即参数节点,在Between-graph replication的方案中,它通常什么都不做,节点启动后即调用`join()`待命,worker,即计算节点,在Between-graph replication方案中,这些节点定义了完整的计算图并执行这些计算,在计算节点完成一次梯度计算后,梯度会被异步分发给参数节点,参数节点更新参数后,分发参数给计算节点。这个过程可以既可以是异步的也可以是同步的,在Between-graph replication方案中,默认是异步的。" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "# PPO" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "在前一篇文章中已经实现了一个PPO,学习笔记:\n", 138 | "> [PPO Note](https://github.com/Ceruleanacg/Learning-Notes/blob/master/note/PPO.ipynb) " 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "源码:\n", 146 | "> [PPO Code](https://github.com/Ceruleanacg/Learning-Notes/blob/master/playground/PPO.py)\n" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "# DPPO in Action" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "首先实现一个方法,它用来启动集群的各个节点,并根据节点类型待命或者定义并执行计算图:" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 1, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "name": "stderr", 170 | "output_type": "stream", 171 | "text": [ 172 | "/Users/shuyu/anaconda3/envs/quant/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5\n", 173 | " return f(*args, **kwds)\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "# coding=utf-8\n", 179 | "\n", 180 | "import sys\n", 181 | "sys.path.append('..')\n", 182 | "\n", 183 | "import multiprocessing as mp\n", 184 | "import tensorflow as tf\n", 185 | "import logging\n", 186 | "import gym\n", 187 | "\n", 188 | "from base.model import *\n", 189 | "from playground import PPO\n", 190 | "from utility.launcher import start_game\n", 191 | "\n", 192 | "\n", 193 | "def start_a3c(cluster, role, task_index):\n", 194 | " # 根据集群描述对象启动节点\n", 195 | " server = tf.train.Server(cluster, job_name=role, task_index=task_index)\n", 196 | " if role == 'ps':\n", 197 | " # 如果是参数节点,则join待命\n", 198 | " logging.warning('Parameter server started.')\n", 199 | " server.join()\n", 200 | " else:\n", 201 | " # 如果是计算节点,定义计算图,计算梯度\n", 202 | " worker_device = \"/job:worker/task:{}\".format(task_index)\n", 203 | " logging.warning('Worker: {}, server stated.'.format(worker_device))\n", 204 | " # 根据集群描述对象分配节点\n", 205 | " with tf.device(tf.train.replica_device_setter(cluster=cluster)):\n", 206 | " # Make env.\n", 207 | " env = gym.make('CartPole-v0')\n", 208 | " env.seed(1)\n", 209 | " env = env.unwrapped\n", 210 | " # Init session.\n", 211 | " session = tf.Session(server.target)\n", 212 | " # session = tf.Session()\n", 213 | " # Init agent.\n", 214 | " agent = PPO.Agent(env.action_space.n, env.observation_space.shape[0], **{\n", 215 | " KEY_SESSION: session,\n", 216 | " KEY_MODEL_NAME: 'PPO',\n", 217 | " KEY_TRAIN_EPISODE: 1000\n", 218 | " })\n", 219 | " start_game(env, agent, task_index)" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "然后定义集群描述对象:" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 2, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "cluster = tf.train.ClusterSpec({\n", 236 | " 'worker': [\n", 237 | " 'localhost:8001',\n", 238 | " 'localhost:8002',\n", 239 | " 'localhost:8003',\n", 240 | " ],\n", 241 | " 'ps': [\n", 242 | " 'localhost:8000'\n", 243 | " ]\n", 244 | " })\n", 245 | "\n", 246 | "role_task_index_map = [\n", 247 | " ('ps', 0),\n", 248 | " ('worker', 0),\n", 249 | " ('worker', 1),\n", 250 | " ('worker', 2),\n", 251 | "]" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "启动A3C并训练:" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "pool = mp.Pool(processes=4)\n", 268 | "\n", 269 | "for role, task_index in role_task_index_map:\n", 270 | " pool.apply_async(start_a3c, args=(cluster, role, task_index, ))\n", 271 | "pool.close()\n", 272 | "pool.join()" 273 | ] 274 | } 275 | ], 276 | "metadata": { 277 | "kernelspec": { 278 | "display_name": "Python 3", 279 | "language": "python", 280 | "name": "python3" 281 | }, 282 | "language_info": { 283 | "codemirror_mode": { 284 | "name": "ipython", 285 | "version": 3 286 | }, 287 | "file_extension": ".py", 288 | "mimetype": "text/x-python", 289 | "name": "python", 290 | "nbconvert_exporter": "python", 291 | "pygments_lexer": "ipython3", 292 | "version": "3.5.4" 293 | } 294 | }, 295 | "nbformat": 4, 296 | "nbformat_minor": 2 297 | } 298 | -------------------------------------------------------------------------------- /note/.ipynb_checkpoints/A3C-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# A3C" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "A3C是Asynchronous Advantage Actor-Critic Model的简称,即异步优势演员-评论家模型,A3C并不是一种像Policy Gradient或DQN这样具体的算法,而是一种解决问题的思想,它的核心精神是,在强化学习的训练过程中,我们可以并行地训练多个Agent,在训练的过程中,各个Agent是参数共享的。更具体一些,我们可能有N个Agent并行地在环境采样1回合后计算1次梯度,我们还有1个或多个Agent在这个过程中什么都不做,当N个Agent中的1个或者M个或者N个采样完成,并反向传播或者基于时间的反向传播计算完1回合的梯度后,这些Agent会将梯度异步地分发给那1个或者多个什么都不做的Agent,然后这些什么都不做的Agent执行一次参数更新,再将更新后的参数分发给这个分发梯度的Agent,然后一直重复这个过程。当然分发梯度和分发参数这个过程是否是异步或者同步,也是可以大做文章的。" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "# A3C with TensorFlow" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "那么具体要怎么操作呢?这里以TensorFlow为框架实现了一个DPPO,即Distributed Proximal Policy Optimization,分布式近端策略优化模型。这个和A3C有什么关系呢?在上文提到,A3C并不是一个具体的算法,它的核心精神是一套异步训练模型、同步或者异步更新参数的思想。或者换句话说,不管是DQN、Policy Gradient、PPO、ACER,还是基于它们的一系列改进,我们都可以用A3C的思想去改进它们。好在TensorFlow已经为我们做了大部分的底层工作,我们只需要几十行代码,就可以把一个单进程的训练过程改进为分布式训练过程。" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Distributed TensorFlow" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "为了避免让文章沦为文档翻译,所以这里仅仅对分布式TensorFlow做非常简短的说明,详细的文档可以参考:\n", 43 | "> [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed) " 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "首先,需要构造集群描述对象让集群待命,可以通过如下方法构造集群描述对象:\n", 51 | "```\n", 52 | "cluster = tf.train.ClusterSpec({\n", 53 | " 'worker': [\n", 54 | " 'localhost:8001',\n", 55 | " 'localhost:8002',\n", 56 | " 'localhost:8003',\n", 57 | " ],\n", 58 | " 'ps': [\n", 59 | " 'localhost:8000'\n", 60 | " ]\n", 61 | "})\n", 62 | "```\n", 63 | "可以看出,集群描述对象是一个键为job_name(任务名),值为ip:port的字典,至于 job_name 的定义稍后会做解释。然后通过如下语句启动集群中的一个节点,并让节点待命:\n", 64 | "```\n", 65 | "server = tf.train.Server(cluster, job_name=role, task_index=task_index)\n", 66 | "if role == 'ps':\n", 67 | " logging.warning('Parameter server started.')\n", 68 | " server.join()\n", 69 | "else:\n", 70 | " pass\n", 71 | " # do some sth later.\n", 72 | "```\n", 73 | "至此,一个节点就被启动并待命了,可以看到一个节点会被抽象为一个server对象,其中job_name对应了节点的任务名,ps是Parameter Server,即参数服务器,worker即计算服务器,它们的用途会在下文提到。根据集群中的每个节点是否会完整地构建自己的计算图,TensorFlow提供了两种方案,分别是 In-graph replication 和 Between-graph replication,每个节点是否会构建自己的计算图,也决定了每个节点的工作方式。" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "### In-graph replication" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "在这种方案中,集群中的每一个节点不会完整地构建自己的计算图,每一个节点仅仅是单纯地利用自己的算力通过以下的语句执行任务:\n", 88 | "```\n", 89 | "with tf.device(\"/job:ps/task:0\"):\n", 90 | " # Define vars.\n", 91 | " \n", 92 | "with tf.device(\"/job:worker/task:0\"):\n", 93 | " # Do computations.\n", 94 | "```\n", 95 | "通常,只需要提前启动集群,然后构造一个Session,然后根据节点分配计算图中的各个结点,然后进行训练就可以了,非常地直觉。这样做有一个缺点是数据会在各个结点分发,如果数据非常大,这样是得不偿失的。在下文实现的DPPO中,我们将不会采用这套方案。" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "### Between-graph replication" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "与In-graph replication不同的是,集群中的每一个节点会完整地构建自己的计算图,可以说这种方案就是为了A3C而设计的,在这种方案中,我们会有一个或者多个参数节点(Parameters Server),多个计算节点(Worker Server),每个计算节点完成梯度计算后,会异步地将梯度分发到参数节点,然后参数节点会同步或者异步地用梯度更新参数,然后分发最新的参数到一个或者多个计算节点。" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "### Parameters Server & Worker Server" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "ps,即参数节点,在Between-graph replication的方案中,它通常什么都不做,节点启动后即调用`join()`待命,worker,即计算节点,在Between-graph replication方案中,这些节点定义了完整的计算图并执行这些计算,在计算节点完成一次梯度计算后,梯度会被异步分发给参数节点,参数节点更新参数后,分发参数给计算节点。这个过程可以既可以是异步的也可以是同步的,在Between-graph replication方案中,默认是异步的。" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "# PPO" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "在前一篇文章中已经实现了一个PPO,学习笔记:\n", 138 | "> [PPO Note](https://github.com/Ceruleanacg/Learning-Notes/blob/master/note/PPO.ipynb) " 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "源码:\n", 146 | "> [PPO Code](https://github.com/Ceruleanacg/Learning-Notes/blob/master/playground/PPO.py)\n" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "# DPPO in Action" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "首先实现一个方法,它用来启动集群的各个节点,并根据节点类型待命或者定义并执行计算图:" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 1, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "name": "stderr", 170 | "output_type": "stream", 171 | "text": [ 172 | "/Users/shuyu/anaconda3/envs/quant/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5\n", 173 | " return f(*args, **kwds)\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "# coding=utf-8\n", 179 | "\n", 180 | "import sys\n", 181 | "sys.path.append('..')\n", 182 | "\n", 183 | "import multiprocessing as mp\n", 184 | "import tensorflow as tf\n", 185 | "import logging\n", 186 | "import gym\n", 187 | "\n", 188 | "from base.model import *\n", 189 | "from playground import PPO\n", 190 | "from utility.launcher import start_game\n", 191 | "\n", 192 | "\n", 193 | "def start_a3c(cluster, role, task_index):\n", 194 | " # 根据集群描述对象启动节点\n", 195 | " server = tf.train.Server(cluster, job_name=role, task_index=task_index)\n", 196 | " if role == 'ps':\n", 197 | " # 如果是参数节点,则join待命\n", 198 | " logging.warning('Parameter server started.')\n", 199 | " server.join()\n", 200 | " else:\n", 201 | " # 如果是计算节点,定义计算图,计算梯度\n", 202 | " worker_device = \"/job:worker/task:{}\".format(task_index)\n", 203 | " logging.warning('Worker: {}, server stated.'.format(worker_device))\n", 204 | " # 根据集群描述对象分配节点\n", 205 | " with tf.device(tf.train.replica_device_setter(cluster=cluster)):\n", 206 | " # Make env.\n", 207 | " env = gym.make('CartPole-v0')\n", 208 | " env.seed(1)\n", 209 | " env = env.unwrapped\n", 210 | " # Init session.\n", 211 | " session = tf.Session(server.target)\n", 212 | " # session = tf.Session()\n", 213 | " # Init agent.\n", 214 | " agent = PPO.Agent(env.action_space.n, env.observation_space.shape[0], **{\n", 215 | " KEY_SESSION: session,\n", 216 | " KEY_MODEL_NAME: 'PPO',\n", 217 | " KEY_TRAIN_EPISODE: 1000\n", 218 | " })\n", 219 | " start_game(env, agent, task_index)" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "然后定义集群描述对象:" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 2, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "cluster = tf.train.ClusterSpec({\n", 236 | " 'worker': [\n", 237 | " 'localhost:8001',\n", 238 | " 'localhost:8002',\n", 239 | " 'localhost:8003',\n", 240 | " ],\n", 241 | " 'ps': [\n", 242 | " 'localhost:8000'\n", 243 | " ]\n", 244 | " })\n", 245 | "\n", 246 | "role_task_index_map = [\n", 247 | " ('ps', 0),\n", 248 | " ('worker', 0),\n", 249 | " ('worker', 1),\n", 250 | " ('worker', 2),\n", 251 | "]" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "启动A3C并训练:" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "pool = mp.Pool(processes=4)\n", 268 | "\n", 269 | "for role, task_index in role_task_index_map:\n", 270 | " pool.apply_async(start_a3c, args=(cluster, role, task_index, ))\n", 271 | "pool.close()\n", 272 | "pool.join()" 273 | ] 274 | } 275 | ], 276 | "metadata": { 277 | "kernelspec": { 278 | "display_name": "Python 3", 279 | "language": "python", 280 | "name": "python3" 281 | }, 282 | "language_info": { 283 | "codemirror_mode": { 284 | "name": "ipython", 285 | "version": 3 286 | }, 287 | "file_extension": ".py", 288 | "mimetype": "text/x-python", 289 | "name": "python", 290 | "nbconvert_exporter": "python", 291 | "pygments_lexer": "ipython3", 292 | "version": "3.5.4" 293 | } 294 | }, 295 | "nbformat": 4, 296 | "nbformat_minor": 2 297 | } 298 | -------------------------------------------------------------------------------- /note/.ipynb_checkpoints/Word2Vec-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Word2Vec" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## 问题设定" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "对于One-hot的词向量:" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "$$\n", 31 | "\\begin{aligned}\n", 32 | "I &= [1, 0, 0] \\\\\n", 33 | "Like &= [0, 1, 0] \\\\\n", 34 | "Apple &= [0, 0, 1] \n", 35 | "\\end{aligned}\n", 36 | "$$" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "无法通过两向量夹角余弦值计算其相似度,word2vec提供了Skip-Gram(跳字模型)与CBOW(连续词袋模型)两个词嵌入模型,通过这种模型训练出的词向量可以较好的表示出词之间的相似度。" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## Skip-Gram" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "即跳字模型,其核心思想是对于一个上下文,设定一个大小为$m$的滑窗,在滑窗内选择$1$个中心词,预测滑窗内$m - 1$个背景词。即如果上下文是:" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "$$\n", 65 | "I\\ eat\\ apple\\ every\\ day\n", 66 | "$$" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "对每一个词进行One-hot编码:" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "$$\n", 81 | "\\begin{aligned}\n", 82 | "I &= [1, 0, 0, 0, 0] \\\\ \n", 83 | "eat &= [0, 1, 0, 0, 0] \\\\\n", 84 | "apple &= [0, 0, 1, 0, 0] \\\\\n", 85 | "every &= [0, 0, 0, 1, 0] \\\\\n", 86 | "day &= [0, 0, 0, 0, 1]\n", 87 | "\\end{aligned}\n", 88 | "$$" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "设定滑窗大小为$2$,如果选择中心词$apple$,那么将会有以下训练数据:" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "$$\n", 103 | "\\begin{aligned}\n", 104 | "x &= [0, 0, 1, 0, 0] \\\\ \n", 105 | "y &= [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]\n", 106 | "\\end{aligned}\n", 107 | "$$" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "设计一个只有1个输入层、1个隐藏层、1个输出层的神经网络,其中输出层的神经元个数等于输入层即等于One-hot编码的维度,而隐含层的神经元个数通常远小于输出层,比如One-hot维度如果是10000,隐含层可以只有300个神经元:" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "我们通过最大化似然函数:" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "$$\n", 129 | "\\prod^{N}_{i=1} \\prod_{-m <= j <= m} \\mathbb{P} \\left( w^{i+j} \\ \\lvert \\ w^i \\right)\n", 130 | "$$" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "即对于上下文内所有的词,给定中心词$w^i$,预测滑窗内其他词,越准确越好。对上式取对数并展开:" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "$$\n", 145 | "\\begin{aligned}\n", 146 | "\\prod^{N}_{i=1} \\prod_{-m <= j <= m} \\mathbb{P} \\left( w^{i+j} \\ \\lvert \\ w^i \\right) &= \\sum^{N}_{i=1} \\sum_{-m <= j <= m} \\log \\mathbb{P} \\left( w^{i+j} \\ \\lvert \\ w^i \\right) \\\\\n", 147 | "&= \\sum^{N}_{i=1} \\sum_{-m <= j <= m} \\log \\left( \\frac{\\exp(\\mathrm{u^T_{i+j} \\cdot v_{i}} )}{ \\mathrm{\\sum^{N}_{k=1} \\exp(\\mathrm{u^T_{k} \\cdot v_{i}})}} \\right) \\\\\n", 148 | "\\end{aligned}\n", 149 | "$$" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "其中,$\\mathrm{v_i}$即是隐藏层的权重,也是隐藏层的输入$z_i$,也是第i个词的词向量,$\\mathrm{u_{i+j}}$是输出层的权重,也是第i+j个词的词向量的另一个表达。最大化上式的最大似然函数,即最小化下式交叉熵:" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "$$\n", 164 | "- \\sum^{N}_{i=1} \\mathrm{y_i} \\cdot \\log \\mathrm{p_i}\n", 165 | "$$" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "其中$\\mathrm{y_i}$与$\\mathrm{p_i}$是维度为词表长度的向量,分别代表观测值与计算值,对$\\mathrm{v_i}$求梯度有:" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "$$\n", 180 | "\\begin{aligned}\n", 181 | "\\frac {\\partial \\log \\mathbb{P} \\left( w^{j} \\ \\lvert \\ w^i \\right)} {\\mathrm{v_i}} &= \\frac {\\partial \\log \\left( \\exp(\\mathrm{u^T_{j} \\cdot v_{i}} ) \\right) - \\log \\left ( \\mathrm{\\sum^{V}_{k=1} \\exp(\\mathrm{u^T_{k} \\cdot v_{i}})} \\right)}{\\partial \\mathrm{v_{i}}} \\\\\n", 182 | "&= \\mathrm{u_{j}} - \\frac{1}{\\sum^{V}_{w=1} \\exp(\\mathrm{u^T_w v_i}) } \\left[ \\sum^{V}_{k=1} \\exp(\\mathrm{u^T_k v_i) \\cdot \\mathrm{u_k}} \\right] \\\\\n", 183 | "&= \\mathrm{u_{j}} - \\sum^{V}_{k=1} \\frac{ \\exp(\\mathrm{u^T_k v_i}) }{ \\sum^{V}_{w=1} \\exp(\\mathrm{u^T_w v_i}) } \\cdot \\mathrm{u_k}\n", 184 | "\\end{aligned}\n", 185 | "$$" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "然后使用梯度下降更新$\\mathrm{v_i}$,此处的$\\mathrm{v_i}$是向量,在网络中,即是输入层的第i个神经元到隐含层的权重。" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "## CBOW" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "即Continuous Bag of Words,连续词袋模型,其核心思想是对于一个上下文,设定一个大小为$m$的滑窗,在滑窗内选择$1$个背景词,$m - 1$个中心词,与Skip-Gram相反,设定滑窗大小为$2$,如果选择中心词$\\ I,\\ eat,\\ every,\\ day$,那么将会有以下训练数据:" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "$$\n", 214 | "\\begin{aligned}\n", 215 | "x &= [1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1] \\\\ \n", 216 | "y &= [0, 0, 1, 0, 0] \\\\ \n", 217 | "\\end{aligned}\n", 218 | "$$" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "而对于概率:" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "$$\n", 233 | "\\mathbb{P} \\left( w^{j} \\ \\lvert \\ w^{i-m}, \\cdots, w^i, \\cdots, w^{i+m} \\right) = \\frac{\\exp(\\mathrm{u^T_{j} \\cdot (v_{i} + \\cdots + v_{i+2m}} ) / 2m)}{ \\mathrm{\\sum^{V}_{k=1} \\exp(\\mathrm{u^T_{k} \\cdot (v_{i} + \\cdots + v_{i+2m}} ) / 2m})}\n", 234 | "$$" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "与Skip-Gram的不同之处在于将中心词求和后平均,之后的梯度计算与更新和Skip-Gram相同,这里就不展开了。" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "## 负采样" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "可以直观地从上面的梯度更新公式中看到,每一次更新都伴随着巨量的计算开销,这个计算开销主要是因为Softmax函数的分母。可以使用负采样替换Softmax,减少计算开销。" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "相对于原条件概率:" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "$$\n", 270 | "\\mathbb{P} \\left( w^{j} \\ \\lvert \\ w^i \\right) = \\frac{\\exp(\\mathrm{u^T_{j} \\cdot v_{i}} )}{ \\mathrm{\\sum^{V}_{k=1} \\exp(\\mathrm{u^T_{k} \\cdot v_{i}})}}\n", 271 | "$$" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "将被改写为:" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "$$\n", 286 | "\\mathbb{P} \\left( w^{j} \\ \\lvert \\ w^i \\right) = \\log \\frac{1}{1 + \\exp(- \\mathrm{u^T_j v_i})} + \\sum^{K}_{k=1} \\log \\left( 1 - \\frac{1}{1 + \\exp(- \\mathrm{u^T_k v_i})} \\right)\n", 287 | "$$" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "即筛选出K个不在滑窗内的词向量,直观地理解是希望中心词尽可能地不预测出这些采样出的词,筛选出某个词的概率由这个公式决定:" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "$$\n", 302 | "\\mathrm{P(w_i)} = \\frac{f(w_i)^{\\frac{3}{4}}}{\\sum^{V}_{k=1}f(w_k)^{\\frac{3}{4}}}\n", 303 | "$$" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "其中,$f(w_i)$是这个单词在上下文中出现的频率。" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "## 结果" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "通过这种词嵌入模型训练出的词向量能较好的表示两个相近意思的词的近似程度。" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [] 331 | } 332 | ], 333 | "metadata": { 334 | "kernelspec": { 335 | "display_name": "Python 3", 336 | "language": "python", 337 | "name": "python3" 338 | }, 339 | "language_info": { 340 | "codemirror_mode": { 341 | "name": "ipython", 342 | "version": 3 343 | }, 344 | "file_extension": ".py", 345 | "mimetype": "text/x-python", 346 | "name": "python", 347 | "nbconvert_exporter": "python", 348 | "pygments_lexer": "ipython3", 349 | "version": "3.6.4" 350 | } 351 | }, 352 | "nbformat": 4, 353 | "nbformat_minor": 1 354 | } 355 | -------------------------------------------------------------------------------- /deprecated/main.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | import tensorflow as tf 3 | import numpy as np 4 | import gym, time 5 | import matplotlib.pyplot as plt 6 | 7 | 8 | UPDATE_GLOBAL_ITER = 10 9 | GAMMA = 0.9 10 | ENTROPY_BETA = 0.001 11 | LR_A = 0.001 # learning rate for actor 12 | LR_C = 0.001 # learning rate for critic 13 | 14 | env = gym.make('CartPole-v0') 15 | N_S = env.observation_space.shape[0] 16 | N_A = env.action_space.n 17 | 18 | 19 | class ACNet(object): 20 | sess = None 21 | 22 | def __init__(self, scope, opt_a=None, opt_c=None, global_net=None): 23 | if scope == 'global_net': # get global network 24 | with tf.variable_scope(scope): 25 | self.s = tf.placeholder(tf.float32, [None, N_S], 'S') 26 | self.a_params, self.c_params = self._build_net(scope)[-2:] 27 | else: 28 | with tf.variable_scope(scope): 29 | self.s = tf.placeholder(tf.float32, [None, N_S], 'S') 30 | self.a_his = tf.placeholder(tf.int32, [None, ], 'A') 31 | self.v_target = tf.placeholder(tf.float32, [None, 1], 'Vtarget') 32 | 33 | self.a_prob, self.v, self.a_params, self.c_params = self._build_net(scope) 34 | 35 | td = tf.subtract(self.v_target, self.v, name='TD_error') 36 | with tf.name_scope('c_loss'): 37 | self.c_loss = tf.reduce_mean(tf.square(td)) 38 | 39 | with tf.name_scope('a_loss'): 40 | log_prob = tf.reduce_sum( 41 | tf.log(self.a_prob) * tf.one_hot(self.a_his, N_A, dtype=tf.float32), 42 | axis=1, keep_dims=True) 43 | exp_v = log_prob * tf.stop_gradient(td) 44 | entropy = -tf.reduce_sum(self.a_prob * tf.log(self.a_prob + 1e-5), 45 | axis=1, keep_dims=True) # encourage exploration 46 | self.exp_v = ENTROPY_BETA * entropy + exp_v 47 | self.a_loss = tf.reduce_mean(-self.exp_v) 48 | 49 | with tf.name_scope('local_grad'): 50 | self.a_grads = tf.gradients(self.a_loss, self.a_params) 51 | self.c_grads = tf.gradients(self.c_loss, self.c_params) 52 | 53 | self.global_step = tf.train.get_or_create_global_step() 54 | with tf.name_scope('sync'): 55 | with tf.name_scope('pull'): 56 | self.pull_a_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.a_params, global_net.a_params)] 57 | self.pull_c_params_op = [l_p.assign(g_p) for l_p, g_p in zip(self.c_params, global_net.c_params)] 58 | with tf.name_scope('push'): 59 | self.update_a_op = opt_a.apply_gradients(zip(self.a_grads, global_net.a_params), global_step=self.global_step) 60 | self.update_c_op = opt_c.apply_gradients(zip(self.c_grads, global_net.c_params)) 61 | 62 | def _build_net(self, scope): 63 | w_init = tf.random_normal_initializer(0., .1) 64 | with tf.variable_scope('actor'): 65 | l_a = tf.layers.dense(self.s, 200, tf.nn.relu6, kernel_initializer=w_init, name='la') 66 | a_prob = tf.layers.dense(l_a, N_A, tf.nn.softmax, kernel_initializer=w_init, name='ap') 67 | with tf.variable_scope('critic'): 68 | l_c = tf.layers.dense(self.s, 100, tf.nn.relu6, kernel_initializer=w_init, name='lc') 69 | v = tf.layers.dense(l_c, 1, kernel_initializer=w_init, name='v') # state value 70 | a_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/actor') 71 | c_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope + '/critic') 72 | return a_prob, v, a_params, c_params 73 | 74 | def choose_action(self, s): # run by a local 75 | prob_weights = self.sess.run(self.a_prob, feed_dict={self.s: s[np.newaxis, :]}) 76 | action = np.random.choice(range(prob_weights.shape[1]), 77 | p=prob_weights.ravel()) # select action w.r.t the actions prob 78 | return action 79 | 80 | def update_global(self, feed_dict): # run by a local 81 | self.sess.run([self.update_a_op, self.update_c_op], feed_dict) # local grads applies to global net 82 | 83 | def pull_global(self): # run by a local 84 | self.sess.run([self.pull_a_params_op, self.pull_c_params_op]) 85 | 86 | 87 | def work(job_name, task_index, global_ep, lock, r_queue, global_running_r): 88 | # set work's ip:port 89 | cluster = tf.train.ClusterSpec({ 90 | "ps": ['localhost:2220', 'localhost:2221',], 91 | "worker": ['localhost:2222', 'localhost:2223', 'localhost:2224', 'localhost:2225',] 92 | }) 93 | server = tf.train.Server(cluster, job_name=job_name, task_index=task_index) 94 | if job_name == 'ps': 95 | print('Start Parameter Sever: ', task_index) 96 | server.join() 97 | else: 98 | t1 = time.time() 99 | env = gym.make('CartPole-v0').unwrapped 100 | print('Start Worker: ', task_index) 101 | with tf.device(tf.train.replica_device_setter( 102 | worker_device="/job:worker/task:%d" % task_index, 103 | cluster=cluster)): 104 | opt_a = tf.train.RMSPropOptimizer(LR_A, name='opt_a') 105 | opt_c = tf.train.RMSPropOptimizer(LR_C, name='opt_c') 106 | global_net = ACNet('global_net') 107 | 108 | local_net = ACNet('local_ac%d' % task_index, opt_a, opt_c, global_net) 109 | # set training steps 110 | hooks = [tf.train.StopAtStepHook(last_step=100000)] 111 | with tf.train.MonitoredTrainingSession(master=server.target, 112 | is_chief=True, 113 | hooks=hooks,) as sess: 114 | print('Start Worker Session: ', task_index) 115 | local_net.sess = sess 116 | total_step = 1 117 | buffer_s, buffer_a, buffer_r = [], [], [] 118 | while (not sess.should_stop()) and (global_ep.value < 1000): 119 | s = env.reset() 120 | ep_r = 0 121 | while True: 122 | # if task_index: 123 | # env.render() 124 | a = local_net.choose_action(s) 125 | s_, r, done, info = env.step(a) 126 | if done: r = -5. 127 | ep_r += r 128 | buffer_s.append(s) 129 | buffer_a.append(a) 130 | buffer_r.append(r) 131 | 132 | if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net 133 | if done: 134 | v_s_ = 0 # terminal 135 | else: 136 | v_s_ = sess.run(local_net.v, {local_net.s: s_[np.newaxis, :]})[0, 0] 137 | buffer_v_target = [] 138 | for r in buffer_r[::-1]: # reverse buffer r 139 | v_s_ = r + GAMMA * v_s_ 140 | buffer_v_target.append(v_s_) 141 | buffer_v_target.reverse() 142 | 143 | buffer_s, buffer_a, buffer_v_target = np.vstack(buffer_s), np.array(buffer_a), np.vstack( 144 | buffer_v_target) 145 | feed_dict = { 146 | local_net.s: buffer_s, 147 | local_net.a_his: buffer_a, 148 | local_net.v_target: buffer_v_target, 149 | } 150 | local_net.update_global(feed_dict) 151 | buffer_s, buffer_a, buffer_r = [], [], [] 152 | local_net.pull_global() 153 | s = s_ 154 | total_step += 1 155 | if done: 156 | if r_queue.empty(): # record running episode reward 157 | global_running_r.value = ep_r 158 | else: 159 | global_running_r.value = .99 * global_running_r.value + 0.01 * ep_r 160 | r_queue.put(global_running_r.value) 161 | 162 | print( 163 | "Task: %i" % task_index, 164 | "| Ep: %i" % global_ep.value, 165 | "| Ep_r: %i" % global_running_r.value, 166 | "| Global_step: %i" % sess.run(local_net.global_step), 167 | ) 168 | with lock: 169 | global_ep.value += 1 170 | break 171 | 172 | print('Worker Done: ', task_index, time.time()-t1) 173 | 174 | 175 | if __name__ == "__main__": 176 | # use multiprocessing to create a local cluster with 2 parameter servers and 2 workers 177 | global_ep = mp.Value('i', 0) 178 | lock = mp.Lock() 179 | r_queue = mp.Queue() 180 | global_running_r = mp.Value('d', 0) 181 | 182 | jobs = [ 183 | ('ps', 0), ('ps', 1), 184 | ('worker', 0), ('worker', 1), ('worker', 2), ('worker', 3) 185 | ] 186 | ps = [mp.Process(target=work, args=(j, i, global_ep, lock, r_queue, global_running_r), ) for j, i in jobs] 187 | [p.start() for p in ps] 188 | [p.join() for p in ps[2:]] 189 | 190 | ep_r = [] 191 | while not r_queue.empty(): 192 | ep_r.append(r_queue.get()) 193 | plt.plot(np.arange(len(ep_r)), ep_r) 194 | plt.title('Distributed training') 195 | plt.xlabel('Step') 196 | plt.ylabel('Total moving reward') 197 | plt.show() 198 | -------------------------------------------------------------------------------- /ann/Dense.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | import os 4 | 5 | from static import CKPT_DIR 6 | from utility import function 7 | from utility.logger import generate_model_logger 8 | 9 | 10 | class Dense(object): 11 | 12 | def __init__(self, x_space, y_space, hidden_units_list, **options): 13 | 14 | # Init x space, y space. 15 | self.x_space = x_space 16 | self.y_space = y_space 17 | 18 | # Init layer & neuron info. 19 | self.hidden_units_list = hidden_units_list 20 | self.hidden_layer_count = len(hidden_units_list) 21 | self.total_layer_count = self.hidden_layer_count + 1 22 | 23 | # Init weights, biases. 24 | self.weights, self.biases = {}, {} 25 | 26 | # Init a, z, outputs caches. 27 | self.z_outputs, self.z_inputs = {}, {} 28 | 29 | # Init deltas caches. 30 | self.deltas = {} 31 | 32 | self._validate_parameters() 33 | self._init_func_map() 34 | self._init_options(options) 35 | self._init_weights_and_biases() 36 | 37 | def _init_weights_and_biases(self): 38 | # Hidden Layer. 39 | for index, hidden_units in enumerate(self.hidden_units_list): 40 | # x_space is the shape of last layer, and the shape of weight of current layer. 41 | x_space = self.x_space if index == 0 else self.hidden_units_list[index - 1] 42 | # hidden_units is shape of current layer, also neuron count. 43 | weights, biases = np.random.normal(0, 0.01, (hidden_units, x_space)), np.zeros((hidden_units, 1)) 44 | self.weights[index], self.biases[index] = weights, biases 45 | # Output Layer. 46 | x_space = self.hidden_units_list[-1] 47 | weights, biases = np.random.normal(0, 0.01, (self.y_space, x_space)), np.zeros((self.y_space, 1)) 48 | self.weights[self.total_layer_count - 1], self.biases[self.total_layer_count - 1] = weights, biases 49 | 50 | def _validate_parameters(self): 51 | if self.hidden_layer_count == 0 or len(self.hidden_units_list) == 0: 52 | raise ValueError('Layer count or neuron count list cannot be zero.') 53 | if self.hidden_layer_count != len(self.hidden_units_list): 54 | raise ValueError('Layer count should be equal to length of neuron count list.') 55 | 56 | def _init_func_map(self): 57 | # Init Activation Func and Grad Map. 58 | self.activation_grad_map = { 59 | function.relu: np.vectorize(function.grad_relu), 60 | function.tanh: np.vectorize(function.grad_tanh), 61 | function.linear: np.vectorize(function.grad_linear), 62 | function.sigmoid: np.vectorize(function.grad_sigmoid), 63 | } 64 | self.grad_loss_map = { 65 | function.softmax_cross_entropy: function.grad_softmax_cross_entropy, 66 | function.mean_square_error: function.grad_mean_square_error 67 | } 68 | 69 | def _init_options(self, options): 70 | 71 | try: 72 | self.model_name = options['model_name'] 73 | except KeyError: 74 | self.model_name = 'model' 75 | finally: 76 | if not isinstance(self.model_name, str): 77 | raise ValueError('Model name must be a str.') 78 | 79 | try: 80 | self.mode = options['mode'] 81 | except KeyError: 82 | self.mode = 'train' 83 | 84 | # Init Activation Func and Grad Func. 85 | try: 86 | self.activation_funcs = options['activation_funcs'] 87 | except KeyError: 88 | self.activation_funcs = [function.tanh] * self.hidden_layer_count 89 | self.activation_funcs.append(function.linear) 90 | finally: 91 | if len(self.activation_funcs) != self.total_layer_count: 92 | raise ValueError('Activation func count should be equal to total layer count.') 93 | 94 | try: 95 | self.grad_activation_funcs = [self.activation_grad_map[act_func] for act_func in self.activation_funcs] 96 | self.activation_funcs = [np.vectorize(act_func) for act_func in self.activation_funcs] 97 | except KeyError: 98 | raise KeyError('Grad func not exists.') 99 | 100 | try: 101 | self.loss_func = options['loss_func'] 102 | except KeyError: 103 | self.loss_func = function.mean_square_error 104 | finally: 105 | self.grad_func = self.grad_loss_map[self.loss_func] 106 | # Enable softmax. 107 | if self.grad_func == self.grad_loss_map[function.softmax_cross_entropy]: 108 | self.enable_softmax = True 109 | else: 110 | self.enable_softmax = False 111 | 112 | # Init Batch Size. 113 | try: 114 | self.batch_size = options['batch_size'] 115 | except KeyError: 116 | self.batch_size = 16 117 | finally: 118 | if self.batch_size < 1: 119 | raise ValueError('Batch size must larger than 1.') 120 | 121 | # Init Learning Rate. 122 | try: 123 | self.learning_rate = options['learning_rate'] 124 | except KeyError: 125 | self.learning_rate = 0.003 126 | finally: 127 | if self.learning_rate < 0.0: 128 | raise ValueError('Learning rate must be positive.') 129 | 130 | try: 131 | self.max_epoch = options['max_epoch'] 132 | except KeyError: 133 | self.max_epoch = 3000 134 | finally: 135 | if self.max_epoch < 1: 136 | raise ValueError('Epoch must be larger than 1.') 137 | 138 | try: 139 | self.enable_logger = options['enable_logger'] 140 | except KeyError: 141 | self.enable_logger = True 142 | finally: 143 | if self.enable_logger: 144 | self.logger = generate_model_logger(self.model_name) 145 | 146 | self.history_loss = [] 147 | 148 | def _forward(self, input_batch): 149 | # Temporal result, a_batch. 150 | z_input = input_batch 151 | # Forward layer by layer. 152 | for layer_index in range(self.total_layer_count): 153 | # Get weights and biases. 154 | weights, biases = self.weights[layer_index], self.biases[layer_index] 155 | # Save result as grad w. 156 | self.z_inputs[layer_index] = z_input 157 | z_output = np.dot(z_input, weights.T) + biases.T 158 | # Save result of a for backward. 159 | self.z_outputs[layer_index] = z_output 160 | # z_input is also called a_output. 161 | z_input = self.activation_funcs[layer_index](z_output) 162 | return z_input 163 | 164 | def _backward(self, error): 165 | # error here is shape of (batch_size, y_space) 166 | for index in np.arange(0, self.total_layer_count)[::-1]: 167 | # dl/dw = dz/dw * da/dz * (dl/da) | x = x_batch. 168 | z_outputs = self.z_outputs[index] 169 | # Get grad of activation func. 170 | grad_activation_func = self.grad_activation_funcs[index] 171 | # Calculate da/dz. 172 | grad_z_batch = grad_activation_func(z_outputs) 173 | # Calculate dl/da * da/dz. 174 | delta = error * grad_z_batch 175 | # Save delta. 176 | self.deltas[index] = delta 177 | # Update error, dz/da 178 | error = np.dot(delta, self.weights[index]) 179 | 180 | def _update_weights_and_biases(self): 181 | for index in range(self.total_layer_count): 182 | # Get z_input and delta. 183 | z_input, delta = self.z_inputs[index], self.deltas[index] 184 | # Calculate grad weights, grad biases, dl/da * da/dz * dz/dw 185 | grad_weights = -np.dot(delta.T, z_input) 186 | grad_biases = -np.mean(delta, axis=0).reshape(self.biases[index].shape) 187 | # Update weights, biases. 188 | self.weights[index] -= self.learning_rate * grad_weights 189 | self.biases[index] -= self.learning_rate * grad_biases 190 | 191 | def train(self, x_data, y_data): 192 | iteration, epoch, x_data_count = 0, 0, len(x_data) 193 | while epoch < self.max_epoch: 194 | s_index, e_index, epoch_loss = 0, self.batch_size, [] 195 | while True: 196 | # Generate batch x, y 197 | x_batch, y_batch = x_data[s_index: e_index], y_data[s_index: e_index] 198 | # Calculate y_predict. 199 | y_predict = self._forward(x_batch) 200 | # Calculate loss. 201 | loss = self.loss_func(y_predict, y_batch) 202 | epoch_loss.append(loss) 203 | # Calculate error. 204 | error = self.grad_func(y_predict, y_batch) 205 | # Bp & Update. 206 | self._backward(error) 207 | self._update_weights_and_biases() 208 | # Update index. 209 | s_index += self.batch_size 210 | e_index = s_index + self.batch_size 211 | # Add iteration. 212 | iteration += 1 213 | if e_index > len(x_data): 214 | mean_epoch_loss = np.mean(epoch_loss) 215 | self.history_loss.append(mean_epoch_loss) 216 | break 217 | if epoch % 100 == 0: 218 | self.save() 219 | self.evaluate(x_data, y_data) 220 | self.logger.warning("Epoch: {:d} | loss: {:.6f}".format(epoch, mean_epoch_loss)) 221 | epoch += 1 222 | 223 | def predict(self, x_batch): 224 | if self.enable_softmax: 225 | result = function.softmax(self._forward(x_batch)) 226 | else: 227 | result = self._forward(x_batch) 228 | return result 229 | 230 | def evaluate(self, x_data, y_data): 231 | y_label, y_output = np.argmax(y_data, axis=1), np.argmax(self.predict(x_data), axis=1) 232 | self.logger.warning("Accuracy: {:.3f} ".format(np.sum(y_label == y_output) / len(x_data))) 233 | 234 | def save(self): 235 | save_dir = os.path.join(CKPT_DIR, self.model_name) 236 | if not os.path.exists(save_dir): 237 | os.makedirs(save_dir) 238 | with open(os.path.join(save_dir, 'weights.json'), 'w') as fp: 239 | weights = [weights.tolist() for weights in self.weights.values()] 240 | json.dump(weights, fp, indent=True) 241 | with open(os.path.join(save_dir, 'biases.json'), 'w') as fp: 242 | biases = [biases.tolist() for biases in self.biases.values()] 243 | json.dump(biases, fp, indent=True) 244 | self.logger.warning("Model saved.") 245 | 246 | def restore(self): 247 | save_dir = os.path.join(CKPT_DIR, self.model_name) 248 | try: 249 | with open(os.path.join(save_dir, 'weights.json'), 'r') as fp: 250 | weights = json.load(fp) 251 | for index in range(self.total_layer_count): 252 | self.weights[index] = np.array(weights[index]) 253 | except FileNotFoundError: 254 | raise FileNotFoundError('Weights not exists.') 255 | 256 | try: 257 | with open(os.path.join(save_dir, 'biases.json'), 'r') as fp: 258 | biases = json.load(fp) 259 | for index in range(self.total_layer_count): 260 | self.biases[index] = np.array(biases[index]) 261 | except FileNotFoundError: 262 | raise FileNotFoundError('biases not exists.') 263 | 264 | self.logger.warning("Model restored.") 265 | -------------------------------------------------------------------------------- /note/.ipynb_checkpoints/DoubleDQN-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Double DQN" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 背景" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "Double DQN是DQN(Deep Q Network)的一种改进,旨在解决DQN训练过程中存在的过估计(Overestimating)问题。在训练过程中,与DQN直接选取目标网络(Target Q Network)中下一个State各个Action对应的Q值最大的那一个Q值不同,Double DQN的核心精神在于,它首先使用预测网络(Predict Q Network)计算下一个State的对应各个Action的Q值,然后选取最大的那个Q值对应Action的索引,再使用目标网络计算该状态的对应各个状态的Q值,然后选取预测网络中给定Action索引对应的Q值,但是它可能不是最大的那个,从而一定程度上避免了过度估计,提高了训练DQN的稳定性和速度。" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## DQN" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "即Deep Q Network,DQN有两个网络,分别是预测网络(Predict Q Network)和目标网络(Target Q Network),预测网络用来预测当前状态对应各个动作的Q值,目标网络用来预测下一个,或者下第几个状态各个动作的Q值,这个取决于训练过程采用时间差分(Temporal Difference)还是蒙特卡洛(MC)方法,以TD的训练过程为例,我们期望对采样过的每一个状态、动作、奖励元组最小化下式:" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "$$\n", 43 | "\\left( Q \\left( s_j, a_j; \\theta \\right) - y_j \\right)^2\n", 44 | "$$" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "其中Q对应的θ即预测网络,它接受当前状态,输出一个当前状态对应各个动作的Q值,然后选取当前动作对应的那个Q值。$y_j$是Ground Truth标签,它是由目标网络计算得出:" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "$$\n", 59 | "y_j=\n", 60 | "\\begin{cases}\n", 61 | "r_j & \\text{if episode ends at j + 1}\\\\\n", 62 | "r_j + \\gamma \\max_{a^{\\prime}} \\hat{Q} \\left( s_{j+1}, a^{\\prime}; \\theta^{-} \\right)& \\text{otherwise}\n", 63 | "\\end{cases}\n", 64 | "$$" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "其中Q对应的θ-即是目标网络,当采取这个动作后回合结束,则标签即是这次动作产生的奖励,如果回合未结束,则标签将由两部分构成,第一部分即是这次动作产生的奖励,另一部分则是由目标网络计算,即计算下一个状态各个动作对应的Q值,然后选取最大的那个Q值。" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "需要注意的是,目标网络的参数是设定是不可train的,在训练经过M次后,我们会将预测网络被更新的全部参数复制给目标网络,其中M次的M是一个可调的超参数,这样的一个直觉的好处就是避免了震荡。" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "## Experience Replay" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "在实作中,On-Policy的DQN表现非常不稳定,一种改进被称之为经验回放(Experience Replay)的技术通过缓存每一步状态、动作、奖励、下一状态元组,在一回合结束后批量训练多次,将On-Policy的过程转化为Off-Policy,提高了DQN的训练速度和稳定性,具体的实现非常直觉,即维护一个指定大小的缓存数组,每回合用新产生的N个状态、动作、奖励、下一状态元组随机替换掉缓存池中现有的N个,然后再回合结束后做数次训练。" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "## Double DQN" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "正如背景中提到的:" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "> 与DQN直接选取目标网络(Target Q Network)中下一个State各个Action对应的Q值最大的那一个Q值不同,Double DQN的核心精神在于,它首先使用预测网络(Predict Q Network)计算下一个State的对应各个Action的Q值,然后选取最大的那个Q值对应Action的索引,再使用目标网络计算该状态的对应各个状态的Q值,然后选取预测网络中给定Action索引对应的Q值,但是它可能不是最大的那个。\n" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "Double DQN与DQN相同的是他们都有被称之为预测网络与目标网络的两个网络,只是在实作过程中,标签的计算过程做了修正:" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "$$\n", 128 | "y_j=\n", 129 | "\\begin{cases}\n", 130 | "r_j & \\text{if episode ends at j + 1}\\\\\n", 131 | "r_j + \\gamma Q \\left( s_{j+1}, \\max_{a^{\\prime}} Q \\left (s_{j+1}, a^{\\prime}; \\theta\\right) ; \\theta^{-} \\right)& \\text{otherwise}\n", 132 | "\\end{cases}\n", 133 | "$$\n" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "可以看出这个修正非常地直觉,在实验中,也确实要比原始的DQN训练稳定。" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "## Experiment" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 5, 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "name": "stderr", 157 | "output_type": "stream", 158 | "text": [ 159 | "/Users/shuyu/anaconda3/envs/quant/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5\n", 160 | " return f(*args, **kwds)\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "# coding=utf-8\n", 166 | "\n", 167 | "import numpy as np\n", 168 | "import gym\n", 169 | "\n", 170 | "import sys\n", 171 | "sys.path.append('..')\n", 172 | "\n", 173 | "from base.model import *\n", 174 | "from utility.launcher import start_game\n", 175 | "\n", 176 | "\n", 177 | "class Agent(BaseRLModel):\n", 178 | "\n", 179 | " def __init__(self, a_space, s_space, **options):\n", 180 | " super(Agent, self).__init__(a_space, s_space, **options)\n", 181 | "\n", 182 | " self._init_input()\n", 183 | " self._init_nn()\n", 184 | " self._init_op()\n", 185 | " self._init_saver()\n", 186 | "\n", 187 | " self.buffer = np.zeros((self.buffer_size, self.s_space + 1 + 1 + self.s_space))\n", 188 | " self.buffer_count = 0\n", 189 | "\n", 190 | " self.update_target_net_step = 200\n", 191 | "\n", 192 | " self.session.run(tf.global_variables_initializer())\n", 193 | "\n", 194 | " def _init_input(self, *args):\n", 195 | " with tf.variable_scope('input'):\n", 196 | " self.s_n = tf.placeholder(tf.float32, [None, self.s_space])\n", 197 | " self.s = tf.placeholder(tf.float32, [None, self.s_space])\n", 198 | " self.q_n = tf.placeholder(tf.float32, [None, ])\n", 199 | " self.r = tf.placeholder(tf.float32, [None, ])\n", 200 | " self.a = tf.placeholder(tf.int32, [None, ])\n", 201 | "\n", 202 | " def _init_nn(self, *args):\n", 203 | " # w,b initializer\n", 204 | " w_initializer = tf.random_normal_initializer(mean=0.0, stddev=0.3)\n", 205 | " b_initializer = tf.constant_initializer(0.1)\n", 206 | "\n", 207 | " with tf.variable_scope('predict_q_net'):\n", 208 | " phi_state = tf.layers.dense(self.s,\n", 209 | " 64,\n", 210 | " tf.nn.relu,\n", 211 | " kernel_initializer=w_initializer,\n", 212 | " bias_initializer=b_initializer)\n", 213 | "\n", 214 | " self.q_predict = tf.layers.dense(phi_state,\n", 215 | " self.a_space,\n", 216 | " kernel_initializer=w_initializer,\n", 217 | " bias_initializer=b_initializer)\n", 218 | "\n", 219 | " with tf.variable_scope('target_q_net'):\n", 220 | " phi_state_next = tf.layers.dense(self.s_n,\n", 221 | " 64,\n", 222 | " tf.nn.relu,\n", 223 | " kernel_initializer=w_initializer,\n", 224 | " bias_initializer=b_initializer,\n", 225 | " trainable=False)\n", 226 | "\n", 227 | " self.q_target = tf.layers.dense(phi_state_next,\n", 228 | " self.a_space,\n", 229 | " kernel_initializer=w_initializer,\n", 230 | " bias_initializer=b_initializer,\n", 231 | " trainable=False)\n", 232 | "\n", 233 | " def _init_op(self):\n", 234 | "\n", 235 | " with tf.variable_scope('q_predict'):\n", 236 | " # size of q_value_predict is [BATCH_SIZE, 1]\n", 237 | " action_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)\n", 238 | " self.q_eval = tf.gather_nd(self.q_predict, action_indices)\n", 239 | "\n", 240 | " with tf.variable_scope('loss'):\n", 241 | " self.loss_func = tf.losses.mean_squared_error(self.q_n, self.q_eval)\n", 242 | "\n", 243 | " with tf.variable_scope('train'):\n", 244 | " self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_func)\n", 245 | "\n", 246 | " with tf.variable_scope('update_target_net'):\n", 247 | " t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_net')\n", 248 | " p_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='predict_q_net')\n", 249 | " self.update_q_net = [tf.assign(t, e) for t, e in zip(t_params, p_params)]\n", 250 | "\n", 251 | " def predict(self, s):\n", 252 | " if np.random.uniform() < self.epsilon or self.mode == 'test':\n", 253 | " a = np.argmax(self.session.run(self.q_predict, feed_dict={self.s: s[np.newaxis, :]}))\n", 254 | " else:\n", 255 | " a = np.random.randint(0, self.a_space)\n", 256 | " return a\n", 257 | "\n", 258 | " def snapshot(self, s, a, r, s_n):\n", 259 | " self.buffer[self.buffer_count % self.buffer_size, :] = np.hstack((s, [a, r], s_n))\n", 260 | " self.buffer_count += 1\n", 261 | "\n", 262 | " def train(self):\n", 263 | "\n", 264 | " for train_step in range(self.train_steps):\n", 265 | " # Update target net if need.\n", 266 | " if self.training_step % self.update_target_net_step == 0:\n", 267 | " self.session.run(self.update_q_net)\n", 268 | " # Get batch.\n", 269 | " if self.buffer_count < self.buffer_size:\n", 270 | " batch = self.buffer[np.random.choice(self.buffer_count, size=self.batch_size), :]\n", 271 | " else:\n", 272 | " batch = self.buffer[np.random.choice(self.buffer_size, size=self.batch_size), :]\n", 273 | "\n", 274 | " s = batch[:, :self.s_space]\n", 275 | " s_n = batch[:, -self.s_space:]\n", 276 | " a = batch[:, self.s_space].reshape((-1))\n", 277 | " r = batch[:, self.s_space + 1]\n", 278 | "\n", 279 | " # 1. Calculate q_next_predict and q_next_target.\n", 280 | " q_next_predict, q_next_target = self.session.run([self.q_predict, self.q_target], {\n", 281 | " self.s: s_n, self.s_n: s_n\n", 282 | " })\n", 283 | "\n", 284 | " # 2. Select a_indices in q_next_predict.\n", 285 | " a_indices = np.argmax(q_next_predict, axis=1)\n", 286 | "\n", 287 | " # 3. Select Q values with a_indices\n", 288 | " q_next = q_next_target[np.arange(0, self.batch_size), a_indices]\n", 289 | "\n", 290 | " # 4. Calculate q_real.\n", 291 | " q_real = r + self.gamma * q_next\n", 292 | "\n", 293 | " _, cost = self.session.run([self.train_op, self.loss_func], {\n", 294 | " self.s: s, self.a: a, self.q_n: q_real\n", 295 | " })\n", 296 | "\n", 297 | " self.training_step += 1" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "## Running" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "if __name__ == '__main__':\n", 314 | "\n", 315 | " def main(_):\n", 316 | " # Make env.\n", 317 | " env = gym.make('CartPole-v0')\n", 318 | " env.seed(1)\n", 319 | " env = env.unwrapped\n", 320 | " # Init agent.\n", 321 | " agent = Agent(env.action_space.n, env.observation_space.shape[0], **{\n", 322 | " KEY_MODEL_NAME: 'PPO',\n", 323 | " KEY_TRAIN_EPISODE: 10000\n", 324 | " })\n", 325 | " start_game(env, agent)\n", 326 | "\n", 327 | "\n", 328 | " if __name__ == '__main__':\n", 329 | " tf.app.run()" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "## 结尾" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "略了。" 344 | ] 345 | } 346 | ], 347 | "metadata": { 348 | "kernelspec": { 349 | "display_name": "Python 3", 350 | "language": "python", 351 | "name": "python3" 352 | }, 353 | "language_info": { 354 | "codemirror_mode": { 355 | "name": "ipython", 356 | "version": 3 357 | }, 358 | "file_extension": ".py", 359 | "mimetype": "text/x-python", 360 | "name": "python", 361 | "nbconvert_exporter": "python", 362 | "pygments_lexer": "ipython3", 363 | "version": "3.5.4" 364 | } 365 | }, 366 | "nbformat": 4, 367 | "nbformat_minor": 2 368 | } 369 | -------------------------------------------------------------------------------- /note/DoubleDQN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Double DQN" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 背景" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "Double DQN是DQN(Deep Q Network)的一种改进,旨在解决DQN训练过程中存在的过估计(Overestimating)问题。在训练过程中,与DQN直接选取目标网络(Target Q Network)中下一个State各个Action对应的Q值最大的那一个Q值不同,Double DQN的核心精神在于,它首先使用预测网络(Predict Q Network)计算下一个State的对应各个Action的Q值,然后选取最大的那个Q值对应Action的索引,再使用目标网络计算该状态的对应各个状态的Q值,然后选取预测网络中给定Action索引对应的Q值,但是它可能不是最大的那个,从而一定程度上避免了过度估计,提高了训练DQN的稳定性和速度。" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## DQN" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "即Deep Q Network,DQN有两个网络,分别是预测网络(Predict Q Network)和目标网络(Target Q Network),预测网络用来预测当前状态对应各个动作的Q值,目标网络用来预测下一个,或者下第几个状态各个动作的Q值,这个取决于训练过程采用时间差分(Temporal Difference)还是蒙特卡洛(MC)方法,以TD的训练过程为例,我们期望对采样过的每一个状态、动作、奖励元组最小化下式:" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "$$\n", 43 | "\\left( Q \\left( s_j, a_j; \\theta \\right) - y_j \\right)^2\n", 44 | "$$" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "其中Q对应的θ即预测网络,它接受当前状态,输出一个当前状态对应各个动作的Q值,然后选取当前动作对应的那个Q值。$y_j$是Ground Truth标签,它是由目标网络计算得出:" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "$$\n", 59 | "y_j=\n", 60 | "\\begin{cases}\n", 61 | "r_j & \\text{if episode ends at j + 1}\\\\\n", 62 | "r_j + \\gamma \\max_{a^{\\prime}} \\hat{Q} \\left( s_{j+1}, a^{\\prime}; \\theta^{-} \\right)& \\text{otherwise}\n", 63 | "\\end{cases}\n", 64 | "$$" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "其中Q对应的θ-即是目标网络,当采取这个动作后回合结束,则标签即是这次动作产生的奖励,如果回合未结束,则标签将由两部分构成,第一部分即是这次动作产生的奖励,另一部分则是由目标网络计算,即计算下一个状态各个动作对应的Q值,然后选取最大的那个Q值。" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "需要注意的是,目标网络的参数是设定是不可train的,在训练经过M次后,我们会将预测网络被更新的全部参数复制给目标网络,其中M次的M是一个可调的超参数,这样的一个直觉的好处就是避免了震荡。" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "## Experience Replay" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "在实作中,On-Policy的DQN表现非常不稳定,一种改进被称之为经验回放(Experience Replay)的技术通过缓存每一步状态、动作、奖励、下一状态元组,在一回合结束后批量训练多次,将On-Policy的过程转化为Off-Policy,提高了DQN的训练速度和稳定性,具体的实现非常直觉,即维护一个指定大小的缓存数组,每回合用新产生的N个状态、动作、奖励、下一状态元组随机替换掉缓存池中现有的N个,然后再回合结束后做数次训练。" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "## Double DQN" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "正如背景中提到的:" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "> 与DQN直接选取目标网络(Target Q Network)中下一个State各个Action对应的Q值最大的那一个Q值不同,Double DQN的核心精神在于,它首先使用预测网络(Predict Q Network)计算下一个State的对应各个Action的Q值,然后选取最大的那个Q值对应Action的索引,再使用目标网络计算该状态的对应各个状态的Q值,然后选取预测网络中给定Action索引对应的Q值,但是它可能不是最大的那个。\n" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "Double DQN与DQN相同的是他们都有被称之为预测网络与目标网络的两个网络,只是在实作过程中,标签的计算过程做了修正:" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "$$\n", 128 | "y_j=\n", 129 | "\\begin{cases}\n", 130 | "r_j & \\text{if episode ends at j + 1}\\\\\n", 131 | "r_j + \\gamma Q \\left( s_{j+1}, \\max_{a^{\\prime}} Q \\left (s_{j+1}, a^{\\prime}; \\theta\\right) ; \\theta^{-} \\right)& \\text{otherwise}\n", 132 | "\\end{cases}\n", 133 | "$$\n" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "可以看出这个修正非常地直觉,在实验中,也确实要比原始的DQN训练稳定。" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "## Experiment" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 5, 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "name": "stderr", 157 | "output_type": "stream", 158 | "text": [ 159 | "/Users/shuyu/anaconda3/envs/quant/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5\n", 160 | " return f(*args, **kwds)\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "# coding=utf-8\n", 166 | "\n", 167 | "import numpy as np\n", 168 | "import gym\n", 169 | "\n", 170 | "import sys\n", 171 | "sys.path.append('..')\n", 172 | "\n", 173 | "from base.model import *\n", 174 | "from utility.launcher import start_game\n", 175 | "\n", 176 | "\n", 177 | "class Agent(BaseRLModel):\n", 178 | "\n", 179 | " def __init__(self, a_space, s_space, **options):\n", 180 | " super(Agent, self).__init__(a_space, s_space, **options)\n", 181 | "\n", 182 | " self._init_input()\n", 183 | " self._init_nn()\n", 184 | " self._init_op()\n", 185 | " self._init_saver()\n", 186 | "\n", 187 | " self.buffer = np.zeros((self.buffer_size, self.s_space + 1 + 1 + self.s_space))\n", 188 | " self.buffer_count = 0\n", 189 | "\n", 190 | " self.update_target_net_step = 200\n", 191 | "\n", 192 | " self.session.run(tf.global_variables_initializer())\n", 193 | "\n", 194 | " def _init_input(self, *args):\n", 195 | " with tf.variable_scope('input'):\n", 196 | " self.s_n = tf.placeholder(tf.float32, [None, self.s_space])\n", 197 | " self.s = tf.placeholder(tf.float32, [None, self.s_space])\n", 198 | " self.q_n = tf.placeholder(tf.float32, [None, ])\n", 199 | " self.r = tf.placeholder(tf.float32, [None, ])\n", 200 | " self.a = tf.placeholder(tf.int32, [None, ])\n", 201 | "\n", 202 | " def _init_nn(self, *args):\n", 203 | " # w,b initializer\n", 204 | " w_initializer = tf.random_normal_initializer(mean=0.0, stddev=0.3)\n", 205 | " b_initializer = tf.constant_initializer(0.1)\n", 206 | "\n", 207 | " with tf.variable_scope('predict_q_net'):\n", 208 | " phi_state = tf.layers.dense(self.s,\n", 209 | " 64,\n", 210 | " tf.nn.relu,\n", 211 | " kernel_initializer=w_initializer,\n", 212 | " bias_initializer=b_initializer)\n", 213 | "\n", 214 | " self.q_predict = tf.layers.dense(phi_state,\n", 215 | " self.a_space,\n", 216 | " kernel_initializer=w_initializer,\n", 217 | " bias_initializer=b_initializer)\n", 218 | "\n", 219 | " with tf.variable_scope('target_q_net'):\n", 220 | " phi_state_next = tf.layers.dense(self.s_n,\n", 221 | " 64,\n", 222 | " tf.nn.relu,\n", 223 | " kernel_initializer=w_initializer,\n", 224 | " bias_initializer=b_initializer,\n", 225 | " trainable=False)\n", 226 | "\n", 227 | " self.q_target = tf.layers.dense(phi_state_next,\n", 228 | " self.a_space,\n", 229 | " kernel_initializer=w_initializer,\n", 230 | " bias_initializer=b_initializer,\n", 231 | " trainable=False)\n", 232 | "\n", 233 | " def _init_op(self):\n", 234 | "\n", 235 | " with tf.variable_scope('q_predict'):\n", 236 | " # size of q_value_predict is [BATCH_SIZE, 1]\n", 237 | " action_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)\n", 238 | " self.q_eval = tf.gather_nd(self.q_predict, action_indices)\n", 239 | "\n", 240 | " with tf.variable_scope('loss'):\n", 241 | " self.loss_func = tf.losses.mean_squared_error(self.q_n, self.q_eval)\n", 242 | "\n", 243 | " with tf.variable_scope('train'):\n", 244 | " self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_func)\n", 245 | "\n", 246 | " with tf.variable_scope('update_target_net'):\n", 247 | " t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_net')\n", 248 | " p_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='predict_q_net')\n", 249 | " self.update_q_net = [tf.assign(t, e) for t, e in zip(t_params, p_params)]\n", 250 | "\n", 251 | " def predict(self, s):\n", 252 | " if np.random.uniform() < self.epsilon or self.mode == 'test':\n", 253 | " a = np.argmax(self.session.run(self.q_predict, feed_dict={self.s: s[np.newaxis, :]}))\n", 254 | " else:\n", 255 | " a = np.random.randint(0, self.a_space)\n", 256 | " return a\n", 257 | "\n", 258 | " def snapshot(self, s, a, r, s_n):\n", 259 | " self.buffer[self.buffer_count % self.buffer_size, :] = np.hstack((s, [a, r], s_n))\n", 260 | " self.buffer_count += 1\n", 261 | "\n", 262 | " def train(self):\n", 263 | "\n", 264 | " for train_step in range(self.train_steps):\n", 265 | " # Update target net if need.\n", 266 | " if self.training_step % self.update_target_net_step == 0:\n", 267 | " self.session.run(self.update_q_net)\n", 268 | " # Get batch.\n", 269 | " if self.buffer_count < self.buffer_size:\n", 270 | " batch = self.buffer[np.random.choice(self.buffer_count, size=self.batch_size), :]\n", 271 | " else:\n", 272 | " batch = self.buffer[np.random.choice(self.buffer_size, size=self.batch_size), :]\n", 273 | "\n", 274 | " s = batch[:, :self.s_space]\n", 275 | " s_n = batch[:, -self.s_space:]\n", 276 | " a = batch[:, self.s_space].reshape((-1))\n", 277 | " r = batch[:, self.s_space + 1]\n", 278 | "\n", 279 | " # 1. Calculate q_next_predict and q_next_target.\n", 280 | " q_next_predict, q_next_target = self.session.run([self.q_predict, self.q_target], {\n", 281 | " self.s: s_n, self.s_n: s_n\n", 282 | " })\n", 283 | "\n", 284 | " # 2. Select a_indices in q_next_predict.\n", 285 | " a_indices = np.argmax(q_next_predict, axis=1)\n", 286 | "\n", 287 | " # 3. Select Q values with a_indices\n", 288 | " q_next = q_next_target[np.arange(0, self.batch_size), a_indices]\n", 289 | "\n", 290 | " # 4. Calculate q_real.\n", 291 | " q_real = r + self.gamma * q_next\n", 292 | "\n", 293 | " _, cost = self.session.run([self.train_op, self.loss_func], {\n", 294 | " self.s: s, self.a: a, self.q_n: q_real\n", 295 | " })\n", 296 | "\n", 297 | " self.training_step += 1" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "## Running" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "if __name__ == '__main__':\n", 314 | "\n", 315 | " def main(_):\n", 316 | " # Make env.\n", 317 | " env = gym.make('CartPole-v0')\n", 318 | " env.seed(1)\n", 319 | " env = env.unwrapped\n", 320 | " # Init agent.\n", 321 | " agent = Agent(env.action_space.n, env.observation_space.shape[0], **{\n", 322 | " KEY_MODEL_NAME: 'PPO',\n", 323 | " KEY_TRAIN_EPISODE: 10000\n", 324 | " })\n", 325 | " start_game(env, agent)\n", 326 | "\n", 327 | "\n", 328 | " if __name__ == '__main__':\n", 329 | " tf.app.run()" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "## 结尾" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "相对于DQN,运气到了可能3000步在小车倒立杆收敛,Double DQN如果运气到了可能只需要1500步。" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [] 352 | } 353 | ], 354 | "metadata": { 355 | "kernelspec": { 356 | "display_name": "Python 3", 357 | "language": "python", 358 | "name": "python3" 359 | }, 360 | "language_info": { 361 | "codemirror_mode": { 362 | "name": "ipython", 363 | "version": 3 364 | }, 365 | "file_extension": ".py", 366 | "mimetype": "text/x-python", 367 | "name": "python", 368 | "nbconvert_exporter": "python", 369 | "pygments_lexer": "ipython3", 370 | "version": "3.5.4" 371 | } 372 | }, 373 | "nbformat": 4, 374 | "nbformat_minor": 2 375 | } 376 | -------------------------------------------------------------------------------- /note/DQN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# 问题设定" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "在小车倒立杆(CartPole)游戏中,我们希望通过强化学习训练一个智能体(agent),尽可能不断地左右移动小车,使得小车上的杆不倒,我们首先定义CartPole游戏:" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "CartPole游戏即是强化学习模型的enviorment,它与agent交互,实时更新state,内部定义了reward function,其中state有以下定义:" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "state每一个维度分别代表了:\n", 31 | "\n", 32 | "- 小车位置,它的取值范围是-2.4到2.4\n", 33 | "- 小车速度,它的取值范围是负无穷到正无穷\n", 34 | "- 杆的角度,它的取值范围是-41.8°到41.8°\n", 35 | "- 杆的角速,它的取值范围是负无穷到正无穷" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "action是一个2维向量,每一个维度分别代表向左和向右移动。" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "$$\n", 50 | "action \\in \\mathbb{R}^2\n", 51 | "$$" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "# DQN" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "我们将设计一个网络,作为状态-动作值函数(state-action value function),其输入是state,输出是对应各个action的value,并TD(Temporal Difference)进行迭代训练直至收敛。我们将定义两个这样的网络,分别记作$\\theta$和$\\theta^-$,分别代表估计网络与目标网络。" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "我们希望最小化:" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "$$\n", 80 | "\\left( y_j - Q \\left( \\phi_j, a_j; \\theta \\right) \\right)^2\n", 81 | "$$" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "其中,$a_j$具有以下形式:" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "$$\n", 96 | "a_j = \\mathrm{argmax}_{a} Q \\left( \\phi(s_j), a; \\theta\\right)\n", 97 | "$$" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "其中,$y_j$具有以下形式:" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "$$\n", 112 | "f(x)=\n", 113 | "\\begin{cases}\n", 114 | "r_j & \\text{if episode ends at j + 1}\\\\\n", 115 | "r_j + \\gamma \\max_{a^{\\prime}} \\hat{Q} \\left( \\phi_{j+1}, a^{\\prime}; \\theta^{-} \\right)& \\text{otherwise}\n", 116 | "\\end{cases}$$\n", 117 | "\n" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "在最小化TD-Error时,我们将固定目标网络,只对估计网络做梯度反向传播,每次到达一定迭代次数后,将估计网络的权重复制到目标网络。在这个过程中,需要用到经验回放(Experience Replay)技术,即将每一次迭代观测到的$s_t, r_t, a_t, s_{t+1}$作为一个元组缓存,然后在这些缓存中随机抽取元组做批次梯度下降。" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "# 代码实现" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 2, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "name": "stderr", 141 | "output_type": "stream", 142 | "text": [ 143 | "/Users/shuyu/anaconda3/envs/quant/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5\n", 144 | " return f(*args, **kwds)\n" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "# coding=utf-8\n", 150 | "\n", 151 | "import tensorflow as tf\n", 152 | "import numpy as np\n", 153 | "import gym\n", 154 | "import sys\n", 155 | "\n", 156 | "sys.path.append('..')\n", 157 | "\n", 158 | "from base.model import *\n", 159 | "\n", 160 | "%matplotlib inline" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 3, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "class Agent(BaseRLModel):\n", 170 | "\n", 171 | " def __init__(self, session, env, a_space, s_space, **options):\n", 172 | " super(Agent, self).__init__(session, env, a_space, s_space, **options)\n", 173 | "\n", 174 | " self._init_input()\n", 175 | " self._init_nn()\n", 176 | " self._init_op()\n", 177 | " self._init_saver()\n", 178 | "\n", 179 | " self.buffer = np.zeros((self.buffer_size, self.s_space + 1 + 1 + self.s_space))\n", 180 | " self.buffer_count = 0\n", 181 | "\n", 182 | " self.total_train_step = 0\n", 183 | "\n", 184 | " self.update_target_net_step = 200\n", 185 | "\n", 186 | " self.session.run(tf.global_variables_initializer())\n", 187 | "\n", 188 | " def _init_input(self, *args):\n", 189 | " with tf.variable_scope('input'):\n", 190 | " self.s_n = tf.placeholder(tf.float32, [None, self.s_space])\n", 191 | " self.s = tf.placeholder(tf.float32, [None, self.s_space])\n", 192 | " self.r = tf.placeholder(tf.float32, [None, ])\n", 193 | " self.a = tf.placeholder(tf.int32, [None, ])\n", 194 | "\n", 195 | " def _init_nn(self, *args):\n", 196 | " with tf.variable_scope('actor_net'):\n", 197 | " # w,b initializer\n", 198 | " w_initializer = tf.random_normal_initializer(mean=0.0, stddev=0.3)\n", 199 | " b_initializer = tf.constant_initializer(0.1)\n", 200 | "\n", 201 | " with tf.variable_scope('predict_q_net'):\n", 202 | " phi_state = tf.layers.dense(self.s,\n", 203 | " 32,\n", 204 | " tf.nn.relu,\n", 205 | " kernel_initializer=w_initializer,\n", 206 | " bias_initializer=b_initializer)\n", 207 | "\n", 208 | " self.q_predict = tf.layers.dense(phi_state,\n", 209 | " self.a_space,\n", 210 | " kernel_initializer=w_initializer,\n", 211 | " bias_initializer=b_initializer)\n", 212 | "\n", 213 | " with tf.variable_scope('target_q_net'):\n", 214 | " phi_state_next = tf.layers.dense(self.s_n,\n", 215 | " 32,\n", 216 | " tf.nn.relu,\n", 217 | " kernel_initializer=w_initializer,\n", 218 | " bias_initializer=b_initializer)\n", 219 | "\n", 220 | " self.q_target = tf.layers.dense(phi_state_next,\n", 221 | " self.a_space,\n", 222 | " kernel_initializer=w_initializer,\n", 223 | " bias_initializer=b_initializer)\n", 224 | "\n", 225 | " def _init_op(self):\n", 226 | " with tf.variable_scope('q_real'):\n", 227 | " # size of q_value_real is [BATCH_SIZE, 1]\n", 228 | " max_q_value = tf.reduce_max(self.q_target, axis=1)\n", 229 | " q_next = self.r + self.gamma * max_q_value\n", 230 | " self.q_next = tf.stop_gradient(q_next)\n", 231 | "\n", 232 | " with tf.variable_scope('q_predict'):\n", 233 | " # size of q_value_predict is [BATCH_SIZE, 1]\n", 234 | " action_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)\n", 235 | " self.q_eval = tf.gather_nd(self.q_predict, action_indices)\n", 236 | "\n", 237 | " with tf.variable_scope('loss'):\n", 238 | " self.loss_func = tf.reduce_mean(tf.squared_difference(self.q_next, self.q_eval, name='mse'))\n", 239 | "\n", 240 | " with tf.variable_scope('train'):\n", 241 | " self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss_func)\n", 242 | "\n", 243 | " with tf.variable_scope('update_target_net'):\n", 244 | " t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_net')\n", 245 | " p_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='predict_q_net')\n", 246 | " self.update_q_net = [tf.assign(t, e) for t, e in zip(t_params, p_params)]\n", 247 | "\n", 248 | " def predict(self, s):\n", 249 | " if np.random.uniform() < self.epsilon:\n", 250 | " a = np.argmax(self.session.run(self.q_predict, feed_dict={self.s: s[np.newaxis, :]}))\n", 251 | " else:\n", 252 | " a = np.random.randint(0, self.a_space)\n", 253 | " return a\n", 254 | "\n", 255 | " def snapshot(self, s, a, r, s_n):\n", 256 | " self.buffer[self.buffer_count % self.buffer_size, :] = np.hstack((s, [a, r], s_n))\n", 257 | " self.buffer_count += 1\n", 258 | "\n", 259 | " def train(self):\n", 260 | " if self.total_train_step % self.update_target_net_step == 0:\n", 261 | " self.session.run(self.update_q_net)\n", 262 | "\n", 263 | " batch = self.buffer[np.random.choice(self.buffer_size, size=self.batch_size), :]\n", 264 | "\n", 265 | " s = batch[:, :self.s_space]\n", 266 | " s_n = batch[:, -self.s_space:]\n", 267 | " a = batch[:, self.s_space].reshape((-1))\n", 268 | " r = batch[:, self.s_space + 1]\n", 269 | "\n", 270 | " _, cost = self.session.run([self.train_op, self.loss_func], {\n", 271 | " self.s: s, self.a: a, self.r: r, self.s_n: s_n\n", 272 | " })\n", 273 | "\n", 274 | " def run(self):\n", 275 | " if self.mode == 'train':\n", 276 | " for episode in range(self.train_episodes):\n", 277 | " s, r_episode = self.env.reset(), 0\n", 278 | " while True:\n", 279 | " # if episode > 400:\n", 280 | " # self.env.render()\n", 281 | " a = self.predict(s)\n", 282 | " s_n, r, done, _ = self.env.step(a)\n", 283 | " if done:\n", 284 | " r = -5\n", 285 | " r_episode += r\n", 286 | " self.snapshot(s, a, r_episode, s_n)\n", 287 | " s = s_n\n", 288 | " if done:\n", 289 | " break\n", 290 | " if self.buffer_count > self.buffer_size:\n", 291 | " self.train()\n", 292 | " if episode % 200 == 0:\n", 293 | " self.logger.warning('Episode: {} | Rewards: {}'.format(episode, r_episode))\n", 294 | " self.save()\n", 295 | " else:\n", 296 | " for episode in range(self.eval_episodes):\n", 297 | " s, r_episode = self.env.reset()\n", 298 | " while True:\n", 299 | " a = self.predict(s)\n", 300 | " s_n, r, done, _ = self.env.step(a)\n", 301 | " r_episode += r\n", 302 | " s = s_n\n", 303 | " if done:\n", 304 | " break" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 4, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "def main(_):\n", 314 | " # Make env.\n", 315 | " env = gym.make('CartPole-v0')\n", 316 | " env.seed(1)\n", 317 | " env = env.unwrapped\n", 318 | " # Init session.\n", 319 | " session = tf.Session()\n", 320 | " # Init agent.\n", 321 | " agent = Agent(session, env, env.action_space.n, env.observation_space.shape[0], **{\n", 322 | " KEY_MODEL_NAME: 'DQN',\n", 323 | " KEY_TRAIN_EPISODE: 3000\n", 324 | " })\n", 325 | " agent.run()\n" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": {}, 332 | "outputs": [ 333 | { 334 | "name": "stdout", 335 | "output_type": "stream", 336 | "text": [ 337 | "\u001b[33mWARN: gym.spaces.Box autodetected dtype as . Please provide explicit dtype.\u001b[0m\n" 338 | ] 339 | }, 340 | { 341 | "name": "stderr", 342 | "output_type": "stream", 343 | "text": [ 344 | "Episode: 0 | Rewards: 3.0\n", 345 | "Episode: 200 | Rewards: 4.0\n", 346 | "Episode: 400 | Rewards: 4.0\n", 347 | "Episode: 600 | Rewards: 4.0\n", 348 | "Episode: 800 | Rewards: 3.0\n", 349 | "Episode: 1000 | Rewards: 3.0\n", 350 | "Episode: 1200 | Rewards: 36.0\n", 351 | "Episode: 1400 | Rewards: 50.0\n", 352 | "Episode: 1600 | Rewards: 31.0\n", 353 | "Episode: 1800 | Rewards: 187.0\n" 354 | ] 355 | } 356 | ], 357 | "source": [ 358 | "main(_)" 359 | ] 360 | } 361 | ], 362 | "metadata": { 363 | "kernelspec": { 364 | "display_name": "Python 3", 365 | "language": "python", 366 | "name": "python3" 367 | }, 368 | "language_info": { 369 | "codemirror_mode": { 370 | "name": "ipython", 371 | "version": 3 372 | }, 373 | "file_extension": ".py", 374 | "mimetype": "text/x-python", 375 | "name": "python", 376 | "nbconvert_exporter": "python", 377 | "pygments_lexer": "ipython3", 378 | "version": "3.5.4" 379 | } 380 | }, 381 | "nbformat": 4, 382 | "nbformat_minor": 1 383 | } 384 | -------------------------------------------------------------------------------- /note/PPO.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Proximal Policy Optimization (PPO)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## 背景" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "Proximal Policy Optimization,简称PPO,即近端策略优化,是对Policy Graident,即策略梯度的一种改进算法。PPO的核心精神在于,通过一种被称之为Importce Sampling的方法,将Policy Gradient中On-policy的训练过程转化为Off-policy,即从在线学习转化为离线学习,某种意义上与基于值迭代算法中的Experience Replay有异曲同工之处。通过这个改进,训练速度与效果在实验上相较于Policy Gradient具有明显提升。" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Policy Gradient" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "Policy Gradient是一种基于策略迭代的强化学习算法,不同于基于值迭代的DQN、Double-DQN、Duling-DQN通过间接地估计动作-状态值函数来学习的过程,Policy Gradient直接地通过采样状态、动作、奖励,然后期望直接最大化奖励的期望。PPO与PG都希望最大化奖励的期望,当采样足够充分时,奖励的期望可以近似为N回合的奖励的平均值:" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "$$\n", 45 | "\\bar{R}_{\\theta} = \\sum_{\\tau} R(\\tau) P(\\tau \\lvert \\theta) \\approx \\frac{1}{N} \\sum^{N}_{n=1} R(\\tau^{n})\n", 46 | "$$" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "上式中的第n回合的奖励值之和$R(\\tau^n)$被定义为如下形式:" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "$$\n", 61 | "R(\\tau) = \\sum^{T}_{t=1} r_t\n", 62 | "$$" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "在前篇专门介绍Policy Gradient文章中,已经详细地推导了关于$\\nabla \\bar{R}_{\\theta}$的计算方法,所以在这里的具体推导过程将略过,最后关于$\\nabla \\bar{R}_{\\theta}$的计算公式将有如下形式:" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "$$\n", 77 | "\\nabla \\bar{R}_{\\theta} = \\frac{1}{N} \\sum^{N}_{n=1} \\sum^{T_n}_{t=1} R(\\tau^n) \\nabla \\log p(a_t \\lvert s_t, \\theta)\n", 78 | "$$" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "本质上是最小化N回合采样出的动作与网络输出的动作的交叉熵的基础上乘以$R(\\tau^n)$,奖励值给了梯度下降的方向,推导出了$\\nabla \\bar{R}_{\\theta}$,其实就已经可以根据梯度下降法反向传播改进网络进行训练了,但是通常情况下我们会根据具体的问题对$R(\\tau^n)$做一些修正。" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "## Actor-Critic Model" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "对$R(\\tau^n)$的修正通常情况下是必须的,也是有意义的,符合直觉的。以CartPole-v0与MountainCar-v0,即小车倒立杆和过山车游戏为例,每一个状态采取的动作对整个回合的奖励和是不同的,对于小车倒立杆问题而言,初始的几个状态采取的动作直接决定了杆是否会很快地倒,所以直觉地他们更加重要,而对于过山车问题而言,在小车即将爬上山时的这些状态采取的动作直接决定了小车能不能爬上山,所以直觉地他们更加重要。" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "这将引入我们的第一个改进,对于小车倒立杆问题而言,我们需要针对每一个状态、动作元组对$R(\\tau^n)$进行如下替换:" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "$$\n", 114 | "R(\\tau^n) \\rightarrow \\sum^{T_n}_{t=t^{\\prime}} \\gamma^{t} r^{n}_{t}\n", 115 | "$$" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "这样原来的梯度公式将会被改写为以下形式:" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "$$\n", 130 | "\\nabla \\bar{R}_{\\theta} = \\frac{1}{N} \\sum^{N}_{n=1} \\sum^{T_n}_{t=1} \\sum^{T_n}_{t=t^{\\prime}} \\gamma^{t} r^{n}_{t} \\nabla \\log p(a_t \\lvert s_t, \\theta)\n", 131 | "$$" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "但是这样还存在一个称之为Overestimate,即过估计的问题。因为在实际情况中,我们的状态-动作采样通常是不充分的,这会导致一些一些动作或者状态几乎不会被采样,这样在进行梯度下降训练网络时,在这些状态对应的动作将可能被极大的放大或者缩小。由于输出层是soft-max,这些概率会此消彼长,这显然不是我们想看到的。所以我们需要做第二个改进:引入Baseline,通常可能是一个待调整的常超参数,或者Critic,通常是一个待训练的网络。" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "如果引入的是一个Critic,这样的模型将会被称之为Actor-Critic Model,即演员-评论家模型,而N回合平均奖励值的梯度将会被改写为以下形式:" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "$$\n", 153 | "\\nabla \\bar{R}_{\\theta} = \\frac{1}{N} \\sum^{N}_{n=1} \\sum^{T_n}_{t=1} A^{\\theta}(a_t \\lvert s_t) \\nabla \\log p(a_t \\lvert s_t, \\theta)\n", 154 | "$$" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "在一次训练过程中,我们会按顺序同时更新这两个网络,目前这样的模型已经被广泛使用,并在实验上证明了较好的效果。" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "## Importance Sampling" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "在前面提到,PPO的一个核心改进是将Policy Gradient中On-policy的训练过程转化为Off-policy,即从在线学习转化为离线学习,这个转化过程被称之为Importance Sampling,是一种数学手段。如果我们有连续随机变量X,它的概率密度函数记作$p(x)$,则$f(x)$的期望通过如下公式计算:" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "$$\n", 183 | "E_{x \\sim p} \\left[ f(x) \\right] = \\int^{}_{} f(x)p(x)dx\n", 184 | "$$" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "若我们对于连续随机变量X,有另一个概率密度函数记作$q(x)$,那么他们将有以下关系:" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "$$\n", 199 | "E_{x \\sim p} \\left[ f(x) \\right] = \\int f(x) \\cdot p(x)dx = \\int f(x) \\frac{p(x)}{q(x)} \\cdot q(x) dx = E_{x \\sim q} \\left[ f(x) \\frac{p(x)}{q(x)} \\right]\n", 200 | "$$" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "在上式中最右边的项中,$\\frac{p(x)}{q(x)}$被称之为Importance Weight,类比到我们的问题,$f(x)$是$A^{\\theta}(a_t \\lvert s_t)$,而$\\frac{p(x)}{q(x)}$,则是新老策略对于当前状态采取当前动作对应的概率之比,这句话比较费解,更加具体一些,对于小车倒立杆为例,动作是离散的,在网络的输出是一组离散的概率分布,以这个概率分布选择动作,这个动作在新老策略中,在当前状态中都对应了一个概率值,$\\frac{p(x)}{q(x)}$即是他们的比值。" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "通过这一操作,在采样充分的情况下,我们可以认为:" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "$$\n", 222 | "E_{x \\sim p} \\left[ f(x) \\right] = E_{x \\sim q} \\left[ f(x) \\frac{p(x)}{q(x)} \\right]\n", 223 | "$$" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "## Proximal Policy Optimization" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "最终我们将推导出PPO,Importance Sampling将给我们将On-policy的训练过程转化为Off-policy以基础,即我们可以通过老策略,即$q(x)$进行充分采样,然后改进新策略$p(x)$,这个过程可以在一回合重复N次,而不再是1次,这样大幅度减少了原始PG算法在线学习进行采样状态-动作-奖励元组对时间,同时保证了训练效果,而N回合平均奖励值的梯度也将被改写为以下形式:" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "$$\n", 245 | "\\nabla \\bar{R}_{\\theta} = \\frac{1}{N} \\sum^{N}_{n=1} \\sum^{T_n}_{t=1} \\frac{p_{\\theta}(a_t \\lvert s_t)}{p_{\\theta^{\\prime}}(a_t \\lvert s_t)} A^{\\theta}(a_t \\lvert s_t) \\nabla \\log p(a_t \\lvert s_t, \\theta)\n", 246 | "$$" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "在实际训练过程中,会有一个对$\\frac{p_{\\theta}(a_t \\lvert s_t)}{p_{\\theta^{\\prime}}(a_t \\lvert s_t)}$的clip的操作:" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "$$\n", 261 | "clip(\\frac{p_{\\theta}(a_t \\lvert s_t)}{p_{\\theta^{\\prime}}(a_t \\lvert s_t)}, 1 - \\epsilon, 1 + \\epsilon)\n", 262 | "$$" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "相当于一个正则化的操作,其中$\\epsilon$是一个可调整的超参数,至此,PPO也就介绍完了。" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "## Experiment" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 5, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "# coding=utf-8\n", 286 | "\n", 287 | "import tensorflow as tf\n", 288 | "import numpy as np\n", 289 | "import gym\n", 290 | "import sys\n", 291 | "\n", 292 | "sys.path.append('..')\n", 293 | "\n", 294 | "from base.model import BaseRLModel\n", 295 | "\n", 296 | "class Agent(BaseRLModel):\n", 297 | "\n", 298 | " def __init__(self, session, env, a_space, s_space, **options):\n", 299 | " super(Agent, self).__init__(session, env, a_space, s_space, **options)\n", 300 | "\n", 301 | " self._init_input()\n", 302 | " self._init_nn()\n", 303 | " self._init_op()\n", 304 | " self._init_saver()\n", 305 | "\n", 306 | " self.a_buffer = []\n", 307 | " self.s_buffer = []\n", 308 | " self.r_buffer = []\n", 309 | " self.a_p_r_buffer = []\n", 310 | "\n", 311 | " self.session.run(tf.global_variables_initializer())\n", 312 | "\n", 313 | " def _init_input(self, *args):\n", 314 | " with tf.variable_scope('input'):\n", 315 | " self.s = tf.placeholder(tf.float32, [None, self.s_space], name='s')\n", 316 | " self.a = tf.placeholder(tf.int32, [None, ], name='a')\n", 317 | " self.r = tf.placeholder(tf.float32, [None, ], name='r')\n", 318 | " self.adv = tf.placeholder(tf.float32, [None, ], name='adv')\n", 319 | " self.a_p_r = tf.placeholder(tf.float32, [None, ], name='a_p_r')\n", 320 | "\n", 321 | " def _init_nn(self, *args):\n", 322 | " self.advantage, self.value = self._init_critic_net('critic_net')\n", 323 | " self.a_prob_eval, self.a_logits_eval = self._init_actor_net('eval_actor_net')\n", 324 | " self.a_prob_target, self.a_logits_target = self._init_actor_net('target_actor_net', trainable=False)\n", 325 | "\n", 326 | " def _init_op(self):\n", 327 | " with tf.variable_scope('critic_loss_func'):\n", 328 | " # loss func.\n", 329 | " self.c_loss_func = tf.losses.mean_squared_error(labels=self.r, predictions=self.value)\n", 330 | " with tf.variable_scope('critic_optimizer'):\n", 331 | " # critic optimizer.\n", 332 | " self.c_optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.c_loss_func)\n", 333 | " with tf.variable_scope('update_target_actor_net'):\n", 334 | " # Get eval w, b.\n", 335 | " params_e = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_actor_net')\n", 336 | " params_t = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor_net')\n", 337 | " self.update_target_a_op = [tf.assign(t, e) for t, e in zip(params_t, params_e)]\n", 338 | " with tf.variable_scope('actor_loss_func'):\n", 339 | " # one hot a.\n", 340 | " a_one_hot = tf.one_hot(self.a, self.a_space)\n", 341 | " # cross entropy.\n", 342 | " cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=a_one_hot, logits=self.a_logits_eval)\n", 343 | " # loss func.\n", 344 | " self.a_loss_func = tf.reduce_mean(cross_entropy * self.adv * self.a_p_r)\n", 345 | " with tf.variable_scope('actor_optimizer'):\n", 346 | " self.a_optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.a_loss_func)\n", 347 | "\n", 348 | " def _init_actor_net(self, scope, trainable=True):\n", 349 | " with tf.variable_scope(scope):\n", 350 | " # Kernel initializer.\n", 351 | " w_initializer = tf.random_normal_initializer(0.0, 0.01)\n", 352 | " # First dense.\n", 353 | " f_dense = tf.layers.dense(self.s, 32, tf.nn.relu, trainable=trainable, kernel_initializer=w_initializer)\n", 354 | " # Second dense.\n", 355 | " s_dense = tf.layers.dense(f_dense, 32, tf.nn.relu, trainable=trainable, kernel_initializer=w_initializer)\n", 356 | " # Action logits.\n", 357 | " a_logits = tf.layers.dense(s_dense, self.a_space, trainable=trainable, kernel_initializer=w_initializer)\n", 358 | " # Action prob.\n", 359 | " a_prob = tf.nn.softmax(a_logits)\n", 360 | " return a_prob, a_logits\n", 361 | "\n", 362 | " def _init_critic_net(self, scope):\n", 363 | " with tf.variable_scope(scope):\n", 364 | " # Kernel initializer.\n", 365 | " w_initializer = tf.random_normal_initializer(0.0, 0.01)\n", 366 | " # First dense.\n", 367 | " f_dense = tf.layers.dense(self.s, 64, tf.nn.relu, kernel_initializer=w_initializer)\n", 368 | " # Value.\n", 369 | " value = tf.layers.dense(f_dense, 1)\n", 370 | " value = tf.reshape(value, [-1, ])\n", 371 | " # Advantage.\n", 372 | " advantage = self.r - value\n", 373 | " return advantage, value\n", 374 | "\n", 375 | " def predict(self, s):\n", 376 | " # Calculate a eval prob.\n", 377 | " a_prob_eval, a_prob_target = self.session.run([self.a_prob_eval, self.a_prob_target], {self.s: [s]})\n", 378 | " # Calculate action prob ratio between eval and target.\n", 379 | " a_p_r = np.max(a_prob_eval) / np.max(a_prob_target)\n", 380 | " self.a_p_r_buffer.append(a_p_r)\n", 381 | " return np.random.choice(range(a_prob_eval.shape[1]), p=a_prob_eval.ravel())\n", 382 | "\n", 383 | " def snapshot(self, s, a, r, _):\n", 384 | " self.a_buffer.append(a)\n", 385 | " self.s_buffer.append(s)\n", 386 | " self.r_buffer.append(r)\n", 387 | "\n", 388 | " def train(self):\n", 389 | " # Copy r_buffer\n", 390 | " r_buffer = self.r_buffer\n", 391 | " # Init r_tau\n", 392 | " r_tau = 0\n", 393 | " # Calculate r_tau\n", 394 | " for index in reversed(range(0, len(r_buffer))):\n", 395 | " r_tau = r_tau * self.gamma + r_buffer[index]\n", 396 | " self.r_buffer[index] = r_tau\n", 397 | " # Calculate adv.\n", 398 | " adv_buffer = self.session.run(self.advantage, {self.s: self.s_buffer, self.r: self.r_buffer})\n", 399 | " # Minimize loss.\n", 400 | " self.session.run([self.a_optimizer, self.c_optimizer], {\n", 401 | " self.adv: adv_buffer,\n", 402 | " self.s: self.s_buffer,\n", 403 | " self.a: self.a_buffer,\n", 404 | " self.r: self.r_buffer,\n", 405 | " self.a_p_r: self.a_p_r_buffer,\n", 406 | " })\n", 407 | " self.s_buffer = []\n", 408 | " self.a_buffer = []\n", 409 | " self.r_buffer = []\n", 410 | " self.a_p_r_buffer = []\n", 411 | "\n", 412 | " def run(self):\n", 413 | " if self.mode == 'train':\n", 414 | " for episode in range(self.train_episodes):\n", 415 | " s, r_episode = self.env.reset(), 0\n", 416 | " while True:\n", 417 | " if episode > 200:\n", 418 | " self.env.render()\n", 419 | " a = self.predict(s)\n", 420 | " s_n, r, done, _ = self.env.step(a)\n", 421 | " if done:\n", 422 | " r = -5\n", 423 | " r_episode += r\n", 424 | " self.snapshot(s, a, r, s_n)\n", 425 | " s = s_n\n", 426 | " if done:\n", 427 | " break\n", 428 | " self.train()\n", 429 | " if episode % 25 == 0:\n", 430 | " self.logger.warning('Episode: {} | Rewards: {}'.format(episode, r_episode))\n", 431 | " self.save()\n", 432 | " else:\n", 433 | " for episode in range(self.eval_episodes):\n", 434 | " s, r_episode = self.env.reset()\n", 435 | " while True:\n", 436 | " a = self.predict(s)\n", 437 | " s_n, r, done, _ = self.env.step(a)\n", 438 | " r_episode += r\n", 439 | " s = s_n\n", 440 | " if done:\n", 441 | " break" 442 | ] 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "metadata": {}, 447 | "source": [ 448 | "## Running" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "# Make env.\n", 458 | "env = gym.make('CartPole-v0')\n", 459 | "env.seed(1)\n", 460 | "env = env.unwrapped\n", 461 | "# Init session.\n", 462 | "session = tf.Session()\n", 463 | "# Init agent.\n", 464 | "agent = Agent(session, env, env.action_space.n, env.observation_space.shape[0], **{\n", 465 | " 'model_name': 'PolicyGradient',\n", 466 | "})\n", 467 | "agent.run()" 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "metadata": {}, 473 | "source": [ 474 | "## 结尾" 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "metadata": {}, 480 | "source": [ 481 | "就目前观察,PPO在小车倒立杆问题上的收敛速度几倍于PG与一票基于值迭代的方法,让我非常惊讶。" 482 | ] 483 | } 484 | ], 485 | "metadata": { 486 | "kernelspec": { 487 | "display_name": "Python 3", 488 | "language": "python", 489 | "name": "python3" 490 | }, 491 | "language_info": { 492 | "codemirror_mode": { 493 | "name": "ipython", 494 | "version": 3 495 | }, 496 | "file_extension": ".py", 497 | "mimetype": "text/x-python", 498 | "name": "python", 499 | "nbconvert_exporter": "python", 500 | "pygments_lexer": "ipython3", 501 | "version": "3.5.4" 502 | } 503 | }, 504 | "nbformat": 4, 505 | "nbformat_minor": 1 506 | } 507 | -------------------------------------------------------------------------------- /note/.ipynb_checkpoints/PPO-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Proximal Policy Optimization (PPO)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## 背景" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "Proximal Policy Optimization,简称PPO,即近端策略优化,是对Policy Graident,即策略梯度的一种改进算法。PPO的核心精神在于,通过一种被称之为Importce Sampling的方法,将Policy Gradient中On-policy的训练过程转化为Off-policy,即从在线学习转化为离线学习,某种意义上与基于值迭代算法中的Experience Replay有异曲同工之处。通过这个改进,训练速度与效果在实验上相较于Policy Gradient具有明显提升。" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Policy Gradient" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "Policy Gradient是一种基于策略迭代的强化学习算法,不同于基于值迭代的DQN、Double-DQN、Duling-DQN通过间接地估计动作-状态值函数来学习的过程,Policy Gradient直接地通过采样状态、动作、奖励,然后期望直接最大化奖励的期望。PPO与PG都希望最大化奖励的期望,当采样足够充分时,奖励的期望可以近似为N回合的奖励的平均值:" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "$$\n", 45 | "\\bar{R}_{\\theta} = \\sum_{\\tau} R(\\tau) P(\\tau \\lvert \\theta) \\approx \\frac{1}{N} \\sum^{N}_{n=1} R(\\tau^{n})\n", 46 | "$$" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "上式中的第n回合的奖励值之和$R(\\tau^n)$被定义为如下形式:" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "$$\n", 61 | "R(\\tau) = \\sum^{T}_{t=1} r_t\n", 62 | "$$" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "在前篇专门介绍Policy Gradient文章中,已经详细地推导了关于$\\nabla \\bar{R}_{\\theta}$的计算方法,所以在这里的具体推导过程将略过,最后关于$\\nabla \\bar{R}_{\\theta}$的计算公式将有如下形式:" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "$$\n", 77 | "\\nabla \\bar{R}_{\\theta} = \\frac{1}{N} \\sum^{N}_{n=1} \\sum^{T_n}_{t=1} R(\\tau^n) \\nabla \\log p(a_t \\lvert s_t, \\theta)\n", 78 | "$$" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "本质上是最小化N回合采样出的动作与网络输出的动作的交叉熵的基础上乘以$R(\\tau^n)$,奖励值给了梯度下降的方向,推导出了$\\nabla \\bar{R}_{\\theta}$,其实就已经可以根据梯度下降法反向传播改进网络进行训练了,但是通常情况下我们会根据具体的问题对$R(\\tau^n)$做一些修正。" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "## Actor-Critic Model" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "对$R(\\tau^n)$的修正通常情况下是必须的,也是有意义的,符合直觉的。以CartPole-v0与MountainCar-v0,即小车倒立杆和过山车游戏为例,每一个状态采取的动作对整个回合的奖励和是不同的,对于小车倒立杆问题而言,初始的几个状态采取的动作直接决定了杆是否会很快地倒,所以直觉地他们更加重要,而对于过山车问题而言,在小车即将爬上山时的这些状态采取的动作直接决定了小车能不能爬上山,所以直觉地他们更加重要。" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "这将引入我们的第一个改进,对于小车倒立杆问题而言,我们需要针对每一个状态、动作元组对$R(\\tau^n)$进行如下替换:" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "$$\n", 114 | "R(\\tau^n) \\rightarrow \\sum^{T_n}_{t=t^{\\prime}} \\gamma^{t} r^{n}_{t}\n", 115 | "$$" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "这样原来的梯度公式将会被改写为以下形式:" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "$$\n", 130 | "\\nabla \\bar{R}_{\\theta} = \\frac{1}{N} \\sum^{N}_{n=1} \\sum^{T_n}_{t=1} \\sum^{T_n}_{t=t^{\\prime}} \\gamma^{t} r^{n}_{t} \\nabla \\log p(a_t \\lvert s_t, \\theta)\n", 131 | "$$" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "但是这样还存在一个称之为Overestimate,即过估计的问题。因为在实际情况中,我们的状态-动作采样通常是不充分的,这会导致一些一些动作或者状态几乎不会被采样,这样在进行梯度下降训练网络时,在这些状态对应的动作将可能被极大的放大或者缩小。由于输出层是soft-max,这些概率会此消彼长,这显然不是我们想看到的。所以我们需要做第二个改进:引入Baseline,通常可能是一个待调整的常超参数,或者Critic,通常是一个待训练的网络。" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "如果引入的是一个Critic,这样的模型将会被称之为Actor-Critic Model,即演员-评论家模型,而N回合平均奖励值的梯度将会被改写为以下形式:" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "$$\n", 153 | "\\nabla \\bar{R}_{\\theta} = \\frac{1}{N} \\sum^{N}_{n=1} \\sum^{T_n}_{t=1} A^{\\theta}(a_t \\lvert s_t) \\nabla \\log p(a_t \\lvert s_t, \\theta)\n", 154 | "$$" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "在一次训练过程中,我们会按顺序同时更新这两个网络,目前这样的模型已经被广泛使用,并在实验上证明了较好的效果。" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "## Importance Sampling" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "在前面提到,PPO的一个核心改进是将Policy Gradient中On-policy的训练过程转化为Off-policy,即从在线学习转化为离线学习,这个转化过程被称之为Importance Sampling,是一种数学手段。如果我们有连续随机变量X,它的概率密度函数记作$p(x)$,则$f(x)$的期望通过如下公式计算:" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "$$\n", 183 | "E_{x \\sim p} \\left[ f(x) \\right] = \\int^{}_{} f(x)p(x)dx\n", 184 | "$$" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "若我们对于连续随机变量X,有另一个概率密度函数记作$q(x)$,那么他们将有以下关系:" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "$$\n", 199 | "E_{x \\sim p} \\left[ f(x) \\right] = \\int f(x) \\cdot p(x)dx = \\int f(x) \\frac{p(x)}{q(x)} \\cdot q(x) dx = E_{x \\sim q} \\left[ f(x) \\frac{p(x)}{q(x)} \\right]\n", 200 | "$$" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "在上式中最右边的项中,$\\frac{p(x)}{q(x)}$被称之为Importance Weight,类比到我们的问题,$f(x)$是$A^{\\theta}(a_t \\lvert s_t)$,而$\\frac{p(x)}{q(x)}$,则是新老策略对于当前状态采取当前动作对应的概率之比,这句话比较费解,更加具体一些,对于小车倒立杆为例,动作是离散的,在网络的输出是一组离散的概率分布,以这个概率分布选择动作,这个动作在新老策略中,在当前状态中都对应了一个概率值,$\\frac{p(x)}{q(x)}$即是他们的比值。" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "通过这一操作,在采样充分的情况下,我们可以认为:" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "$$\n", 222 | "E_{x \\sim p} \\left[ f(x) \\right] = E_{x \\sim q} \\left[ f(x) \\frac{p(x)}{q(x)} \\right]\n", 223 | "$$" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "## Proximal Policy Optimization" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "最终我们将推导出PPO,Importance Sampling将给我们将On-policy的训练过程转化为Off-policy以基础,即我们可以通过老策略,即$q(x)$进行充分采样,然后改进新策略$p(x)$,这个过程可以在一回合重复N次,而不再是1次,这样大幅度减少了原始PG算法在线学习进行采样状态-动作-奖励元组对时间,同时保证了训练效果,而N回合平均奖励值的梯度也将被改写为以下形式:" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "$$\n", 245 | "\\nabla \\bar{R}_{\\theta} = \\frac{1}{N} \\sum^{N}_{n=1} \\sum^{T_n}_{t=1} \\frac{p_{\\theta}(a_t \\lvert s_t)}{p_{\\theta^{\\prime}}(a_t \\lvert s_t)} A^{\\theta}(a_t \\lvert s_t) \\nabla \\log p(a_t \\lvert s_t, \\theta)\n", 246 | "$$" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "在实际训练过程中,会有一个对$\\frac{p_{\\theta}(a_t \\lvert s_t)}{p_{\\theta^{\\prime}}(a_t \\lvert s_t)}$的clip的操作:" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "$$\n", 261 | "clip(\\frac{p_{\\theta}(a_t \\lvert s_t)}{p_{\\theta^{\\prime}}(a_t \\lvert s_t)}, 1 - \\epsilon, 1 + \\epsilon)\n", 262 | "$$" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "相当于一个正则化的操作,其中$\\epsilon$是一个可调整的超参数,至此,PPO也就介绍完了。" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "## Experiment" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 5, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "# coding=utf-8\n", 286 | "\n", 287 | "import tensorflow as tf\n", 288 | "import numpy as np\n", 289 | "import gym\n", 290 | "import sys\n", 291 | "\n", 292 | "sys.path.append('..')\n", 293 | "\n", 294 | "from base.model import BaseRLModel\n", 295 | "\n", 296 | "class Agent(BaseRLModel):\n", 297 | "\n", 298 | " def __init__(self, session, env, a_space, s_space, **options):\n", 299 | " super(Agent, self).__init__(session, env, a_space, s_space, **options)\n", 300 | "\n", 301 | " self._init_input()\n", 302 | " self._init_nn()\n", 303 | " self._init_op()\n", 304 | " self._init_saver()\n", 305 | "\n", 306 | " self.a_buffer = []\n", 307 | " self.s_buffer = []\n", 308 | " self.r_buffer = []\n", 309 | " self.a_p_r_buffer = []\n", 310 | "\n", 311 | " self.session.run(tf.global_variables_initializer())\n", 312 | "\n", 313 | " def _init_input(self, *args):\n", 314 | " with tf.variable_scope('input'):\n", 315 | " self.s = tf.placeholder(tf.float32, [None, self.s_space], name='s')\n", 316 | " self.a = tf.placeholder(tf.int32, [None, ], name='a')\n", 317 | " self.r = tf.placeholder(tf.float32, [None, ], name='r')\n", 318 | " self.adv = tf.placeholder(tf.float32, [None, ], name='adv')\n", 319 | " self.a_p_r = tf.placeholder(tf.float32, [None, ], name='a_p_r')\n", 320 | "\n", 321 | " def _init_nn(self, *args):\n", 322 | " self.advantage, self.value = self._init_critic_net('critic_net')\n", 323 | " self.a_prob_eval, self.a_logits_eval = self._init_actor_net('eval_actor_net')\n", 324 | " self.a_prob_target, self.a_logits_target = self._init_actor_net('target_actor_net', trainable=False)\n", 325 | "\n", 326 | " def _init_op(self):\n", 327 | " with tf.variable_scope('critic_loss_func'):\n", 328 | " # loss func.\n", 329 | " self.c_loss_func = tf.losses.mean_squared_error(labels=self.r, predictions=self.value)\n", 330 | " with tf.variable_scope('critic_optimizer'):\n", 331 | " # critic optimizer.\n", 332 | " self.c_optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.c_loss_func)\n", 333 | " with tf.variable_scope('update_target_actor_net'):\n", 334 | " # Get eval w, b.\n", 335 | " params_e = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_actor_net')\n", 336 | " params_t = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor_net')\n", 337 | " self.update_target_a_op = [tf.assign(t, e) for t, e in zip(params_t, params_e)]\n", 338 | " with tf.variable_scope('actor_loss_func'):\n", 339 | " # one hot a.\n", 340 | " a_one_hot = tf.one_hot(self.a, self.a_space)\n", 341 | " # cross entropy.\n", 342 | " cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=a_one_hot, logits=self.a_logits_eval)\n", 343 | " # loss func.\n", 344 | " self.a_loss_func = tf.reduce_mean(cross_entropy * self.adv * self.a_p_r)\n", 345 | " with tf.variable_scope('actor_optimizer'):\n", 346 | " self.a_optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.a_loss_func)\n", 347 | "\n", 348 | " def _init_actor_net(self, scope, trainable=True):\n", 349 | " with tf.variable_scope(scope):\n", 350 | " # Kernel initializer.\n", 351 | " w_initializer = tf.random_normal_initializer(0.0, 0.01)\n", 352 | " # First dense.\n", 353 | " f_dense = tf.layers.dense(self.s, 32, tf.nn.relu, trainable=trainable, kernel_initializer=w_initializer)\n", 354 | " # Second dense.\n", 355 | " s_dense = tf.layers.dense(f_dense, 32, tf.nn.relu, trainable=trainable, kernel_initializer=w_initializer)\n", 356 | " # Action logits.\n", 357 | " a_logits = tf.layers.dense(s_dense, self.a_space, trainable=trainable, kernel_initializer=w_initializer)\n", 358 | " # Action prob.\n", 359 | " a_prob = tf.nn.softmax(a_logits)\n", 360 | " return a_prob, a_logits\n", 361 | "\n", 362 | " def _init_critic_net(self, scope):\n", 363 | " with tf.variable_scope(scope):\n", 364 | " # Kernel initializer.\n", 365 | " w_initializer = tf.random_normal_initializer(0.0, 0.01)\n", 366 | " # First dense.\n", 367 | " f_dense = tf.layers.dense(self.s, 64, tf.nn.relu, kernel_initializer=w_initializer)\n", 368 | " # Value.\n", 369 | " value = tf.layers.dense(f_dense, 1)\n", 370 | " value = tf.reshape(value, [-1, ])\n", 371 | " # Advantage.\n", 372 | " advantage = self.r - value\n", 373 | " return advantage, value\n", 374 | "\n", 375 | " def predict(self, s):\n", 376 | " # Calculate a eval prob.\n", 377 | " a_prob_eval, a_prob_target = self.session.run([self.a_prob_eval, self.a_prob_target], {self.s: [s]})\n", 378 | " # Calculate action prob ratio between eval and target.\n", 379 | " a_p_r = np.max(a_prob_eval) / np.max(a_prob_target)\n", 380 | " self.a_p_r_buffer.append(a_p_r)\n", 381 | " return np.random.choice(range(a_prob_eval.shape[1]), p=a_prob_eval.ravel())\n", 382 | "\n", 383 | " def snapshot(self, s, a, r, _):\n", 384 | " self.a_buffer.append(a)\n", 385 | " self.s_buffer.append(s)\n", 386 | " self.r_buffer.append(r)\n", 387 | "\n", 388 | " def train(self):\n", 389 | " # Copy r_buffer\n", 390 | " r_buffer = self.r_buffer\n", 391 | " # Init r_tau\n", 392 | " r_tau = 0\n", 393 | " # Calculate r_tau\n", 394 | " for index in reversed(range(0, len(r_buffer))):\n", 395 | " r_tau = r_tau * self.gamma + r_buffer[index]\n", 396 | " self.r_buffer[index] = r_tau\n", 397 | " # Calculate adv.\n", 398 | " adv_buffer = self.session.run(self.advantage, {self.s: self.s_buffer, self.r: self.r_buffer})\n", 399 | " # Minimize loss.\n", 400 | " self.session.run([self.a_optimizer, self.c_optimizer], {\n", 401 | " self.adv: adv_buffer,\n", 402 | " self.s: self.s_buffer,\n", 403 | " self.a: self.a_buffer,\n", 404 | " self.r: self.r_buffer,\n", 405 | " self.a_p_r: self.a_p_r_buffer,\n", 406 | " })\n", 407 | " self.s_buffer = []\n", 408 | " self.a_buffer = []\n", 409 | " self.r_buffer = []\n", 410 | " self.a_p_r_buffer = []\n", 411 | "\n", 412 | " def run(self):\n", 413 | " if self.mode == 'train':\n", 414 | " for episode in range(self.train_episodes):\n", 415 | " s, r_episode = self.env.reset(), 0\n", 416 | " while True:\n", 417 | " if episode > 200:\n", 418 | " self.env.render()\n", 419 | " a = self.predict(s)\n", 420 | " s_n, r, done, _ = self.env.step(a)\n", 421 | " if done:\n", 422 | " r = -5\n", 423 | " r_episode += r\n", 424 | " self.snapshot(s, a, r, s_n)\n", 425 | " s = s_n\n", 426 | " if done:\n", 427 | " break\n", 428 | " self.train()\n", 429 | " if episode % 25 == 0:\n", 430 | " self.logger.warning('Episode: {} | Rewards: {}'.format(episode, r_episode))\n", 431 | " self.save()\n", 432 | " else:\n", 433 | " for episode in range(self.eval_episodes):\n", 434 | " s, r_episode = self.env.reset()\n", 435 | " while True:\n", 436 | " a = self.predict(s)\n", 437 | " s_n, r, done, _ = self.env.step(a)\n", 438 | " r_episode += r\n", 439 | " s = s_n\n", 440 | " if done:\n", 441 | " break" 442 | ] 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "metadata": {}, 447 | "source": [ 448 | "## Running" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "# Make env.\n", 458 | "env = gym.make('CartPole-v0')\n", 459 | "env.seed(1)\n", 460 | "env = env.unwrapped\n", 461 | "# Init session.\n", 462 | "session = tf.Session()\n", 463 | "# Init agent.\n", 464 | "agent = Agent(session, env, env.action_space.n, env.observation_space.shape[0], **{\n", 465 | " 'model_name': 'PolicyGradient',\n", 466 | "})\n", 467 | "agent.run()" 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "metadata": {}, 473 | "source": [ 474 | "## 结尾" 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "metadata": {}, 480 | "source": [ 481 | "就目前观察,PPO在小车倒立杆问题上的收敛速度几倍于PG与一票基于值迭代的方法,让我非常惊讶。" 482 | ] 483 | } 484 | ], 485 | "metadata": { 486 | "kernelspec": { 487 | "display_name": "Python 3", 488 | "language": "python", 489 | "name": "python3" 490 | }, 491 | "language_info": { 492 | "codemirror_mode": { 493 | "name": "ipython", 494 | "version": 3 495 | }, 496 | "file_extension": ".py", 497 | "mimetype": "text/x-python", 498 | "name": "python", 499 | "nbconvert_exporter": "python", 500 | "pygments_lexer": "ipython3", 501 | "version": "3.5.4" 502 | } 503 | }, 504 | "nbformat": 4, 505 | "nbformat_minor": 1 506 | } 507 | --------------------------------------------------------------------------------