├── .DS_Store ├── LICENSE ├── README.md ├── cartpole ├── .DS_Store ├── a2c │ ├── .DS_Store │ ├── __pycache__ │ │ └── model.cpython-36.pyc │ ├── model.py │ ├── save_model │ │ └── model.pth.tar │ ├── test.py │ └── train.py ├── ddqn │ ├── .DS_Store │ ├── __pycache__ │ │ └── model.cpython-36.pyc │ ├── model.py │ ├── save_model │ │ └── model.pth.tar │ ├── test.py │ └── train.py └── dqn │ ├── __pycache__ │ ├── model.cpython-36.pyc │ └── model.cpython-37.pyc │ ├── logs │ └── .DS_Store │ ├── model.py │ ├── save_model │ └── model.pth.tar │ ├── test.py │ └── train.py ├── img ├── .DS_Store ├── cartpole.png └── pendulum.png ├── mountaincar ├── .DS_Store ├── app │ ├── .DS_Store │ ├── __pycache__ │ │ ├── app.cpython-36.pyc │ │ └── train.cpython-36.pyc │ ├── app.py │ ├── expert_demo │ │ ├── .DS_Store │ │ ├── expert_demo.npy │ │ └── make_expert.py │ ├── learning_curves │ │ ├── .DS_Store │ │ └── app_eps_60000.png │ ├── results │ │ ├── .DS_Store │ │ ├── app_q_table.npy │ │ └── test_rendering_60000.gif │ ├── test.py │ └── train.py ├── ddpg │ ├── __pycache__ │ │ ├── model.cpython-36.pyc │ │ └── utils.cpython-36.pyc │ ├── model.py │ ├── save_model │ │ └── model.pth.tar │ ├── test.py │ ├── train.py │ └── utils.py ├── maxent │ ├── .DS_Store │ ├── __pycache__ │ │ ├── maxent.cpython-36.pyc │ │ ├── maxent_train.cpython-36.pyc │ │ └── train.cpython-36.pyc │ ├── expert_demo │ │ ├── .DS_Store │ │ ├── expert_demo.npy │ │ └── make_expert.py │ ├── learning_curves │ │ ├── .DS_Store │ │ └── maxent_eps_30000.png │ ├── maxent.py │ ├── results │ │ ├── .DS_Store │ │ ├── maxent_q_table.npy │ │ └── test_rendering_30000.gif │ ├── test.py │ └── train.py └── sac │ ├── .DS_Store │ ├── __pycache__ │ ├── model.cpython-36.pyc │ └── utils.cpython-36.pyc │ ├── model.py │ ├── test.py │ ├── train.py │ └── utils.py ├── mujoco ├── .DS_Store ├── gail │ ├── .DS_Store │ ├── __pycache__ │ │ ├── hparams.cpython-36.pyc │ │ ├── model.cpython-36.pyc │ │ └── train_model.cpython-36.pyc │ ├── expert_demo │ │ └── expert_demo.p │ ├── logs │ │ └── .DS_Store │ ├── main.py │ ├── model.py │ ├── save_model │ │ └── .DS_Store │ ├── test.py │ ├── train_model.py │ └── utils │ │ ├── __pycache__ │ │ ├── running_state.cpython-36.pyc │ │ ├── utils.cpython-36.pyc │ │ └── zfilter.cpython-36.pyc │ │ ├── utils.py │ │ └── zfilter.py ├── ppo │ ├── .DS_Store │ ├── __pycache__ │ │ ├── hparams.cpython-36.pyc │ │ ├── model.cpython-36.pyc │ │ ├── ppo.cpython-36.pyc │ │ └── train_model.cpython-36.pyc │ ├── logs │ │ └── .DS_Store │ ├── main.py │ ├── model.py │ ├── ppo.py │ ├── save_model │ │ └── .DS_Store │ ├── test.py │ └── utils │ │ ├── __pycache__ │ │ ├── running_state.cpython-36.pyc │ │ ├── utils.cpython-36.pyc │ │ └── zfilter.cpython-36.pyc │ │ ├── utils.py │ │ └── zfilter.py ├── tnpg │ ├── .DS_Store │ ├── __pycache__ │ │ ├── model.cpython-36.pyc │ │ ├── tnpg.cpython-36.pyc │ │ └── trpo.cpython-36.pyc │ ├── model.py │ ├── save_model │ │ ├── 24model.pth │ │ ├── 40model.pth │ │ ├── 67model.pth │ │ ├── 76model.pth │ │ ├── 79model.pth │ │ └── 86model.pth │ ├── test.py │ ├── tnpg.py │ ├── train.py │ └── utils │ │ ├── __pycache__ │ │ ├── running_state.cpython-36.pyc │ │ └── utils.cpython-36.pyc │ │ ├── running_state.py │ │ └── utils.py ├── trpo │ ├── __pycache__ │ │ ├── model.cpython-36.pyc │ │ └── trpo.cpython-36.pyc │ ├── model.py │ ├── test.py │ ├── train.py │ ├── trpo.py │ └── utils │ │ ├── __pycache__ │ │ ├── running_state.cpython-36.pyc │ │ └── utils.cpython-36.pyc │ │ ├── running_state.py │ │ └── utils.py └── vail │ ├── .DS_Store │ ├── __pycache__ │ ├── hparams.cpython-36.pyc │ ├── model.cpython-36.pyc │ └── train_model.cpython-36.pyc │ ├── expert_demo │ └── expert_demo.p │ ├── logs │ └── .DS_Store │ ├── main.py │ ├── model.py │ ├── save_model │ └── .DS_Store │ ├── test.py │ ├── train_model.py │ └── utils │ ├── __pycache__ │ ├── running_state.cpython-36.pyc │ ├── utils.cpython-36.pyc │ └── zfilter.cpython-36.pyc │ ├── utils.py │ └── zfilter.py └── pendulum ├── .DS_Store ├── ddpg ├── .DS_Store ├── __pycache__ │ ├── model.cpython-36.pyc │ ├── model.cpython-37.pyc │ ├── utils.cpython-36.pyc │ └── utils.cpython-37.pyc ├── model.py ├── save_model │ ├── .DS_Store │ └── model.pth.tar ├── test.py ├── train.py └── utils.py ├── ppo ├── __pycache__ │ ├── model.cpython-36.pyc │ └── utils.cpython-36.pyc ├── model.py ├── save_model │ └── model.pth.tar ├── test.py ├── train.py └── utils.py ├── ppo_gae ├── __pycache__ │ ├── model.cpython-36.pyc │ └── utils.cpython-36.pyc ├── model.py ├── save_model │ └── model.pth.tar ├── test.py ├── train.py └── utils.py ├── sac ├── __pycache__ │ ├── model.cpython-36.pyc │ └── utils.cpython-36.pyc ├── model.py ├── save_model │ └── model.pth.tar ├── test.py ├── train.py └── utils.py ├── tnpg ├── __pycache__ │ ├── model.cpython-36.pyc │ ├── tnpg.cpython-36.pyc │ └── utils.cpython-36.pyc ├── model.py ├── save_model │ └── model.pth.tar ├── test.py ├── train.py └── utils.py ├── trpo ├── .DS_Store ├── __pycache__ │ ├── model.cpython-36.pyc │ └── utils.cpython-36.pyc ├── model.py ├── save_model │ └── model.pth.tar ├── test.py ├── train.py └── utils.py └── trpo_gae ├── __pycache__ ├── model.cpython-36.pyc └── utils.cpython-36.pyc ├── model.py ├── save_model └── model.pth.tar ├── test.py ├── train.py └── utils.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/.DS_Store -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Dongmin Lee 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement Learning Code with PyTorch 2 | 3 | ## Papers 4 | 5 | - [Deep Q-Network (DQN)](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf) 6 | - [Double DQN (DDQN)](https://arxiv.org/pdf/1509.06461.pdf) 7 | - [Advantage Actor-Critic (A2C)](http://incompleteideas.net/book/RLbook2018.pdf) 8 | - [Asynchronous Advantage Actor-Critic (A3C)](https://arxiv.org/pdf/1602.01783.pdf) 9 | - [Deep Deterministic Policy Gradient (DDPG)](https://arxiv.org/pdf/1509.02971.pdf) 10 | - [Truncated Natural Policy Gradient (TNPG)](https://papers.nips.cc/paper/2073-a-natural-policy-gradient.pdf) 11 | - [Trust Region Policy Optimization (TRPO)](https://arxiv.org/pdf/1502.05477.pdf) 12 | - [Generalized Advantage Estimator (GAE)](https://arxiv.org/pdf/1506.02438.pdf) 13 | - [Proximal Policy Optimization (PPO)](https://arxiv.org/pdf/1707.06347.pdf) 14 | - [Soft Actor-Critic (SAC)](https://arxiv.org/pdf/1812.05905.pdf) 15 | - [Apprenticeship Learning via Inverse Reinforcement Learning (APP)](http://people.eecs.berkeley.edu/~russell/classes/cs294/s11/readings/Abbeel+Ng:2004.pdf) 16 | - [Maximum Entropy Inverse Reinforcement Learning (MaxEnt)](http://new.aaai.org/Papers/AAAI/2008/AAAI08-227.pdf) 17 | - [Generative Adversarial Imitation Learning (GAIL)](https://papers.nips.cc/paper/6391-generative-adversarial-imitation-learning.pdf) 18 | - [Variational Adversarial Imitation Learning (VAIL)](https://arxiv.org/pdf/1810.00821.pdf) 19 | 20 | ## Algorithms 21 | 22 | ### 01. Model-Free Reinforcement Learning 23 | 24 | #### Deep Q-Network (DQN) 25 | 26 | - [CartPole(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/cartpole/dqn) 27 | 28 | #### Double DQN (DDQN) 29 | 30 | - [CartPole(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/cartpole/ddqn) 31 | 32 | #### Advantage Actor-Critic (A2C) 33 | 34 | - [CartPole(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/cartpole/a2c) 35 | 36 | #### Asynchronous Advantage Actor-Critic (A3C) 37 | 38 | - [CartPole(Classic control)]() 39 | 40 | #### Deep Deterministic Policy Gradient (DDPG) 41 | 42 | - [Pendulum(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/pendulum/ddpg) 43 | 44 | #### Truncated Natural Policy Gradient (TNPG) 45 | 46 | - [Pendulum(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/pendulum/tnpg) 47 | - [Hopper(MoJoCo)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/mujoco/tnpg) 48 | 49 | #### Trust Region Policy Optimization (TRPO) 50 | 51 | - [Pendulum(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/pendulum/trpo) 52 | 53 | #### TRPO + Generalized Advantage Estimator (GAE) 54 | 55 | - [Pendulum(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/pendulum/trpo_gae) 56 | - [Hopper(MoJoCo)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/mujoco/trpo) 57 | 58 | #### Proximal Policy Optimization (PPO) 59 | 60 | - [Pendulum(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/pendulum/ppo) 61 | 62 | #### PPO + Generalized Advantage Estimator (GAE) 63 | 64 | - [Pendulum(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/pendulum/ppo_gae) 65 | - [Hopper(MoJoCo)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/mujoco/ppo) 66 | 67 | #### Soft Actor-Critic (SAC) 68 | 69 | - [Pendulum(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/pendulum/sac) 70 | - [Hopper(MoJoCo)]() 71 | 72 | --- 73 | 74 | ### 02. Inverse Reinforcement Learning 75 | 76 | #### Apprenticeship Learning via Inverse Reinforcement Learning (APP) 77 | 78 | - [MountainCar(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/mountaincar/app) 79 | 80 | #### Maximum Entropy Inverse Reinforcement Learning (MaxEnt) 81 | 82 | - [MountainCar(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/mountaincar/maxent) 83 | 84 | #### Generative Adversarial Imitation Learning (GAIL) 85 | 86 | - [Hopper(MoJoCo)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/mujoco/gail) 87 | 88 | #### Variational Adversarial Imitation Learning (VAIL) 89 | 90 | - [Hopper(MoJoCo)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/mujoco/vail) 91 | 92 | --- 93 | 94 | ## Learning curve 95 | 96 | ### CartPole 97 | 98 | 99 | 100 | ### Pendulum 101 | 102 | 103 | 104 | ### Hopper 105 | 106 | --- 107 | 108 | ## Reference 109 | 110 | - [Minimal and Clean Reinforcement Learning Examples in PyTorch](https://github.com/reinforcement-learning-kr/reinforcement-learning-pytorch) 111 | - [Pytorch implementation for Policy Gradient algorithms (REINFORCE, NPG, TRPO, PPO)](https://github.com/reinforcement-learning-kr/pg_travel) 112 | - [Implementation of APP](https://github.com/jangirrishabh/toyCarIRL) 113 | - [Implementation of MaxEnt](https://github.com/MatthewJA/Inverse-Reinforcement-Learning) 114 | - [Pytorch implementation of GAIL](https://github.com/Khrylx/PyTorch-RL) 115 | - [Pytorch implementation of SAC1](https://github.com/vitchyr/rlkit/tree/master/rlkit/torch/sac) 116 | - [Pytorch implementation of SAC2](https://github.com/pranz24/pytorch-soft-actor-critic) 117 | -------------------------------------------------------------------------------- /cartpole/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/.DS_Store -------------------------------------------------------------------------------- /cartpole/a2c/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/a2c/.DS_Store -------------------------------------------------------------------------------- /cartpole/a2c/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/a2c/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /cartpole/a2c/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Actor(nn.Module): 5 | def __init__(self, state_size, action_size, args): 6 | super(Actor, self).__init__() 7 | self.fc1 = nn.Linear(state_size, args.hidden_size) 8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 9 | self.fc3 = nn.Linear(args.hidden_size, action_size) 10 | 11 | def forward(self, x): 12 | x = torch.tanh(self.fc1(x)) 13 | x = torch.tanh(self.fc2(x)) 14 | policies = torch.softmax(self.fc3(x), dim=1) 15 | 16 | return policies 17 | 18 | class Critic(nn.Module): 19 | def __init__(self, state_size, args): 20 | super(Critic, self).__init__() 21 | self.fc1 = nn.Linear(state_size, args.hidden_size) 22 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 23 | self.fc3 = nn.Linear(args.hidden_size, 1) 24 | 25 | def forward(self, x): 26 | x = torch.tanh(self.fc1(x)) 27 | x = torch.tanh(self.fc2(x)) 28 | value = self.fc3(x) 29 | 30 | return value -------------------------------------------------------------------------------- /cartpole/a2c/save_model/model.pth.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/a2c/save_model/model.pth.tar -------------------------------------------------------------------------------- /cartpole/a2c/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import random 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | from model import Actor, Critic 9 | from torch.distributions import Categorical 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--env_name', type=str, default="CartPole-v1") 13 | parser.add_argument("--load_model", type=str, default='model.pth.tar') 14 | parser.add_argument('--render', action="store_true", default=True) 15 | parser.add_argument('--hidden_size', type=int, default=64) 16 | parser.add_argument('--iter', type=int, default=10000) 17 | parser.add_argument('--log_interval', type=int, default=10) 18 | args = parser.parse_args() 19 | 20 | def get_action(policies): 21 | m = Categorical(policies) 22 | action = m.sample() 23 | action = action.data.numpy()[0] 24 | return action 25 | 26 | if __name__=="__main__": 27 | env = gym.make(args.env_name) 28 | env.seed(500) 29 | torch.manual_seed(500) 30 | 31 | state_size = env.observation_space.shape[0] 32 | action_size = env.action_space.n 33 | print('state size:', state_size) 34 | print('action size:', action_size) 35 | 36 | actor = Actor(state_size, action_size, args) 37 | 38 | if args.load_model is not None: 39 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) 40 | pretrained_model = torch.load(pretrained_model_path) 41 | actor.load_state_dict(pretrained_model) 42 | 43 | steps = 0 44 | 45 | for episode in range(args.iter): 46 | done = False 47 | score = 0 48 | 49 | state = env.reset() 50 | state = np.reshape(state, [1, state_size]) 51 | 52 | while not done: 53 | if args.render: 54 | env.render() 55 | 56 | steps += 1 57 | policies = actor(torch.Tensor(state)) 58 | action = get_action(policies) 59 | 60 | next_state, reward, done, _ = env.step(action) 61 | 62 | next_state = np.reshape(next_state, [1, state_size]) 63 | reward = reward if not done or score == 499 else -1 64 | 65 | state = next_state 66 | score += reward 67 | 68 | if episode % args.log_interval == 0: 69 | print('{} episode | score: {:.2f}'.format(episode, score)) -------------------------------------------------------------------------------- /cartpole/a2c/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import argparse 4 | import numpy as np 5 | 6 | import torch 7 | import torch.optim as optim 8 | from torch.distributions import Categorical 9 | 10 | from model import Actor, Critic 11 | from tensorboardX import SummaryWriter 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--env_name', type=str, default="CartPole-v1") 15 | parser.add_argument('--load_model', type=str, default=None) 16 | parser.add_argument('--save_path', default='./save_model/', help='') 17 | parser.add_argument('--render', action="store_true", default=False) 18 | parser.add_argument('--gamma', type=float, default=0.99) 19 | parser.add_argument('--hidden_size', type=int, default=64) 20 | parser.add_argument('--actor_lr', type=float, default=1e-4) 21 | parser.add_argument('--critic_lr', type=float, default=1e-3) 22 | parser.add_argument('--ent_coef', type=float, default=0.1) 23 | parser.add_argument('--max_iter_num', type=int, default=1000) 24 | parser.add_argument('--log_interval', type=int, default=10) 25 | parser.add_argument('--goal_score', type=int, default=400) 26 | parser.add_argument('--logdir', type=str, default='./logs', 27 | help='tensorboardx logs directory') 28 | args = parser.parse_args() 29 | 30 | def train_model(actor, critic, actor_optimizer, critic_optimizer, transition, policies): 31 | state, action, reward, next_state, mask = transition 32 | 33 | # update critic 34 | criterion = torch.nn.MSELoss() 35 | 36 | value = critic(torch.Tensor(state)).squeeze(1) 37 | 38 | next_value = critic(torch.Tensor(next_state)).squeeze(1) 39 | target = reward + mask * args.gamma * next_value 40 | 41 | critic_loss = criterion(value, target.detach()) 42 | critic_optimizer.zero_grad() 43 | critic_loss.backward() 44 | critic_optimizer.step() 45 | 46 | # update actor 47 | categorical = Categorical(policies) 48 | log_policy = categorical.log_prob(torch.Tensor([action])) 49 | entropy = categorical.entropy() 50 | 51 | advantage = target - value 52 | 53 | actor_loss = -log_policy * advantage.item() + args.ent_coef * entropy 54 | actor_optimizer.zero_grad() 55 | actor_loss.backward() 56 | actor_optimizer.step() 57 | 58 | def get_action(policies): 59 | categorical = Categorical(policies) 60 | action = categorical.sample() 61 | action = action.data.numpy()[0] 62 | 63 | return action 64 | 65 | 66 | def main(): 67 | env = gym.make(args.env_name) 68 | env.seed(500) 69 | torch.manual_seed(500) 70 | 71 | state_size = env.observation_space.shape[0] 72 | action_size = env.action_space.n 73 | print('state size:', state_size) 74 | print('action size:', action_size) 75 | 76 | actor = Actor(state_size, action_size, args) 77 | critic = Critic(state_size, args) 78 | 79 | actor_optimizer = optim.Adam(actor.parameters(), lr=args.actor_lr) 80 | critic_optimizer = optim.Adam(critic.parameters(), lr=args.critic_lr) 81 | 82 | writer = SummaryWriter(args.logdir) 83 | 84 | running_score = 0 85 | 86 | for episode in range(args.max_iter_num): 87 | done = False 88 | score = 0 89 | 90 | state = env.reset() 91 | state = np.reshape(state, [1, state_size]) 92 | 93 | while not done: 94 | if args.render: 95 | env.render() 96 | 97 | policies = actor(torch.Tensor(state)) 98 | action = get_action(policies) 99 | 100 | next_state, reward, done, _ = env.step(action) 101 | 102 | next_state = np.reshape(next_state, [1, state_size]) 103 | reward = reward if not done or score == 499 else -1 104 | mask = 0 if done else 1 105 | 106 | transition = [state, action, reward, next_state, mask] 107 | 108 | actor.train(), critic.train() 109 | train_model(actor, critic, actor_optimizer, critic_optimizer, 110 | transition, policies) 111 | 112 | state = next_state 113 | score += reward 114 | 115 | score = score if score == 500.0 else score + 1 116 | running_score = 0.99 * running_score + 0.01 * score 117 | 118 | if episode % args.log_interval == 0: 119 | print('{} episode | running_score: {:.2f}'.format(episode, running_score)) 120 | writer.add_scalar('log/score', float(score), episode) 121 | 122 | if running_score > args.goal_score: 123 | if not os.path.isdir(args.save_path): 124 | os.makedirs(args.save_path) 125 | 126 | ckpt_path = args.save_path + 'model.pth.tar' 127 | torch.save(actor.state_dict(), ckpt_path) 128 | print('Running score exceeds 400. So end') 129 | break 130 | 131 | if __name__=="__main__": 132 | main() 133 | -------------------------------------------------------------------------------- /cartpole/ddqn/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/ddqn/.DS_Store -------------------------------------------------------------------------------- /cartpole/ddqn/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/ddqn/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /cartpole/ddqn/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class QNet(nn.Module): 5 | def __init__(self, state_size, action_size, args): 6 | super(QNet, self).__init__() 7 | self.fc1 = nn.Linear(state_size, args.hidden_size) 8 | self.fc2 = nn.Linear(args.hidden_size, action_size) 9 | 10 | def forward(self, x): 11 | x = torch.tanh(self.fc1(x)) 12 | q_values = self.fc2(x) 13 | return q_values -------------------------------------------------------------------------------- /cartpole/ddqn/save_model/model.pth.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/ddqn/save_model/model.pth.tar -------------------------------------------------------------------------------- /cartpole/ddqn/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import random 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | from model import QNet 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--env_name', type=str, default="CartPole-v1") 12 | parser.add_argument("--load_model", type=str, default='model.pth.tar') 13 | parser.add_argument('--render', action="store_true", default=True) 14 | parser.add_argument('--hidden_size', type=int, default=64) 15 | parser.add_argument('--iter', type=int, default=10000) 16 | parser.add_argument('--log_interval', type=int, default=10) 17 | args = parser.parse_args() 18 | 19 | if __name__=="__main__": 20 | env = gym.make(args.env_name) 21 | env.seed(500) 22 | torch.manual_seed(500) 23 | 24 | state_size = env.observation_space.shape[0] 25 | action_size = env.action_space.n 26 | print('state size:', state_size) 27 | print('action size:', action_size) 28 | 29 | q_net = QNet(state_size, action_size, args) 30 | 31 | if args.load_model is not None: 32 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) 33 | pretrained_model = torch.load(pretrained_model_path) 34 | q_net.load_state_dict(pretrained_model) 35 | 36 | steps = 0 37 | 38 | for episode in range(args.iter): 39 | done = False 40 | score = 0 41 | 42 | state = env.reset() 43 | state = np.reshape(state, [1, state_size]) 44 | 45 | while not done: 46 | if args.render: 47 | env.render() 48 | 49 | steps += 1 50 | 51 | q_value = q_net(torch.Tensor(state)) 52 | _, action = torch.max(q_value, 1) 53 | action = action.numpy()[0] 54 | 55 | next_state, reward, done, _ = env.step(action) 56 | 57 | next_state = np.reshape(next_state, [1, state_size]) 58 | reward = reward if not done or score == 499 else -1 59 | 60 | score += reward 61 | state = next_state 62 | 63 | if episode % args.log_interval == 0: 64 | print('{} episode | score: {:.2f}'.format(episode, score)) -------------------------------------------------------------------------------- /cartpole/ddqn/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import random 4 | import argparse 5 | import numpy as np 6 | from collections import deque 7 | 8 | import torch 9 | import torch.optim as optim 10 | 11 | from model import QNet 12 | from tensorboardX import SummaryWriter 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--env_name', type=str, default="CartPole-v1") 16 | parser.add_argument('--load_model', type=str, default=None) 17 | parser.add_argument('--save_path', default='./save_model/', help='') 18 | parser.add_argument('--render', action="store_true", default=False) 19 | parser.add_argument('--gamma', type=float, default=0.99) 20 | parser.add_argument('--hidden_size', type=int, default=64) 21 | parser.add_argument('--batch_size', type=int, default=32) 22 | parser.add_argument('--initial_exploration', type=int, default=1000) 23 | parser.add_argument('--epsilon', type=float, default=1.0) 24 | parser.add_argument('--epsilon_decay', type=float, default=0.00005) 25 | parser.add_argument('--update_target', type=int, default=100) 26 | parser.add_argument('--max_iter_num', type=int, default=1000) 27 | parser.add_argument('--log_interval', type=int, default=10) 28 | parser.add_argument('--goal_score', type=int, default=400) 29 | parser.add_argument('--logdir', type=str, default='./logs', 30 | help='tensorboardx logs directory') 31 | args = parser.parse_args() 32 | 33 | def train_model(q_net, target_q_net, optimizer, mini_batch): 34 | mini_batch = np.array(mini_batch) 35 | states = np.vstack(mini_batch[:, 0]) 36 | actions = list(mini_batch[:, 1]) 37 | rewards = list(mini_batch[:, 2]) 38 | next_states = np.vstack(mini_batch[:, 3]) 39 | masks = list(mini_batch[:, 4]) 40 | 41 | actions = torch.LongTensor(actions) 42 | rewards = torch.Tensor(rewards) 43 | masks = torch.Tensor(masks) 44 | 45 | criterion = torch.nn.MSELoss() 46 | 47 | # get Q-value 48 | q_values = q_net(torch.Tensor(states)) 49 | q_value = q_values.gather(1, actions.unsqueeze(1)).view(-1) 50 | 51 | # get target 52 | next_q_values = q_net(torch.Tensor(next_states)) 53 | next_q_value_index = next_q_values.max(1)[1] 54 | 55 | target_next_q_values = target_q_net(torch.Tensor(next_states)) 56 | target_next_q_value = target_next_q_values.gather(1, next_q_value_index.unsqueeze(1)).view(-1) 57 | target = rewards + masks * args.gamma * target_next_q_value 58 | 59 | loss = criterion(q_value, target.detach()) 60 | optimizer.zero_grad() 61 | loss.backward() 62 | optimizer.step() 63 | 64 | def get_action(q_values, action_size, epsilon): 65 | if np.random.rand() <= epsilon: 66 | return random.randrange(action_size) 67 | else: 68 | _, action = torch.max(q_values, 1) 69 | return action.numpy()[0] 70 | 71 | def update_target_model(net, target_q_net): 72 | target_q_net.load_state_dict(net.state_dict()) 73 | 74 | 75 | def main(): 76 | env = gym.make(args.env_name) 77 | env.seed(500) 78 | torch.manual_seed(500) 79 | 80 | state_size = env.observation_space.shape[0] 81 | action_size = env.action_space.n 82 | print('state size:', state_size) 83 | print('action size:', action_size) 84 | 85 | q_net = QNet(state_size, action_size, args) 86 | target_q_net = QNet(state_size, action_size, args) 87 | optimizer = optim.Adam(q_net.parameters(), lr=0.001) 88 | 89 | update_target_model(q_net, target_q_net) 90 | 91 | writer = SummaryWriter(args.logdir) 92 | 93 | replay_buffer = deque(maxlen=10000) 94 | running_score = 0 95 | steps = 0 96 | 97 | for episode in range(args.max_iter_num): 98 | done = False 99 | score = 0 100 | 101 | state = env.reset() 102 | state = np.reshape(state, [1, state_size]) 103 | 104 | while not done: 105 | if args.render: 106 | env.render() 107 | 108 | steps += 1 109 | 110 | q_values = q_net(torch.Tensor(state)) 111 | action = get_action(q_values, action_size, args.epsilon) 112 | 113 | next_state, reward, done, _ = env.step(action) 114 | 115 | next_state = np.reshape(next_state, [1, state_size]) 116 | reward = reward if not done or score == 499 else -1 117 | mask = 0 if done else 1 118 | 119 | replay_buffer.append((state, action, reward, next_state, mask)) 120 | 121 | state = next_state 122 | score += reward 123 | 124 | if steps > args.initial_exploration: 125 | args.epsilon -= args.epsilon_decay 126 | args.epsilon = max(args.epsilon, 0.1) 127 | 128 | mini_batch = random.sample(replay_buffer, args.batch_size) 129 | 130 | q_net.train(), target_q_net.train() 131 | train_model(q_net, target_q_net, optimizer, mini_batch) 132 | 133 | if steps % args.update_target == 0: 134 | update_target_model(q_net, target_q_net) 135 | 136 | score = score if score == 500.0 else score + 1 137 | running_score = 0.99 * running_score + 0.01 * score 138 | 139 | if episode % args.log_interval == 0: 140 | print('{} episode | running_score: {:.2f} | epsilon: {:.2f}'.format( 141 | episode, running_score, args.epsilon)) 142 | writer.add_scalar('log/score', float(score), episode) 143 | 144 | if running_score > args.goal_score: 145 | if not os.path.isdir(args.save_path): 146 | os.makedirs(args.save_path) 147 | 148 | ckpt_path = args.save_path + 'model.pth.tar' 149 | torch.save(q_net.state_dict(), ckpt_path) 150 | print('Running score exceeds 400. So end') 151 | break 152 | 153 | if __name__ == '__main__': 154 | main() -------------------------------------------------------------------------------- /cartpole/dqn/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/dqn/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /cartpole/dqn/__pycache__/model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/dqn/__pycache__/model.cpython-37.pyc -------------------------------------------------------------------------------- /cartpole/dqn/logs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/dqn/logs/.DS_Store -------------------------------------------------------------------------------- /cartpole/dqn/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class QNet(nn.Module): 5 | def __init__(self, state_size, action_size, args): 6 | super(QNet, self).__init__() 7 | self.fc1 = nn.Linear(state_size, args.hidden_size) 8 | self.fc2 = nn.Linear(args.hidden_size, action_size) 9 | 10 | def forward(self, x): 11 | x = torch.tanh(self.fc1(x)) 12 | q_values = self.fc2(x) 13 | return q_values -------------------------------------------------------------------------------- /cartpole/dqn/save_model/model.pth.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/dqn/save_model/model.pth.tar -------------------------------------------------------------------------------- /cartpole/dqn/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import random 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | from model import QNet 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--env_name', type=str, default="CartPole-v1") 12 | parser.add_argument("--load_model", type=str, default='model.pth.tar') 13 | parser.add_argument('--render', action="store_true", default=True) 14 | parser.add_argument('--hidden_size', type=int, default=64) 15 | parser.add_argument('--iter', type=int, default=10000) 16 | parser.add_argument('--log_interval', type=int, default=10) 17 | args = parser.parse_args() 18 | 19 | if __name__=="__main__": 20 | env = gym.make(args.env_name) 21 | env.seed(500) 22 | torch.manual_seed(500) 23 | 24 | state_size = env.observation_space.shape[0] 25 | action_size = env.action_space.n 26 | print('state size:', state_size) 27 | print('action size:', action_size) 28 | 29 | q_net = QNet(state_size, action_size, args) 30 | 31 | if args.load_model is not None: 32 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) 33 | pretrained_model = torch.load(pretrained_model_path) 34 | q_net.load_state_dict(pretrained_model) 35 | 36 | steps = 0 37 | 38 | for episode in range(args.iter): 39 | done = False 40 | score = 0 41 | 42 | state = env.reset() 43 | state = np.reshape(state, [1, state_size]) 44 | 45 | while not done: 46 | if args.render: 47 | env.render() 48 | 49 | steps += 1 50 | 51 | q_value = q_net(torch.Tensor(state)) 52 | _, action = torch.max(q_value, 1) 53 | action = action.numpy()[0] 54 | 55 | next_state, reward, done, _ = env.step(action) 56 | 57 | next_state = np.reshape(next_state, [1, state_size]) 58 | reward = reward if not done or score == 499 else -1 59 | 60 | score += reward 61 | state = next_state 62 | 63 | if episode % args.log_interval == 0: 64 | print('{} episode | score: {:.2f}'.format(episode, score)) -------------------------------------------------------------------------------- /cartpole/dqn/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import random 4 | import argparse 5 | import numpy as np 6 | from collections import deque 7 | 8 | import torch 9 | import torch.optim as optim 10 | 11 | from model import QNet 12 | from tensorboardX import SummaryWriter 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--env_name', type=str, default="CartPole-v1") 16 | parser.add_argument('--load_model', type=str, default=None) 17 | parser.add_argument('--save_path', default='./save_model/', help='') 18 | parser.add_argument('--render', action="store_true", default=False) 19 | parser.add_argument('--gamma', type=float, default=0.99) 20 | parser.add_argument('--hidden_size', type=int, default=64) 21 | parser.add_argument('--batch_size', type=int, default=32) 22 | parser.add_argument('--initial_exploration', type=int, default=1000) 23 | parser.add_argument('--epsilon', type=float, default=1.0) 24 | parser.add_argument('--epsilon_decay', type=float, default=0.00005) 25 | parser.add_argument('--update_target', type=int, default=100) 26 | parser.add_argument('--max_iter_num', type=int, default=1000) 27 | parser.add_argument('--log_interval', type=int, default=10) 28 | parser.add_argument('--goal_score', type=int, default=400) 29 | parser.add_argument('--logdir', type=str, default='./logs', 30 | help='tensorboardx logs directory') 31 | args = parser.parse_args() 32 | 33 | def train_model(q_net, target_q_net, optimizer, mini_batch): 34 | mini_batch = np.array(mini_batch) 35 | states = np.vstack(mini_batch[:, 0]) 36 | actions = list(mini_batch[:, 1]) 37 | rewards = list(mini_batch[:, 2]) 38 | next_states = np.vstack(mini_batch[:, 3]) 39 | masks = list(mini_batch[:, 4]) 40 | 41 | actions = torch.LongTensor(actions) 42 | rewards = torch.Tensor(rewards) 43 | masks = torch.Tensor(masks) 44 | 45 | criterion = torch.nn.MSELoss() 46 | 47 | # get Q-value 48 | q_values = q_net(torch.Tensor(states)) 49 | q_value = q_values.gather(1, actions.unsqueeze(1)).view(-1) 50 | 51 | # get target 52 | target_next_q_values = target_q_net(torch.Tensor(next_states)) 53 | target = rewards + masks * args.gamma * target_next_q_values.max(1)[0] 54 | 55 | loss = criterion(q_value, target.detach()) 56 | optimizer.zero_grad() 57 | loss.backward() 58 | optimizer.step() 59 | 60 | def get_action(q_values, action_size, epsilon): 61 | if np.random.rand() <= epsilon: 62 | return random.randrange(action_size) 63 | else: 64 | _, action = torch.max(q_values, 1) 65 | return action.numpy()[0] 66 | 67 | def update_target_model(q_net, target_q_net): 68 | target_q_net.load_state_dict(q_net.state_dict()) 69 | 70 | 71 | def main(): 72 | env = gym.make(args.env_name) 73 | env.seed(500) 74 | torch.manual_seed(500) 75 | 76 | state_size = env.observation_space.shape[0] 77 | action_size = env.action_space.n 78 | print('state size:', state_size) 79 | print('action size:', action_size) 80 | 81 | q_net = QNet(state_size, action_size, args) 82 | target_q_net = QNet(state_size, action_size, args) 83 | optimizer = optim.Adam(q_net.parameters(), lr=0.001) 84 | 85 | update_target_model(q_net, target_q_net) 86 | 87 | writer = SummaryWriter(args.logdir) 88 | 89 | replay_buffer = deque(maxlen=10000) 90 | running_score = 0 91 | steps = 0 92 | 93 | for episode in range(args.max_iter_num): 94 | done = False 95 | score = 0 96 | 97 | state = env.reset() 98 | state = np.reshape(state, [1, state_size]) 99 | 100 | while not done: 101 | if args.render: 102 | env.render() 103 | 104 | steps += 1 105 | 106 | q_values = q_net(torch.Tensor(state)) 107 | action = get_action(q_values, action_size, args.epsilon) 108 | 109 | next_state, reward, done, _ = env.step(action) 110 | 111 | next_state = np.reshape(next_state, [1, state_size]) 112 | reward = reward if not done or score == 499 else -1 113 | mask = 0 if done else 1 114 | 115 | replay_buffer.append((state, action, reward, next_state, mask)) 116 | 117 | state = next_state 118 | score += reward 119 | 120 | if steps > args.initial_exploration: 121 | args.epsilon -= args.epsilon_decay 122 | args.epsilon = max(args.epsilon, 0.1) 123 | 124 | mini_batch = random.sample(replay_buffer, args.batch_size) 125 | 126 | q_net.train(), target_q_net.train() 127 | train_model(q_net, target_q_net, optimizer, mini_batch) 128 | 129 | if steps % args.update_target == 0: 130 | update_target_model(q_net, target_q_net) 131 | 132 | score = score if score == 500.0 else score + 1 133 | running_score = 0.99 * running_score + 0.01 * score 134 | 135 | if episode % args.log_interval == 0: 136 | print('{} episode | running_score: {:.2f} | epsilon: {:.2f}'.format( 137 | episode, running_score, args.epsilon)) 138 | writer.add_scalar('log/score', float(score), episode) 139 | 140 | if running_score > args.goal_score: 141 | if not os.path.isdir(args.save_path): 142 | os.makedirs(args.save_path) 143 | 144 | ckpt_path = args.save_path + 'model.pth.tar' 145 | torch.save(q_net.state_dict(), ckpt_path) 146 | print('Running score exceeds 400. So end') 147 | break 148 | 149 | if __name__ == '__main__': 150 | main() -------------------------------------------------------------------------------- /img/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/img/.DS_Store -------------------------------------------------------------------------------- /img/cartpole.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/img/cartpole.png -------------------------------------------------------------------------------- /img/pendulum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/img/pendulum.png -------------------------------------------------------------------------------- /mountaincar/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/.DS_Store -------------------------------------------------------------------------------- /mountaincar/app/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/.DS_Store -------------------------------------------------------------------------------- /mountaincar/app/__pycache__/app.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/__pycache__/app.cpython-36.pyc -------------------------------------------------------------------------------- /mountaincar/app/__pycache__/train.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/__pycache__/train.cpython-36.pyc -------------------------------------------------------------------------------- /mountaincar/app/app.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cvxpy as cp 3 | from train import idx_state 4 | 5 | class FeatureEstimate: 6 | def __init__(self, feature_num, env): 7 | self.env = env 8 | self.feature_num = feature_num 9 | self.feature = np.ones(self.feature_num) 10 | 11 | def gaussian_function(self, x, mu): 12 | return np.exp(-np.power(x - mu, 2.) / (2 * np.power(1., 2.))) 13 | 14 | def get_features(self, state): 15 | env_low = self.env.observation_space.low 16 | env_high = self.env.observation_space.high 17 | env_distance = (env_high - env_low) / (self.feature_num - 1) 18 | 19 | for i in range(int(self.feature_num/2)): 20 | # position 21 | self.feature[i] = self.gaussian_function(state[0], 22 | env_low[0] + i * env_distance[0]) 23 | # velocity 24 | self.feature[i+int(self.feature_num/2)] = self.gaussian_function(state[1], 25 | env_low[1] + i * env_distance[1]) 26 | 27 | return self.feature 28 | 29 | 30 | def calc_feature_expectation(feature_num, gamma, q_table, demonstrations, env): 31 | feature_estimate = FeatureEstimate(feature_num, env) 32 | feature_expectations = np.zeros(feature_num) 33 | demo_num = len(demonstrations) 34 | 35 | for _ in range(demo_num): 36 | state = env.reset() 37 | demo_length = 0 38 | done = False 39 | 40 | while not done: 41 | demo_length += 1 42 | 43 | state_idx = idx_state(env, state) 44 | action = np.argmax(q_table[state_idx]) 45 | next_state, reward, done, _ = env.step(action) 46 | 47 | features = feature_estimate.get_features(next_state) 48 | feature_expectations += (gamma**(demo_length)) * np.array(features) 49 | 50 | state = next_state 51 | 52 | feature_expectations = feature_expectations/ demo_num 53 | 54 | return feature_expectations 55 | 56 | def expert_feature_expectation(feature_num, gamma, demonstrations, env): 57 | feature_estimate = FeatureEstimate(feature_num, env) 58 | feature_expectations = np.zeros(feature_num) 59 | 60 | for demo_num in range(len(demonstrations)): 61 | for demo_length in range(len(demonstrations[0])): 62 | state = demonstrations[demo_num][demo_length] 63 | features = feature_estimate.get_features(state) 64 | feature_expectations += (gamma**(demo_length)) * np.array(features) 65 | 66 | feature_expectations = feature_expectations / len(demonstrations) 67 | 68 | return feature_expectations 69 | 70 | def QP_optimizer(feature_num, learner, expert): 71 | w = cp.Variable(feature_num) 72 | 73 | obj_func = cp.Minimize(cp.norm(w)) 74 | constraints = [(expert-learner) * w >= 2] 75 | 76 | prob = cp.Problem(obj_func, constraints) 77 | prob.solve() 78 | 79 | if prob.status == "optimal": 80 | print("status:", prob.status) 81 | print("optimal value", prob.value) 82 | 83 | weights = np.squeeze(np.asarray(w.value)) 84 | return weights, prob.status 85 | else: 86 | print("status:", prob.status) 87 | 88 | weights = np.zeros(feature_num) 89 | return weights, prob.status 90 | 91 | 92 | def add_feature_expectation(learner, temp_learner): 93 | # save new feature expectation to list after RL step 94 | learner = np.vstack([learner, temp_learner]) 95 | return learner 96 | 97 | def subtract_feature_expectation(learner): 98 | # if status is infeasible, subtract first feature expectation 99 | learner = learner[1:][:] 100 | return learner -------------------------------------------------------------------------------- /mountaincar/app/expert_demo/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/expert_demo/.DS_Store -------------------------------------------------------------------------------- /mountaincar/app/expert_demo/expert_demo.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/expert_demo/expert_demo.npy -------------------------------------------------------------------------------- /mountaincar/app/expert_demo/make_expert.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import readchar 3 | import numpy as np 4 | 5 | # MACROS 6 | Push_Left = 0 7 | No_Push = 1 8 | Push_Right = 2 9 | 10 | # Key mapping 11 | arrow_keys = { 12 | '\x1b[D': Push_Left, 13 | '\x1b[B': No_Push, 14 | '\x1b[C': Push_Right} 15 | 16 | env = gym.make('MountainCar-v0') 17 | 18 | trajectories = [] 19 | episode_step = 0 20 | 21 | for episode in range(20): # n_trajectories : 20 22 | trajectory = [] 23 | step = 0 24 | 25 | env.reset() 26 | print("episode_step", episode_step) 27 | 28 | while True: 29 | env.render() 30 | print("step", step) 31 | 32 | key = readchar.readkey() 33 | if key not in arrow_keys.keys(): 34 | break 35 | 36 | action = arrow_keys[key] 37 | state, reward, done, _ = env.step(action) 38 | 39 | if state[0] >= env.env.goal_position and step > 129: # trajectory_length : 130 40 | break 41 | 42 | trajectory.append((state[0], state[1], action)) 43 | step += 1 44 | 45 | trajectory_numpy = np.array(trajectory, float) 46 | print("trajectory_numpy.shape", trajectory_numpy.shape) 47 | episode_step += 1 48 | trajectories.append(trajectory) 49 | 50 | np_trajectories = np.array(trajectories, float) 51 | print("np_trajectories.shape", np_trajectories.shape) 52 | 53 | np.save("expert_demo", arr=np_trajectories) -------------------------------------------------------------------------------- /mountaincar/app/learning_curves/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/learning_curves/.DS_Store -------------------------------------------------------------------------------- /mountaincar/app/learning_curves/app_eps_60000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/learning_curves/app_eps_60000.png -------------------------------------------------------------------------------- /mountaincar/app/results/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/results/.DS_Store -------------------------------------------------------------------------------- /mountaincar/app/results/app_q_table.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/results/app_q_table.npy -------------------------------------------------------------------------------- /mountaincar/app/results/test_rendering_60000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/results/test_rendering_60000.gif -------------------------------------------------------------------------------- /mountaincar/app/test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import random 4 | import sys 5 | import cvxpy as cp 6 | 7 | N_idx = 20 8 | F_idx = 4 9 | GAMMA = 0.99 10 | 11 | def idx_to_state(env, state): 12 | env_low = env.observation_space.low 13 | env_high = env.observation_space.high 14 | env_distance = (env_high - env_low) / N_idx 15 | position_idx = int((state[0] - env_low[0]) / env_distance[0]) 16 | velocity_idx = int((state[1] - env_low[1]) / env_distance[1]) 17 | state_idx = position_idx + velocity_idx * N_idx 18 | return state_idx 19 | 20 | 21 | if __name__ == '__main__': 22 | print(":: Testing APP-learning.\n") 23 | 24 | # Load the agent 25 | n_states = N_idx**2 # position - 20, velocity - 20 26 | n_actions = 3 27 | q_table = np.load(file="results/app_q_table.npy") 28 | 29 | # Create a new game instance. 30 | env = gym.make('MountainCar-v0') 31 | n_episode = 10 # test the agent 10times 32 | scores = [] 33 | 34 | for ep in range(n_episode): 35 | state = env.reset() 36 | score = 0 37 | 38 | while True: 39 | # Render the play 40 | env.render() 41 | 42 | state_idx = idx_to_state(env, state) 43 | 44 | action = np.argmax(q_table[state_idx]) 45 | 46 | next_state, reward, done, _ = env.step(action) 47 | next_state_idx = idx_to_state(env, next_state) 48 | 49 | score += reward 50 | state = next_state 51 | 52 | if done: 53 | print('{} episode | score: {:.1f}'.format(ep + 1, score)) 54 | 55 | break 56 | 57 | env.close() 58 | sys.exit() -------------------------------------------------------------------------------- /mountaincar/app/train.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym 3 | import pylab 4 | import numpy as np 5 | 6 | from app import * 7 | 8 | n_states = 400 # position - 20, velocity - 20 9 | n_actions = 3 10 | one_feature = 20 # number of state per one feature 11 | feature_num = 4 12 | q_table = np.zeros((n_states, n_actions)) # (400, 3) 13 | 14 | gamma = 0.99 15 | q_learning_rate = 0.03 16 | 17 | def idx_state(env, state): 18 | env_low = env.observation_space.low 19 | env_high = env.observation_space.high 20 | env_distance = (env_high - env_low) / one_feature 21 | positioone_feature = int((state[0] - env_low[0]) / env_distance[0]) 22 | velocity_idx = int((state[1] - env_low[1]) / env_distance[1]) 23 | state_idx = positioone_feature + velocity_idx * one_feature 24 | return state_idx 25 | 26 | def update_q_table(state, action, reward, next_state): 27 | q_1 = q_table[state][action] 28 | q_2 = reward + gamma * max(q_table[next_state]) 29 | q_table[state][action] += q_learning_rate * (q_2 - q_1) 30 | 31 | 32 | def main(): 33 | env = gym.make('MountainCar-v0') 34 | demonstrations = np.load(file="expert_demo/expert_demo.npy") 35 | 36 | feature_estimate = FeatureEstimate(feature_num, env) 37 | 38 | learner = calc_feature_expectation(feature_num, gamma, q_table, demonstrations, env) 39 | learner = np.matrix([learner]) 40 | 41 | expert = expert_feature_expectation(feature_num, gamma, demonstrations, env) 42 | expert = np.matrix([expert]) 43 | 44 | w, status = QP_optimizer(feature_num, learner, expert) 45 | 46 | 47 | episodes, scores = [], [] 48 | 49 | for episode in range(60000): 50 | state = env.reset() 51 | score = 0 52 | 53 | while True: 54 | state_idx = idx_state(env, state) 55 | action = np.argmax(q_table[state_idx]) 56 | next_state, reward, done, _ = env.step(action) 57 | 58 | features = feature_estimate.get_features(state) 59 | irl_reward = np.dot(w, features) 60 | 61 | next_state_idx = idx_state(env, next_state) 62 | update_q_table(state_idx, action, irl_reward, next_state_idx) 63 | 64 | score += reward 65 | state = next_state 66 | 67 | if done: 68 | scores.append(score) 69 | episodes.append(episode) 70 | break 71 | 72 | if episode % 1000 == 0: 73 | score_avg = np.mean(scores) 74 | print('{} episode score is {:.2f}'.format(episode, score_avg)) 75 | # pylab.plot(episodes, scores, 'b') 76 | # pylab.savefig("./learning_curves/app_eps_60000.png") 77 | # np.save("./results/app_q_table", arr=q_table) 78 | 79 | if episode % 5000 == 0: 80 | # optimize weight per 5000 episode 81 | status = "infeasible" 82 | temp_learner = calc_feature_expectation(feature_num, gamma, q_table, demonstrations, env) 83 | learner = add_feature_expectation(learner, temp_learner) 84 | 85 | while status=="infeasible": 86 | w, status = QP_optimizer(feature_num, learner, expert) 87 | if status=="infeasible": 88 | learner = subtract_feature_expectation(learner) 89 | 90 | if __name__ == '__main__': 91 | main() -------------------------------------------------------------------------------- /mountaincar/ddpg/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/ddpg/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /mountaincar/ddpg/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/ddpg/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /mountaincar/ddpg/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Actor(nn.Module): 5 | def __init__(self, state_size, action_size, args): 6 | super(Actor, self).__init__() 7 | self.fc1 = nn.Linear(state_size, args.hidden_size) 8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 9 | self.fc3 = nn.Linear(args.hidden_size, action_size) 10 | 11 | def forward(self, x): 12 | x = torch.relu(self.fc1(x)) 13 | x = torch.relu(self.fc2(x)) 14 | policy = self.fc3(x) 15 | 16 | return policy 17 | 18 | class Critic(nn.Module): 19 | def __init__(self, state_size, action_size, args): 20 | super(Critic, self).__init__() 21 | self.fc1 = nn.Linear(state_size + action_size, args.hidden_size) 22 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 23 | self.fc3 = nn.Linear(args.hidden_size, 1) 24 | 25 | def forward(self, states, actions): 26 | x = torch.cat([states, actions], dim=1) 27 | x = torch.relu(self.fc1(x)) 28 | x = torch.relu(self.fc2(x)) 29 | q_value = self.fc3(x) 30 | 31 | return q_value -------------------------------------------------------------------------------- /mountaincar/ddpg/save_model/model.pth.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/ddpg/save_model/model.pth.tar -------------------------------------------------------------------------------- /mountaincar/ddpg/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import random 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | from utils import * 9 | from model import Actor, Critic 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--env_name', type=str, default="MountainCarContinuous-v0") 13 | parser.add_argument("--load_model", type=str, default='model.pth.tar') 14 | parser.add_argument('--render', action="store_true", default=True) 15 | parser.add_argument('--hidden_size', type=int, default=64) 16 | parser.add_argument('--theta', type=float, default=0.15) 17 | parser.add_argument('--mu', type=float, default=0.0) 18 | parser.add_argument('--sigma', type=float, default=0.2) 19 | parser.add_argument('--iter', type=int, default=10000) 20 | parser.add_argument('--log_interval', type=int, default=10) 21 | args = parser.parse_args() 22 | 23 | if __name__=="__main__": 24 | env = gym.make(args.env_name) 25 | env.seed(500) 26 | torch.manual_seed(500) 27 | 28 | state_size = env.observation_space.shape[0] 29 | action_size = env.action_space.shape[0] 30 | print('state size:', state_size) 31 | print('action size:', action_size) 32 | 33 | actor = Actor(state_size, action_size, args) 34 | 35 | if args.load_model is not None: 36 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) 37 | pretrained_model = torch.load(pretrained_model_path) 38 | actor.load_state_dict(pretrained_model) 39 | 40 | ou_noise = OUNoise(action_size, args.theta, args.mu, args.sigma) 41 | steps = 0 42 | 43 | for episode in range(args.iter): 44 | done = False 45 | score = 0 46 | 47 | state = env.reset() 48 | state = np.reshape(state, [1, state_size]) 49 | 50 | while not done: 51 | if args.render: 52 | env.render() 53 | 54 | steps += 1 55 | 56 | policy = actor(torch.Tensor(state)) 57 | action = get_action(policy, ou_noise) 58 | 59 | next_state, reward, done, _ = env.step(action) 60 | 61 | next_state = np.reshape(next_state, [1, state_size]) 62 | state = next_state 63 | score += reward 64 | 65 | if episode % args.log_interval == 0: 66 | print('{} episode | score: {:.2f}'.format(episode, score)) -------------------------------------------------------------------------------- /mountaincar/ddpg/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | class OUNoise: 5 | def __init__(self, action_size, theta, mu, sigma): 6 | self.action_size = action_size 7 | self.theta = theta 8 | self.mu = mu 9 | self.sigma = sigma 10 | self.X = np.zeros(self.action_size) 11 | 12 | def sample(self): 13 | dx = self.theta * (self.mu - self.X) 14 | dx = dx + self.sigma * np.random.randn(len(self.X)) 15 | self.X = self.X + dx 16 | 17 | return self.X 18 | 19 | def get_action(policy, ou_noise): 20 | action = policy.detach().numpy() + ou_noise.sample() 21 | 22 | return action 23 | 24 | def hard_target_update(actor, critic, target_actor, target_critic): 25 | target_critic.load_state_dict(critic.state_dict()) 26 | target_actor.load_state_dict(actor.state_dict()) 27 | 28 | def soft_target_update(actor, critic, target_actor, target_critic, tau): 29 | soft_update(critic, target_critic, tau) 30 | soft_update(actor, target_actor, tau) 31 | 32 | def soft_update(net, target_net, tau): 33 | for param, target_param in zip(net.parameters(), target_net.parameters()): 34 | target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data) -------------------------------------------------------------------------------- /mountaincar/maxent/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/.DS_Store -------------------------------------------------------------------------------- /mountaincar/maxent/__pycache__/maxent.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/__pycache__/maxent.cpython-36.pyc -------------------------------------------------------------------------------- /mountaincar/maxent/__pycache__/maxent_train.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/__pycache__/maxent_train.cpython-36.pyc -------------------------------------------------------------------------------- /mountaincar/maxent/__pycache__/train.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/__pycache__/train.cpython-36.pyc -------------------------------------------------------------------------------- /mountaincar/maxent/expert_demo/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/expert_demo/.DS_Store -------------------------------------------------------------------------------- /mountaincar/maxent/expert_demo/expert_demo.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/expert_demo/expert_demo.npy -------------------------------------------------------------------------------- /mountaincar/maxent/expert_demo/make_expert.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import readchar 3 | import numpy as np 4 | 5 | # # MACROS 6 | Push_Left = 0 7 | No_Push = 1 8 | Push_Right = 2 9 | 10 | # Key mapping 11 | arrow_keys = { 12 | '\x1b[D': Push_Left, 13 | '\x1b[B': No_Push, 14 | '\x1b[C': Push_Right} 15 | 16 | env = gym.make('MountainCar-v0') 17 | 18 | trajectories = [] 19 | episode_step = 0 20 | 21 | for episode in range(20): # n_trajectories : 20 22 | trajectory = [] 23 | step = 0 24 | 25 | env.reset() 26 | print("episode_step", episode_step) 27 | 28 | while True: 29 | env.render() 30 | print("step", step) 31 | 32 | key = readchar.readkey() 33 | if key not in arrow_keys.keys(): 34 | break 35 | 36 | action = arrow_keys[key] 37 | state, reward, done, _ = env.step(action) 38 | 39 | if state[0] >= env.env.goal_position and step > 129: # trajectory_length : 130 40 | break 41 | 42 | trajectory.append((state[0], state[1], action)) 43 | step += 1 44 | 45 | # trajectory_numpy = np.array(trajectory, float) 46 | # print("trajectory_numpy.shape", trajectory_numpy.shape) 47 | # episode_step += 1 48 | # trajectories.append(trajectory) 49 | 50 | # np_trajectories = np.array(trajectories, float) 51 | # print("np_trajectories.shape", np_trajectories.shape) 52 | 53 | # np.save("expert_trajectories", arr=np_trajectories) -------------------------------------------------------------------------------- /mountaincar/maxent/learning_curves/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/learning_curves/.DS_Store -------------------------------------------------------------------------------- /mountaincar/maxent/learning_curves/maxent_eps_30000.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/learning_curves/maxent_eps_30000.png -------------------------------------------------------------------------------- /mountaincar/maxent/maxent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def get_reward(feature_matrix, theta, n_states, state_idx): 4 | irl_rewards = feature_matrix.dot(theta).reshape((n_states,)) 5 | return irl_rewards[state_idx] 6 | 7 | 8 | def expert_feature_expectations(feature_matrix, demonstrations): 9 | feature_expectations = np.zeros(feature_matrix.shape[0]) 10 | 11 | for demonstration in demonstrations: 12 | for state_idx, _, _ in demonstration: 13 | feature_expectations += feature_matrix[int(state_idx)] 14 | 15 | feature_expectations /= demonstrations.shape[0] 16 | return feature_expectations 17 | 18 | def maxent_irl(expert, learner, theta, learning_rate): 19 | gradient = expert - learner 20 | theta += learning_rate * gradient 21 | 22 | # Clip theta 23 | for j in range(len(theta)): 24 | if theta[j] > 0: 25 | theta[j] = 0 -------------------------------------------------------------------------------- /mountaincar/maxent/results/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/results/.DS_Store -------------------------------------------------------------------------------- /mountaincar/maxent/results/maxent_q_table.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/results/maxent_q_table.npy -------------------------------------------------------------------------------- /mountaincar/maxent/results/test_rendering_30000.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/results/test_rendering_30000.gif -------------------------------------------------------------------------------- /mountaincar/maxent/test.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import pylab 3 | import numpy as np 4 | 5 | q_table = np.load(file="results/maxent_20_epoch_100000_epi_test.npy") # (400, 3) 6 | one_feature = 20 # number of state per one feature 7 | 8 | def idx_to_state(env, state): 9 | """ Convert pos and vel about mounting car environment to the integer value""" 10 | env_low = env.observation_space.low 11 | env_high = env.observation_space.high 12 | env_distance = (env_high - env_low) / one_feature 13 | position_idx = int((state[0] - env_low[0]) / env_distance[0]) 14 | velocity_idx = int((state[1] - env_low[1]) / env_distance[1]) 15 | state_idx = position_idx + velocity_idx * one_feature 16 | return state_idx 17 | 18 | def main(): 19 | env = gym.make('MountainCar-v0') 20 | 21 | episodes, scores = [], [] 22 | 23 | for episode in range(10): 24 | state = env.reset() 25 | score = 0 26 | 27 | while True: 28 | env.render() 29 | state_idx = idx_to_state(env, state) 30 | action = np.argmax(q_table[state_idx]) 31 | next_state, reward, done, _ = env.step(action) 32 | 33 | score += reward 34 | state = next_state 35 | 36 | if done: 37 | scores.append(score) 38 | episodes.append(episode) 39 | pylab.plot(episodes, scores, 'b') 40 | pylab.savefig("./learning_curves/maxent_test.png") 41 | break 42 | 43 | if episode % 1 == 0: 44 | print('{} episode score is {:.2f}'.format(episode, score)) 45 | 46 | if __name__ == '__main__': 47 | main() 48 | -------------------------------------------------------------------------------- /mountaincar/maxent/train.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import pylab 3 | import numpy as np 4 | 5 | from maxent import * 6 | 7 | n_states = 400 # position - 20, velocity - 20 8 | n_actions = 3 9 | one_feature = 20 # number of state per one feature 10 | q_table = np.zeros((n_states, n_actions)) # (400, 3) 11 | feature_matrix = np.eye((n_states)) # (400, 400) 12 | 13 | gamma = 0.99 14 | q_learning_rate = 0.03 15 | theta_learning_rate = 0.05 16 | 17 | np.random.seed(1) 18 | 19 | def idx_demo(env, one_feature): 20 | env_low = env.observation_space.low 21 | env_high = env.observation_space.high 22 | env_distance = (env_high - env_low) / one_feature 23 | 24 | raw_demo = np.load(file="expert_demo/expert_demo.npy") 25 | demonstrations = np.zeros((len(raw_demo), len(raw_demo[0]), 3)) 26 | 27 | for x in range(len(raw_demo)): 28 | for y in range(len(raw_demo[0])): 29 | position_idx = int((raw_demo[x][y][0] - env_low[0]) / env_distance[0]) 30 | velocity_idx = int((raw_demo[x][y][1] - env_low[1]) / env_distance[1]) 31 | state_idx = position_idx + velocity_idx * one_feature 32 | 33 | demonstrations[x][y][0] = state_idx 34 | demonstrations[x][y][1] = raw_demo[x][y][2] 35 | 36 | return demonstrations 37 | 38 | def idx_state(env, state): 39 | env_low = env.observation_space.low 40 | env_high = env.observation_space.high 41 | env_distance = (env_high - env_low) / one_feature 42 | position_idx = int((state[0] - env_low[0]) / env_distance[0]) 43 | velocity_idx = int((state[1] - env_low[1]) / env_distance[1]) 44 | state_idx = position_idx + velocity_idx * one_feature 45 | return state_idx 46 | 47 | def update_q_table(state, action, reward, next_state): 48 | q_1 = q_table[state][action] 49 | q_2 = reward + gamma * max(q_table[next_state]) 50 | q_table[state][action] += q_learning_rate * (q_2 - q_1) 51 | 52 | 53 | def main(): 54 | env = gym.make('MountainCar-v0') 55 | demonstrations = idx_demo(env, one_feature) 56 | 57 | learner_feature_expectations = np.zeros(n_states) 58 | 59 | theta = -(np.random.uniform(size=(n_states,))) 60 | 61 | episodes, scores = [], [] 62 | 63 | for episode in range(30000): 64 | state = env.reset() 65 | score = 0 66 | 67 | if episode != 0 and episode == 10000 or (episode > 10000 and episode % 5000 == 0): 68 | expert = expert_feature_expectations(feature_matrix, demonstrations) 69 | learner = learner_feature_expectations / episode 70 | maxent_irl(expert, learner, theta, theta_learning_rate) 71 | 72 | while True: 73 | state_idx = idx_state(env, state) 74 | action = np.argmax(q_table[state_idx]) 75 | next_state, reward, done, _ = env.step(action) 76 | 77 | irl_reward = get_reward(feature_matrix, theta, n_states, state_idx) 78 | next_state_idx = idx_state(env, next_state) 79 | update_q_table(state_idx, action, irl_reward, next_state_idx) 80 | 81 | learner_feature_expectations += feature_matrix[int(state_idx)] 82 | 83 | score += reward 84 | state = next_state 85 | 86 | if done: 87 | scores.append(score) 88 | episodes.append(episode) 89 | break 90 | 91 | if episode % 1000 == 0: 92 | score_avg = np.mean(scores) 93 | print('{} episode score is {:.2f}'.format(episode, score_avg)) 94 | pylab.plot(episodes, scores, 'b') 95 | pylab.savefig("./learning_curves/maxent_30000.png") 96 | np.save("./results/maxent_q_table", arr=q_table) 97 | 98 | if __name__ == '__main__': 99 | main() -------------------------------------------------------------------------------- /mountaincar/sac/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/sac/.DS_Store -------------------------------------------------------------------------------- /mountaincar/sac/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/sac/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /mountaincar/sac/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/sac/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /mountaincar/sac/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Actor(nn.Module): 5 | def __init__(self, state_size, action_size, args, log_std_min=-20, log_std_max=2): 6 | super(Actor, self).__init__() 7 | self.log_std_min = log_std_min 8 | self.log_std_max = log_std_max 9 | 10 | self.fc1 = nn.Linear(state_size, args.hidden_size) 11 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 12 | 13 | self.fc3 = nn.Linear(args.hidden_size, action_size) 14 | self.fc4 = nn.Linear(args.hidden_size, action_size) 15 | 16 | def forward(self, x): 17 | x = torch.relu(self.fc1(x)) 18 | x = torch.relu(self.fc2(x)) 19 | 20 | mu = self.fc3(x) 21 | log_std = self.fc4(x) 22 | 23 | log_std = torch.clamp(log_std, min=self.log_std_min, max=self.log_std_max) 24 | std = torch.exp(log_std) 25 | 26 | return mu, std 27 | 28 | class Critic(nn.Module): 29 | def __init__(self, state_size, action_size, args): 30 | super(Critic, self).__init__() 31 | 32 | # Q1 architecture 33 | self.fc1 = nn.Linear(state_size + action_size, args.hidden_size) 34 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 35 | self.fc3 = nn.Linear(args.hidden_size, 1) 36 | 37 | # Q2 architecture 38 | self.fc4 = nn.Linear(state_size + action_size, args.hidden_size) 39 | self.fc5 = nn.Linear(args.hidden_size, args.hidden_size) 40 | self.fc6 = nn.Linear(args.hidden_size, 1) 41 | 42 | def forward(self, states, actions): 43 | x = torch.cat([states, actions], dim=1) 44 | 45 | x1 = torch.relu(self.fc1(x)) 46 | x1 = torch.relu(self.fc2(x1)) 47 | q_value1 = self.fc3(x1) 48 | 49 | x2 = torch.relu(self.fc4(x)) 50 | x2 = torch.relu(self.fc5(x2)) 51 | q_value2 = self.fc6(x2) 52 | 53 | return q_value1, q_value2 -------------------------------------------------------------------------------- /mountaincar/sac/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import random 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | from utils import * 9 | from model import Actor, Critic 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--env_name', type=str, default="MountainCarContinuous-v0") 13 | parser.add_argument("--load_model", type=str, default='model.pth.tar') 14 | parser.add_argument('--render', action="store_true", default=True) 15 | parser.add_argument('--hidden_size', type=int, default=64) 16 | parser.add_argument('--iter', type=int, default=10000) 17 | parser.add_argument('--log_interval', type=int, default=10) 18 | args = parser.parse_args() 19 | 20 | if __name__=="__main__": 21 | env = gym.make(args.env_name) 22 | env.seed(500) 23 | torch.manual_seed(500) 24 | 25 | state_size = env.observation_space.shape[0] 26 | action_size = env.action_space.shape[0] 27 | print('state size:', state_size) 28 | print('action size:', action_size) 29 | 30 | actor = Actor(state_size, action_size, args) 31 | 32 | if args.load_model is not None: 33 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) 34 | pretrained_model = torch.load(pretrained_model_path) 35 | actor.load_state_dict(pretrained_model) 36 | 37 | steps = 0 38 | 39 | for episode in range(args.iter): 40 | done = False 41 | score = 0 42 | 43 | state = env.reset() 44 | state = np.reshape(state, [1, state_size]) 45 | 46 | while not done: 47 | if args.render: 48 | env.render() 49 | 50 | steps += 1 51 | 52 | mu, std = actor(torch.Tensor(state)) 53 | action = get_action(mu, std) 54 | 55 | next_state, reward, done, _ = env.step(action) 56 | 57 | next_state = np.reshape(next_state, [1, state_size]) 58 | state = next_state 59 | score += reward 60 | 61 | if episode % args.log_interval == 0: 62 | print('{} episode | score: {:.2f}'.format(episode, score)) -------------------------------------------------------------------------------- /mountaincar/sac/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.distributions import Normal 3 | 4 | def get_action(mu, std): 5 | normal = Normal(mu, std) 6 | z = normal.rsample() # reparameterization trick (mean + std * N(0,1)) 7 | action = torch.tanh(z) 8 | 9 | return action.data.numpy() 10 | 11 | def eval_action(mu, std, epsilon=1e-6): 12 | normal = Normal(mu, std) 13 | z = normal.rsample() # reparameterization trick (mean + std * N(0,1)) 14 | action = torch.tanh(z) 15 | log_prob = normal.log_prob(z) 16 | 17 | # Enforcing Action Bounds 18 | log_prob -= torch.log(1 - action.pow(2) + epsilon) 19 | log_policy = log_prob.sum(1, keepdim=True) 20 | 21 | return action, log_policy 22 | 23 | def hard_target_update(net, target_net): 24 | target_net.load_state_dict(net.state_dict()) 25 | 26 | def soft_target_update(net, target_net, tau): 27 | for param, target_param in zip(net.parameters(), target_net.parameters()): 28 | target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data) -------------------------------------------------------------------------------- /mujoco/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/.DS_Store -------------------------------------------------------------------------------- /mujoco/gail/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/.DS_Store -------------------------------------------------------------------------------- /mujoco/gail/__pycache__/hparams.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/__pycache__/hparams.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/gail/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/gail/__pycache__/train_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/__pycache__/train_model.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/gail/expert_demo/expert_demo.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/expert_demo/expert_demo.p -------------------------------------------------------------------------------- /mujoco/gail/logs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/logs/.DS_Store -------------------------------------------------------------------------------- /mujoco/gail/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Actor(nn.Module): 5 | def __init__(self, num_inputs, num_outputs, args): 6 | super(Actor, self).__init__() 7 | self.fc1 = nn.Linear(num_inputs, args.hidden_size) 8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 9 | self.fc3 = nn.Linear(args.hidden_size, num_outputs) 10 | 11 | self.fc3.weight.data.mul_(0.1) 12 | self.fc3.bias.data.mul_(0.0) 13 | 14 | def forward(self, x): 15 | x = torch.tanh(self.fc1(x)) 16 | x = torch.tanh(self.fc2(x)) 17 | mu = self.fc3(x) 18 | logstd = torch.zeros_like(mu) 19 | std = torch.exp(logstd) 20 | return mu, std 21 | 22 | 23 | class Critic(nn.Module): 24 | def __init__(self, num_inputs, args): 25 | super(Critic, self).__init__() 26 | self.fc1 = nn.Linear(num_inputs, args.hidden_size) 27 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 28 | self.fc3 = nn.Linear(args.hidden_size, 1) 29 | 30 | self.fc3.weight.data.mul_(0.1) 31 | self.fc3.bias.data.mul_(0.0) 32 | 33 | def forward(self, x): 34 | x = torch.tanh(self.fc1(x)) 35 | x = torch.tanh(self.fc2(x)) 36 | v = self.fc3(x) 37 | return v 38 | 39 | 40 | class Discriminator(nn.Module): 41 | def __init__(self, num_inputs, args): 42 | super(Discriminator, self).__init__() 43 | self.fc1 = nn.Linear(num_inputs, args.hidden_size) 44 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 45 | self.fc3 = nn.Linear(args.hidden_size, 1) 46 | 47 | self.fc3.weight.data.mul_(0.1) 48 | self.fc3.bias.data.mul_(0.0) 49 | 50 | def forward(self, x): 51 | x = torch.tanh(self.fc1(x)) 52 | x = torch.tanh(self.fc2(x)) 53 | prob = torch.sigmoid(self.fc3(x)) 54 | return prob -------------------------------------------------------------------------------- /mujoco/gail/save_model/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/save_model/.DS_Store -------------------------------------------------------------------------------- /mujoco/gail/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import torch 4 | import argparse 5 | 6 | from model import Actor, Critic 7 | from utils.utils import get_action 8 | from utils.running_state import ZFilter 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--env', type=str, default="Hopper-v2", 12 | help='name of Mujoco environement') 13 | parser.add_argument('--iter', type=int, default=5, 14 | help='number of episodes to play') 15 | parser.add_argument("--load_model", type=str, default='ppo_max.tar', 16 | help="if you test pretrained file, write filename in save_model folder") 17 | 18 | args = parser.parse_args() 19 | 20 | 21 | if __name__ == "__main__": 22 | env = gym.make(args.env) 23 | env.seed(500) 24 | torch.manual_seed(500) 25 | 26 | num_inputs = env.observation_space.shape[0] 27 | num_actions = env.action_space.shape[0] 28 | 29 | print("state size: ", num_inputs) 30 | print("action size: ", num_actions) 31 | 32 | actor = Actor(num_inputs, num_actions) 33 | critic = Critic(num_inputs) 34 | 35 | running_state = ZFilter((num_inputs,), clip=5) 36 | 37 | if args.load_model is not None: 38 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) 39 | 40 | pretrained_model = torch.load(pretrained_model_path) 41 | 42 | actor.load_state_dict(pretrained_model['actor']) 43 | critic.load_state_dict(pretrained_model['critic']) 44 | 45 | running_state.rs.n = pretrained_model['z_filter_n'] 46 | running_state.rs.mean = pretrained_model['z_filter_m'] 47 | running_state.rs.sum_square = pretrained_model['z_filter_s'] 48 | 49 | print("Loaded OK ex. ZFilter N {}".format(running_state.rs.n)) 50 | 51 | else: 52 | assert("Should write pretrained filename in save_model folder. ex) python3 test_algo.py --load_model ppo_max.tar") 53 | 54 | 55 | actor.eval(), critic.eval() 56 | for episode in range(args.iter): 57 | state = env.reset() 58 | steps = 0 59 | score = 0 60 | for _ in range(10000): 61 | env.render() 62 | mu, std, _ = actor(torch.Tensor(state).unsqueeze(0)) 63 | action = get_action(mu, std)[0] 64 | 65 | next_state, reward, done, _ = env.step(action) 66 | next_state = running_state(next_state) 67 | 68 | state = next_state 69 | score += reward 70 | 71 | if done: 72 | print("{} cumulative reward: {}".format(episode, score)) 73 | break 74 | -------------------------------------------------------------------------------- /mujoco/gail/train_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from utils.utils import get_entropy, log_prob_density 4 | 5 | def train_discrim(discrim, memory, discrim_optim, demonstrations, args): 6 | memory = np.array(memory) 7 | states = np.vstack(memory[:, 0]) 8 | actions = list(memory[:, 1]) 9 | 10 | states = torch.Tensor(states) 11 | actions = torch.Tensor(actions) 12 | 13 | criterion = torch.nn.BCELoss() 14 | 15 | for _ in range(args.discrim_update_num): 16 | learner = discrim(torch.cat([states, actions], dim=1)) 17 | expert = discrim(torch.Tensor(demonstrations)) 18 | 19 | discrim_loss = criterion(learner, torch.ones((states.shape[0], 1))) + \ 20 | criterion(expert, torch.zeros((demonstrations.shape[0], 1))) 21 | 22 | discrim_optim.zero_grad() 23 | discrim_loss.backward() 24 | discrim_optim.step() 25 | 26 | 27 | def train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args): 28 | memory = np.array(memory) 29 | states = np.vstack(memory[:, 0]) 30 | actions = list(memory[:, 1]) 31 | rewards = list(memory[:, 2]) 32 | masks = list(memory[:, 3]) 33 | 34 | old_values = critic(torch.Tensor(states)) 35 | returns, advants = get_gae(rewards, masks, old_values, args) 36 | 37 | mu, std = actor(torch.Tensor(states)) 38 | old_policy = log_prob_density(torch.Tensor(actions), mu, std) 39 | 40 | criterion = torch.nn.MSELoss() 41 | n = len(states) 42 | arr = np.arange(n) 43 | 44 | for _ in range(args.actor_critic_update_num): 45 | np.random.shuffle(arr) 46 | 47 | for i in range(n // args.batch_size): 48 | batch_index = arr[args.batch_size * i : args.batch_size * (i + 1)] 49 | batch_index = torch.LongTensor(batch_index) 50 | 51 | inputs = torch.Tensor(states)[batch_index] 52 | actions_samples = torch.Tensor(actions)[batch_index] 53 | returns_samples = returns.unsqueeze(1)[batch_index] 54 | advants_samples = advants.unsqueeze(1)[batch_index] 55 | oldvalue_samples = old_values[batch_index].detach() 56 | 57 | values = critic(inputs) 58 | clipped_values = oldvalue_samples + \ 59 | torch.clamp(values - oldvalue_samples, 60 | -args.clip_param, 61 | args.clip_param) 62 | critic_loss1 = criterion(clipped_values, returns_samples) 63 | critic_loss2 = criterion(values, returns_samples) 64 | critic_loss = torch.max(critic_loss1, critic_loss2).mean() 65 | 66 | loss, ratio, entropy = surrogate_loss(actor, advants_samples, inputs, 67 | old_policy.detach(), actions_samples, 68 | batch_index) 69 | clipped_ratio = torch.clamp(ratio, 70 | 1.0 - args.clip_param, 71 | 1.0 + args.clip_param) 72 | clipped_loss = clipped_ratio * advants_samples 73 | actor_loss = -torch.min(loss, clipped_loss).mean() 74 | 75 | loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy 76 | 77 | critic_optim.zero_grad() 78 | loss.backward(retain_graph=True) 79 | critic_optim.step() 80 | 81 | actor_optim.zero_grad() 82 | loss.backward() 83 | actor_optim.step() 84 | 85 | def get_gae(rewards, masks, values, args): 86 | rewards = torch.Tensor(rewards) 87 | masks = torch.Tensor(masks) 88 | returns = torch.zeros_like(rewards) 89 | advants = torch.zeros_like(rewards) 90 | 91 | running_returns = 0 92 | previous_value = 0 93 | running_advants = 0 94 | 95 | for t in reversed(range(0, len(rewards))): 96 | running_returns = rewards[t] + (args.gamma * running_returns * masks[t]) 97 | returns[t] = running_returns 98 | 99 | running_delta = rewards[t] + (args.gamma * previous_value * masks[t]) - \ 100 | values.data[t] 101 | previous_value = values.data[t] 102 | 103 | running_advants = running_delta + (args.gamma * args.lamda * \ 104 | running_advants * masks[t]) 105 | advants[t] = running_advants 106 | 107 | advants = (advants - advants.mean()) / advants.std() 108 | return returns, advants 109 | 110 | def surrogate_loss(actor, advants, states, old_policy, actions, batch_index): 111 | mu, std = actor(states) 112 | new_policy = log_prob_density(actions, mu, std) 113 | old_policy = old_policy[batch_index] 114 | 115 | ratio = torch.exp(new_policy - old_policy) 116 | surrogate_loss = ratio * advants 117 | entropy = get_entropy(mu, std) 118 | 119 | return surrogate_loss, ratio, entropy -------------------------------------------------------------------------------- /mujoco/gail/utils/__pycache__/running_state.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/utils/__pycache__/running_state.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/gail/utils/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/utils/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/gail/utils/__pycache__/zfilter.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/utils/__pycache__/zfilter.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/gail/utils/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.distributions import Normal 4 | 5 | def get_action(mu, std): 6 | action = torch.normal(mu, std) 7 | action = action.data.numpy() 8 | return action 9 | 10 | def get_entropy(mu, std): 11 | dist = Normal(mu, std) 12 | entropy = dist.entropy().mean() 13 | return entropy 14 | 15 | def log_prob_density(x, mu, std): 16 | log_prob_density = -(x - mu).pow(2) / (2 * std.pow(2)) \ 17 | - 0.5 * math.log(2 * math.pi) 18 | return log_prob_density.sum(1, keepdim=True) 19 | 20 | def get_reward(discrim, state, action): 21 | state = torch.Tensor(state) 22 | action = torch.Tensor(action) 23 | state_action = torch.cat([state, action]) 24 | with torch.no_grad(): 25 | return -math.log(discrim(state_action)[0].item()) 26 | 27 | def save_checkpoint(state, filename): 28 | torch.save(state, filename) -------------------------------------------------------------------------------- /mujoco/gail/utils/zfilter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # from https://github.com/joschu/modular_rl 4 | # http://www.johndcook.com/blog/standard_deviation/ 5 | 6 | class RunningStat(object): 7 | def __init__(self, shape): 8 | self._n = 0 9 | self._M = np.zeros(shape) 10 | self._S = np.zeros(shape) 11 | 12 | def push(self, x): 13 | x = np.asarray(x) 14 | assert x.shape == self._M.shape 15 | self._n += 1 16 | if self._n == 1: 17 | self._M[...] = x 18 | else: 19 | oldM = self._M.copy() 20 | self._M[...] = oldM + (x - oldM) / self._n 21 | self._S[...] = self._S + (x - oldM) * (x - self._M) 22 | 23 | @property 24 | def n(self): 25 | return self._n 26 | 27 | @n.setter 28 | def n(self, n): 29 | self._n = n 30 | 31 | @property 32 | def mean(self): 33 | return self._M 34 | 35 | @mean.setter 36 | def mean(self, M): 37 | self._M = M 38 | 39 | @property 40 | def sum_square(self): 41 | return self._S 42 | 43 | @sum_square.setter 44 | def sum_square(self, S): 45 | self._S = S 46 | 47 | @property 48 | def var(self): 49 | return self._S / (self._n - 1) if self._n > 1 else np.square(self._M) 50 | 51 | @property 52 | def std(self): 53 | return np.sqrt(self.var) 54 | 55 | @property 56 | def shape(self): 57 | return self._M.shape 58 | 59 | 60 | class ZFilter: 61 | """ 62 | y = (x-mean)/std 63 | using running estimates of mean,std 64 | """ 65 | 66 | def __init__(self, shape, demean=True, destd=True, clip=10.0): 67 | self.demean = demean 68 | self.destd = destd 69 | self.clip = clip 70 | 71 | self.rs = RunningStat(shape) 72 | 73 | def __call__(self, x, update=True): 74 | if update: self.rs.push(x) 75 | 76 | if self.demean: 77 | x = x - self.rs.mean 78 | 79 | if self.destd: 80 | x = x / (self.rs.std + 1e-8) 81 | 82 | if self.clip: 83 | x = np.clip(x, -self.clip, self.clip) 84 | 85 | return x 86 | -------------------------------------------------------------------------------- /mujoco/ppo/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/.DS_Store -------------------------------------------------------------------------------- /mujoco/ppo/__pycache__/hparams.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/__pycache__/hparams.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/ppo/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/ppo/__pycache__/ppo.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/__pycache__/ppo.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/ppo/__pycache__/train_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/__pycache__/train_model.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/ppo/logs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/logs/.DS_Store -------------------------------------------------------------------------------- /mujoco/ppo/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Actor(nn.Module): 5 | def __init__(self, num_inputs, num_outputs, args): 6 | super(Actor, self).__init__() 7 | self.fc1 = nn.Linear(num_inputs, args.hidden_size) 8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 9 | self.fc3 = nn.Linear(args.hidden_size, num_outputs) 10 | 11 | self.fc3.weight.data.mul_(0.1) 12 | self.fc3.bias.data.mul_(0.0) 13 | 14 | def forward(self, x): 15 | x = torch.tanh(self.fc1(x)) 16 | x = torch.tanh(self.fc2(x)) 17 | mu = self.fc3(x) 18 | logstd = torch.zeros_like(mu) 19 | std = torch.exp(logstd) 20 | return mu, std 21 | 22 | 23 | class Critic(nn.Module): 24 | def __init__(self, num_inputs, args): 25 | super(Critic, self).__init__() 26 | self.fc1 = nn.Linear(num_inputs, args.hidden_size) 27 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 28 | self.fc3 = nn.Linear(args.hidden_size, 1) 29 | 30 | self.fc3.weight.data.mul_(0.1) 31 | self.fc3.bias.data.mul_(0.0) 32 | 33 | def forward(self, x): 34 | x = torch.tanh(self.fc1(x)) 35 | x = torch.tanh(self.fc2(x)) 36 | v = self.fc3(x) 37 | return v -------------------------------------------------------------------------------- /mujoco/ppo/ppo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from utils.utils import log_prob_density 4 | 5 | def train_model(actor, critic, memory, actor_optim, critic_optim, args): 6 | memory = np.array(memory) 7 | states = np.vstack(memory[:, 0]) 8 | actions = list(memory[:, 1]) 9 | rewards = list(memory[:, 2]) 10 | masks = list(memory[:, 3]) 11 | 12 | old_values = critic(torch.Tensor(states)) 13 | returns, advants = get_gae(rewards, masks, old_values, args) 14 | 15 | mu, std = actor(torch.Tensor(states)) 16 | old_policy = log_prob_density(torch.Tensor(actions), mu, std) 17 | 18 | criterion = torch.nn.MSELoss() 19 | n = len(states) 20 | arr = np.arange(n) 21 | 22 | for _ in range(args.model_update_num): 23 | np.random.shuffle(arr) 24 | 25 | for i in range(n // args.batch_size): 26 | batch_index = arr[args.batch_size * i : args.batch_size * (i + 1)] 27 | batch_index = torch.LongTensor(batch_index) 28 | 29 | inputs = torch.Tensor(states)[batch_index] 30 | actions_samples = torch.Tensor(actions)[batch_index] 31 | returns_samples = returns.unsqueeze(1)[batch_index] 32 | advants_samples = advants.unsqueeze(1)[batch_index] 33 | oldvalue_samples = old_values[batch_index].detach() 34 | 35 | values = critic(inputs) 36 | clipped_values = oldvalue_samples + \ 37 | torch.clamp(values - oldvalue_samples, 38 | -args.clip_param, 39 | args.clip_param) 40 | critic_loss1 = criterion(clipped_values, returns_samples) 41 | critic_loss2 = criterion(values, returns_samples) 42 | critic_loss = torch.max(critic_loss1, critic_loss2).mean() 43 | 44 | loss, ratio = surrogate_loss(actor, advants_samples, inputs, 45 | old_policy.detach(), actions_samples, 46 | batch_index) 47 | clipped_ratio = torch.clamp(ratio, 48 | 1.0 - args.clip_param, 49 | 1.0 + args.clip_param) 50 | clipped_loss = clipped_ratio * advants_samples 51 | actor_loss = -torch.min(loss, clipped_loss).mean() 52 | 53 | loss = actor_loss + 0.5 * critic_loss 54 | 55 | critic_optim.zero_grad() 56 | loss.backward(retain_graph=True) 57 | critic_optim.step() 58 | 59 | actor_optim.zero_grad() 60 | loss.backward() 61 | actor_optim.step() 62 | 63 | def get_gae(rewards, masks, values, args): 64 | rewards = torch.Tensor(rewards) 65 | masks = torch.Tensor(masks) 66 | returns = torch.zeros_like(rewards) 67 | advants = torch.zeros_like(rewards) 68 | 69 | running_returns = 0 70 | previous_value = 0 71 | running_advants = 0 72 | 73 | for t in reversed(range(0, len(rewards))): 74 | running_returns = rewards[t] + (args.gamma * running_returns * masks[t]) 75 | returns[t] = running_returns 76 | 77 | running_delta = rewards[t] + (args.gamma * previous_value * masks[t]) - \ 78 | values.data[t] 79 | previous_value = values.data[t] 80 | 81 | running_advants = running_delta + (args.gamma * args.lamda * \ 82 | running_advants * masks[t]) 83 | advants[t] = running_advants 84 | 85 | advants = (advants - advants.mean()) / advants.std() 86 | return returns, advants 87 | 88 | def surrogate_loss(actor, advants, states, old_policy, actions, batch_index): 89 | mu, std = actor(states) 90 | new_policy = log_prob_density(actions, mu, std) 91 | old_policy = old_policy[batch_index] 92 | 93 | ratio = torch.exp(new_policy - old_policy) 94 | surrogate_loss = ratio * advants 95 | 96 | return surrogate_loss, ratio -------------------------------------------------------------------------------- /mujoco/ppo/save_model/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/save_model/.DS_Store -------------------------------------------------------------------------------- /mujoco/ppo/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import torch 4 | import argparse 5 | 6 | from model import Actor, Critic 7 | from utils.utils import get_action 8 | from utils.running_state import ZFilter 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--env', type=str, default="Hopper-v2", 12 | help='name of Mujoco environement') 13 | parser.add_argument('--iter', type=int, default=5, 14 | help='number of episodes to play') 15 | parser.add_argument("--load_model", type=str, default='ppo_max.tar', 16 | help="if you test pretrained file, write filename in save_model folder") 17 | 18 | args = parser.parse_args() 19 | 20 | 21 | if __name__ == "__main__": 22 | env = gym.make(args.env) 23 | env.seed(500) 24 | torch.manual_seed(500) 25 | 26 | num_inputs = env.observation_space.shape[0] 27 | num_actions = env.action_space.shape[0] 28 | 29 | print("state size: ", num_inputs) 30 | print("action size: ", num_actions) 31 | 32 | actor = Actor(num_inputs, num_actions) 33 | critic = Critic(num_inputs) 34 | 35 | running_state = ZFilter((num_inputs,), clip=5) 36 | 37 | if args.load_model is not None: 38 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) 39 | 40 | pretrained_model = torch.load(pretrained_model_path) 41 | 42 | actor.load_state_dict(pretrained_model['actor']) 43 | critic.load_state_dict(pretrained_model['critic']) 44 | 45 | running_state.rs.n = pretrained_model['z_filter_n'] 46 | running_state.rs.mean = pretrained_model['z_filter_m'] 47 | running_state.rs.sum_square = pretrained_model['z_filter_s'] 48 | 49 | print("Loaded OK ex. ZFilter N {}".format(running_state.rs.n)) 50 | 51 | else: 52 | assert("Should write pretrained filename in save_model folder. ex) python3 test_algo.py --load_model ppo_max.tar") 53 | 54 | 55 | actor.eval(), critic.eval() 56 | for episode in range(args.iter): 57 | state = env.reset() 58 | steps = 0 59 | score = 0 60 | for _ in range(10000): 61 | env.render() 62 | mu, std, _ = actor(torch.Tensor(state).unsqueeze(0)) 63 | action = get_action(mu, std)[0] 64 | 65 | next_state, reward, done, _ = env.step(action) 66 | next_state = running_state(next_state) 67 | 68 | state = next_state 69 | score += reward 70 | 71 | if done: 72 | print("{} cumulative reward: {}".format(episode, score)) 73 | break 74 | -------------------------------------------------------------------------------- /mujoco/ppo/utils/__pycache__/running_state.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/utils/__pycache__/running_state.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/ppo/utils/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/utils/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/ppo/utils/__pycache__/zfilter.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/utils/__pycache__/zfilter.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/ppo/utils/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | 4 | def get_action(mu, std): 5 | action = torch.normal(mu, std) 6 | action = action.data.numpy() 7 | return action 8 | 9 | def log_prob_density(x, mu, std): 10 | log_prob_density = -(x - mu).pow(2) / (2 * std.pow(2)) \ 11 | - 0.5 * math.log(2 * math.pi) 12 | return log_prob_density.sum(1, keepdim=True) 13 | 14 | def save_checkpoint(state, filename): 15 | torch.save(state, filename) -------------------------------------------------------------------------------- /mujoco/ppo/utils/zfilter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # from https://github.com/joschu/modular_rl 4 | # http://www.johndcook.com/blog/standard_deviation/ 5 | 6 | class RunningStat(object): 7 | def __init__(self, shape): 8 | self._n = 0 9 | self._M = np.zeros(shape) 10 | self._S = np.zeros(shape) 11 | 12 | def push(self, x): 13 | x = np.asarray(x) 14 | assert x.shape == self._M.shape 15 | self._n += 1 16 | if self._n == 1: 17 | self._M[...] = x 18 | else: 19 | oldM = self._M.copy() 20 | self._M[...] = oldM + (x - oldM) / self._n 21 | self._S[...] = self._S + (x - oldM) * (x - self._M) 22 | 23 | @property 24 | def n(self): 25 | return self._n 26 | 27 | @n.setter 28 | def n(self, n): 29 | self._n = n 30 | 31 | @property 32 | def mean(self): 33 | return self._M 34 | 35 | @mean.setter 36 | def mean(self, M): 37 | self._M = M 38 | 39 | @property 40 | def sum_square(self): 41 | return self._S 42 | 43 | @sum_square.setter 44 | def sum_square(self, S): 45 | self._S = S 46 | 47 | @property 48 | def var(self): 49 | return self._S / (self._n - 1) if self._n > 1 else np.square(self._M) 50 | 51 | @property 52 | def std(self): 53 | return np.sqrt(self.var) 54 | 55 | @property 56 | def shape(self): 57 | return self._M.shape 58 | 59 | 60 | class ZFilter: 61 | """ 62 | y = (x-mean)/std 63 | using running estimates of mean,std 64 | """ 65 | 66 | def __init__(self, shape, demean=True, destd=True, clip=10.0): 67 | self.demean = demean 68 | self.destd = destd 69 | self.clip = clip 70 | 71 | self.rs = RunningStat(shape) 72 | 73 | def __call__(self, x, update=True): 74 | if update: self.rs.push(x) 75 | 76 | if self.demean: 77 | x = x - self.rs.mean 78 | 79 | if self.destd: 80 | x = x / (self.rs.std + 1e-8) 81 | 82 | if self.clip: 83 | x = np.clip(x, -self.clip, self.clip) 84 | 85 | return x 86 | -------------------------------------------------------------------------------- /mujoco/tnpg/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/.DS_Store -------------------------------------------------------------------------------- /mujoco/tnpg/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/tnpg/__pycache__/tnpg.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/__pycache__/tnpg.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/tnpg/__pycache__/trpo.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/__pycache__/trpo.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/tnpg/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Actor(nn.Module): 5 | def __init__(self, state_size, action_size, args): 6 | super(Actor, self).__init__() 7 | self.fc1 = nn.Linear(state_size, args.hidden_size) 8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 9 | self.fc3 = nn.Linear(args.hidden_size, action_size) 10 | self.fc3.weight.data.mul_(0.1) 11 | self.fc3.bias.data.mul_(0.0) 12 | 13 | def forward(self, x): 14 | x = torch.tanh(self.fc1(x)) 15 | x = torch.tanh(self.fc2(x)) 16 | mu = self.fc3(x) 17 | logstd = torch.zeros_like(mu) 18 | std = torch.exp(logstd) 19 | return mu, std 20 | -------------------------------------------------------------------------------- /mujoco/tnpg/save_model/24model.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/save_model/24model.pth -------------------------------------------------------------------------------- /mujoco/tnpg/save_model/40model.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/save_model/40model.pth -------------------------------------------------------------------------------- /mujoco/tnpg/save_model/67model.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/save_model/67model.pth -------------------------------------------------------------------------------- /mujoco/tnpg/save_model/76model.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/save_model/76model.pth -------------------------------------------------------------------------------- /mujoco/tnpg/save_model/79model.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/save_model/79model.pth -------------------------------------------------------------------------------- /mujoco/tnpg/save_model/86model.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/save_model/86model.pth -------------------------------------------------------------------------------- /mujoco/tnpg/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import torch 4 | import argparse 5 | 6 | from model import Actor, Critic 7 | from utils.utils import get_action 8 | from utils.running_state import ZFilter 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--env', type=str, default="Hopper-v2", 12 | help='name of Mujoco environement') 13 | parser.add_argument('--iter', type=int, default=5, 14 | help='number of episodes to play') 15 | parser.add_argument("--load_model", type=str, default='ppo_max.tar', 16 | help="if you test pretrained file, write filename in save_model folder") 17 | 18 | args = parser.parse_args() 19 | 20 | 21 | if __name__ == "__main__": 22 | env = gym.make(args.env) 23 | env.seed(500) 24 | torch.manual_seed(500) 25 | 26 | num_inputs = env.observation_space.shape[0] 27 | num_actions = env.action_space.shape[0] 28 | 29 | print("state size: ", num_inputs) 30 | print("action size: ", num_actions) 31 | 32 | actor = Actor(num_inputs, num_actions) 33 | critic = Critic(num_inputs) 34 | 35 | running_state = ZFilter((num_inputs,), clip=5) 36 | 37 | if args.load_model is not None: 38 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) 39 | 40 | pretrained_model = torch.load(pretrained_model_path) 41 | 42 | actor.load_state_dict(pretrained_model['actor']) 43 | critic.load_state_dict(pretrained_model['critic']) 44 | 45 | running_state.rs.n = pretrained_model['z_filter_n'] 46 | running_state.rs.mean = pretrained_model['z_filter_m'] 47 | running_state.rs.sum_square = pretrained_model['z_filter_s'] 48 | 49 | print("Loaded OK ex. ZFilter N {}".format(running_state.rs.n)) 50 | 51 | else: 52 | assert("Should write pretrained filename in save_model folder. ex) python3 test_algo.py --load_model ppo_max.tar") 53 | 54 | 55 | actor.eval(), critic.eval() 56 | for episode in range(args.iter): 57 | state = env.reset() 58 | steps = 0 59 | score = 0 60 | for _ in range(10000): 61 | env.render() 62 | mu, std, _ = actor(torch.Tensor(state).unsqueeze(0)) 63 | action = get_action(mu, std)[0] 64 | 65 | next_state, reward, done, _ = env.step(action) 66 | next_state = running_state(next_state) 67 | 68 | state = next_state 69 | score += reward 70 | 71 | if done: 72 | print("{} cumulative reward: {}".format(episode, score)) 73 | break 74 | -------------------------------------------------------------------------------- /mujoco/tnpg/tnpg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from utils.utils import * 3 | 4 | def get_returns(rewards, masks, gamma): 5 | rewards = torch.Tensor(rewards) 6 | masks = torch.Tensor(masks) 7 | returns = torch.zeros_like(rewards) 8 | 9 | running_returns = 0 10 | 11 | for t in reversed(range(0, len(rewards))): 12 | running_returns = rewards[t] + gamma * running_returns * masks[t] 13 | returns[t] = running_returns 14 | 15 | returns = (returns - returns.mean()) / returns.std() 16 | return returns 17 | 18 | def get_loss(actor, returns, states, actions): 19 | mu, std = actor(torch.Tensor(states)) 20 | log_policy = log_prob_density(torch.Tensor(actions), mu, std) 21 | returns = returns.unsqueeze(1) 22 | 23 | loss = log_policy * returns 24 | loss = loss.mean() 25 | return loss 26 | 27 | # from openai baseline code 28 | # https://github.com/openai/baselines/blob/master/baselines/common/cg.py 29 | def conjugate_gradient(actor, states, b, nsteps, residual_tol=1e-10): 30 | x = torch.zeros(b.size()) 31 | r = b.clone() 32 | p = b.clone() 33 | rdotr = torch.dot(r, r) 34 | 35 | for i in range(nsteps): # nsteps=10 36 | f_Ax = hessian_vector_product(actor, states, p, cg_damping=1e-1) 37 | alpha = rdotr / torch.dot(p, f_Ax) 38 | x += alpha * p 39 | r -= alpha * f_Ax 40 | new_rdotr = torch.dot(r, r) 41 | beta = new_rdotr / rdotr 42 | p = r + beta * p 43 | 44 | rdotr = new_rdotr 45 | if rdotr < residual_tol: # residual_tol = 0.0000000001 46 | break 47 | return x 48 | 49 | def train_model(actor, memory, args): 50 | memory = np.array(memory) 51 | states = np.vstack(memory[:, 0]) 52 | actions = list(memory[:, 1]) 53 | rewards = list(memory[:, 2]) 54 | masks = list(memory[:, 3]) 55 | 56 | # ---------------------------- 57 | # step 1: get returns 58 | returns = get_returns(rewards, masks, args.gamma) 59 | 60 | # ---------------------------- 61 | # step 2: get gradient of loss and hessian of kl 62 | loss = get_loss(actor, returns, states, actions) 63 | loss_grad = torch.autograd.grad(loss, actor.parameters()) 64 | loss_grad = flat_grad(loss_grad) 65 | 66 | step_dir = conjugate_gradient(actor, states, loss_grad.data, nsteps=10) 67 | 68 | # ---------------------------- 69 | # step 3: get step direction and step size and update actor 70 | params = flat_params(actor) 71 | new_params = params + 0.5 * step_dir 72 | update_model(actor, new_params) -------------------------------------------------------------------------------- /mujoco/tnpg/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import argparse 4 | import numpy as np 5 | from collections import deque 6 | 7 | import torch 8 | import torch.optim as optim 9 | from tensorboardX import SummaryWriter 10 | 11 | from model import Actor 12 | from tnpg import train_model 13 | from utils.utils import get_action 14 | from utils.running_state import ZFilter 15 | 16 | parser = argparse.ArgumentParser(description='PyTorch NPG') 17 | parser.add_argument('--env_name', type=str, default="Hopper-v2") 18 | parser.add_argument('--load_model', type=str, default=None) 19 | parser.add_argument('--save_path', default='./save_model/', help='') 20 | parser.add_argument('--render', action="store_true", default=False) 21 | parser.add_argument('--gamma', type=float, default=0.99) 22 | parser.add_argument('--hidden_size', type=int, default=64) 23 | parser.add_argument('--learning_rate', type=float, default=3e-4) 24 | parser.add_argument('--logdir', type=str, default='logs', 25 | help='tensorboardx logs directory') 26 | args = parser.parse_args() 27 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 28 | 29 | if __name__=="__main__": 30 | env = gym.make(args.env_name) 31 | env.seed(500) 32 | torch.manual_seed(500) 33 | 34 | state_size = env.observation_space.shape[0] 35 | action_size = env.action_space.shape[0] 36 | print('state size:', state_size) 37 | print('action size:', action_size) 38 | 39 | actor = Actor(state_size, action_size, args) 40 | # writer = SummaryWriter(args.logdir) 41 | 42 | if not os.path.isdir(args.save_path): 43 | os.makedirs(args.save_path) 44 | 45 | running_state = ZFilter((state_size,), clip=5) 46 | episodes = 0 47 | 48 | for iter in range(2000): 49 | memory = deque() 50 | scores = [] 51 | steps = 0 52 | 53 | while steps < 2048: 54 | score = 0 55 | episodes += 1 56 | 57 | state = env.reset() 58 | state = running_state(state) 59 | 60 | for _ in range(10000): 61 | if args.render: 62 | env.render() 63 | 64 | steps += 1 65 | 66 | mu, std = actor(torch.Tensor(state).unsqueeze(0)) 67 | action = get_action(mu, std)[0] 68 | next_state, reward, done, _ = env.step(action) 69 | 70 | if done: 71 | mask = 0 72 | else: 73 | mask = 1 74 | 75 | memory.append([state, action, reward, mask]) 76 | 77 | next_state = running_state(next_state) 78 | state = next_state 79 | score += reward 80 | 81 | if done: 82 | break 83 | 84 | scores.append(score) 85 | 86 | score_avg = np.mean(scores) 87 | print('{} episode score is {:.2f}'.format(episodes, score_avg)) 88 | # writer.add_scalar('log/score', float(score_avg), iter) 89 | 90 | actor.train() 91 | train_model(actor, memory, args) 92 | 93 | if iter % 100: 94 | ckpt_path = args.save_path + str(score_avg) + 'model.pth' 95 | torch.save(actor.state_dict(), ckpt_path) -------------------------------------------------------------------------------- /mujoco/tnpg/utils/__pycache__/running_state.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/utils/__pycache__/running_state.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/tnpg/utils/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/utils/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/tnpg/utils/running_state.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | 3 | import numpy as np 4 | 5 | 6 | # from https://github.com/joschu/modular_rl 7 | # http://www.johndcook.com/blog/standard_deviation/ 8 | class RunningStat(object): 9 | def __init__(self, shape): # shape = (11,) 10 | self._n = 0 11 | self._M = np.zeros(shape) 12 | self._S = np.zeros(shape) 13 | 14 | def push(self, x): 15 | x = np.asarray(x) 16 | assert x.shape == self._M.shape 17 | self._n += 1 18 | if self._n == 1: # Only the first time 19 | self._M[...] = x 20 | else: # From the second time ~ 21 | oldM = self._M.copy() 22 | self._M[...] = oldM + (x - oldM) / self._n 23 | self._S[...] = self._S + (x - oldM) * (x - self._M) 24 | 25 | @property 26 | def n(self): 27 | return self._n 28 | 29 | @n.setter 30 | def n(self, n): 31 | self._n = n 32 | 33 | @property 34 | def mean(self): 35 | return self._M 36 | 37 | @mean.setter 38 | def mean(self, M): 39 | self._M = M 40 | 41 | @property 42 | def sum_square(self): 43 | return self._S 44 | 45 | @sum_square.setter 46 | def sum_square(self, S): 47 | self._S = S 48 | 49 | @property 50 | def var(self): 51 | return self._S / (self._n - 1) if self._n > 1 else np.square(self._M) 52 | 53 | @property 54 | def std(self): 55 | return np.sqrt(self.var) 56 | 57 | @property 58 | def shape(self): 59 | return self._M.shape 60 | 61 | 62 | class ZFilter: 63 | """ 64 | y = (x-mean)/std 65 | using running estimates of mean,std 66 | """ 67 | 68 | def __init__(self, shape, demean=True, destd=True, clip=10.0): # shape = (11,), clip = 5 69 | self.demean = demean 70 | self.destd = destd 71 | self.clip = clip 72 | 73 | self.rs = RunningStat(shape) 74 | 75 | def __call__(self, x, update=True): 76 | if update: self.rs.push(x) 77 | 78 | if self.demean: 79 | x = x - self.rs.mean 80 | 81 | if self.destd: 82 | x = x / (self.rs.std + 1e-8) 83 | 84 | if self.clip: 85 | x = np.clip(x, -self.clip, self.clip) 86 | 87 | return x 88 | -------------------------------------------------------------------------------- /mujoco/tnpg/utils/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | 4 | def get_action(mu, std): 5 | action = torch.normal(mu, std) 6 | action = action.data.numpy() 7 | return action 8 | 9 | # logarithm의 property을 이용하여 ratio를 만들 때 사용하기 위한 10 | # normal distribution의 probability density 11 | def log_prob_density(x, mu, std): 12 | log_density = -(x - mu).pow(2) / (2 * std.pow(2)) \ 13 | - 0.5 * math.log(2 * math.pi) 14 | return log_density.sum(1, keepdim=True) 15 | 16 | 17 | def hessian_vector_product(actor, states, p, cg_damping): 18 | p.detach() 19 | kl = kl_divergence(old_actor=actor, new_actor=actor, states=states) 20 | kl = kl.mean() 21 | 22 | kl_grad = torch.autograd.grad(kl, actor.parameters(), create_graph=True) 23 | kl_grad = flat_grad(kl_grad) 24 | 25 | kl_grad_p = (kl_grad * p).sum() 26 | kl_hessian = torch.autograd.grad(kl_grad_p, actor.parameters()) 27 | kl_hessian = flat_hessian(kl_hessian) 28 | 29 | return kl_hessian + p * cg_damping # cg_damping = 0.1 30 | 31 | def kl_divergence(old_actor, new_actor, states): 32 | mu, std = new_actor(torch.Tensor(states)) 33 | mu_old, std_old = old_actor(torch.Tensor(states)) 34 | mu_old = mu_old.detach() 35 | std_old = std_old.detach() 36 | 37 | # kl divergence between old policy and new policy : D( pi_old || pi_new ) 38 | # pi_old -> mu_old, std_old / pi_new -> mu, std 39 | # be careful of calculating KL-divergence. It is not symmetric metric. 40 | kl = torch.log(std / std_old) + (std_old.pow(2) + (mu_old - mu).pow(2)) / (2.0 * std.pow(2)) - 0.5 41 | return kl.sum(1, keepdim=True) 42 | 43 | def flat_grad(grads): 44 | grad_flatten = [] 45 | for grad in grads: 46 | grad_flatten.append(grad.view(-1)) 47 | grad_flatten = torch.cat(grad_flatten) 48 | return grad_flatten 49 | 50 | def flat_hessian(hessians): 51 | hessians_flatten = [] 52 | for hessian in hessians: 53 | hessians_flatten.append(hessian.contiguous().view(-1)) 54 | hessians_flatten = torch.cat(hessians_flatten).data 55 | return hessians_flatten 56 | 57 | 58 | def flat_params(model): 59 | params = [] 60 | for param in model.parameters(): 61 | params.append(param.data.view(-1)) 62 | params_flatten = torch.cat(params) 63 | return params_flatten 64 | 65 | def update_model(model, new_params): 66 | index = 0 67 | for params in model.parameters(): 68 | params_length = len(params.view(-1)) 69 | new_param = new_params[index: index + params_length] 70 | new_param = new_param.view(params.size()) 71 | params.data.copy_(new_param) 72 | index += params_length -------------------------------------------------------------------------------- /mujoco/trpo/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/trpo/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/trpo/__pycache__/trpo.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/trpo/__pycache__/trpo.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/trpo/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Actor(nn.Module): 5 | def __init__(self, state_size, action_size, args): 6 | super(Actor, self).__init__() 7 | self.fc1 = nn.Linear(state_size, args.hidden_size) 8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 9 | self.fc3 = nn.Linear(args.hidden_size, action_size) 10 | self.fc3.weight.data.mul_(0.1) 11 | self.fc3.bias.data.mul_(0.0) 12 | 13 | def forward(self, x): 14 | x = torch.tanh(self.fc1(x)) 15 | x = torch.tanh(self.fc2(x)) 16 | mu = self.fc3(x) 17 | logstd = torch.zeros_like(mu) 18 | std = torch.exp(logstd) 19 | return mu, std -------------------------------------------------------------------------------- /mujoco/trpo/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import torch 4 | import argparse 5 | 6 | from model import Actor, Critic 7 | from utils.utils import get_action 8 | from utils.running_state import ZFilter 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--env', type=str, default="Hopper-v2", 12 | help='name of Mujoco environement') 13 | parser.add_argument('--iter', type=int, default=5, 14 | help='number of episodes to play') 15 | parser.add_argument("--load_model", type=str, default='ppo_max.tar', 16 | help="if you test pretrained file, write filename in save_model folder") 17 | 18 | args = parser.parse_args() 19 | 20 | 21 | if __name__ == "__main__": 22 | env = gym.make(args.env) 23 | env.seed(500) 24 | torch.manual_seed(500) 25 | 26 | num_inputs = env.observation_space.shape[0] 27 | num_actions = env.action_space.shape[0] 28 | 29 | print("state size: ", num_inputs) 30 | print("action size: ", num_actions) 31 | 32 | actor = Actor(num_inputs, num_actions) 33 | critic = Critic(num_inputs) 34 | 35 | running_state = ZFilter((num_inputs,), clip=5) 36 | 37 | if args.load_model is not None: 38 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) 39 | 40 | pretrained_model = torch.load(pretrained_model_path) 41 | 42 | actor.load_state_dict(pretrained_model['actor']) 43 | critic.load_state_dict(pretrained_model['critic']) 44 | 45 | running_state.rs.n = pretrained_model['z_filter_n'] 46 | running_state.rs.mean = pretrained_model['z_filter_m'] 47 | running_state.rs.sum_square = pretrained_model['z_filter_s'] 48 | 49 | print("Loaded OK ex. ZFilter N {}".format(running_state.rs.n)) 50 | 51 | else: 52 | assert("Should write pretrained filename in save_model folder. ex) python3 test_algo.py --load_model ppo_max.tar") 53 | 54 | 55 | actor.eval(), critic.eval() 56 | for episode in range(args.iter): 57 | state = env.reset() 58 | steps = 0 59 | score = 0 60 | for _ in range(10000): 61 | env.render() 62 | mu, std, _ = actor(torch.Tensor(state).unsqueeze(0)) 63 | action = get_action(mu, std)[0] 64 | 65 | next_state, reward, done, _ = env.step(action) 66 | next_state = running_state(next_state) 67 | 68 | state = next_state 69 | score += reward 70 | 71 | if done: 72 | print("{} cumulative reward: {}".format(episode, score)) 73 | break 74 | -------------------------------------------------------------------------------- /mujoco/trpo/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import argparse 4 | import numpy as np 5 | from collections import deque 6 | 7 | import torch 8 | import torch.optim as optim 9 | from tensorboardX import SummaryWriter 10 | 11 | from model import Actor 12 | from trpo import train_model 13 | from utils.utils import get_action 14 | from utils.running_state import ZFilter 15 | 16 | parser = argparse.ArgumentParser(description='PyTorch TRPO') 17 | parser.add_argument('--env_name', type=str, default="Hopper-v2") 18 | parser.add_argument('--load_model', type=str, default=None) 19 | parser.add_argument('--save_path', default='./save_model/', help='') 20 | parser.add_argument('--render', action="store_true", default=False) 21 | parser.add_argument('--gamma', type=float, default=0.99) 22 | parser.add_argument('--hidden_size', type=int, default=64) 23 | parser.add_argument('--max_kl', type=float, default=1e-2) 24 | parser.add_argument('--logdir', type=str, default='logs', 25 | help='tensorboardx logs directory') 26 | args = parser.parse_args() 27 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 28 | 29 | if __name__=="__main__": 30 | env = gym.make(args.env_name) 31 | env.seed(500) 32 | torch.manual_seed(500) 33 | 34 | state_size = env.observation_space.shape[0] 35 | action_size = env.action_space.shape[0] 36 | print('state size:', state_size) 37 | print('action size:', action_size) 38 | 39 | actor = Actor(state_size, action_size, args) 40 | # writer = SummaryWriter(args.logdir) 41 | 42 | if not os.path.isdir(args.save_path): 43 | os.makedirs(args.save_path) 44 | 45 | running_state = ZFilter((state_size,), clip=5) 46 | episodes = 0 47 | 48 | for iter in range(2000): 49 | memory = deque() 50 | scores = [] 51 | steps = 0 52 | 53 | while steps < 2048: 54 | score = 0 55 | episodes += 1 56 | 57 | state = env.reset() 58 | state = running_state(state) 59 | 60 | for _ in range(10000): 61 | if args.render: 62 | env.render() 63 | 64 | steps += 1 65 | 66 | mu, std = actor(torch.Tensor(state).unsqueeze(0)) 67 | action = get_action(mu, std)[0] 68 | next_state, reward, done, _ = env.step(action) 69 | 70 | if done: 71 | mask = 0 72 | else: 73 | mask = 1 74 | 75 | memory.append([state, action, reward, mask]) 76 | 77 | next_state = running_state(next_state) 78 | state = next_state 79 | score += reward 80 | 81 | if done: 82 | break 83 | 84 | scores.append(score) 85 | 86 | score_avg = np.mean(scores) 87 | print('{} episode score is {:.2f}'.format(episodes, score_avg)) 88 | # writer.add_scalar('log/score', float(score_avg), iter) 89 | 90 | actor.train() 91 | train_model(actor, memory, state_size, action_size, args) 92 | 93 | # if iter % 100: 94 | # ckpt_path = args.save_path + str(score_avg) + 'model.pth' 95 | # torch.save(actor.state_dict(), ckpt_path) -------------------------------------------------------------------------------- /mujoco/trpo/trpo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from model import Actor 3 | from utils.utils import * 4 | 5 | def train_model(actor, memory, state_size, action_size, args): 6 | memory = np.array(memory) 7 | states = np.vstack(memory[:, 0]) 8 | actions = list(memory[:, 1]) 9 | rewards = list(memory[:, 2]) 10 | masks = list(memory[:, 3]) 11 | 12 | # ---------------------------- 13 | # step 1: get returns 14 | returns = get_returns(rewards, masks, args.gamma) 15 | 16 | # ---------------------------- 17 | # step 2: get gradient of loss and hessian of kl and step direction 18 | mu, std = actor(torch.Tensor(states)) 19 | old_policy = log_prob_density(torch.Tensor(actions), mu, std) 20 | loss = surrogate_loss(actor, returns, states, old_policy.detach(), actions) 21 | 22 | loss_grad = torch.autograd.grad(loss, actor.parameters()) 23 | loss_grad = flat_grad(loss_grad) 24 | loss = loss.data.numpy() 25 | 26 | step_dir = conjugate_gradient(actor, states, loss_grad.data, nsteps=10) 27 | 28 | # ---------------------------- 29 | # step 3: get step-size alpha and maximal step 30 | sHs = 0.5 * (step_dir * hessian_vector_product(actor, states, step_dir) 31 | ).sum(0, keepdim=True) 32 | step_size = torch.sqrt(2 * args.max_kl / sHs)[0] 33 | maximal_step = step_size * step_dir 34 | 35 | # ---------------------------- 36 | # step 4: perform backtracking line search for n iteration 37 | old_actor = Actor(state_size, action_size, args) 38 | params = flat_params(actor) 39 | update_model(old_actor, params) 40 | 41 | # 구했던 maximal step만큼 parameter space에서 움직였을 때 예상되는 performance 변화 42 | expected_improve = (loss_grad * maximal_step).sum(0, keepdim=True) 43 | expected_improve = expected_improve.data.numpy() 44 | 45 | # Backtracking line search 46 | # see cvx 464p https://web.stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf 47 | # additionally, https://en.wikipedia.org/wiki/Backtracking_line_search 48 | flag = False 49 | alpha = 0.5 50 | beta = 0.5 51 | t = 1.0 52 | 53 | for i in range(10): 54 | new_params = params + t * maximal_step 55 | update_model(actor, new_params) 56 | 57 | new_loss = surrogate_loss(actor, returns, states, old_policy.detach(), actions) 58 | new_loss = new_loss.data.numpy() 59 | 60 | loss_improve = new_loss - loss 61 | expected_improve *= t 62 | improve_condition = loss_improve / expected_improve 63 | 64 | kl = kl_divergence(old_actor=old_actor, new_actor=actor, states=states) 65 | kl = kl.mean() 66 | 67 | print('kl: {:.4f} | loss_improve: {:.4f} | expected_improve: {:.4f} ' 68 | '| improve_condition: {:.4f} | number of line search: {}' 69 | .format(kl.data.numpy(), loss_improve, expected_improve[0], improve_condition, i)) 70 | 71 | # kl-divergence와 expected_new_loss_grad와 함께 trust region 안에 있는지 밖에 있는지를 판단 72 | # trust region 안에 있으면 loop 탈출 73 | # max_kl = 0.01 74 | if kl < args.max_kl and improve_condition > alpha: 75 | flag = True 76 | break 77 | 78 | # trust region 밖에 있으면 maximal_step을 반만큼 쪼개서 다시 실시 79 | t *= beta 80 | 81 | if not flag: 82 | params = flat_params(old_actor) 83 | update_model(actor, params) 84 | print('policy update does not impove the surrogate') 85 | 86 | def get_returns(rewards, masks, gamma): 87 | rewards = torch.Tensor(rewards) 88 | masks = torch.Tensor(masks) 89 | returns = torch.zeros_like(rewards) 90 | 91 | running_returns = 0 92 | 93 | for t in reversed(range(0, len(rewards))): 94 | running_returns = rewards[t] + gamma * running_returns * masks[t] 95 | returns[t] = running_returns 96 | 97 | returns = (returns - returns.mean()) / returns.std() 98 | return returns 99 | 100 | def surrogate_loss(actor, returns, states, old_policy, actions): 101 | mu, std = actor(torch.Tensor(states)) 102 | new_policy = log_prob_density(torch.Tensor(actions), mu, std) 103 | returns = returns.unsqueeze(1) 104 | 105 | surrogate = torch.exp(new_policy - old_policy) * returns 106 | surrogate = surrogate.mean() 107 | return surrogate 108 | 109 | # from openai baseline code 110 | # https://github.com/openai/baselines/blob/master/baselines/common/cg.py 111 | def conjugate_gradient(actor, states, b, nsteps, residual_tol=1e-10): 112 | x = torch.zeros(b.size()) 113 | r = b.clone() 114 | p = b.clone() 115 | rdotr = torch.dot(r, r) 116 | for i in range(nsteps): 117 | _Avp = hessian_vector_product(actor, states, p) 118 | alpha = rdotr / torch.dot(p, _Avp) 119 | x += alpha * p 120 | r -= alpha * _Avp 121 | new_rdotr = torch.dot(r, r) 122 | betta = new_rdotr / rdotr 123 | p = r + betta * p 124 | rdotr = new_rdotr 125 | if rdotr < residual_tol: 126 | break 127 | return x -------------------------------------------------------------------------------- /mujoco/trpo/utils/__pycache__/running_state.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/trpo/utils/__pycache__/running_state.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/trpo/utils/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/trpo/utils/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/trpo/utils/running_state.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | 3 | import numpy as np 4 | 5 | 6 | # from https://github.com/joschu/modular_rl 7 | # http://www.johndcook.com/blog/standard_deviation/ 8 | class RunningStat(object): 9 | def __init__(self, shape): # shape = (11,) 10 | self._n = 0 11 | self._M = np.zeros(shape) 12 | self._S = np.zeros(shape) 13 | 14 | def push(self, x): 15 | x = np.asarray(x) 16 | assert x.shape == self._M.shape 17 | self._n += 1 18 | if self._n == 1: # Only the first time 19 | self._M[...] = x 20 | else: # From the second time ~ 21 | oldM = self._M.copy() 22 | self._M[...] = oldM + (x - oldM) / self._n 23 | self._S[...] = self._S + (x - oldM) * (x - self._M) 24 | 25 | @property 26 | def n(self): 27 | return self._n 28 | 29 | @n.setter 30 | def n(self, n): 31 | self._n = n 32 | 33 | @property 34 | def mean(self): 35 | return self._M 36 | 37 | @mean.setter 38 | def mean(self, M): 39 | self._M = M 40 | 41 | @property 42 | def sum_square(self): 43 | return self._S 44 | 45 | @sum_square.setter 46 | def sum_square(self, S): 47 | self._S = S 48 | 49 | @property 50 | def var(self): 51 | return self._S / (self._n - 1) if self._n > 1 else np.square(self._M) 52 | 53 | @property 54 | def std(self): 55 | return np.sqrt(self.var) 56 | 57 | @property 58 | def shape(self): 59 | return self._M.shape 60 | 61 | 62 | class ZFilter: 63 | """ 64 | y = (x-mean)/std 65 | using running estimates of mean,std 66 | """ 67 | 68 | def __init__(self, shape, demean=True, destd=True, clip=10.0): # shape = (11,), clip = 5 69 | self.demean = demean 70 | self.destd = destd 71 | self.clip = clip 72 | 73 | self.rs = RunningStat(shape) 74 | 75 | def __call__(self, x, update=True): 76 | if update: self.rs.push(x) 77 | 78 | if self.demean: 79 | x = x - self.rs.mean 80 | 81 | if self.destd: 82 | x = x / (self.rs.std + 1e-8) 83 | 84 | if self.clip: 85 | x = np.clip(x, -self.clip, self.clip) 86 | 87 | return x 88 | -------------------------------------------------------------------------------- /mujoco/trpo/utils/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | 4 | def get_action(mu, std): 5 | action = torch.normal(mu, std) 6 | action = action.data.numpy() 7 | return action 8 | 9 | # logarithm의 property을 이용하여 ratio를 만들 때 사용하기 위한 10 | # normal distribution의 probability density 11 | def log_prob_density(x, mu, std): 12 | log_density = -(x - mu).pow(2) / (2 * std.pow(2)) \ 13 | - 0.5 * math.log(2 * math.pi) 14 | return log_density.sum(1, keepdim=True) 15 | 16 | 17 | def hessian_vector_product(actor, states, p): 18 | p.detach() 19 | kl = kl_divergence(old_actor=actor, new_actor=actor, states=states) 20 | kl = kl.mean() 21 | kl_grad = torch.autograd.grad(kl, actor.parameters(), create_graph=True) 22 | kl_grad = flat_grad(kl_grad) # check kl_grad == 0 23 | 24 | kl_grad_p = (kl_grad * p).sum() 25 | kl_hessian_p = torch.autograd.grad(kl_grad_p, actor.parameters()) 26 | kl_hessian_p = flat_hessian(kl_hessian_p) 27 | 28 | return kl_hessian_p + 0.1 * p 29 | 30 | def kl_divergence(old_actor, new_actor, states): 31 | mu, std = new_actor(torch.Tensor(states)) 32 | mu_old, std_old = old_actor(torch.Tensor(states)) 33 | mu_old = mu_old.detach() 34 | std_old = std_old.detach() 35 | 36 | # kl divergence between old policy and new policy : D( pi_old || pi_new ) 37 | # pi_old -> mu_old, std_old / pi_new -> mu, std 38 | # be careful of calculating KL-divergence. It is not symmetric metric 39 | kl = (std_old.pow(2) + (mu_old - mu).pow(2)) / (2.0 * std.pow(2)) - 0.5 40 | return kl.sum(1, keepdim=True) 41 | 42 | 43 | def flat_grad(grads): 44 | grad_flatten = [] 45 | for grad in grads: 46 | grad_flatten.append(grad.view(-1)) 47 | grad_flatten = torch.cat(grad_flatten) 48 | return grad_flatten 49 | 50 | def flat_hessian(hessians): 51 | hessians_flatten = [] 52 | for hessian in hessians: 53 | hessians_flatten.append(hessian.contiguous().view(-1)) 54 | hessians_flatten = torch.cat(hessians_flatten).data 55 | return hessians_flatten 56 | 57 | 58 | def flat_params(model): 59 | params = [] 60 | for param in model.parameters(): 61 | params.append(param.data.view(-1)) 62 | params_flatten = torch.cat(params) 63 | return params_flatten 64 | 65 | def update_model(model, new_params): 66 | index = 0 67 | for params in model.parameters(): 68 | params_length = len(params.view(-1)) 69 | new_param = new_params[index: index + params_length] 70 | new_param = new_param.view(params.size()) 71 | params.data.copy_(new_param) 72 | index += params_length 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /mujoco/vail/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/.DS_Store -------------------------------------------------------------------------------- /mujoco/vail/__pycache__/hparams.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/__pycache__/hparams.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/vail/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/vail/__pycache__/train_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/__pycache__/train_model.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/vail/expert_demo/expert_demo.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/expert_demo/expert_demo.p -------------------------------------------------------------------------------- /mujoco/vail/logs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/logs/.DS_Store -------------------------------------------------------------------------------- /mujoco/vail/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Actor(nn.Module): 5 | def __init__(self, num_inputs, num_outputs, args): 6 | super(Actor, self).__init__() 7 | self.fc1 = nn.Linear(num_inputs, args.hidden_size) 8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 9 | self.fc3 = nn.Linear(args.hidden_size, num_outputs) 10 | 11 | self.fc3.weight.data.mul_(0.1) 12 | self.fc3.bias.data.mul_(0.0) 13 | 14 | def forward(self, x): 15 | x = torch.tanh(self.fc1(x)) 16 | x = torch.tanh(self.fc2(x)) 17 | mu = self.fc3(x) 18 | logstd = torch.zeros_like(mu) 19 | std = torch.exp(logstd) 20 | return mu, std 21 | 22 | 23 | class Critic(nn.Module): 24 | def __init__(self, num_inputs, args): 25 | super(Critic, self).__init__() 26 | self.fc1 = nn.Linear(num_inputs, args.hidden_size) 27 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 28 | self.fc3 = nn.Linear(args.hidden_size, 1) 29 | 30 | self.fc3.weight.data.mul_(0.1) 31 | self.fc3.bias.data.mul_(0.0) 32 | 33 | def forward(self, x): 34 | x = torch.tanh(self.fc1(x)) 35 | x = torch.tanh(self.fc2(x)) 36 | v = self.fc3(x) 37 | return v 38 | 39 | 40 | class VDB(nn.Module): 41 | def __init__(self, num_inputs, args): 42 | super(VDB, self).__init__() 43 | self.fc1 = nn.Linear(num_inputs, args.hidden_size) 44 | self.fc2 = nn.Linear(args.hidden_size, args.z_size) 45 | self.fc3 = nn.Linear(args.hidden_size, args.z_size) 46 | self.fc4 = nn.Linear(args.z_size, args.hidden_size) 47 | self.fc5 = nn.Linear(args.hidden_size, 1) 48 | 49 | self.fc5.weight.data.mul_(0.1) 50 | self.fc5.bias.data.mul_(0.0) 51 | 52 | def encoder(self, x): 53 | h = torch.tanh(self.fc1(x)) 54 | return self.fc2(h), self.fc3(h) 55 | 56 | def reparameterize(self, mu, logvar): 57 | std = torch.exp(logvar/2) 58 | eps = torch.randn_like(std) 59 | return mu + std * eps 60 | 61 | def discriminator(self, z): 62 | h = torch.tanh(self.fc4(z)) 63 | return torch.sigmoid(self.fc5(h)) 64 | 65 | def forward(self, x): 66 | mu, logvar = self.encoder(x) 67 | z = self.reparameterize(mu, logvar) 68 | prob = self.discriminator(z) 69 | return prob, mu, logvar -------------------------------------------------------------------------------- /mujoco/vail/save_model/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/save_model/.DS_Store -------------------------------------------------------------------------------- /mujoco/vail/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import torch 4 | import argparse 5 | 6 | from model import Actor, Critic 7 | from utils.utils import get_action 8 | from utils.running_state import ZFilter 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--env', type=str, default="Hopper-v2", 12 | help='name of Mujoco environement') 13 | parser.add_argument('--iter', type=int, default=5, 14 | help='number of episodes to play') 15 | parser.add_argument("--load_model", type=str, default='ppo_max.tar', 16 | help="if you test pretrained file, write filename in save_model folder") 17 | 18 | args = parser.parse_args() 19 | 20 | 21 | if __name__ == "__main__": 22 | env = gym.make(args.env) 23 | env.seed(500) 24 | torch.manual_seed(500) 25 | 26 | num_inputs = env.observation_space.shape[0] 27 | num_actions = env.action_space.shape[0] 28 | 29 | print("state size: ", num_inputs) 30 | print("action size: ", num_actions) 31 | 32 | actor = Actor(num_inputs, num_actions) 33 | critic = Critic(num_inputs) 34 | 35 | running_state = ZFilter((num_inputs,), clip=5) 36 | 37 | if args.load_model is not None: 38 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) 39 | 40 | pretrained_model = torch.load(pretrained_model_path) 41 | 42 | actor.load_state_dict(pretrained_model['actor']) 43 | critic.load_state_dict(pretrained_model['critic']) 44 | 45 | running_state.rs.n = pretrained_model['z_filter_n'] 46 | running_state.rs.mean = pretrained_model['z_filter_m'] 47 | running_state.rs.sum_square = pretrained_model['z_filter_s'] 48 | 49 | print("Loaded OK ex. ZFilter N {}".format(running_state.rs.n)) 50 | 51 | else: 52 | assert("Should write pretrained filename in save_model folder. ex) python3 test_algo.py --load_model ppo_max.tar") 53 | 54 | 55 | actor.eval(), critic.eval() 56 | for episode in range(args.iter): 57 | state = env.reset() 58 | steps = 0 59 | score = 0 60 | for _ in range(10000): 61 | env.render() 62 | mu, std, _ = actor(torch.Tensor(state).unsqueeze(0)) 63 | action = get_action(mu, std)[0] 64 | 65 | next_state, reward, done, _ = env.step(action) 66 | next_state = running_state(next_state) 67 | 68 | state = next_state 69 | score += reward 70 | 71 | if done: 72 | print("{} cumulative reward: {}".format(episode, score)) 73 | break 74 | -------------------------------------------------------------------------------- /mujoco/vail/train_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from utils.utils import * 4 | 5 | def train_vdb(vdb, memory, vdb_optim, demonstrations, beta, args): 6 | memory = np.array(memory) 7 | states = np.vstack(memory[:, 0]) 8 | actions = list(memory[:, 1]) 9 | 10 | states = torch.Tensor(states) 11 | actions = torch.Tensor(actions) 12 | 13 | criterion = torch.nn.BCELoss() 14 | 15 | for _ in range(args.vdb_update_num): 16 | learner, l_mu, l_logvar = vdb(torch.cat([states, actions], dim=1)) 17 | expert, e_mu, e_logvar = vdb(torch.Tensor(demonstrations)) 18 | 19 | l_kld = kl_divergence(l_mu, l_logvar) 20 | l_kld = l_kld.mean() 21 | 22 | e_kld = kl_divergence(e_mu, e_logvar) 23 | e_kld = e_kld.mean() 24 | 25 | kld = 0.5 * (l_kld + e_kld) 26 | bottleneck_loss = kld - args.i_c 27 | 28 | beta = max(0, beta + args.alpha_beta * bottleneck_loss) 29 | 30 | vdb_loss = criterion(learner, torch.ones((states.shape[0], 1))) + \ 31 | criterion(expert, torch.zeros((demonstrations.shape[0], 1))) + \ 32 | beta * bottleneck_loss 33 | 34 | vdb_optim.zero_grad() 35 | vdb_loss.backward(retain_graph=True) 36 | vdb_optim.step() 37 | 38 | 39 | def train_ppo(actor, critic, memory, actor_optim, critic_optim, args): 40 | memory = np.array(memory) 41 | states = np.vstack(memory[:, 0]) 42 | actions = list(memory[:, 1]) 43 | rewards = list(memory[:, 2]) 44 | masks = list(memory[:, 3]) 45 | 46 | old_values = critic(torch.Tensor(states)) 47 | returns, advants = get_gae(rewards, masks, old_values, args) 48 | 49 | mu, std = actor(torch.Tensor(states)) 50 | old_policy = log_prob_density(torch.Tensor(actions), mu, std) 51 | 52 | criterion = torch.nn.MSELoss() 53 | n = len(states) 54 | arr = np.arange(n) 55 | 56 | for _ in range(args.ppo_update_num): 57 | np.random.shuffle(arr) 58 | 59 | for i in range(n // args.batch_size): 60 | batch_index = arr[args.batch_size * i : args.batch_size * (i + 1)] 61 | batch_index = torch.LongTensor(batch_index) 62 | 63 | inputs = torch.Tensor(states)[batch_index] 64 | actions_samples = torch.Tensor(actions)[batch_index] 65 | returns_samples = returns.unsqueeze(1)[batch_index] 66 | advants_samples = advants.unsqueeze(1)[batch_index] 67 | oldvalue_samples = old_values[batch_index].detach() 68 | 69 | values = critic(inputs) 70 | clipped_values = oldvalue_samples + \ 71 | torch.clamp(values - oldvalue_samples, 72 | -args.clip_param, 73 | args.clip_param) 74 | critic_loss1 = criterion(clipped_values, returns_samples) 75 | critic_loss2 = criterion(values, returns_samples) 76 | critic_loss = torch.max(critic_loss1, critic_loss2).mean() 77 | 78 | loss, ratio, entropy = surrogate_loss(actor, advants_samples, inputs, 79 | old_policy.detach(), actions_samples, 80 | batch_index) 81 | clipped_ratio = torch.clamp(ratio, 82 | 1.0 - args.clip_param, 83 | 1.0 + args.clip_param) 84 | clipped_loss = clipped_ratio * advants_samples 85 | actor_loss = -torch.min(loss, clipped_loss).mean() 86 | 87 | loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy 88 | 89 | critic_optim.zero_grad() 90 | loss.backward(retain_graph=True) 91 | critic_optim.step() 92 | 93 | actor_optim.zero_grad() 94 | loss.backward() 95 | actor_optim.step() 96 | 97 | def get_gae(rewards, masks, values, args): 98 | rewards = torch.Tensor(rewards) 99 | masks = torch.Tensor(masks) 100 | returns = torch.zeros_like(rewards) 101 | advants = torch.zeros_like(rewards) 102 | 103 | running_returns = 0 104 | previous_value = 0 105 | running_advants = 0 106 | 107 | for t in reversed(range(0, len(rewards))): 108 | running_returns = rewards[t] + (args.gamma * running_returns * masks[t]) 109 | returns[t] = running_returns 110 | 111 | running_delta = rewards[t] + (args.gamma * previous_value * masks[t]) - \ 112 | values.data[t] 113 | previous_value = values.data[t] 114 | 115 | running_advants = running_delta + (args.gamma * args.lamda * \ 116 | running_advants * masks[t]) 117 | advants[t] = running_advants 118 | 119 | advants = (advants - advants.mean()) / advants.std() 120 | return returns, advants 121 | 122 | def surrogate_loss(actor, advants, states, old_policy, actions, batch_index): 123 | mu, std = actor(states) 124 | new_policy = log_prob_density(actions, mu, std) 125 | old_policy = old_policy[batch_index] 126 | 127 | ratio = torch.exp(new_policy - old_policy) 128 | surrogate_loss = ratio * advants 129 | entropy = get_entropy(mu, std) 130 | 131 | return surrogate_loss, ratio, entropy -------------------------------------------------------------------------------- /mujoco/vail/utils/__pycache__/running_state.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/utils/__pycache__/running_state.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/vail/utils/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/utils/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/vail/utils/__pycache__/zfilter.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/utils/__pycache__/zfilter.cpython-36.pyc -------------------------------------------------------------------------------- /mujoco/vail/utils/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.distributions import Normal 4 | 5 | def get_action(mu, std): 6 | action = torch.normal(mu, std) 7 | action = action.data.numpy() 8 | return action 9 | 10 | def get_entropy(mu, std): 11 | dist = Normal(mu, std) 12 | entropy = dist.entropy().mean() 13 | return entropy 14 | 15 | def log_prob_density(x, mu, std): 16 | log_prob_density = -(x - mu).pow(2) / (2 * std.pow(2)) \ 17 | - 0.5 * math.log(2 * math.pi) 18 | return log_prob_density.sum(1, keepdim=True) 19 | 20 | def get_reward(vdb, state, action): 21 | state = torch.Tensor(state) 22 | action = torch.Tensor(action) 23 | state_action = torch.cat([state, action]) 24 | with torch.no_grad(): 25 | return -math.log(vdb(state_action)[0].item()) 26 | 27 | def kl_divergence(mu, logvar): 28 | kl_div = 0.5 * torch.sum(mu.pow(2) + logvar.exp() - logvar - 1, dim=1) 29 | return kl_div 30 | 31 | def save_checkpoint(state, filename): 32 | torch.save(state, filename) -------------------------------------------------------------------------------- /mujoco/vail/utils/zfilter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # from https://github.com/joschu/modular_rl 4 | # http://www.johndcook.com/blog/standard_deviation/ 5 | 6 | class RunningStat(object): 7 | def __init__(self, shape): 8 | self._n = 0 9 | self._M = np.zeros(shape) 10 | self._S = np.zeros(shape) 11 | 12 | def push(self, x): 13 | x = np.asarray(x) 14 | assert x.shape == self._M.shape 15 | self._n += 1 16 | if self._n == 1: 17 | self._M[...] = x 18 | else: 19 | oldM = self._M.copy() 20 | self._M[...] = oldM + (x - oldM) / self._n 21 | self._S[...] = self._S + (x - oldM) * (x - self._M) 22 | 23 | @property 24 | def n(self): 25 | return self._n 26 | 27 | @n.setter 28 | def n(self, n): 29 | self._n = n 30 | 31 | @property 32 | def mean(self): 33 | return self._M 34 | 35 | @mean.setter 36 | def mean(self, M): 37 | self._M = M 38 | 39 | @property 40 | def sum_square(self): 41 | return self._S 42 | 43 | @sum_square.setter 44 | def sum_square(self, S): 45 | self._S = S 46 | 47 | @property 48 | def var(self): 49 | return self._S / (self._n - 1) if self._n > 1 else np.square(self._M) 50 | 51 | @property 52 | def std(self): 53 | return np.sqrt(self.var) 54 | 55 | @property 56 | def shape(self): 57 | return self._M.shape 58 | 59 | 60 | class ZFilter: 61 | """ 62 | y = (x-mean)/std 63 | using running estimates of mean,std 64 | """ 65 | 66 | def __init__(self, shape, demean=True, destd=True, clip=10.0): 67 | self.demean = demean 68 | self.destd = destd 69 | self.clip = clip 70 | 71 | self.rs = RunningStat(shape) 72 | 73 | def __call__(self, x, update=True): 74 | if update: self.rs.push(x) 75 | 76 | if self.demean: 77 | x = x - self.rs.mean 78 | 79 | if self.destd: 80 | x = x / (self.rs.std + 1e-8) 81 | 82 | if self.clip: 83 | x = np.clip(x, -self.clip, self.clip) 84 | 85 | return x 86 | -------------------------------------------------------------------------------- /pendulum/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/.DS_Store -------------------------------------------------------------------------------- /pendulum/ddpg/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ddpg/.DS_Store -------------------------------------------------------------------------------- /pendulum/ddpg/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ddpg/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /pendulum/ddpg/__pycache__/model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ddpg/__pycache__/model.cpython-37.pyc -------------------------------------------------------------------------------- /pendulum/ddpg/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ddpg/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /pendulum/ddpg/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ddpg/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /pendulum/ddpg/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Actor(nn.Module): 5 | def __init__(self, state_size, action_size, args): 6 | super(Actor, self).__init__() 7 | self.fc1 = nn.Linear(state_size, args.hidden_size) 8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 9 | self.fc3 = nn.Linear(args.hidden_size, action_size) 10 | 11 | def forward(self, x): 12 | x = torch.relu(self.fc1(x)) 13 | x = torch.relu(self.fc2(x)) 14 | policy = self.fc3(x) 15 | 16 | return policy 17 | 18 | class Critic(nn.Module): 19 | def __init__(self, state_size, action_size, args): 20 | super(Critic, self).__init__() 21 | self.fc1 = nn.Linear(state_size + action_size, args.hidden_size) 22 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 23 | self.fc3 = nn.Linear(args.hidden_size, 1) 24 | 25 | def forward(self, states, actions): 26 | x = torch.cat([states, actions], dim=1) 27 | x = torch.relu(self.fc1(x)) 28 | x = torch.relu(self.fc2(x)) 29 | q_value = self.fc3(x) 30 | 31 | return q_value -------------------------------------------------------------------------------- /pendulum/ddpg/save_model/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ddpg/save_model/.DS_Store -------------------------------------------------------------------------------- /pendulum/ddpg/save_model/model.pth.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ddpg/save_model/model.pth.tar -------------------------------------------------------------------------------- /pendulum/ddpg/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import random 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | from utils import * 9 | from model import Actor, Critic 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--env_name', type=str, default="Pendulum-v0") 13 | parser.add_argument("--load_model", type=str, default='model.pth.tar') 14 | parser.add_argument('--render', action="store_true", default=True) 15 | parser.add_argument('--hidden_size', type=int, default=64) 16 | parser.add_argument('--theta', type=float, default=0.15) 17 | parser.add_argument('--mu', type=float, default=0.0) 18 | parser.add_argument('--sigma', type=float, default=0.2) 19 | parser.add_argument('--iter', type=int, default=10000) 20 | parser.add_argument('--log_interval', type=int, default=10) 21 | args = parser.parse_args() 22 | 23 | if __name__=="__main__": 24 | env = gym.make(args.env_name) 25 | env.seed(500) 26 | torch.manual_seed(500) 27 | 28 | state_size = env.observation_space.shape[0] 29 | action_size = env.action_space.shape[0] 30 | print('state size:', state_size) 31 | print('action size:', action_size) 32 | 33 | actor = Actor(state_size, action_size, args) 34 | 35 | if args.load_model is not None: 36 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) 37 | pretrained_model = torch.load(pretrained_model_path) 38 | actor.load_state_dict(pretrained_model) 39 | 40 | ou_noise = OUNoise(action_size, args.theta, args.mu, args.sigma) 41 | steps = 0 42 | 43 | for episode in range(args.iter): 44 | done = False 45 | score = 0 46 | 47 | state = env.reset() 48 | state = np.reshape(state, [1, state_size]) 49 | 50 | while not done: 51 | if args.render: 52 | env.render() 53 | 54 | steps += 1 55 | 56 | policy = actor(torch.Tensor(state)) 57 | action = get_action(policy, ou_noise) 58 | 59 | next_state, reward, done, _ = env.step(action) 60 | 61 | next_state = np.reshape(next_state, [1, state_size]) 62 | state = next_state 63 | score += reward 64 | 65 | if episode % args.log_interval == 0: 66 | print('{} episode | score: {:.2f}'.format(episode, score[0])) -------------------------------------------------------------------------------- /pendulum/ddpg/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import random 4 | import argparse 5 | import numpy as np 6 | from collections import deque 7 | 8 | import torch 9 | import torch.optim as optim 10 | 11 | from utils import * 12 | from model import Actor, Critic 13 | from tensorboardX import SummaryWriter 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--env_name', type=str, default="Pendulum-v0") 17 | parser.add_argument('--load_model', type=str, default=None) 18 | parser.add_argument('--save_path', default='./save_model/', help='') 19 | parser.add_argument('--render', action="store_true", default=False) 20 | parser.add_argument('--gamma', type=float, default=0.99) 21 | parser.add_argument('--hidden_size', type=int, default=64) 22 | parser.add_argument('--batch_size', type=int, default=64) 23 | parser.add_argument('--actor_lr', type=float, default=1e-3) 24 | parser.add_argument('--critic_lr', type=float, default=1e-3) 25 | parser.add_argument('--theta', type=float, default=0.15) 26 | parser.add_argument('--mu', type=float, default=0.0) 27 | parser.add_argument('--sigma', type=float, default=0.2) 28 | parser.add_argument('--tau', type=float, default=0.001) 29 | parser.add_argument('--max_iter_num', type=int, default=1000) 30 | parser.add_argument('--log_interval', type=int, default=10) 31 | parser.add_argument('--goal_score', type=int, default=-300) 32 | parser.add_argument('--logdir', type=str, default='./logs', 33 | help='tensorboardx logs directory') 34 | args = parser.parse_args() 35 | 36 | def train_model(actor, critic, target_actor, target_critic, 37 | actor_optimizer, critic_optimizer, mini_batch): 38 | mini_batch = np.array(mini_batch) 39 | states = np.vstack(mini_batch[:, 0]) 40 | actions = list(mini_batch[:, 1]) 41 | rewards = list(mini_batch[:, 2]) 42 | next_states = np.vstack(mini_batch[:, 3]) 43 | masks = list(mini_batch[:, 4]) 44 | 45 | actions = torch.Tensor(actions).squeeze(1) 46 | rewards = torch.Tensor(rewards).squeeze(1) 47 | masks = torch.Tensor(masks) 48 | 49 | # update critic 50 | criterion = torch.nn.MSELoss() 51 | 52 | # get Q-value 53 | q_value = critic(torch.Tensor(states), actions).squeeze(1) 54 | 55 | # get target 56 | target_next_policy = target_actor(torch.Tensor(next_states)) 57 | target_next_q_value = target_critic(torch.Tensor(next_states), target_next_policy).squeeze(1) 58 | target = rewards + masks * args.gamma * target_next_q_value 59 | 60 | critic_loss = criterion(q_value, target.detach()) 61 | critic_optimizer.zero_grad() 62 | critic_loss.backward() 63 | critic_optimizer.step() 64 | 65 | # update actor 66 | policy = actor(torch.Tensor(states)) 67 | 68 | actor_loss = -critic(torch.Tensor(states), policy).mean() 69 | actor_optimizer.zero_grad() 70 | actor_loss.backward() 71 | actor_optimizer.step() 72 | 73 | 74 | def main(): 75 | env = gym.make(args.env_name) 76 | env.seed(500) 77 | torch.manual_seed(500) 78 | 79 | state_size = env.observation_space.shape[0] 80 | action_size = env.action_space.shape[0] 81 | print('state size:', state_size) 82 | print('action size:', action_size) 83 | 84 | actor = Actor(state_size, action_size, args) 85 | target_actor = Actor(state_size, action_size, args) 86 | critic = Critic(state_size, action_size, args) 87 | target_critic = Critic(state_size, action_size, args) 88 | 89 | actor_optimizer = optim.Adam(actor.parameters(), lr=args.actor_lr) 90 | critic_optimizer = optim.Adam(critic.parameters(), lr=args.critic_lr) 91 | 92 | hard_target_update(actor, critic, target_actor, target_critic) 93 | ou_noise = OUNoise(action_size, args.theta, args.mu, args.sigma) 94 | 95 | writer = SummaryWriter(args.logdir) 96 | 97 | replay_buffer = deque(maxlen=10000) 98 | recent_rewards = deque(maxlen=100) 99 | steps = 0 100 | 101 | for episode in range(args.max_iter_num): 102 | done = False 103 | score = 0 104 | 105 | state = env.reset() 106 | state = np.reshape(state, [1, state_size]) 107 | 108 | while not done: 109 | if args.render: 110 | env.render() 111 | 112 | steps += 1 113 | 114 | policy = actor(torch.Tensor(state)) 115 | action = get_action(policy, ou_noise) 116 | 117 | next_state, reward, done, _ = env.step(action) 118 | 119 | next_state = np.reshape(next_state, [1, state_size]) 120 | mask = 0 if done else 1 121 | 122 | replay_buffer.append((state, action, reward, next_state, mask)) 123 | 124 | state = next_state 125 | score += reward 126 | 127 | if steps > args.batch_size: 128 | mini_batch = random.sample(replay_buffer, args.batch_size) 129 | 130 | actor.train(), critic.train() 131 | target_actor.train(), target_critic.train() 132 | train_model(actor, critic, target_actor, target_critic, 133 | actor_optimizer, critic_optimizer, mini_batch) 134 | 135 | soft_target_update(actor, critic, target_actor, target_critic, args.tau) 136 | 137 | if done: 138 | recent_rewards.append(score) 139 | 140 | if episode % args.log_interval == 0: 141 | print('{} episode | score_avg: {:.2f}'.format(episode, np.mean(recent_rewards))) 142 | writer.add_scalar('log/score', float(score), episode) 143 | 144 | if np.mean(recent_rewards) > args.goal_score: 145 | if not os.path.isdir(args.save_path): 146 | os.makedirs(args.save_path) 147 | 148 | ckpt_path = args.save_path + 'model.pth.tar' 149 | torch.save(actor.state_dict(), ckpt_path) 150 | print('Recent rewards exceed -300. So end') 151 | break 152 | 153 | if __name__ == '__main__': 154 | main() -------------------------------------------------------------------------------- /pendulum/ddpg/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | class OUNoise: 5 | def __init__(self, action_size, theta, mu, sigma): 6 | self.action_size = action_size 7 | self.theta = theta 8 | self.mu = mu 9 | self.sigma = sigma 10 | self.X = np.zeros(self.action_size) 11 | 12 | def sample(self): 13 | dx = self.theta * (self.mu - self.X) 14 | dx = dx + self.sigma * np.random.randn(len(self.X)) 15 | self.X = self.X + dx 16 | 17 | return self.X 18 | 19 | def get_action(policy, ou_noise): 20 | action = policy.detach().numpy() + ou_noise.sample() 21 | 22 | return action 23 | 24 | def hard_target_update(actor, critic, target_actor, target_critic): 25 | target_critic.load_state_dict(critic.state_dict()) 26 | target_actor.load_state_dict(actor.state_dict()) 27 | 28 | def soft_target_update(actor, critic, target_actor, target_critic, tau): 29 | soft_update(critic, target_critic, tau) 30 | soft_update(actor, target_actor, tau) 31 | 32 | def soft_update(net, target_net, tau): 33 | for param, target_param in zip(net.parameters(), target_net.parameters()): 34 | target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data) -------------------------------------------------------------------------------- /pendulum/ppo/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ppo/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /pendulum/ppo/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ppo/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /pendulum/ppo/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Actor(nn.Module): 5 | def __init__(self, state_size, action_size, args): 6 | super(Actor, self).__init__() 7 | self.fc1 = nn.Linear(state_size, args.hidden_size) 8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 9 | self.fc3 = nn.Linear(args.hidden_size, action_size) 10 | 11 | def forward(self, x): 12 | x = torch.tanh(self.fc1(x)) 13 | x = torch.tanh(self.fc2(x)) 14 | 15 | mu = self.fc3(x) 16 | log_std = torch.zeros_like(mu) 17 | std = torch.exp(log_std) 18 | 19 | return mu, std 20 | 21 | class Critic(nn.Module): 22 | def __init__(self, state_size, args): 23 | super(Critic, self).__init__() 24 | self.fc1 = nn.Linear(state_size, args.hidden_size) 25 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 26 | self.fc3 = nn.Linear(args.hidden_size, 1) 27 | 28 | def forward(self, x): 29 | x = torch.tanh(self.fc1(x)) 30 | x = torch.tanh(self.fc2(x)) 31 | value = self.fc3(x) 32 | 33 | return value 34 | -------------------------------------------------------------------------------- /pendulum/ppo/save_model/model.pth.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ppo/save_model/model.pth.tar -------------------------------------------------------------------------------- /pendulum/ppo/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import argparse 4 | import numpy as np 5 | 6 | import torch 7 | from utils import * 8 | from model import Actor 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--env_name', type=str, default="Pendulum-v0") 12 | parser.add_argument("--load_model", type=str, default='model.pth.tar') 13 | parser.add_argument('--render', action="store_true", default=True) 14 | parser.add_argument('--hidden_size', type=int, default=64) 15 | parser.add_argument('--iter', type=int, default=10000) 16 | parser.add_argument('--log_interval', type=int, default=10) 17 | args = parser.parse_args() 18 | 19 | if __name__=="__main__": 20 | env = gym.make(args.env_name) 21 | env.seed(500) 22 | torch.manual_seed(500) 23 | 24 | state_size = env.observation_space.shape[0] 25 | action_size = env.action_space.shape[0] 26 | print('state size:', state_size) 27 | print('action size:', action_size) 28 | 29 | actor = Actor(state_size, action_size, args) 30 | 31 | if args.load_model is not None: 32 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) 33 | pretrained_model = torch.load(pretrained_model_path) 34 | actor.load_state_dict(pretrained_model) 35 | 36 | steps = 0 37 | 38 | for episode in range(args.iter): 39 | done = False 40 | score = 0 41 | 42 | state = env.reset() 43 | state = np.reshape(state, [1, state_size]) 44 | 45 | while not done: 46 | if args.render: 47 | env.render() 48 | 49 | steps += 1 50 | 51 | mu, std = actor(torch.Tensor(state)) 52 | action = get_action(mu, std) 53 | 54 | next_state, reward, done, _ = env.step(action) 55 | 56 | next_state = np.reshape(next_state, [1, state_size]) 57 | state = next_state 58 | score += reward 59 | 60 | if episode % args.log_interval == 0: 61 | print('{} episode | score: {:.2f}'.format(episode, score[0])) -------------------------------------------------------------------------------- /pendulum/ppo/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.distributions import Normal 4 | 5 | def get_action(mu, std): 6 | normal = Normal(mu, std) 7 | action = normal.sample() 8 | 9 | return action.data.numpy() 10 | 11 | def get_returns(rewards, masks, gamma): 12 | returns = torch.zeros_like(rewards) 13 | running_returns = 0 14 | 15 | for t in reversed(range(0, len(rewards))): 16 | running_returns = rewards[t] + masks[t] * gamma * running_returns 17 | returns[t] = running_returns 18 | 19 | return returns 20 | 21 | def get_log_prob(actions, mu, std): 22 | normal = Normal(mu, std) 23 | log_prob = normal.log_prob(actions) 24 | 25 | return log_prob 26 | 27 | def surrogate_loss(actor, values, targets, states, old_policy, actions, batch_index): 28 | mu, std = actor(torch.Tensor(states)) 29 | new_policy = get_log_prob(actions, mu, std) 30 | 31 | old_policy = old_policy[batch_index] 32 | ratio = torch.exp(new_policy - old_policy) 33 | 34 | advantages = targets - values 35 | 36 | surrogate_loss = ratio * advantages 37 | 38 | return surrogate_loss, ratio, advantages -------------------------------------------------------------------------------- /pendulum/ppo_gae/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ppo_gae/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /pendulum/ppo_gae/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ppo_gae/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /pendulum/ppo_gae/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Actor(nn.Module): 5 | def __init__(self, state_size, action_size, args): 6 | super(Actor, self).__init__() 7 | self.fc1 = nn.Linear(state_size, args.hidden_size) 8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 9 | self.fc3 = nn.Linear(args.hidden_size, action_size) 10 | 11 | def forward(self, x): 12 | x = torch.tanh(self.fc1(x)) 13 | x = torch.tanh(self.fc2(x)) 14 | 15 | mu = self.fc3(x) 16 | log_std = torch.zeros_like(mu) 17 | std = torch.exp(log_std) 18 | 19 | return mu, std 20 | 21 | class Critic(nn.Module): 22 | def __init__(self, state_size, args): 23 | super(Critic, self).__init__() 24 | self.fc1 = nn.Linear(state_size, args.hidden_size) 25 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 26 | self.fc3 = nn.Linear(args.hidden_size, 1) 27 | 28 | def forward(self, x): 29 | x = torch.tanh(self.fc1(x)) 30 | x = torch.tanh(self.fc2(x)) 31 | value = self.fc3(x) 32 | 33 | return value 34 | -------------------------------------------------------------------------------- /pendulum/ppo_gae/save_model/model.pth.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ppo_gae/save_model/model.pth.tar -------------------------------------------------------------------------------- /pendulum/ppo_gae/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import argparse 4 | import numpy as np 5 | 6 | import torch 7 | from utils import * 8 | from model import Actor 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--env_name', type=str, default="Pendulum-v0") 12 | parser.add_argument("--load_model", type=str, default='model.pth.tar') 13 | parser.add_argument('--render', action="store_true", default=True) 14 | parser.add_argument('--hidden_size', type=int, default=64) 15 | parser.add_argument('--iter', type=int, default=10000) 16 | parser.add_argument('--log_interval', type=int, default=10) 17 | args = parser.parse_args() 18 | 19 | if __name__=="__main__": 20 | env = gym.make(args.env_name) 21 | env.seed(500) 22 | torch.manual_seed(500) 23 | 24 | state_size = env.observation_space.shape[0] 25 | action_size = env.action_space.shape[0] 26 | print('state size:', state_size) 27 | print('action size:', action_size) 28 | 29 | actor = Actor(state_size, action_size, args) 30 | 31 | if args.load_model is not None: 32 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) 33 | pretrained_model = torch.load(pretrained_model_path) 34 | actor.load_state_dict(pretrained_model) 35 | 36 | steps = 0 37 | 38 | for episode in range(args.iter): 39 | done = False 40 | score = 0 41 | 42 | state = env.reset() 43 | state = np.reshape(state, [1, state_size]) 44 | 45 | while not done: 46 | if args.render: 47 | env.render() 48 | 49 | steps += 1 50 | 51 | mu, std = actor(torch.Tensor(state)) 52 | action = get_action(mu, std) 53 | 54 | next_state, reward, done, _ = env.step(action) 55 | 56 | next_state = np.reshape(next_state, [1, state_size]) 57 | state = next_state 58 | score += reward 59 | 60 | if episode % args.log_interval == 0: 61 | print('{} episode | score: {:.2f}'.format(episode, score[0])) -------------------------------------------------------------------------------- /pendulum/ppo_gae/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.distributions import Normal 4 | 5 | def get_action(mu, std): 6 | normal = Normal(mu, std) 7 | action = normal.sample() 8 | 9 | return action.data.numpy() 10 | 11 | def get_gae(rewards, masks, values, args): 12 | returns = torch.zeros_like(rewards) 13 | advantages = torch.zeros_like(rewards) 14 | 15 | running_returns = 0 16 | previous_value = 0 17 | running_advants = 0 18 | 19 | for t in reversed(range(0, len(rewards))): 20 | # return 21 | running_returns = rewards[t] + masks[t] * args.gamma * running_returns 22 | returns[t] = running_returns 23 | 24 | # advantage 25 | running_deltas = rewards[t] + masks[t] * args.gamma * previous_value - values.data[t] 26 | running_advants = running_deltas + masks[t] * args.gamma * args.lamda * running_advants 27 | 28 | previous_value = values.data[t] 29 | advantages[t] = running_advants 30 | 31 | advantages = (advantages - advantages.mean()) / advantages.std() 32 | 33 | return returns, advantages 34 | 35 | def get_log_prob(actions, mu, std): 36 | normal = Normal(mu, std) 37 | log_prob = normal.log_prob(actions) 38 | 39 | return log_prob 40 | 41 | def surrogate_loss(actor, advantages, states, old_policy, actions, batch_index): 42 | mu, std = actor(torch.Tensor(states)) 43 | new_policy = get_log_prob(actions, mu, std) 44 | 45 | old_policy = old_policy[batch_index] 46 | 47 | ratio = torch.exp(new_policy - old_policy) 48 | surrogate_loss = ratio * advantages 49 | 50 | return surrogate_loss, ratio -------------------------------------------------------------------------------- /pendulum/sac/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/sac/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /pendulum/sac/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/sac/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /pendulum/sac/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Actor(nn.Module): 5 | def __init__(self, state_size, action_size, args, log_std_min=-20, log_std_max=2): 6 | super(Actor, self).__init__() 7 | self.log_std_min = log_std_min 8 | self.log_std_max = log_std_max 9 | 10 | self.fc1 = nn.Linear(state_size, args.hidden_size) 11 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 12 | 13 | self.fc3 = nn.Linear(args.hidden_size, action_size) 14 | self.fc4 = nn.Linear(args.hidden_size, action_size) 15 | 16 | def forward(self, x): 17 | x = torch.relu(self.fc1(x)) 18 | x = torch.relu(self.fc2(x)) 19 | 20 | mu = self.fc3(x) 21 | log_std = self.fc4(x) 22 | 23 | log_std = torch.clamp(log_std, min=self.log_std_min, max=self.log_std_max) 24 | std = torch.exp(log_std) 25 | 26 | return mu, std 27 | 28 | class Critic(nn.Module): 29 | def __init__(self, state_size, action_size, args): 30 | super(Critic, self).__init__() 31 | 32 | # Q1 architecture 33 | self.fc1 = nn.Linear(state_size + action_size, args.hidden_size) 34 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 35 | self.fc3 = nn.Linear(args.hidden_size, 1) 36 | 37 | # Q2 architecture 38 | self.fc4 = nn.Linear(state_size + action_size, args.hidden_size) 39 | self.fc5 = nn.Linear(args.hidden_size, args.hidden_size) 40 | self.fc6 = nn.Linear(args.hidden_size, 1) 41 | 42 | def forward(self, states, actions): 43 | x = torch.cat([states, actions], dim=1) 44 | 45 | x1 = torch.relu(self.fc1(x)) 46 | x1 = torch.relu(self.fc2(x1)) 47 | q_value1 = self.fc3(x1) 48 | 49 | x2 = torch.relu(self.fc4(x)) 50 | x2 = torch.relu(self.fc5(x2)) 51 | q_value2 = self.fc6(x2) 52 | 53 | return q_value1, q_value2 -------------------------------------------------------------------------------- /pendulum/sac/save_model/model.pth.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/sac/save_model/model.pth.tar -------------------------------------------------------------------------------- /pendulum/sac/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import random 4 | import argparse 5 | import numpy as np 6 | 7 | import torch 8 | from utils import * 9 | from model import Actor, Critic 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--env_name', type=str, default="Pendulum-v0") 13 | parser.add_argument("--load_model", type=str, default='model.pth.tar') 14 | parser.add_argument('--render', action="store_true", default=True) 15 | parser.add_argument('--hidden_size', type=int, default=64) 16 | parser.add_argument('--iter', type=int, default=10000) 17 | parser.add_argument('--log_interval', type=int, default=10) 18 | args = parser.parse_args() 19 | 20 | if __name__=="__main__": 21 | env = gym.make(args.env_name) 22 | env.seed(500) 23 | torch.manual_seed(500) 24 | 25 | state_size = env.observation_space.shape[0] 26 | action_size = env.action_space.shape[0] 27 | print('state size:', state_size) 28 | print('action size:', action_size) 29 | 30 | actor = Actor(state_size, action_size, args) 31 | 32 | if args.load_model is not None: 33 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) 34 | pretrained_model = torch.load(pretrained_model_path) 35 | actor.load_state_dict(pretrained_model) 36 | 37 | steps = 0 38 | 39 | for episode in range(args.iter): 40 | done = False 41 | score = 0 42 | 43 | state = env.reset() 44 | state = np.reshape(state, [1, state_size]) 45 | 46 | while not done: 47 | if args.render: 48 | env.render() 49 | 50 | steps += 1 51 | 52 | mu, std = actor(torch.Tensor(state)) 53 | action = get_action(mu, std) 54 | 55 | next_state, reward, done, _ = env.step(action) 56 | 57 | next_state = np.reshape(next_state, [1, state_size]) 58 | state = next_state 59 | score += reward 60 | 61 | if episode % args.log_interval == 0: 62 | print('{} episode | score: {:.2f}'.format(episode, score[0])) -------------------------------------------------------------------------------- /pendulum/sac/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.distributions import Normal 3 | 4 | def get_action(mu, std): 5 | normal = Normal(mu, std) 6 | z = normal.rsample() # reparameterization trick (mean + std * N(0,1)) 7 | action = torch.tanh(z) 8 | 9 | return action.data.numpy() 10 | 11 | def eval_action(mu, std, epsilon=1e-6): 12 | normal = Normal(mu, std) 13 | z = normal.rsample() # reparameterization trick (mean + std * N(0,1)) 14 | action = torch.tanh(z) 15 | log_prob = normal.log_prob(z) 16 | 17 | # Enforcing Action Bounds 18 | log_prob -= torch.log(1 - action.pow(2) + epsilon) 19 | log_policy = log_prob.sum(1, keepdim=True) 20 | 21 | return action, log_policy 22 | 23 | def hard_target_update(net, target_net): 24 | target_net.load_state_dict(net.state_dict()) 25 | 26 | def soft_target_update(net, target_net, tau): 27 | for param, target_param in zip(net.parameters(), target_net.parameters()): 28 | target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data) -------------------------------------------------------------------------------- /pendulum/tnpg/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/tnpg/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /pendulum/tnpg/__pycache__/tnpg.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/tnpg/__pycache__/tnpg.cpython-36.pyc -------------------------------------------------------------------------------- /pendulum/tnpg/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/tnpg/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /pendulum/tnpg/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Actor(nn.Module): 5 | def __init__(self, state_size, action_size, args): 6 | super(Actor, self).__init__() 7 | self.fc1 = nn.Linear(state_size, args.hidden_size) 8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 9 | self.fc3 = nn.Linear(args.hidden_size, action_size) 10 | 11 | def forward(self, x): 12 | x = torch.tanh(self.fc1(x)) 13 | x = torch.tanh(self.fc2(x)) 14 | 15 | mu = self.fc3(x) 16 | log_std = torch.zeros_like(mu) 17 | std = torch.exp(log_std) 18 | 19 | return mu, std 20 | 21 | class Critic(nn.Module): 22 | def __init__(self, state_size, args): 23 | super(Critic, self).__init__() 24 | self.fc1 = nn.Linear(state_size, args.hidden_size) 25 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 26 | self.fc3 = nn.Linear(args.hidden_size, 1) 27 | 28 | def forward(self, x): 29 | x = torch.tanh(self.fc1(x)) 30 | x = torch.tanh(self.fc2(x)) 31 | value = self.fc3(x) 32 | 33 | return value -------------------------------------------------------------------------------- /pendulum/tnpg/save_model/model.pth.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/tnpg/save_model/model.pth.tar -------------------------------------------------------------------------------- /pendulum/tnpg/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import argparse 4 | import numpy as np 5 | 6 | import torch 7 | from utils import * 8 | from model import Actor 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--env_name', type=str, default="Pendulum-v0") 12 | parser.add_argument("--load_model", type=str, default='model.pth.tar') 13 | parser.add_argument('--render', action="store_true", default=True) 14 | parser.add_argument('--hidden_size', type=int, default=64) 15 | parser.add_argument('--iter', type=int, default=10000) 16 | parser.add_argument('--log_interval', type=int, default=10) 17 | args = parser.parse_args() 18 | 19 | if __name__=="__main__": 20 | env = gym.make(args.env_name) 21 | env.seed(500) 22 | torch.manual_seed(500) 23 | 24 | state_size = env.observation_space.shape[0] 25 | action_size = env.action_space.shape[0] 26 | print('state size:', state_size) 27 | print('action size:', action_size) 28 | 29 | actor = Actor(state_size, action_size, args) 30 | 31 | if args.load_model is not None: 32 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) 33 | pretrained_model = torch.load(pretrained_model_path) 34 | actor.load_state_dict(pretrained_model) 35 | 36 | steps = 0 37 | 38 | for episode in range(args.iter): 39 | done = False 40 | score = 0 41 | 42 | state = env.reset() 43 | state = np.reshape(state, [1, state_size]) 44 | 45 | while not done: 46 | if args.render: 47 | env.render() 48 | 49 | steps += 1 50 | 51 | mu, std = actor(torch.Tensor(state)) 52 | action = get_action(mu, std) 53 | 54 | next_state, reward, done, _ = env.step(action) 55 | 56 | next_state = np.reshape(next_state, [1, state_size]) 57 | state = next_state 58 | score += reward 59 | 60 | if episode % args.log_interval == 0: 61 | print('{} episode | score: {:.2f}'.format(episode, score[0])) -------------------------------------------------------------------------------- /pendulum/tnpg/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import argparse 4 | import numpy as np 5 | from collections import deque 6 | 7 | import torch 8 | import torch.optim as optim 9 | 10 | from utils import * 11 | from model import Actor, Critic 12 | from tensorboardX import SummaryWriter 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('--env_name', type=str, default="Pendulum-v0") 16 | parser.add_argument('--load_model', type=str, default=None) 17 | parser.add_argument('--save_path', default='./save_model/', help='') 18 | parser.add_argument('--render', action="store_true", default=False) 19 | parser.add_argument('--gamma', type=float, default=0.99) 20 | parser.add_argument('--hidden_size', type=int, default=64) 21 | parser.add_argument('--critic_lr', type=float, default=1e-3) 22 | parser.add_argument('--max_kl', type=float, default=1e-2) 23 | parser.add_argument('--max_iter_num', type=int, default=1000) 24 | parser.add_argument('--total_sample_size', type=int, default=2048) 25 | parser.add_argument('--log_interval', type=int, default=5) 26 | parser.add_argument('--goal_score', type=int, default=-300) 27 | parser.add_argument('--logdir', type=str, default='./logs', 28 | help='tensorboardx logs directory') 29 | args = parser.parse_args() 30 | 31 | def train_model(actor, critic, critic_optimizer, trajectories): 32 | trajectories = np.array(trajectories) 33 | states = np.vstack(trajectories[:, 0]) 34 | actions = list(trajectories[:, 1]) 35 | rewards = list(trajectories[:, 2]) 36 | masks = list(trajectories[:, 3]) 37 | 38 | actions = torch.Tensor(actions).squeeze(1) 39 | rewards = torch.Tensor(rewards).squeeze(1) 40 | masks = torch.Tensor(masks) 41 | 42 | # ---------------------------- 43 | # step 1: get returns 44 | returns = get_returns(rewards, masks, args.gamma) 45 | 46 | # ---------------------------- 47 | # step 2: update ciritic 48 | criterion = torch.nn.MSELoss() 49 | 50 | values = critic(torch.Tensor(states)) 51 | targets = returns.unsqueeze(1) 52 | 53 | critic_loss = criterion(values, targets) 54 | critic_optimizer.zero_grad() 55 | critic_loss.backward() 56 | critic_optimizer.step() 57 | 58 | # ---------------------------- 59 | # step 3: get gradient of actor loss 60 | mu, std = actor(torch.Tensor(states)) 61 | log_policy = get_log_prob(actions, mu, std) 62 | actor_loss = get_loss(actor, values, targets, log_policy) 63 | 64 | actor_loss_grad = torch.autograd.grad(actor_loss, actor.parameters()) 65 | actor_loss_grad = flat_grad(actor_loss_grad) 66 | 67 | # ---------------------------- 68 | # step 4: get search direction through conjugate gradient method 69 | search_dir = conjugate_gradient(actor, states, actor_loss_grad.data, nsteps=10) 70 | 71 | # ---------------------------- 72 | # step 5: get step size and maximal step 73 | gHg = (hessian_vector_product(actor, states, search_dir) * search_dir).sum(0, keepdim=True) 74 | step_size = torch.sqrt(2 * args.max_kl / gHg)[0] 75 | maximal_step = step_size * search_dir 76 | 77 | # ---------------------------- 78 | # step 6: update actor 79 | params = flat_params(actor) 80 | 81 | new_params = params + maximal_step 82 | update_model(actor, new_params) 83 | 84 | 85 | def main(): 86 | env = gym.make(args.env_name) 87 | env.seed(500) 88 | torch.manual_seed(500) 89 | 90 | state_size = env.observation_space.shape[0] 91 | action_size = env.action_space.shape[0] 92 | print('state size:', state_size) 93 | print('action size:', action_size) 94 | 95 | actor = Actor(state_size, action_size, args) 96 | critic = Critic(state_size, args) 97 | critic_optimizer = optim.Adam(critic.parameters(), lr=args.critic_lr) 98 | 99 | writer = SummaryWriter(args.logdir) 100 | 101 | recent_rewards = deque(maxlen=100) 102 | episodes = 0 103 | 104 | for iter in range(args.max_iter_num): 105 | trajectories = deque() 106 | steps = 0 107 | 108 | while steps < args.total_sample_size: 109 | done = False 110 | score = 0 111 | episodes += 1 112 | 113 | state = env.reset() 114 | state = np.reshape(state, [1, state_size]) 115 | 116 | while not done: 117 | if args.render: 118 | env.render() 119 | 120 | steps += 1 121 | 122 | mu, std = actor(torch.Tensor(state)) 123 | action = get_action(mu, std) 124 | 125 | next_state, reward, done, _ = env.step(action) 126 | 127 | mask = 0 if done else 1 128 | 129 | trajectories.append((state, action, reward, mask)) 130 | 131 | next_state = np.reshape(next_state, [1, state_size]) 132 | state = next_state 133 | score += reward 134 | 135 | if done: 136 | recent_rewards.append(score) 137 | 138 | actor.train() 139 | train_model(actor, critic, critic_optimizer, trajectories) 140 | 141 | writer.add_scalar('log/score', float(score), episodes) 142 | 143 | if iter % args.log_interval == 0: 144 | print('{} iter | {} episode | score_avg: {:.2f}'.format(iter, episodes, np.mean(recent_rewards))) 145 | 146 | if np.mean(recent_rewards) > args.goal_score: 147 | if not os.path.isdir(args.save_path): 148 | os.makedirs(args.save_path) 149 | 150 | ckpt_path = args.save_path + 'model.pth.tar' 151 | torch.save(actor.state_dict(), ckpt_path) 152 | print('Recent rewards exceed -300. So end') 153 | break 154 | 155 | if __name__ == '__main__': 156 | main() -------------------------------------------------------------------------------- /pendulum/tnpg/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.distributions import Normal 4 | 5 | def get_action(mu, std): 6 | normal = Normal(mu, std) 7 | action = normal.sample() 8 | 9 | return action.data.numpy() 10 | 11 | def get_returns(rewards, masks, gamma): 12 | returns = torch.zeros_like(rewards) 13 | running_returns = 0 14 | 15 | for t in reversed(range(0, len(rewards))): 16 | running_returns = rewards[t] + masks[t] * gamma * running_returns 17 | returns[t] = running_returns 18 | 19 | returns = (returns - returns.mean()) / returns.std() 20 | 21 | return returns 22 | 23 | def get_loss(actor, values, targets, log_policy): 24 | advantages = targets - values 25 | 26 | loss = log_policy * advantages 27 | loss = loss.mean() 28 | 29 | return loss 30 | 31 | def get_log_prob(actions, mu, std): 32 | normal = Normal(mu, std) 33 | log_prob = normal.log_prob(actions) 34 | 35 | return log_prob 36 | 37 | 38 | # from openai baseline code 39 | # https://github.com/openai/baselines/blob/master/baselines/common/cg.py 40 | def conjugate_gradient(actor, states, b, nsteps, residual_tol=1e-10): 41 | x = torch.zeros(b.size()) 42 | r = b.clone() 43 | p = b.clone() 44 | rdotr = torch.dot(r, r) 45 | 46 | for i in range(nsteps): # nsteps = 10 47 | f_Ax = hessian_vector_product(actor, states, p, cg_damping=1e-1) 48 | alpha = rdotr / torch.dot(p, f_Ax) 49 | 50 | x += alpha * p 51 | r -= alpha * f_Ax 52 | 53 | new_rdotr = torch.dot(r, r) 54 | betta = new_rdotr / rdotr 55 | 56 | p = r + betta * p 57 | rdotr = new_rdotr 58 | 59 | if rdotr < residual_tol: # residual_tol = 0.0000000001 60 | break 61 | 62 | return x 63 | 64 | def hessian_vector_product(actor, states, p, cg_damping=1e-1): 65 | p.detach() 66 | kl = kl_divergence(new_actor=actor, old_actor=actor, states=states) 67 | kl = kl.mean() 68 | 69 | kl_grad = torch.autograd.grad(kl, actor.parameters(), create_graph=True) 70 | kl_grad = flat_grad(kl_grad) 71 | 72 | kl_grad_p = (kl_grad * p).sum() 73 | kl_hessian = torch.autograd.grad(kl_grad_p, actor.parameters()) 74 | kl_hessian = flat_hessian(kl_hessian) 75 | 76 | return kl_hessian + p * cg_damping # cg_damping = 0.1 77 | 78 | def kl_divergence(new_actor, old_actor, states): 79 | mu, std = new_actor(torch.Tensor(states)) 80 | 81 | mu_old, std_old = old_actor(torch.Tensor(states)) 82 | mu_old = mu_old.detach() 83 | std_old = std_old.detach() 84 | 85 | # kl divergence between old policy and new policy : D( pi_old || pi_new ) 86 | # pi_old -> mu_old, std_old / pi_new -> mu, std 87 | # be careful of calculating KL-divergence. It is not symmetric metric. 88 | kl = torch.log(std / std_old) + (std_old.pow(2) + (mu_old - mu).pow(2)) / (2.0 * std.pow(2)) - 0.5 89 | 90 | return kl.sum(1, keepdim=True) 91 | 92 | 93 | def flat_grad(grads): 94 | grad_flatten = [] 95 | for grad in grads: 96 | grad_flatten.append(grad.view(-1)) 97 | grad_flatten = torch.cat(grad_flatten) 98 | 99 | return grad_flatten 100 | 101 | def flat_hessian(hessians): 102 | hessians_flatten = [] 103 | for hessian in hessians: 104 | hessians_flatten.append(hessian.contiguous().view(-1)) 105 | hessians_flatten = torch.cat(hessians_flatten).data 106 | 107 | return hessians_flatten 108 | 109 | 110 | def flat_params(model): 111 | params = [] 112 | for param in model.parameters(): 113 | params.append(param.data.view(-1)) 114 | params_flatten = torch.cat(params) 115 | 116 | return params_flatten 117 | 118 | def update_model(model, new_params): 119 | index = 0 120 | for params in model.parameters(): 121 | params_length = len(params.view(-1)) 122 | new_param = new_params[index: index + params_length] 123 | new_param = new_param.view(params.size()) 124 | params.data.copy_(new_param) 125 | index += params_length -------------------------------------------------------------------------------- /pendulum/trpo/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/trpo/.DS_Store -------------------------------------------------------------------------------- /pendulum/trpo/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/trpo/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /pendulum/trpo/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/trpo/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /pendulum/trpo/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Actor(nn.Module): 5 | def __init__(self, state_size, action_size, args): 6 | super(Actor, self).__init__() 7 | self.fc1 = nn.Linear(state_size, args.hidden_size) 8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 9 | self.fc3 = nn.Linear(args.hidden_size, action_size) 10 | 11 | def forward(self, x): 12 | x = torch.tanh(self.fc1(x)) 13 | x = torch.tanh(self.fc2(x)) 14 | 15 | mu = self.fc3(x) 16 | log_std = torch.zeros_like(mu) 17 | std = torch.exp(log_std) 18 | 19 | return mu, std 20 | 21 | class Critic(nn.Module): 22 | def __init__(self, state_size, args): 23 | super(Critic, self).__init__() 24 | self.fc1 = nn.Linear(state_size, args.hidden_size) 25 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 26 | self.fc3 = nn.Linear(args.hidden_size, 1) 27 | 28 | def forward(self, x): 29 | x = torch.tanh(self.fc1(x)) 30 | x = torch.tanh(self.fc2(x)) 31 | value = self.fc3(x) 32 | 33 | return value -------------------------------------------------------------------------------- /pendulum/trpo/save_model/model.pth.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/trpo/save_model/model.pth.tar -------------------------------------------------------------------------------- /pendulum/trpo/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import argparse 4 | import numpy as np 5 | 6 | import torch 7 | from utils import * 8 | from model import Actor 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--env_name', type=str, default="Pendulum-v0") 12 | parser.add_argument("--load_model", type=str, default='model.pth.tar') 13 | parser.add_argument('--render', action="store_true", default=True) 14 | parser.add_argument('--hidden_size', type=int, default=64) 15 | parser.add_argument('--iter', type=int, default=10000) 16 | parser.add_argument('--log_interval', type=int, default=10) 17 | args = parser.parse_args() 18 | 19 | if __name__=="__main__": 20 | env = gym.make(args.env_name) 21 | env.seed(500) 22 | torch.manual_seed(500) 23 | 24 | state_size = env.observation_space.shape[0] 25 | action_size = env.action_space.shape[0] 26 | print('state size:', state_size) 27 | print('action size:', action_size) 28 | 29 | actor = Actor(state_size, action_size, args) 30 | 31 | if args.load_model is not None: 32 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) 33 | pretrained_model = torch.load(pretrained_model_path) 34 | actor.load_state_dict(pretrained_model) 35 | 36 | steps = 0 37 | 38 | for episode in range(args.iter): 39 | done = False 40 | score = 0 41 | 42 | state = env.reset() 43 | state = np.reshape(state, [1, state_size]) 44 | 45 | while not done: 46 | if args.render: 47 | env.render() 48 | 49 | steps += 1 50 | 51 | mu, std = actor(torch.Tensor(state)) 52 | action = get_action(mu, std) 53 | 54 | next_state, reward, done, _ = env.step(action) 55 | 56 | next_state = np.reshape(next_state, [1, state_size]) 57 | state = next_state 58 | score += reward 59 | 60 | if episode % args.log_interval == 0: 61 | print('{} episode | score: {:.2f}'.format(episode, score[0])) -------------------------------------------------------------------------------- /pendulum/trpo/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.distributions import Normal 4 | 5 | def get_action(mu, std): 6 | normal = Normal(mu, std) 7 | action = normal.sample() 8 | 9 | return action.data.numpy() 10 | 11 | def get_returns(rewards, masks, gamma): 12 | returns = torch.zeros_like(rewards) 13 | running_returns = 0 14 | 15 | for t in reversed(range(0, len(rewards))): 16 | running_returns = rewards[t] + masks[t] * gamma * running_returns 17 | returns[t] = running_returns 18 | 19 | returns = (returns - returns.mean()) / returns.std() 20 | 21 | return returns 22 | 23 | def get_log_prob(actions, mu, std): 24 | normal = Normal(mu, std) 25 | log_prob = normal.log_prob(actions) 26 | 27 | return log_prob 28 | 29 | def surrogate_loss(actor, values, targets, states, old_policy, actions): 30 | mu, std = actor(torch.Tensor(states)) 31 | new_policy = get_log_prob(actions, mu, std) 32 | 33 | advantages = targets - values 34 | 35 | surrogate_loss = torch.exp(new_policy - old_policy) * advantages 36 | surrogate_loss = surrogate_loss.mean() 37 | 38 | return surrogate_loss 39 | 40 | 41 | # from openai baseline code 42 | # https://github.com/openai/baselines/blob/master/baselines/common/cg.py 43 | def conjugate_gradient(actor, states, b, nsteps, residual_tol=1e-10): 44 | x = torch.zeros(b.size()) 45 | r = b.clone() 46 | p = b.clone() 47 | rdotr = torch.dot(r, r) 48 | 49 | for i in range(nsteps): # nsteps = 10 50 | Ap = hessian_vector_product(actor, states, p, cg_damping=1e-1) 51 | alpha = rdotr / torch.dot(p, Ap) 52 | 53 | x += alpha * p 54 | r -= alpha * Ap 55 | 56 | new_rdotr = torch.dot(r, r) 57 | betta = new_rdotr / rdotr 58 | 59 | p = r + betta * p 60 | rdotr = new_rdotr 61 | 62 | if rdotr < residual_tol: # residual_tol = 0.0000000001 63 | break 64 | return x 65 | 66 | def hessian_vector_product(actor, states, p, cg_damping=1e-1): 67 | p.detach() 68 | kl = kl_divergence(new_actor=actor, old_actor=actor, states=states) 69 | kl = kl.mean() 70 | 71 | kl_grad = torch.autograd.grad(kl, actor.parameters(), create_graph=True) 72 | kl_grad = flat_grad(kl_grad) 73 | 74 | kl_grad_p = (kl_grad * p).sum() 75 | kl_hessian = torch.autograd.grad(kl_grad_p, actor.parameters()) 76 | kl_hessian = flat_hessian(kl_hessian) 77 | 78 | return kl_hessian + p * cg_damping 79 | 80 | def kl_divergence(new_actor, old_actor, states): 81 | mu, std = new_actor(torch.Tensor(states)) 82 | 83 | mu_old, std_old = old_actor(torch.Tensor(states)) 84 | mu_old = mu_old.detach() 85 | std_old = std_old.detach() 86 | 87 | # kl divergence between old policy and new policy : D( pi_old || pi_new ) 88 | # pi_old -> mu_old, std_old / pi_new -> mu, std 89 | # be careful of calculating KL-divergence. It is not symmetric metric. 90 | kl = torch.log(std / std_old) + (std_old.pow(2) + (mu_old - mu).pow(2)) / (2.0 * std.pow(2)) - 0.5 91 | return kl.sum(1, keepdim=True) 92 | 93 | 94 | def flat_grad(grads): 95 | grad_flatten = [] 96 | for grad in grads: 97 | grad_flatten.append(grad.view(-1)) 98 | grad_flatten = torch.cat(grad_flatten) 99 | return grad_flatten 100 | 101 | def flat_hessian(hessians): 102 | hessians_flatten = [] 103 | for hessian in hessians: 104 | hessians_flatten.append(hessian.contiguous().view(-1)) 105 | hessians_flatten = torch.cat(hessians_flatten).data 106 | return hessians_flatten 107 | 108 | 109 | def flat_params(model): 110 | params = [] 111 | for param in model.parameters(): 112 | params.append(param.data.view(-1)) 113 | params_flatten = torch.cat(params) 114 | return params_flatten 115 | 116 | def update_model(model, new_params): 117 | index = 0 118 | for params in model.parameters(): 119 | params_length = len(params.view(-1)) 120 | new_param = new_params[index : index + params_length] 121 | new_param = new_param.view(params.size()) 122 | params.data.copy_(new_param) 123 | index += params_length 124 | 125 | 126 | def backtracking_line_search(old_actor, actor, actor_loss, actor_loss_grad, 127 | old_policy, params, maximal_step, max_kl, 128 | values, targets, states, actions): 129 | backtrac_coef = 1.0 130 | alpha = 0.5 131 | beta = 0.5 132 | flag = False 133 | 134 | expected_improve = (actor_loss_grad * maximal_step).sum(0, keepdim=True) 135 | 136 | for i in range(10): 137 | new_params = params + backtrac_coef * maximal_step 138 | update_model(actor, new_params) 139 | 140 | new_actor_loss = surrogate_loss(actor, values, targets, states, old_policy.detach(), actions) 141 | 142 | loss_improve = new_actor_loss - actor_loss 143 | expected_improve *= backtrac_coef 144 | improve_condition = loss_improve / expected_improve 145 | 146 | kl = kl_divergence(new_actor=actor, old_actor=old_actor, states=states) 147 | kl = kl.mean() 148 | 149 | if kl < max_kl and improve_condition > alpha: 150 | flag = True 151 | break 152 | 153 | backtrac_coef *= beta 154 | 155 | if not flag: 156 | params = flat_params(old_actor) 157 | update_model(actor, params) 158 | print('policy update does not impove the surrogate') -------------------------------------------------------------------------------- /pendulum/trpo_gae/__pycache__/model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/trpo_gae/__pycache__/model.cpython-36.pyc -------------------------------------------------------------------------------- /pendulum/trpo_gae/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/trpo_gae/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /pendulum/trpo_gae/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class Actor(nn.Module): 5 | def __init__(self, state_size, action_size, args): 6 | super(Actor, self).__init__() 7 | self.fc1 = nn.Linear(state_size, args.hidden_size) 8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 9 | self.fc3 = nn.Linear(args.hidden_size, action_size) 10 | 11 | def forward(self, x): 12 | x = torch.tanh(self.fc1(x)) 13 | x = torch.tanh(self.fc2(x)) 14 | 15 | mu = self.fc3(x) 16 | log_std = torch.zeros_like(mu) 17 | std = torch.exp(log_std) 18 | 19 | return mu, std 20 | 21 | class Critic(nn.Module): 22 | def __init__(self, state_size, args): 23 | super(Critic, self).__init__() 24 | self.fc1 = nn.Linear(state_size, args.hidden_size) 25 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size) 26 | self.fc3 = nn.Linear(args.hidden_size, 1) 27 | 28 | def forward(self, x): 29 | x = torch.tanh(self.fc1(x)) 30 | x = torch.tanh(self.fc2(x)) 31 | value = self.fc3(x) 32 | 33 | return value 34 | -------------------------------------------------------------------------------- /pendulum/trpo_gae/save_model/model.pth.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/trpo_gae/save_model/model.pth.tar -------------------------------------------------------------------------------- /pendulum/trpo_gae/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import argparse 4 | import numpy as np 5 | 6 | import torch 7 | from utils import * 8 | from model import Actor 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--env_name', type=str, default="Pendulum-v0") 12 | parser.add_argument("--load_model", type=str, default='model.pth.tar') 13 | parser.add_argument('--render', action="store_true", default=True) 14 | parser.add_argument('--hidden_size', type=int, default=64) 15 | parser.add_argument('--iter', type=int, default=10000) 16 | parser.add_argument('--log_interval', type=int, default=10) 17 | args = parser.parse_args() 18 | 19 | if __name__=="__main__": 20 | env = gym.make(args.env_name) 21 | env.seed(500) 22 | torch.manual_seed(500) 23 | 24 | state_size = env.observation_space.shape[0] 25 | action_size = env.action_space.shape[0] 26 | print('state size:', state_size) 27 | print('action size:', action_size) 28 | 29 | actor = Actor(state_size, action_size, args) 30 | 31 | if args.load_model is not None: 32 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) 33 | pretrained_model = torch.load(pretrained_model_path) 34 | actor.load_state_dict(pretrained_model) 35 | 36 | steps = 0 37 | 38 | for episode in range(args.iter): 39 | done = False 40 | score = 0 41 | 42 | state = env.reset() 43 | state = np.reshape(state, [1, state_size]) 44 | 45 | while not done: 46 | if args.render: 47 | env.render() 48 | 49 | steps += 1 50 | 51 | mu, std = actor(torch.Tensor(state)) 52 | action = get_action(mu, std) 53 | 54 | next_state, reward, done, _ = env.step(action) 55 | 56 | next_state = np.reshape(next_state, [1, state_size]) 57 | state = next_state 58 | score += reward 59 | 60 | if episode % args.log_interval == 0: 61 | print('{} episode | score: {:.2f}'.format(episode, score[0])) -------------------------------------------------------------------------------- /pendulum/trpo_gae/utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | from torch.distributions import Normal 4 | 5 | def get_action(mu, std): 6 | normal = Normal(mu, std) 7 | action = normal.sample() 8 | 9 | return action.data.numpy() 10 | 11 | def get_gae(rewards, masks, values, args): 12 | returns = torch.zeros_like(rewards) 13 | advantages = torch.zeros_like(rewards) 14 | 15 | running_returns = 0 16 | previous_value = 0 17 | running_advants = 0 18 | 19 | for t in reversed(range(0, len(rewards))): 20 | # return 21 | running_returns = rewards[t] + masks[t] * args.gamma * running_returns 22 | returns[t] = running_returns 23 | 24 | # advantage 25 | running_deltas = rewards[t] + masks[t] * args.gamma * previous_value - values.data[t] 26 | running_advants = running_deltas + masks[t] * args.gamma * args.lamda * running_advants 27 | 28 | previous_value = values.data[t] 29 | advantages[t] = running_advants 30 | 31 | advantages = (advantages - advantages.mean()) / advantages.std() 32 | 33 | return returns, advantages 34 | 35 | def get_log_prob(actions, mu, std): 36 | normal = Normal(mu, std) 37 | log_prob = normal.log_prob(actions) 38 | 39 | return log_prob 40 | 41 | def surrogate_loss(actor, advantages, states, old_policy, actions): 42 | mu, std = actor(torch.Tensor(states)) 43 | new_policy = get_log_prob(actions, mu, std) 44 | 45 | advantages = advantages.unsqueeze(1) 46 | 47 | surrogate_loss = torch.exp(new_policy - old_policy) * advantages 48 | surrogate_loss = surrogate_loss.mean() 49 | 50 | return surrogate_loss 51 | 52 | 53 | # from openai baseline code 54 | # https://github.com/openai/baselines/blob/master/baselines/common/cg.py 55 | def conjugate_gradient(actor, states, b, nsteps, residual_tol=1e-10): 56 | x = torch.zeros(b.size()) 57 | r = b.clone() 58 | p = b.clone() 59 | rdotr = torch.dot(r, r) 60 | 61 | for i in range(nsteps): # nsteps = 10 62 | Ap = hessian_vector_product(actor, states, p, cg_damping=1e-1) 63 | alpha = rdotr / torch.dot(p, Ap) 64 | 65 | x += alpha * p 66 | r -= alpha * Ap 67 | 68 | new_rdotr = torch.dot(r, r) 69 | betta = new_rdotr / rdotr 70 | 71 | p = r + betta * p 72 | rdotr = new_rdotr 73 | 74 | if rdotr < residual_tol: # residual_tol = 0.0000000001 75 | break 76 | return x 77 | 78 | def hessian_vector_product(actor, states, p, cg_damping=1e-1): 79 | p.detach() 80 | kl = kl_divergence(new_actor=actor, old_actor=actor, states=states) 81 | kl = kl.mean() 82 | 83 | kl_grad = torch.autograd.grad(kl, actor.parameters(), create_graph=True) 84 | kl_grad = flat_grad(kl_grad) 85 | 86 | kl_grad_p = (kl_grad * p).sum() 87 | kl_hessian = torch.autograd.grad(kl_grad_p, actor.parameters()) 88 | kl_hessian = flat_hessian(kl_hessian) 89 | 90 | return kl_hessian + p * cg_damping # cg_damping = 0.1 91 | 92 | def kl_divergence(new_actor, old_actor, states): 93 | mu, std = new_actor(torch.Tensor(states)) 94 | 95 | mu_old, std_old = old_actor(torch.Tensor(states)) 96 | mu_old = mu_old.detach() 97 | std_old = std_old.detach() 98 | 99 | # kl divergence between old policy and new policy : D( pi_old || pi_new ) 100 | # pi_old -> mu_old, std_old / pi_new -> mu, std 101 | # be careful of calculating KL-divergence. It is not symmetric metric. 102 | kl = torch.log(std / std_old) + (std_old.pow(2) + (mu_old - mu).pow(2)) / (2.0 * std.pow(2)) - 0.5 103 | return kl.sum(1, keepdim=True) 104 | 105 | 106 | def flat_grad(grads): 107 | grad_flatten = [] 108 | for grad in grads: 109 | grad_flatten.append(grad.view(-1)) 110 | grad_flatten = torch.cat(grad_flatten) 111 | return grad_flatten 112 | 113 | def flat_hessian(hessians): 114 | hessians_flatten = [] 115 | for hessian in hessians: 116 | hessians_flatten.append(hessian.contiguous().view(-1)) 117 | hessians_flatten = torch.cat(hessians_flatten).data 118 | return hessians_flatten 119 | 120 | 121 | def flat_params(model): 122 | params = [] 123 | for param in model.parameters(): 124 | params.append(param.data.view(-1)) 125 | params_flatten = torch.cat(params) 126 | return params_flatten 127 | 128 | def update_model(model, new_params): 129 | index = 0 130 | for params in model.parameters(): 131 | params_length = len(params.view(-1)) 132 | new_param = new_params[index: index + params_length] 133 | new_param = new_param.view(params.size()) 134 | params.data.copy_(new_param) 135 | index += params_length 136 | 137 | 138 | def backtracking_line_search(old_actor, actor, actor_loss, actor_loss_grad, 139 | old_policy, params, maximal_step, max_kl, 140 | advantages, states, actions): 141 | backtrac_coef = 1.0 142 | alpha = 0.5 143 | beta = 0.5 144 | flag = False 145 | 146 | expected_improve = (actor_loss_grad * maximal_step).sum(0, keepdim=True) 147 | 148 | for i in range(10): 149 | new_params = params + backtrac_coef * maximal_step 150 | update_model(actor, new_params) 151 | 152 | new_actor_loss = surrogate_loss(actor, advantages, states, old_policy.detach(), actions) 153 | 154 | loss_improve = new_actor_loss - actor_loss 155 | expected_improve *= backtrac_coef 156 | improve_condition = loss_improve / expected_improve 157 | 158 | kl = kl_divergence(new_actor=actor, old_actor=old_actor, states=states) 159 | kl = kl.mean() 160 | 161 | if kl < max_kl and improve_condition > alpha: 162 | flag = True 163 | break 164 | 165 | backtrac_coef *= beta 166 | 167 | if not flag: 168 | params = flat_params(old_actor) 169 | update_model(actor, params) 170 | print('policy update does not impove the surrogate') --------------------------------------------------------------------------------