├── .DS_Store
├── LICENSE
├── README.md
├── cartpole
├── .DS_Store
├── a2c
│ ├── .DS_Store
│ ├── __pycache__
│ │ └── model.cpython-36.pyc
│ ├── model.py
│ ├── save_model
│ │ └── model.pth.tar
│ ├── test.py
│ └── train.py
├── ddqn
│ ├── .DS_Store
│ ├── __pycache__
│ │ └── model.cpython-36.pyc
│ ├── model.py
│ ├── save_model
│ │ └── model.pth.tar
│ ├── test.py
│ └── train.py
└── dqn
│ ├── __pycache__
│ ├── model.cpython-36.pyc
│ └── model.cpython-37.pyc
│ ├── logs
│ └── .DS_Store
│ ├── model.py
│ ├── save_model
│ └── model.pth.tar
│ ├── test.py
│ └── train.py
├── img
├── .DS_Store
├── cartpole.png
└── pendulum.png
├── mountaincar
├── .DS_Store
├── app
│ ├── .DS_Store
│ ├── __pycache__
│ │ ├── app.cpython-36.pyc
│ │ └── train.cpython-36.pyc
│ ├── app.py
│ ├── expert_demo
│ │ ├── .DS_Store
│ │ ├── expert_demo.npy
│ │ └── make_expert.py
│ ├── learning_curves
│ │ ├── .DS_Store
│ │ └── app_eps_60000.png
│ ├── results
│ │ ├── .DS_Store
│ │ ├── app_q_table.npy
│ │ └── test_rendering_60000.gif
│ ├── test.py
│ └── train.py
├── ddpg
│ ├── __pycache__
│ │ ├── model.cpython-36.pyc
│ │ └── utils.cpython-36.pyc
│ ├── model.py
│ ├── save_model
│ │ └── model.pth.tar
│ ├── test.py
│ ├── train.py
│ └── utils.py
├── maxent
│ ├── .DS_Store
│ ├── __pycache__
│ │ ├── maxent.cpython-36.pyc
│ │ ├── maxent_train.cpython-36.pyc
│ │ └── train.cpython-36.pyc
│ ├── expert_demo
│ │ ├── .DS_Store
│ │ ├── expert_demo.npy
│ │ └── make_expert.py
│ ├── learning_curves
│ │ ├── .DS_Store
│ │ └── maxent_eps_30000.png
│ ├── maxent.py
│ ├── results
│ │ ├── .DS_Store
│ │ ├── maxent_q_table.npy
│ │ └── test_rendering_30000.gif
│ ├── test.py
│ └── train.py
└── sac
│ ├── .DS_Store
│ ├── __pycache__
│ ├── model.cpython-36.pyc
│ └── utils.cpython-36.pyc
│ ├── model.py
│ ├── test.py
│ ├── train.py
│ └── utils.py
├── mujoco
├── .DS_Store
├── gail
│ ├── .DS_Store
│ ├── __pycache__
│ │ ├── hparams.cpython-36.pyc
│ │ ├── model.cpython-36.pyc
│ │ └── train_model.cpython-36.pyc
│ ├── expert_demo
│ │ └── expert_demo.p
│ ├── logs
│ │ └── .DS_Store
│ ├── main.py
│ ├── model.py
│ ├── save_model
│ │ └── .DS_Store
│ ├── test.py
│ ├── train_model.py
│ └── utils
│ │ ├── __pycache__
│ │ ├── running_state.cpython-36.pyc
│ │ ├── utils.cpython-36.pyc
│ │ └── zfilter.cpython-36.pyc
│ │ ├── utils.py
│ │ └── zfilter.py
├── ppo
│ ├── .DS_Store
│ ├── __pycache__
│ │ ├── hparams.cpython-36.pyc
│ │ ├── model.cpython-36.pyc
│ │ ├── ppo.cpython-36.pyc
│ │ └── train_model.cpython-36.pyc
│ ├── logs
│ │ └── .DS_Store
│ ├── main.py
│ ├── model.py
│ ├── ppo.py
│ ├── save_model
│ │ └── .DS_Store
│ ├── test.py
│ └── utils
│ │ ├── __pycache__
│ │ ├── running_state.cpython-36.pyc
│ │ ├── utils.cpython-36.pyc
│ │ └── zfilter.cpython-36.pyc
│ │ ├── utils.py
│ │ └── zfilter.py
├── tnpg
│ ├── .DS_Store
│ ├── __pycache__
│ │ ├── model.cpython-36.pyc
│ │ ├── tnpg.cpython-36.pyc
│ │ └── trpo.cpython-36.pyc
│ ├── model.py
│ ├── save_model
│ │ ├── 24model.pth
│ │ ├── 40model.pth
│ │ ├── 67model.pth
│ │ ├── 76model.pth
│ │ ├── 79model.pth
│ │ └── 86model.pth
│ ├── test.py
│ ├── tnpg.py
│ ├── train.py
│ └── utils
│ │ ├── __pycache__
│ │ ├── running_state.cpython-36.pyc
│ │ └── utils.cpython-36.pyc
│ │ ├── running_state.py
│ │ └── utils.py
├── trpo
│ ├── __pycache__
│ │ ├── model.cpython-36.pyc
│ │ └── trpo.cpython-36.pyc
│ ├── model.py
│ ├── test.py
│ ├── train.py
│ ├── trpo.py
│ └── utils
│ │ ├── __pycache__
│ │ ├── running_state.cpython-36.pyc
│ │ └── utils.cpython-36.pyc
│ │ ├── running_state.py
│ │ └── utils.py
└── vail
│ ├── .DS_Store
│ ├── __pycache__
│ ├── hparams.cpython-36.pyc
│ ├── model.cpython-36.pyc
│ └── train_model.cpython-36.pyc
│ ├── expert_demo
│ └── expert_demo.p
│ ├── logs
│ └── .DS_Store
│ ├── main.py
│ ├── model.py
│ ├── save_model
│ └── .DS_Store
│ ├── test.py
│ ├── train_model.py
│ └── utils
│ ├── __pycache__
│ ├── running_state.cpython-36.pyc
│ ├── utils.cpython-36.pyc
│ └── zfilter.cpython-36.pyc
│ ├── utils.py
│ └── zfilter.py
└── pendulum
├── .DS_Store
├── ddpg
├── .DS_Store
├── __pycache__
│ ├── model.cpython-36.pyc
│ ├── model.cpython-37.pyc
│ ├── utils.cpython-36.pyc
│ └── utils.cpython-37.pyc
├── model.py
├── save_model
│ ├── .DS_Store
│ └── model.pth.tar
├── test.py
├── train.py
└── utils.py
├── ppo
├── __pycache__
│ ├── model.cpython-36.pyc
│ └── utils.cpython-36.pyc
├── model.py
├── save_model
│ └── model.pth.tar
├── test.py
├── train.py
└── utils.py
├── ppo_gae
├── __pycache__
│ ├── model.cpython-36.pyc
│ └── utils.cpython-36.pyc
├── model.py
├── save_model
│ └── model.pth.tar
├── test.py
├── train.py
└── utils.py
├── sac
├── __pycache__
│ ├── model.cpython-36.pyc
│ └── utils.cpython-36.pyc
├── model.py
├── save_model
│ └── model.pth.tar
├── test.py
├── train.py
└── utils.py
├── tnpg
├── __pycache__
│ ├── model.cpython-36.pyc
│ ├── tnpg.cpython-36.pyc
│ └── utils.cpython-36.pyc
├── model.py
├── save_model
│ └── model.pth.tar
├── test.py
├── train.py
└── utils.py
├── trpo
├── .DS_Store
├── __pycache__
│ ├── model.cpython-36.pyc
│ └── utils.cpython-36.pyc
├── model.py
├── save_model
│ └── model.pth.tar
├── test.py
├── train.py
└── utils.py
└── trpo_gae
├── __pycache__
├── model.cpython-36.pyc
└── utils.cpython-36.pyc
├── model.py
├── save_model
└── model.pth.tar
├── test.py
├── train.py
└── utils.py
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/.DS_Store
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Dongmin Lee
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Reinforcement Learning Code with PyTorch
2 |
3 | ## Papers
4 |
5 | - [Deep Q-Network (DQN)](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf)
6 | - [Double DQN (DDQN)](https://arxiv.org/pdf/1509.06461.pdf)
7 | - [Advantage Actor-Critic (A2C)](http://incompleteideas.net/book/RLbook2018.pdf)
8 | - [Asynchronous Advantage Actor-Critic (A3C)](https://arxiv.org/pdf/1602.01783.pdf)
9 | - [Deep Deterministic Policy Gradient (DDPG)](https://arxiv.org/pdf/1509.02971.pdf)
10 | - [Truncated Natural Policy Gradient (TNPG)](https://papers.nips.cc/paper/2073-a-natural-policy-gradient.pdf)
11 | - [Trust Region Policy Optimization (TRPO)](https://arxiv.org/pdf/1502.05477.pdf)
12 | - [Generalized Advantage Estimator (GAE)](https://arxiv.org/pdf/1506.02438.pdf)
13 | - [Proximal Policy Optimization (PPO)](https://arxiv.org/pdf/1707.06347.pdf)
14 | - [Soft Actor-Critic (SAC)](https://arxiv.org/pdf/1812.05905.pdf)
15 | - [Apprenticeship Learning via Inverse Reinforcement Learning (APP)](http://people.eecs.berkeley.edu/~russell/classes/cs294/s11/readings/Abbeel+Ng:2004.pdf)
16 | - [Maximum Entropy Inverse Reinforcement Learning (MaxEnt)](http://new.aaai.org/Papers/AAAI/2008/AAAI08-227.pdf)
17 | - [Generative Adversarial Imitation Learning (GAIL)](https://papers.nips.cc/paper/6391-generative-adversarial-imitation-learning.pdf)
18 | - [Variational Adversarial Imitation Learning (VAIL)](https://arxiv.org/pdf/1810.00821.pdf)
19 |
20 | ## Algorithms
21 |
22 | ### 01. Model-Free Reinforcement Learning
23 |
24 | #### Deep Q-Network (DQN)
25 |
26 | - [CartPole(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/cartpole/dqn)
27 |
28 | #### Double DQN (DDQN)
29 |
30 | - [CartPole(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/cartpole/ddqn)
31 |
32 | #### Advantage Actor-Critic (A2C)
33 |
34 | - [CartPole(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/cartpole/a2c)
35 |
36 | #### Asynchronous Advantage Actor-Critic (A3C)
37 |
38 | - [CartPole(Classic control)]()
39 |
40 | #### Deep Deterministic Policy Gradient (DDPG)
41 |
42 | - [Pendulum(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/pendulum/ddpg)
43 |
44 | #### Truncated Natural Policy Gradient (TNPG)
45 |
46 | - [Pendulum(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/pendulum/tnpg)
47 | - [Hopper(MoJoCo)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/mujoco/tnpg)
48 |
49 | #### Trust Region Policy Optimization (TRPO)
50 |
51 | - [Pendulum(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/pendulum/trpo)
52 |
53 | #### TRPO + Generalized Advantage Estimator (GAE)
54 |
55 | - [Pendulum(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/pendulum/trpo_gae)
56 | - [Hopper(MoJoCo)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/mujoco/trpo)
57 |
58 | #### Proximal Policy Optimization (PPO)
59 |
60 | - [Pendulum(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/pendulum/ppo)
61 |
62 | #### PPO + Generalized Advantage Estimator (GAE)
63 |
64 | - [Pendulum(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/pendulum/ppo_gae)
65 | - [Hopper(MoJoCo)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/mujoco/ppo)
66 |
67 | #### Soft Actor-Critic (SAC)
68 |
69 | - [Pendulum(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/pendulum/sac)
70 | - [Hopper(MoJoCo)]()
71 |
72 | ---
73 |
74 | ### 02. Inverse Reinforcement Learning
75 |
76 | #### Apprenticeship Learning via Inverse Reinforcement Learning (APP)
77 |
78 | - [MountainCar(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/mountaincar/app)
79 |
80 | #### Maximum Entropy Inverse Reinforcement Learning (MaxEnt)
81 |
82 | - [MountainCar(Classic control)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/mountaincar/maxent)
83 |
84 | #### Generative Adversarial Imitation Learning (GAIL)
85 |
86 | - [Hopper(MoJoCo)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/mujoco/gail)
87 |
88 | #### Variational Adversarial Imitation Learning (VAIL)
89 |
90 | - [Hopper(MoJoCo)](https://github.com/dongminleeai/Reinforcement-Learning-Code/tree/master/mujoco/vail)
91 |
92 | ---
93 |
94 | ## Learning curve
95 |
96 | ### CartPole
97 |
98 |
99 |
100 | ### Pendulum
101 |
102 |
103 |
104 | ### Hopper
105 |
106 | ---
107 |
108 | ## Reference
109 |
110 | - [Minimal and Clean Reinforcement Learning Examples in PyTorch](https://github.com/reinforcement-learning-kr/reinforcement-learning-pytorch)
111 | - [Pytorch implementation for Policy Gradient algorithms (REINFORCE, NPG, TRPO, PPO)](https://github.com/reinforcement-learning-kr/pg_travel)
112 | - [Implementation of APP](https://github.com/jangirrishabh/toyCarIRL)
113 | - [Implementation of MaxEnt](https://github.com/MatthewJA/Inverse-Reinforcement-Learning)
114 | - [Pytorch implementation of GAIL](https://github.com/Khrylx/PyTorch-RL)
115 | - [Pytorch implementation of SAC1](https://github.com/vitchyr/rlkit/tree/master/rlkit/torch/sac)
116 | - [Pytorch implementation of SAC2](https://github.com/pranz24/pytorch-soft-actor-critic)
117 |
--------------------------------------------------------------------------------
/cartpole/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/.DS_Store
--------------------------------------------------------------------------------
/cartpole/a2c/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/a2c/.DS_Store
--------------------------------------------------------------------------------
/cartpole/a2c/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/a2c/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/cartpole/a2c/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class Actor(nn.Module):
5 | def __init__(self, state_size, action_size, args):
6 | super(Actor, self).__init__()
7 | self.fc1 = nn.Linear(state_size, args.hidden_size)
8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
9 | self.fc3 = nn.Linear(args.hidden_size, action_size)
10 |
11 | def forward(self, x):
12 | x = torch.tanh(self.fc1(x))
13 | x = torch.tanh(self.fc2(x))
14 | policies = torch.softmax(self.fc3(x), dim=1)
15 |
16 | return policies
17 |
18 | class Critic(nn.Module):
19 | def __init__(self, state_size, args):
20 | super(Critic, self).__init__()
21 | self.fc1 = nn.Linear(state_size, args.hidden_size)
22 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
23 | self.fc3 = nn.Linear(args.hidden_size, 1)
24 |
25 | def forward(self, x):
26 | x = torch.tanh(self.fc1(x))
27 | x = torch.tanh(self.fc2(x))
28 | value = self.fc3(x)
29 |
30 | return value
--------------------------------------------------------------------------------
/cartpole/a2c/save_model/model.pth.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/a2c/save_model/model.pth.tar
--------------------------------------------------------------------------------
/cartpole/a2c/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import random
4 | import argparse
5 | import numpy as np
6 |
7 | import torch
8 | from model import Actor, Critic
9 | from torch.distributions import Categorical
10 |
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('--env_name', type=str, default="CartPole-v1")
13 | parser.add_argument("--load_model", type=str, default='model.pth.tar')
14 | parser.add_argument('--render', action="store_true", default=True)
15 | parser.add_argument('--hidden_size', type=int, default=64)
16 | parser.add_argument('--iter', type=int, default=10000)
17 | parser.add_argument('--log_interval', type=int, default=10)
18 | args = parser.parse_args()
19 |
20 | def get_action(policies):
21 | m = Categorical(policies)
22 | action = m.sample()
23 | action = action.data.numpy()[0]
24 | return action
25 |
26 | if __name__=="__main__":
27 | env = gym.make(args.env_name)
28 | env.seed(500)
29 | torch.manual_seed(500)
30 |
31 | state_size = env.observation_space.shape[0]
32 | action_size = env.action_space.n
33 | print('state size:', state_size)
34 | print('action size:', action_size)
35 |
36 | actor = Actor(state_size, action_size, args)
37 |
38 | if args.load_model is not None:
39 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
40 | pretrained_model = torch.load(pretrained_model_path)
41 | actor.load_state_dict(pretrained_model)
42 |
43 | steps = 0
44 |
45 | for episode in range(args.iter):
46 | done = False
47 | score = 0
48 |
49 | state = env.reset()
50 | state = np.reshape(state, [1, state_size])
51 |
52 | while not done:
53 | if args.render:
54 | env.render()
55 |
56 | steps += 1
57 | policies = actor(torch.Tensor(state))
58 | action = get_action(policies)
59 |
60 | next_state, reward, done, _ = env.step(action)
61 |
62 | next_state = np.reshape(next_state, [1, state_size])
63 | reward = reward if not done or score == 499 else -1
64 |
65 | state = next_state
66 | score += reward
67 |
68 | if episode % args.log_interval == 0:
69 | print('{} episode | score: {:.2f}'.format(episode, score))
--------------------------------------------------------------------------------
/cartpole/a2c/train.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import argparse
4 | import numpy as np
5 |
6 | import torch
7 | import torch.optim as optim
8 | from torch.distributions import Categorical
9 |
10 | from model import Actor, Critic
11 | from tensorboardX import SummaryWriter
12 |
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('--env_name', type=str, default="CartPole-v1")
15 | parser.add_argument('--load_model', type=str, default=None)
16 | parser.add_argument('--save_path', default='./save_model/', help='')
17 | parser.add_argument('--render', action="store_true", default=False)
18 | parser.add_argument('--gamma', type=float, default=0.99)
19 | parser.add_argument('--hidden_size', type=int, default=64)
20 | parser.add_argument('--actor_lr', type=float, default=1e-4)
21 | parser.add_argument('--critic_lr', type=float, default=1e-3)
22 | parser.add_argument('--ent_coef', type=float, default=0.1)
23 | parser.add_argument('--max_iter_num', type=int, default=1000)
24 | parser.add_argument('--log_interval', type=int, default=10)
25 | parser.add_argument('--goal_score', type=int, default=400)
26 | parser.add_argument('--logdir', type=str, default='./logs',
27 | help='tensorboardx logs directory')
28 | args = parser.parse_args()
29 |
30 | def train_model(actor, critic, actor_optimizer, critic_optimizer, transition, policies):
31 | state, action, reward, next_state, mask = transition
32 |
33 | # update critic
34 | criterion = torch.nn.MSELoss()
35 |
36 | value = critic(torch.Tensor(state)).squeeze(1)
37 |
38 | next_value = critic(torch.Tensor(next_state)).squeeze(1)
39 | target = reward + mask * args.gamma * next_value
40 |
41 | critic_loss = criterion(value, target.detach())
42 | critic_optimizer.zero_grad()
43 | critic_loss.backward()
44 | critic_optimizer.step()
45 |
46 | # update actor
47 | categorical = Categorical(policies)
48 | log_policy = categorical.log_prob(torch.Tensor([action]))
49 | entropy = categorical.entropy()
50 |
51 | advantage = target - value
52 |
53 | actor_loss = -log_policy * advantage.item() + args.ent_coef * entropy
54 | actor_optimizer.zero_grad()
55 | actor_loss.backward()
56 | actor_optimizer.step()
57 |
58 | def get_action(policies):
59 | categorical = Categorical(policies)
60 | action = categorical.sample()
61 | action = action.data.numpy()[0]
62 |
63 | return action
64 |
65 |
66 | def main():
67 | env = gym.make(args.env_name)
68 | env.seed(500)
69 | torch.manual_seed(500)
70 |
71 | state_size = env.observation_space.shape[0]
72 | action_size = env.action_space.n
73 | print('state size:', state_size)
74 | print('action size:', action_size)
75 |
76 | actor = Actor(state_size, action_size, args)
77 | critic = Critic(state_size, args)
78 |
79 | actor_optimizer = optim.Adam(actor.parameters(), lr=args.actor_lr)
80 | critic_optimizer = optim.Adam(critic.parameters(), lr=args.critic_lr)
81 |
82 | writer = SummaryWriter(args.logdir)
83 |
84 | running_score = 0
85 |
86 | for episode in range(args.max_iter_num):
87 | done = False
88 | score = 0
89 |
90 | state = env.reset()
91 | state = np.reshape(state, [1, state_size])
92 |
93 | while not done:
94 | if args.render:
95 | env.render()
96 |
97 | policies = actor(torch.Tensor(state))
98 | action = get_action(policies)
99 |
100 | next_state, reward, done, _ = env.step(action)
101 |
102 | next_state = np.reshape(next_state, [1, state_size])
103 | reward = reward if not done or score == 499 else -1
104 | mask = 0 if done else 1
105 |
106 | transition = [state, action, reward, next_state, mask]
107 |
108 | actor.train(), critic.train()
109 | train_model(actor, critic, actor_optimizer, critic_optimizer,
110 | transition, policies)
111 |
112 | state = next_state
113 | score += reward
114 |
115 | score = score if score == 500.0 else score + 1
116 | running_score = 0.99 * running_score + 0.01 * score
117 |
118 | if episode % args.log_interval == 0:
119 | print('{} episode | running_score: {:.2f}'.format(episode, running_score))
120 | writer.add_scalar('log/score', float(score), episode)
121 |
122 | if running_score > args.goal_score:
123 | if not os.path.isdir(args.save_path):
124 | os.makedirs(args.save_path)
125 |
126 | ckpt_path = args.save_path + 'model.pth.tar'
127 | torch.save(actor.state_dict(), ckpt_path)
128 | print('Running score exceeds 400. So end')
129 | break
130 |
131 | if __name__=="__main__":
132 | main()
133 |
--------------------------------------------------------------------------------
/cartpole/ddqn/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/ddqn/.DS_Store
--------------------------------------------------------------------------------
/cartpole/ddqn/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/ddqn/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/cartpole/ddqn/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class QNet(nn.Module):
5 | def __init__(self, state_size, action_size, args):
6 | super(QNet, self).__init__()
7 | self.fc1 = nn.Linear(state_size, args.hidden_size)
8 | self.fc2 = nn.Linear(args.hidden_size, action_size)
9 |
10 | def forward(self, x):
11 | x = torch.tanh(self.fc1(x))
12 | q_values = self.fc2(x)
13 | return q_values
--------------------------------------------------------------------------------
/cartpole/ddqn/save_model/model.pth.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/ddqn/save_model/model.pth.tar
--------------------------------------------------------------------------------
/cartpole/ddqn/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import random
4 | import argparse
5 | import numpy as np
6 |
7 | import torch
8 | from model import QNet
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--env_name', type=str, default="CartPole-v1")
12 | parser.add_argument("--load_model", type=str, default='model.pth.tar')
13 | parser.add_argument('--render', action="store_true", default=True)
14 | parser.add_argument('--hidden_size', type=int, default=64)
15 | parser.add_argument('--iter', type=int, default=10000)
16 | parser.add_argument('--log_interval', type=int, default=10)
17 | args = parser.parse_args()
18 |
19 | if __name__=="__main__":
20 | env = gym.make(args.env_name)
21 | env.seed(500)
22 | torch.manual_seed(500)
23 |
24 | state_size = env.observation_space.shape[0]
25 | action_size = env.action_space.n
26 | print('state size:', state_size)
27 | print('action size:', action_size)
28 |
29 | q_net = QNet(state_size, action_size, args)
30 |
31 | if args.load_model is not None:
32 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
33 | pretrained_model = torch.load(pretrained_model_path)
34 | q_net.load_state_dict(pretrained_model)
35 |
36 | steps = 0
37 |
38 | for episode in range(args.iter):
39 | done = False
40 | score = 0
41 |
42 | state = env.reset()
43 | state = np.reshape(state, [1, state_size])
44 |
45 | while not done:
46 | if args.render:
47 | env.render()
48 |
49 | steps += 1
50 |
51 | q_value = q_net(torch.Tensor(state))
52 | _, action = torch.max(q_value, 1)
53 | action = action.numpy()[0]
54 |
55 | next_state, reward, done, _ = env.step(action)
56 |
57 | next_state = np.reshape(next_state, [1, state_size])
58 | reward = reward if not done or score == 499 else -1
59 |
60 | score += reward
61 | state = next_state
62 |
63 | if episode % args.log_interval == 0:
64 | print('{} episode | score: {:.2f}'.format(episode, score))
--------------------------------------------------------------------------------
/cartpole/ddqn/train.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import random
4 | import argparse
5 | import numpy as np
6 | from collections import deque
7 |
8 | import torch
9 | import torch.optim as optim
10 |
11 | from model import QNet
12 | from tensorboardX import SummaryWriter
13 |
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('--env_name', type=str, default="CartPole-v1")
16 | parser.add_argument('--load_model', type=str, default=None)
17 | parser.add_argument('--save_path', default='./save_model/', help='')
18 | parser.add_argument('--render', action="store_true", default=False)
19 | parser.add_argument('--gamma', type=float, default=0.99)
20 | parser.add_argument('--hidden_size', type=int, default=64)
21 | parser.add_argument('--batch_size', type=int, default=32)
22 | parser.add_argument('--initial_exploration', type=int, default=1000)
23 | parser.add_argument('--epsilon', type=float, default=1.0)
24 | parser.add_argument('--epsilon_decay', type=float, default=0.00005)
25 | parser.add_argument('--update_target', type=int, default=100)
26 | parser.add_argument('--max_iter_num', type=int, default=1000)
27 | parser.add_argument('--log_interval', type=int, default=10)
28 | parser.add_argument('--goal_score', type=int, default=400)
29 | parser.add_argument('--logdir', type=str, default='./logs',
30 | help='tensorboardx logs directory')
31 | args = parser.parse_args()
32 |
33 | def train_model(q_net, target_q_net, optimizer, mini_batch):
34 | mini_batch = np.array(mini_batch)
35 | states = np.vstack(mini_batch[:, 0])
36 | actions = list(mini_batch[:, 1])
37 | rewards = list(mini_batch[:, 2])
38 | next_states = np.vstack(mini_batch[:, 3])
39 | masks = list(mini_batch[:, 4])
40 |
41 | actions = torch.LongTensor(actions)
42 | rewards = torch.Tensor(rewards)
43 | masks = torch.Tensor(masks)
44 |
45 | criterion = torch.nn.MSELoss()
46 |
47 | # get Q-value
48 | q_values = q_net(torch.Tensor(states))
49 | q_value = q_values.gather(1, actions.unsqueeze(1)).view(-1)
50 |
51 | # get target
52 | next_q_values = q_net(torch.Tensor(next_states))
53 | next_q_value_index = next_q_values.max(1)[1]
54 |
55 | target_next_q_values = target_q_net(torch.Tensor(next_states))
56 | target_next_q_value = target_next_q_values.gather(1, next_q_value_index.unsqueeze(1)).view(-1)
57 | target = rewards + masks * args.gamma * target_next_q_value
58 |
59 | loss = criterion(q_value, target.detach())
60 | optimizer.zero_grad()
61 | loss.backward()
62 | optimizer.step()
63 |
64 | def get_action(q_values, action_size, epsilon):
65 | if np.random.rand() <= epsilon:
66 | return random.randrange(action_size)
67 | else:
68 | _, action = torch.max(q_values, 1)
69 | return action.numpy()[0]
70 |
71 | def update_target_model(net, target_q_net):
72 | target_q_net.load_state_dict(net.state_dict())
73 |
74 |
75 | def main():
76 | env = gym.make(args.env_name)
77 | env.seed(500)
78 | torch.manual_seed(500)
79 |
80 | state_size = env.observation_space.shape[0]
81 | action_size = env.action_space.n
82 | print('state size:', state_size)
83 | print('action size:', action_size)
84 |
85 | q_net = QNet(state_size, action_size, args)
86 | target_q_net = QNet(state_size, action_size, args)
87 | optimizer = optim.Adam(q_net.parameters(), lr=0.001)
88 |
89 | update_target_model(q_net, target_q_net)
90 |
91 | writer = SummaryWriter(args.logdir)
92 |
93 | replay_buffer = deque(maxlen=10000)
94 | running_score = 0
95 | steps = 0
96 |
97 | for episode in range(args.max_iter_num):
98 | done = False
99 | score = 0
100 |
101 | state = env.reset()
102 | state = np.reshape(state, [1, state_size])
103 |
104 | while not done:
105 | if args.render:
106 | env.render()
107 |
108 | steps += 1
109 |
110 | q_values = q_net(torch.Tensor(state))
111 | action = get_action(q_values, action_size, args.epsilon)
112 |
113 | next_state, reward, done, _ = env.step(action)
114 |
115 | next_state = np.reshape(next_state, [1, state_size])
116 | reward = reward if not done or score == 499 else -1
117 | mask = 0 if done else 1
118 |
119 | replay_buffer.append((state, action, reward, next_state, mask))
120 |
121 | state = next_state
122 | score += reward
123 |
124 | if steps > args.initial_exploration:
125 | args.epsilon -= args.epsilon_decay
126 | args.epsilon = max(args.epsilon, 0.1)
127 |
128 | mini_batch = random.sample(replay_buffer, args.batch_size)
129 |
130 | q_net.train(), target_q_net.train()
131 | train_model(q_net, target_q_net, optimizer, mini_batch)
132 |
133 | if steps % args.update_target == 0:
134 | update_target_model(q_net, target_q_net)
135 |
136 | score = score if score == 500.0 else score + 1
137 | running_score = 0.99 * running_score + 0.01 * score
138 |
139 | if episode % args.log_interval == 0:
140 | print('{} episode | running_score: {:.2f} | epsilon: {:.2f}'.format(
141 | episode, running_score, args.epsilon))
142 | writer.add_scalar('log/score', float(score), episode)
143 |
144 | if running_score > args.goal_score:
145 | if not os.path.isdir(args.save_path):
146 | os.makedirs(args.save_path)
147 |
148 | ckpt_path = args.save_path + 'model.pth.tar'
149 | torch.save(q_net.state_dict(), ckpt_path)
150 | print('Running score exceeds 400. So end')
151 | break
152 |
153 | if __name__ == '__main__':
154 | main()
--------------------------------------------------------------------------------
/cartpole/dqn/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/dqn/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/cartpole/dqn/__pycache__/model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/dqn/__pycache__/model.cpython-37.pyc
--------------------------------------------------------------------------------
/cartpole/dqn/logs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/dqn/logs/.DS_Store
--------------------------------------------------------------------------------
/cartpole/dqn/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class QNet(nn.Module):
5 | def __init__(self, state_size, action_size, args):
6 | super(QNet, self).__init__()
7 | self.fc1 = nn.Linear(state_size, args.hidden_size)
8 | self.fc2 = nn.Linear(args.hidden_size, action_size)
9 |
10 | def forward(self, x):
11 | x = torch.tanh(self.fc1(x))
12 | q_values = self.fc2(x)
13 | return q_values
--------------------------------------------------------------------------------
/cartpole/dqn/save_model/model.pth.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/cartpole/dqn/save_model/model.pth.tar
--------------------------------------------------------------------------------
/cartpole/dqn/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import random
4 | import argparse
5 | import numpy as np
6 |
7 | import torch
8 | from model import QNet
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--env_name', type=str, default="CartPole-v1")
12 | parser.add_argument("--load_model", type=str, default='model.pth.tar')
13 | parser.add_argument('--render', action="store_true", default=True)
14 | parser.add_argument('--hidden_size', type=int, default=64)
15 | parser.add_argument('--iter', type=int, default=10000)
16 | parser.add_argument('--log_interval', type=int, default=10)
17 | args = parser.parse_args()
18 |
19 | if __name__=="__main__":
20 | env = gym.make(args.env_name)
21 | env.seed(500)
22 | torch.manual_seed(500)
23 |
24 | state_size = env.observation_space.shape[0]
25 | action_size = env.action_space.n
26 | print('state size:', state_size)
27 | print('action size:', action_size)
28 |
29 | q_net = QNet(state_size, action_size, args)
30 |
31 | if args.load_model is not None:
32 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
33 | pretrained_model = torch.load(pretrained_model_path)
34 | q_net.load_state_dict(pretrained_model)
35 |
36 | steps = 0
37 |
38 | for episode in range(args.iter):
39 | done = False
40 | score = 0
41 |
42 | state = env.reset()
43 | state = np.reshape(state, [1, state_size])
44 |
45 | while not done:
46 | if args.render:
47 | env.render()
48 |
49 | steps += 1
50 |
51 | q_value = q_net(torch.Tensor(state))
52 | _, action = torch.max(q_value, 1)
53 | action = action.numpy()[0]
54 |
55 | next_state, reward, done, _ = env.step(action)
56 |
57 | next_state = np.reshape(next_state, [1, state_size])
58 | reward = reward if not done or score == 499 else -1
59 |
60 | score += reward
61 | state = next_state
62 |
63 | if episode % args.log_interval == 0:
64 | print('{} episode | score: {:.2f}'.format(episode, score))
--------------------------------------------------------------------------------
/cartpole/dqn/train.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import random
4 | import argparse
5 | import numpy as np
6 | from collections import deque
7 |
8 | import torch
9 | import torch.optim as optim
10 |
11 | from model import QNet
12 | from tensorboardX import SummaryWriter
13 |
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('--env_name', type=str, default="CartPole-v1")
16 | parser.add_argument('--load_model', type=str, default=None)
17 | parser.add_argument('--save_path', default='./save_model/', help='')
18 | parser.add_argument('--render', action="store_true", default=False)
19 | parser.add_argument('--gamma', type=float, default=0.99)
20 | parser.add_argument('--hidden_size', type=int, default=64)
21 | parser.add_argument('--batch_size', type=int, default=32)
22 | parser.add_argument('--initial_exploration', type=int, default=1000)
23 | parser.add_argument('--epsilon', type=float, default=1.0)
24 | parser.add_argument('--epsilon_decay', type=float, default=0.00005)
25 | parser.add_argument('--update_target', type=int, default=100)
26 | parser.add_argument('--max_iter_num', type=int, default=1000)
27 | parser.add_argument('--log_interval', type=int, default=10)
28 | parser.add_argument('--goal_score', type=int, default=400)
29 | parser.add_argument('--logdir', type=str, default='./logs',
30 | help='tensorboardx logs directory')
31 | args = parser.parse_args()
32 |
33 | def train_model(q_net, target_q_net, optimizer, mini_batch):
34 | mini_batch = np.array(mini_batch)
35 | states = np.vstack(mini_batch[:, 0])
36 | actions = list(mini_batch[:, 1])
37 | rewards = list(mini_batch[:, 2])
38 | next_states = np.vstack(mini_batch[:, 3])
39 | masks = list(mini_batch[:, 4])
40 |
41 | actions = torch.LongTensor(actions)
42 | rewards = torch.Tensor(rewards)
43 | masks = torch.Tensor(masks)
44 |
45 | criterion = torch.nn.MSELoss()
46 |
47 | # get Q-value
48 | q_values = q_net(torch.Tensor(states))
49 | q_value = q_values.gather(1, actions.unsqueeze(1)).view(-1)
50 |
51 | # get target
52 | target_next_q_values = target_q_net(torch.Tensor(next_states))
53 | target = rewards + masks * args.gamma * target_next_q_values.max(1)[0]
54 |
55 | loss = criterion(q_value, target.detach())
56 | optimizer.zero_grad()
57 | loss.backward()
58 | optimizer.step()
59 |
60 | def get_action(q_values, action_size, epsilon):
61 | if np.random.rand() <= epsilon:
62 | return random.randrange(action_size)
63 | else:
64 | _, action = torch.max(q_values, 1)
65 | return action.numpy()[0]
66 |
67 | def update_target_model(q_net, target_q_net):
68 | target_q_net.load_state_dict(q_net.state_dict())
69 |
70 |
71 | def main():
72 | env = gym.make(args.env_name)
73 | env.seed(500)
74 | torch.manual_seed(500)
75 |
76 | state_size = env.observation_space.shape[0]
77 | action_size = env.action_space.n
78 | print('state size:', state_size)
79 | print('action size:', action_size)
80 |
81 | q_net = QNet(state_size, action_size, args)
82 | target_q_net = QNet(state_size, action_size, args)
83 | optimizer = optim.Adam(q_net.parameters(), lr=0.001)
84 |
85 | update_target_model(q_net, target_q_net)
86 |
87 | writer = SummaryWriter(args.logdir)
88 |
89 | replay_buffer = deque(maxlen=10000)
90 | running_score = 0
91 | steps = 0
92 |
93 | for episode in range(args.max_iter_num):
94 | done = False
95 | score = 0
96 |
97 | state = env.reset()
98 | state = np.reshape(state, [1, state_size])
99 |
100 | while not done:
101 | if args.render:
102 | env.render()
103 |
104 | steps += 1
105 |
106 | q_values = q_net(torch.Tensor(state))
107 | action = get_action(q_values, action_size, args.epsilon)
108 |
109 | next_state, reward, done, _ = env.step(action)
110 |
111 | next_state = np.reshape(next_state, [1, state_size])
112 | reward = reward if not done or score == 499 else -1
113 | mask = 0 if done else 1
114 |
115 | replay_buffer.append((state, action, reward, next_state, mask))
116 |
117 | state = next_state
118 | score += reward
119 |
120 | if steps > args.initial_exploration:
121 | args.epsilon -= args.epsilon_decay
122 | args.epsilon = max(args.epsilon, 0.1)
123 |
124 | mini_batch = random.sample(replay_buffer, args.batch_size)
125 |
126 | q_net.train(), target_q_net.train()
127 | train_model(q_net, target_q_net, optimizer, mini_batch)
128 |
129 | if steps % args.update_target == 0:
130 | update_target_model(q_net, target_q_net)
131 |
132 | score = score if score == 500.0 else score + 1
133 | running_score = 0.99 * running_score + 0.01 * score
134 |
135 | if episode % args.log_interval == 0:
136 | print('{} episode | running_score: {:.2f} | epsilon: {:.2f}'.format(
137 | episode, running_score, args.epsilon))
138 | writer.add_scalar('log/score', float(score), episode)
139 |
140 | if running_score > args.goal_score:
141 | if not os.path.isdir(args.save_path):
142 | os.makedirs(args.save_path)
143 |
144 | ckpt_path = args.save_path + 'model.pth.tar'
145 | torch.save(q_net.state_dict(), ckpt_path)
146 | print('Running score exceeds 400. So end')
147 | break
148 |
149 | if __name__ == '__main__':
150 | main()
--------------------------------------------------------------------------------
/img/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/img/.DS_Store
--------------------------------------------------------------------------------
/img/cartpole.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/img/cartpole.png
--------------------------------------------------------------------------------
/img/pendulum.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/img/pendulum.png
--------------------------------------------------------------------------------
/mountaincar/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/.DS_Store
--------------------------------------------------------------------------------
/mountaincar/app/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/.DS_Store
--------------------------------------------------------------------------------
/mountaincar/app/__pycache__/app.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/__pycache__/app.cpython-36.pyc
--------------------------------------------------------------------------------
/mountaincar/app/__pycache__/train.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/__pycache__/train.cpython-36.pyc
--------------------------------------------------------------------------------
/mountaincar/app/app.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import cvxpy as cp
3 | from train import idx_state
4 |
5 | class FeatureEstimate:
6 | def __init__(self, feature_num, env):
7 | self.env = env
8 | self.feature_num = feature_num
9 | self.feature = np.ones(self.feature_num)
10 |
11 | def gaussian_function(self, x, mu):
12 | return np.exp(-np.power(x - mu, 2.) / (2 * np.power(1., 2.)))
13 |
14 | def get_features(self, state):
15 | env_low = self.env.observation_space.low
16 | env_high = self.env.observation_space.high
17 | env_distance = (env_high - env_low) / (self.feature_num - 1)
18 |
19 | for i in range(int(self.feature_num/2)):
20 | # position
21 | self.feature[i] = self.gaussian_function(state[0],
22 | env_low[0] + i * env_distance[0])
23 | # velocity
24 | self.feature[i+int(self.feature_num/2)] = self.gaussian_function(state[1],
25 | env_low[1] + i * env_distance[1])
26 |
27 | return self.feature
28 |
29 |
30 | def calc_feature_expectation(feature_num, gamma, q_table, demonstrations, env):
31 | feature_estimate = FeatureEstimate(feature_num, env)
32 | feature_expectations = np.zeros(feature_num)
33 | demo_num = len(demonstrations)
34 |
35 | for _ in range(demo_num):
36 | state = env.reset()
37 | demo_length = 0
38 | done = False
39 |
40 | while not done:
41 | demo_length += 1
42 |
43 | state_idx = idx_state(env, state)
44 | action = np.argmax(q_table[state_idx])
45 | next_state, reward, done, _ = env.step(action)
46 |
47 | features = feature_estimate.get_features(next_state)
48 | feature_expectations += (gamma**(demo_length)) * np.array(features)
49 |
50 | state = next_state
51 |
52 | feature_expectations = feature_expectations/ demo_num
53 |
54 | return feature_expectations
55 |
56 | def expert_feature_expectation(feature_num, gamma, demonstrations, env):
57 | feature_estimate = FeatureEstimate(feature_num, env)
58 | feature_expectations = np.zeros(feature_num)
59 |
60 | for demo_num in range(len(demonstrations)):
61 | for demo_length in range(len(demonstrations[0])):
62 | state = demonstrations[demo_num][demo_length]
63 | features = feature_estimate.get_features(state)
64 | feature_expectations += (gamma**(demo_length)) * np.array(features)
65 |
66 | feature_expectations = feature_expectations / len(demonstrations)
67 |
68 | return feature_expectations
69 |
70 | def QP_optimizer(feature_num, learner, expert):
71 | w = cp.Variable(feature_num)
72 |
73 | obj_func = cp.Minimize(cp.norm(w))
74 | constraints = [(expert-learner) * w >= 2]
75 |
76 | prob = cp.Problem(obj_func, constraints)
77 | prob.solve()
78 |
79 | if prob.status == "optimal":
80 | print("status:", prob.status)
81 | print("optimal value", prob.value)
82 |
83 | weights = np.squeeze(np.asarray(w.value))
84 | return weights, prob.status
85 | else:
86 | print("status:", prob.status)
87 |
88 | weights = np.zeros(feature_num)
89 | return weights, prob.status
90 |
91 |
92 | def add_feature_expectation(learner, temp_learner):
93 | # save new feature expectation to list after RL step
94 | learner = np.vstack([learner, temp_learner])
95 | return learner
96 |
97 | def subtract_feature_expectation(learner):
98 | # if status is infeasible, subtract first feature expectation
99 | learner = learner[1:][:]
100 | return learner
--------------------------------------------------------------------------------
/mountaincar/app/expert_demo/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/expert_demo/.DS_Store
--------------------------------------------------------------------------------
/mountaincar/app/expert_demo/expert_demo.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/expert_demo/expert_demo.npy
--------------------------------------------------------------------------------
/mountaincar/app/expert_demo/make_expert.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import readchar
3 | import numpy as np
4 |
5 | # MACROS
6 | Push_Left = 0
7 | No_Push = 1
8 | Push_Right = 2
9 |
10 | # Key mapping
11 | arrow_keys = {
12 | '\x1b[D': Push_Left,
13 | '\x1b[B': No_Push,
14 | '\x1b[C': Push_Right}
15 |
16 | env = gym.make('MountainCar-v0')
17 |
18 | trajectories = []
19 | episode_step = 0
20 |
21 | for episode in range(20): # n_trajectories : 20
22 | trajectory = []
23 | step = 0
24 |
25 | env.reset()
26 | print("episode_step", episode_step)
27 |
28 | while True:
29 | env.render()
30 | print("step", step)
31 |
32 | key = readchar.readkey()
33 | if key not in arrow_keys.keys():
34 | break
35 |
36 | action = arrow_keys[key]
37 | state, reward, done, _ = env.step(action)
38 |
39 | if state[0] >= env.env.goal_position and step > 129: # trajectory_length : 130
40 | break
41 |
42 | trajectory.append((state[0], state[1], action))
43 | step += 1
44 |
45 | trajectory_numpy = np.array(trajectory, float)
46 | print("trajectory_numpy.shape", trajectory_numpy.shape)
47 | episode_step += 1
48 | trajectories.append(trajectory)
49 |
50 | np_trajectories = np.array(trajectories, float)
51 | print("np_trajectories.shape", np_trajectories.shape)
52 |
53 | np.save("expert_demo", arr=np_trajectories)
--------------------------------------------------------------------------------
/mountaincar/app/learning_curves/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/learning_curves/.DS_Store
--------------------------------------------------------------------------------
/mountaincar/app/learning_curves/app_eps_60000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/learning_curves/app_eps_60000.png
--------------------------------------------------------------------------------
/mountaincar/app/results/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/results/.DS_Store
--------------------------------------------------------------------------------
/mountaincar/app/results/app_q_table.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/results/app_q_table.npy
--------------------------------------------------------------------------------
/mountaincar/app/results/test_rendering_60000.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/app/results/test_rendering_60000.gif
--------------------------------------------------------------------------------
/mountaincar/app/test.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym
3 | import random
4 | import sys
5 | import cvxpy as cp
6 |
7 | N_idx = 20
8 | F_idx = 4
9 | GAMMA = 0.99
10 |
11 | def idx_to_state(env, state):
12 | env_low = env.observation_space.low
13 | env_high = env.observation_space.high
14 | env_distance = (env_high - env_low) / N_idx
15 | position_idx = int((state[0] - env_low[0]) / env_distance[0])
16 | velocity_idx = int((state[1] - env_low[1]) / env_distance[1])
17 | state_idx = position_idx + velocity_idx * N_idx
18 | return state_idx
19 |
20 |
21 | if __name__ == '__main__':
22 | print(":: Testing APP-learning.\n")
23 |
24 | # Load the agent
25 | n_states = N_idx**2 # position - 20, velocity - 20
26 | n_actions = 3
27 | q_table = np.load(file="results/app_q_table.npy")
28 |
29 | # Create a new game instance.
30 | env = gym.make('MountainCar-v0')
31 | n_episode = 10 # test the agent 10times
32 | scores = []
33 |
34 | for ep in range(n_episode):
35 | state = env.reset()
36 | score = 0
37 |
38 | while True:
39 | # Render the play
40 | env.render()
41 |
42 | state_idx = idx_to_state(env, state)
43 |
44 | action = np.argmax(q_table[state_idx])
45 |
46 | next_state, reward, done, _ = env.step(action)
47 | next_state_idx = idx_to_state(env, next_state)
48 |
49 | score += reward
50 | state = next_state
51 |
52 | if done:
53 | print('{} episode | score: {:.1f}'.format(ep + 1, score))
54 |
55 | break
56 |
57 | env.close()
58 | sys.exit()
--------------------------------------------------------------------------------
/mountaincar/app/train.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import gym
3 | import pylab
4 | import numpy as np
5 |
6 | from app import *
7 |
8 | n_states = 400 # position - 20, velocity - 20
9 | n_actions = 3
10 | one_feature = 20 # number of state per one feature
11 | feature_num = 4
12 | q_table = np.zeros((n_states, n_actions)) # (400, 3)
13 |
14 | gamma = 0.99
15 | q_learning_rate = 0.03
16 |
17 | def idx_state(env, state):
18 | env_low = env.observation_space.low
19 | env_high = env.observation_space.high
20 | env_distance = (env_high - env_low) / one_feature
21 | positioone_feature = int((state[0] - env_low[0]) / env_distance[0])
22 | velocity_idx = int((state[1] - env_low[1]) / env_distance[1])
23 | state_idx = positioone_feature + velocity_idx * one_feature
24 | return state_idx
25 |
26 | def update_q_table(state, action, reward, next_state):
27 | q_1 = q_table[state][action]
28 | q_2 = reward + gamma * max(q_table[next_state])
29 | q_table[state][action] += q_learning_rate * (q_2 - q_1)
30 |
31 |
32 | def main():
33 | env = gym.make('MountainCar-v0')
34 | demonstrations = np.load(file="expert_demo/expert_demo.npy")
35 |
36 | feature_estimate = FeatureEstimate(feature_num, env)
37 |
38 | learner = calc_feature_expectation(feature_num, gamma, q_table, demonstrations, env)
39 | learner = np.matrix([learner])
40 |
41 | expert = expert_feature_expectation(feature_num, gamma, demonstrations, env)
42 | expert = np.matrix([expert])
43 |
44 | w, status = QP_optimizer(feature_num, learner, expert)
45 |
46 |
47 | episodes, scores = [], []
48 |
49 | for episode in range(60000):
50 | state = env.reset()
51 | score = 0
52 |
53 | while True:
54 | state_idx = idx_state(env, state)
55 | action = np.argmax(q_table[state_idx])
56 | next_state, reward, done, _ = env.step(action)
57 |
58 | features = feature_estimate.get_features(state)
59 | irl_reward = np.dot(w, features)
60 |
61 | next_state_idx = idx_state(env, next_state)
62 | update_q_table(state_idx, action, irl_reward, next_state_idx)
63 |
64 | score += reward
65 | state = next_state
66 |
67 | if done:
68 | scores.append(score)
69 | episodes.append(episode)
70 | break
71 |
72 | if episode % 1000 == 0:
73 | score_avg = np.mean(scores)
74 | print('{} episode score is {:.2f}'.format(episode, score_avg))
75 | # pylab.plot(episodes, scores, 'b')
76 | # pylab.savefig("./learning_curves/app_eps_60000.png")
77 | # np.save("./results/app_q_table", arr=q_table)
78 |
79 | if episode % 5000 == 0:
80 | # optimize weight per 5000 episode
81 | status = "infeasible"
82 | temp_learner = calc_feature_expectation(feature_num, gamma, q_table, demonstrations, env)
83 | learner = add_feature_expectation(learner, temp_learner)
84 |
85 | while status=="infeasible":
86 | w, status = QP_optimizer(feature_num, learner, expert)
87 | if status=="infeasible":
88 | learner = subtract_feature_expectation(learner)
89 |
90 | if __name__ == '__main__':
91 | main()
--------------------------------------------------------------------------------
/mountaincar/ddpg/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/ddpg/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/mountaincar/ddpg/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/ddpg/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/mountaincar/ddpg/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class Actor(nn.Module):
5 | def __init__(self, state_size, action_size, args):
6 | super(Actor, self).__init__()
7 | self.fc1 = nn.Linear(state_size, args.hidden_size)
8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
9 | self.fc3 = nn.Linear(args.hidden_size, action_size)
10 |
11 | def forward(self, x):
12 | x = torch.relu(self.fc1(x))
13 | x = torch.relu(self.fc2(x))
14 | policy = self.fc3(x)
15 |
16 | return policy
17 |
18 | class Critic(nn.Module):
19 | def __init__(self, state_size, action_size, args):
20 | super(Critic, self).__init__()
21 | self.fc1 = nn.Linear(state_size + action_size, args.hidden_size)
22 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
23 | self.fc3 = nn.Linear(args.hidden_size, 1)
24 |
25 | def forward(self, states, actions):
26 | x = torch.cat([states, actions], dim=1)
27 | x = torch.relu(self.fc1(x))
28 | x = torch.relu(self.fc2(x))
29 | q_value = self.fc3(x)
30 |
31 | return q_value
--------------------------------------------------------------------------------
/mountaincar/ddpg/save_model/model.pth.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/ddpg/save_model/model.pth.tar
--------------------------------------------------------------------------------
/mountaincar/ddpg/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import random
4 | import argparse
5 | import numpy as np
6 |
7 | import torch
8 | from utils import *
9 | from model import Actor, Critic
10 |
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('--env_name', type=str, default="MountainCarContinuous-v0")
13 | parser.add_argument("--load_model", type=str, default='model.pth.tar')
14 | parser.add_argument('--render', action="store_true", default=True)
15 | parser.add_argument('--hidden_size', type=int, default=64)
16 | parser.add_argument('--theta', type=float, default=0.15)
17 | parser.add_argument('--mu', type=float, default=0.0)
18 | parser.add_argument('--sigma', type=float, default=0.2)
19 | parser.add_argument('--iter', type=int, default=10000)
20 | parser.add_argument('--log_interval', type=int, default=10)
21 | args = parser.parse_args()
22 |
23 | if __name__=="__main__":
24 | env = gym.make(args.env_name)
25 | env.seed(500)
26 | torch.manual_seed(500)
27 |
28 | state_size = env.observation_space.shape[0]
29 | action_size = env.action_space.shape[0]
30 | print('state size:', state_size)
31 | print('action size:', action_size)
32 |
33 | actor = Actor(state_size, action_size, args)
34 |
35 | if args.load_model is not None:
36 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
37 | pretrained_model = torch.load(pretrained_model_path)
38 | actor.load_state_dict(pretrained_model)
39 |
40 | ou_noise = OUNoise(action_size, args.theta, args.mu, args.sigma)
41 | steps = 0
42 |
43 | for episode in range(args.iter):
44 | done = False
45 | score = 0
46 |
47 | state = env.reset()
48 | state = np.reshape(state, [1, state_size])
49 |
50 | while not done:
51 | if args.render:
52 | env.render()
53 |
54 | steps += 1
55 |
56 | policy = actor(torch.Tensor(state))
57 | action = get_action(policy, ou_noise)
58 |
59 | next_state, reward, done, _ = env.step(action)
60 |
61 | next_state = np.reshape(next_state, [1, state_size])
62 | state = next_state
63 | score += reward
64 |
65 | if episode % args.log_interval == 0:
66 | print('{} episode | score: {:.2f}'.format(episode, score))
--------------------------------------------------------------------------------
/mountaincar/ddpg/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 |
4 | class OUNoise:
5 | def __init__(self, action_size, theta, mu, sigma):
6 | self.action_size = action_size
7 | self.theta = theta
8 | self.mu = mu
9 | self.sigma = sigma
10 | self.X = np.zeros(self.action_size)
11 |
12 | def sample(self):
13 | dx = self.theta * (self.mu - self.X)
14 | dx = dx + self.sigma * np.random.randn(len(self.X))
15 | self.X = self.X + dx
16 |
17 | return self.X
18 |
19 | def get_action(policy, ou_noise):
20 | action = policy.detach().numpy() + ou_noise.sample()
21 |
22 | return action
23 |
24 | def hard_target_update(actor, critic, target_actor, target_critic):
25 | target_critic.load_state_dict(critic.state_dict())
26 | target_actor.load_state_dict(actor.state_dict())
27 |
28 | def soft_target_update(actor, critic, target_actor, target_critic, tau):
29 | soft_update(critic, target_critic, tau)
30 | soft_update(actor, target_actor, tau)
31 |
32 | def soft_update(net, target_net, tau):
33 | for param, target_param in zip(net.parameters(), target_net.parameters()):
34 | target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data)
--------------------------------------------------------------------------------
/mountaincar/maxent/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/.DS_Store
--------------------------------------------------------------------------------
/mountaincar/maxent/__pycache__/maxent.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/__pycache__/maxent.cpython-36.pyc
--------------------------------------------------------------------------------
/mountaincar/maxent/__pycache__/maxent_train.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/__pycache__/maxent_train.cpython-36.pyc
--------------------------------------------------------------------------------
/mountaincar/maxent/__pycache__/train.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/__pycache__/train.cpython-36.pyc
--------------------------------------------------------------------------------
/mountaincar/maxent/expert_demo/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/expert_demo/.DS_Store
--------------------------------------------------------------------------------
/mountaincar/maxent/expert_demo/expert_demo.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/expert_demo/expert_demo.npy
--------------------------------------------------------------------------------
/mountaincar/maxent/expert_demo/make_expert.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import readchar
3 | import numpy as np
4 |
5 | # # MACROS
6 | Push_Left = 0
7 | No_Push = 1
8 | Push_Right = 2
9 |
10 | # Key mapping
11 | arrow_keys = {
12 | '\x1b[D': Push_Left,
13 | '\x1b[B': No_Push,
14 | '\x1b[C': Push_Right}
15 |
16 | env = gym.make('MountainCar-v0')
17 |
18 | trajectories = []
19 | episode_step = 0
20 |
21 | for episode in range(20): # n_trajectories : 20
22 | trajectory = []
23 | step = 0
24 |
25 | env.reset()
26 | print("episode_step", episode_step)
27 |
28 | while True:
29 | env.render()
30 | print("step", step)
31 |
32 | key = readchar.readkey()
33 | if key not in arrow_keys.keys():
34 | break
35 |
36 | action = arrow_keys[key]
37 | state, reward, done, _ = env.step(action)
38 |
39 | if state[0] >= env.env.goal_position and step > 129: # trajectory_length : 130
40 | break
41 |
42 | trajectory.append((state[0], state[1], action))
43 | step += 1
44 |
45 | # trajectory_numpy = np.array(trajectory, float)
46 | # print("trajectory_numpy.shape", trajectory_numpy.shape)
47 | # episode_step += 1
48 | # trajectories.append(trajectory)
49 |
50 | # np_trajectories = np.array(trajectories, float)
51 | # print("np_trajectories.shape", np_trajectories.shape)
52 |
53 | # np.save("expert_trajectories", arr=np_trajectories)
--------------------------------------------------------------------------------
/mountaincar/maxent/learning_curves/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/learning_curves/.DS_Store
--------------------------------------------------------------------------------
/mountaincar/maxent/learning_curves/maxent_eps_30000.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/learning_curves/maxent_eps_30000.png
--------------------------------------------------------------------------------
/mountaincar/maxent/maxent.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def get_reward(feature_matrix, theta, n_states, state_idx):
4 | irl_rewards = feature_matrix.dot(theta).reshape((n_states,))
5 | return irl_rewards[state_idx]
6 |
7 |
8 | def expert_feature_expectations(feature_matrix, demonstrations):
9 | feature_expectations = np.zeros(feature_matrix.shape[0])
10 |
11 | for demonstration in demonstrations:
12 | for state_idx, _, _ in demonstration:
13 | feature_expectations += feature_matrix[int(state_idx)]
14 |
15 | feature_expectations /= demonstrations.shape[0]
16 | return feature_expectations
17 |
18 | def maxent_irl(expert, learner, theta, learning_rate):
19 | gradient = expert - learner
20 | theta += learning_rate * gradient
21 |
22 | # Clip theta
23 | for j in range(len(theta)):
24 | if theta[j] > 0:
25 | theta[j] = 0
--------------------------------------------------------------------------------
/mountaincar/maxent/results/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/results/.DS_Store
--------------------------------------------------------------------------------
/mountaincar/maxent/results/maxent_q_table.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/results/maxent_q_table.npy
--------------------------------------------------------------------------------
/mountaincar/maxent/results/test_rendering_30000.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/maxent/results/test_rendering_30000.gif
--------------------------------------------------------------------------------
/mountaincar/maxent/test.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import pylab
3 | import numpy as np
4 |
5 | q_table = np.load(file="results/maxent_20_epoch_100000_epi_test.npy") # (400, 3)
6 | one_feature = 20 # number of state per one feature
7 |
8 | def idx_to_state(env, state):
9 | """ Convert pos and vel about mounting car environment to the integer value"""
10 | env_low = env.observation_space.low
11 | env_high = env.observation_space.high
12 | env_distance = (env_high - env_low) / one_feature
13 | position_idx = int((state[0] - env_low[0]) / env_distance[0])
14 | velocity_idx = int((state[1] - env_low[1]) / env_distance[1])
15 | state_idx = position_idx + velocity_idx * one_feature
16 | return state_idx
17 |
18 | def main():
19 | env = gym.make('MountainCar-v0')
20 |
21 | episodes, scores = [], []
22 |
23 | for episode in range(10):
24 | state = env.reset()
25 | score = 0
26 |
27 | while True:
28 | env.render()
29 | state_idx = idx_to_state(env, state)
30 | action = np.argmax(q_table[state_idx])
31 | next_state, reward, done, _ = env.step(action)
32 |
33 | score += reward
34 | state = next_state
35 |
36 | if done:
37 | scores.append(score)
38 | episodes.append(episode)
39 | pylab.plot(episodes, scores, 'b')
40 | pylab.savefig("./learning_curves/maxent_test.png")
41 | break
42 |
43 | if episode % 1 == 0:
44 | print('{} episode score is {:.2f}'.format(episode, score))
45 |
46 | if __name__ == '__main__':
47 | main()
48 |
--------------------------------------------------------------------------------
/mountaincar/maxent/train.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import pylab
3 | import numpy as np
4 |
5 | from maxent import *
6 |
7 | n_states = 400 # position - 20, velocity - 20
8 | n_actions = 3
9 | one_feature = 20 # number of state per one feature
10 | q_table = np.zeros((n_states, n_actions)) # (400, 3)
11 | feature_matrix = np.eye((n_states)) # (400, 400)
12 |
13 | gamma = 0.99
14 | q_learning_rate = 0.03
15 | theta_learning_rate = 0.05
16 |
17 | np.random.seed(1)
18 |
19 | def idx_demo(env, one_feature):
20 | env_low = env.observation_space.low
21 | env_high = env.observation_space.high
22 | env_distance = (env_high - env_low) / one_feature
23 |
24 | raw_demo = np.load(file="expert_demo/expert_demo.npy")
25 | demonstrations = np.zeros((len(raw_demo), len(raw_demo[0]), 3))
26 |
27 | for x in range(len(raw_demo)):
28 | for y in range(len(raw_demo[0])):
29 | position_idx = int((raw_demo[x][y][0] - env_low[0]) / env_distance[0])
30 | velocity_idx = int((raw_demo[x][y][1] - env_low[1]) / env_distance[1])
31 | state_idx = position_idx + velocity_idx * one_feature
32 |
33 | demonstrations[x][y][0] = state_idx
34 | demonstrations[x][y][1] = raw_demo[x][y][2]
35 |
36 | return demonstrations
37 |
38 | def idx_state(env, state):
39 | env_low = env.observation_space.low
40 | env_high = env.observation_space.high
41 | env_distance = (env_high - env_low) / one_feature
42 | position_idx = int((state[0] - env_low[0]) / env_distance[0])
43 | velocity_idx = int((state[1] - env_low[1]) / env_distance[1])
44 | state_idx = position_idx + velocity_idx * one_feature
45 | return state_idx
46 |
47 | def update_q_table(state, action, reward, next_state):
48 | q_1 = q_table[state][action]
49 | q_2 = reward + gamma * max(q_table[next_state])
50 | q_table[state][action] += q_learning_rate * (q_2 - q_1)
51 |
52 |
53 | def main():
54 | env = gym.make('MountainCar-v0')
55 | demonstrations = idx_demo(env, one_feature)
56 |
57 | learner_feature_expectations = np.zeros(n_states)
58 |
59 | theta = -(np.random.uniform(size=(n_states,)))
60 |
61 | episodes, scores = [], []
62 |
63 | for episode in range(30000):
64 | state = env.reset()
65 | score = 0
66 |
67 | if episode != 0 and episode == 10000 or (episode > 10000 and episode % 5000 == 0):
68 | expert = expert_feature_expectations(feature_matrix, demonstrations)
69 | learner = learner_feature_expectations / episode
70 | maxent_irl(expert, learner, theta, theta_learning_rate)
71 |
72 | while True:
73 | state_idx = idx_state(env, state)
74 | action = np.argmax(q_table[state_idx])
75 | next_state, reward, done, _ = env.step(action)
76 |
77 | irl_reward = get_reward(feature_matrix, theta, n_states, state_idx)
78 | next_state_idx = idx_state(env, next_state)
79 | update_q_table(state_idx, action, irl_reward, next_state_idx)
80 |
81 | learner_feature_expectations += feature_matrix[int(state_idx)]
82 |
83 | score += reward
84 | state = next_state
85 |
86 | if done:
87 | scores.append(score)
88 | episodes.append(episode)
89 | break
90 |
91 | if episode % 1000 == 0:
92 | score_avg = np.mean(scores)
93 | print('{} episode score is {:.2f}'.format(episode, score_avg))
94 | pylab.plot(episodes, scores, 'b')
95 | pylab.savefig("./learning_curves/maxent_30000.png")
96 | np.save("./results/maxent_q_table", arr=q_table)
97 |
98 | if __name__ == '__main__':
99 | main()
--------------------------------------------------------------------------------
/mountaincar/sac/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/sac/.DS_Store
--------------------------------------------------------------------------------
/mountaincar/sac/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/sac/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/mountaincar/sac/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mountaincar/sac/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/mountaincar/sac/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class Actor(nn.Module):
5 | def __init__(self, state_size, action_size, args, log_std_min=-20, log_std_max=2):
6 | super(Actor, self).__init__()
7 | self.log_std_min = log_std_min
8 | self.log_std_max = log_std_max
9 |
10 | self.fc1 = nn.Linear(state_size, args.hidden_size)
11 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
12 |
13 | self.fc3 = nn.Linear(args.hidden_size, action_size)
14 | self.fc4 = nn.Linear(args.hidden_size, action_size)
15 |
16 | def forward(self, x):
17 | x = torch.relu(self.fc1(x))
18 | x = torch.relu(self.fc2(x))
19 |
20 | mu = self.fc3(x)
21 | log_std = self.fc4(x)
22 |
23 | log_std = torch.clamp(log_std, min=self.log_std_min, max=self.log_std_max)
24 | std = torch.exp(log_std)
25 |
26 | return mu, std
27 |
28 | class Critic(nn.Module):
29 | def __init__(self, state_size, action_size, args):
30 | super(Critic, self).__init__()
31 |
32 | # Q1 architecture
33 | self.fc1 = nn.Linear(state_size + action_size, args.hidden_size)
34 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
35 | self.fc3 = nn.Linear(args.hidden_size, 1)
36 |
37 | # Q2 architecture
38 | self.fc4 = nn.Linear(state_size + action_size, args.hidden_size)
39 | self.fc5 = nn.Linear(args.hidden_size, args.hidden_size)
40 | self.fc6 = nn.Linear(args.hidden_size, 1)
41 |
42 | def forward(self, states, actions):
43 | x = torch.cat([states, actions], dim=1)
44 |
45 | x1 = torch.relu(self.fc1(x))
46 | x1 = torch.relu(self.fc2(x1))
47 | q_value1 = self.fc3(x1)
48 |
49 | x2 = torch.relu(self.fc4(x))
50 | x2 = torch.relu(self.fc5(x2))
51 | q_value2 = self.fc6(x2)
52 |
53 | return q_value1, q_value2
--------------------------------------------------------------------------------
/mountaincar/sac/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import random
4 | import argparse
5 | import numpy as np
6 |
7 | import torch
8 | from utils import *
9 | from model import Actor, Critic
10 |
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('--env_name', type=str, default="MountainCarContinuous-v0")
13 | parser.add_argument("--load_model", type=str, default='model.pth.tar')
14 | parser.add_argument('--render', action="store_true", default=True)
15 | parser.add_argument('--hidden_size', type=int, default=64)
16 | parser.add_argument('--iter', type=int, default=10000)
17 | parser.add_argument('--log_interval', type=int, default=10)
18 | args = parser.parse_args()
19 |
20 | if __name__=="__main__":
21 | env = gym.make(args.env_name)
22 | env.seed(500)
23 | torch.manual_seed(500)
24 |
25 | state_size = env.observation_space.shape[0]
26 | action_size = env.action_space.shape[0]
27 | print('state size:', state_size)
28 | print('action size:', action_size)
29 |
30 | actor = Actor(state_size, action_size, args)
31 |
32 | if args.load_model is not None:
33 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
34 | pretrained_model = torch.load(pretrained_model_path)
35 | actor.load_state_dict(pretrained_model)
36 |
37 | steps = 0
38 |
39 | for episode in range(args.iter):
40 | done = False
41 | score = 0
42 |
43 | state = env.reset()
44 | state = np.reshape(state, [1, state_size])
45 |
46 | while not done:
47 | if args.render:
48 | env.render()
49 |
50 | steps += 1
51 |
52 | mu, std = actor(torch.Tensor(state))
53 | action = get_action(mu, std)
54 |
55 | next_state, reward, done, _ = env.step(action)
56 |
57 | next_state = np.reshape(next_state, [1, state_size])
58 | state = next_state
59 | score += reward
60 |
61 | if episode % args.log_interval == 0:
62 | print('{} episode | score: {:.2f}'.format(episode, score))
--------------------------------------------------------------------------------
/mountaincar/sac/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.distributions import Normal
3 |
4 | def get_action(mu, std):
5 | normal = Normal(mu, std)
6 | z = normal.rsample() # reparameterization trick (mean + std * N(0,1))
7 | action = torch.tanh(z)
8 |
9 | return action.data.numpy()
10 |
11 | def eval_action(mu, std, epsilon=1e-6):
12 | normal = Normal(mu, std)
13 | z = normal.rsample() # reparameterization trick (mean + std * N(0,1))
14 | action = torch.tanh(z)
15 | log_prob = normal.log_prob(z)
16 |
17 | # Enforcing Action Bounds
18 | log_prob -= torch.log(1 - action.pow(2) + epsilon)
19 | log_policy = log_prob.sum(1, keepdim=True)
20 |
21 | return action, log_policy
22 |
23 | def hard_target_update(net, target_net):
24 | target_net.load_state_dict(net.state_dict())
25 |
26 | def soft_target_update(net, target_net, tau):
27 | for param, target_param in zip(net.parameters(), target_net.parameters()):
28 | target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data)
--------------------------------------------------------------------------------
/mujoco/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/.DS_Store
--------------------------------------------------------------------------------
/mujoco/gail/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/.DS_Store
--------------------------------------------------------------------------------
/mujoco/gail/__pycache__/hparams.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/__pycache__/hparams.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/gail/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/gail/__pycache__/train_model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/__pycache__/train_model.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/gail/expert_demo/expert_demo.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/expert_demo/expert_demo.p
--------------------------------------------------------------------------------
/mujoco/gail/logs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/logs/.DS_Store
--------------------------------------------------------------------------------
/mujoco/gail/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class Actor(nn.Module):
5 | def __init__(self, num_inputs, num_outputs, args):
6 | super(Actor, self).__init__()
7 | self.fc1 = nn.Linear(num_inputs, args.hidden_size)
8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
9 | self.fc3 = nn.Linear(args.hidden_size, num_outputs)
10 |
11 | self.fc3.weight.data.mul_(0.1)
12 | self.fc3.bias.data.mul_(0.0)
13 |
14 | def forward(self, x):
15 | x = torch.tanh(self.fc1(x))
16 | x = torch.tanh(self.fc2(x))
17 | mu = self.fc3(x)
18 | logstd = torch.zeros_like(mu)
19 | std = torch.exp(logstd)
20 | return mu, std
21 |
22 |
23 | class Critic(nn.Module):
24 | def __init__(self, num_inputs, args):
25 | super(Critic, self).__init__()
26 | self.fc1 = nn.Linear(num_inputs, args.hidden_size)
27 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
28 | self.fc3 = nn.Linear(args.hidden_size, 1)
29 |
30 | self.fc3.weight.data.mul_(0.1)
31 | self.fc3.bias.data.mul_(0.0)
32 |
33 | def forward(self, x):
34 | x = torch.tanh(self.fc1(x))
35 | x = torch.tanh(self.fc2(x))
36 | v = self.fc3(x)
37 | return v
38 |
39 |
40 | class Discriminator(nn.Module):
41 | def __init__(self, num_inputs, args):
42 | super(Discriminator, self).__init__()
43 | self.fc1 = nn.Linear(num_inputs, args.hidden_size)
44 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
45 | self.fc3 = nn.Linear(args.hidden_size, 1)
46 |
47 | self.fc3.weight.data.mul_(0.1)
48 | self.fc3.bias.data.mul_(0.0)
49 |
50 | def forward(self, x):
51 | x = torch.tanh(self.fc1(x))
52 | x = torch.tanh(self.fc2(x))
53 | prob = torch.sigmoid(self.fc3(x))
54 | return prob
--------------------------------------------------------------------------------
/mujoco/gail/save_model/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/save_model/.DS_Store
--------------------------------------------------------------------------------
/mujoco/gail/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import torch
4 | import argparse
5 |
6 | from model import Actor, Critic
7 | from utils.utils import get_action
8 | from utils.running_state import ZFilter
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--env', type=str, default="Hopper-v2",
12 | help='name of Mujoco environement')
13 | parser.add_argument('--iter', type=int, default=5,
14 | help='number of episodes to play')
15 | parser.add_argument("--load_model", type=str, default='ppo_max.tar',
16 | help="if you test pretrained file, write filename in save_model folder")
17 |
18 | args = parser.parse_args()
19 |
20 |
21 | if __name__ == "__main__":
22 | env = gym.make(args.env)
23 | env.seed(500)
24 | torch.manual_seed(500)
25 |
26 | num_inputs = env.observation_space.shape[0]
27 | num_actions = env.action_space.shape[0]
28 |
29 | print("state size: ", num_inputs)
30 | print("action size: ", num_actions)
31 |
32 | actor = Actor(num_inputs, num_actions)
33 | critic = Critic(num_inputs)
34 |
35 | running_state = ZFilter((num_inputs,), clip=5)
36 |
37 | if args.load_model is not None:
38 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
39 |
40 | pretrained_model = torch.load(pretrained_model_path)
41 |
42 | actor.load_state_dict(pretrained_model['actor'])
43 | critic.load_state_dict(pretrained_model['critic'])
44 |
45 | running_state.rs.n = pretrained_model['z_filter_n']
46 | running_state.rs.mean = pretrained_model['z_filter_m']
47 | running_state.rs.sum_square = pretrained_model['z_filter_s']
48 |
49 | print("Loaded OK ex. ZFilter N {}".format(running_state.rs.n))
50 |
51 | else:
52 | assert("Should write pretrained filename in save_model folder. ex) python3 test_algo.py --load_model ppo_max.tar")
53 |
54 |
55 | actor.eval(), critic.eval()
56 | for episode in range(args.iter):
57 | state = env.reset()
58 | steps = 0
59 | score = 0
60 | for _ in range(10000):
61 | env.render()
62 | mu, std, _ = actor(torch.Tensor(state).unsqueeze(0))
63 | action = get_action(mu, std)[0]
64 |
65 | next_state, reward, done, _ = env.step(action)
66 | next_state = running_state(next_state)
67 |
68 | state = next_state
69 | score += reward
70 |
71 | if done:
72 | print("{} cumulative reward: {}".format(episode, score))
73 | break
74 |
--------------------------------------------------------------------------------
/mujoco/gail/train_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from utils.utils import get_entropy, log_prob_density
4 |
5 | def train_discrim(discrim, memory, discrim_optim, demonstrations, args):
6 | memory = np.array(memory)
7 | states = np.vstack(memory[:, 0])
8 | actions = list(memory[:, 1])
9 |
10 | states = torch.Tensor(states)
11 | actions = torch.Tensor(actions)
12 |
13 | criterion = torch.nn.BCELoss()
14 |
15 | for _ in range(args.discrim_update_num):
16 | learner = discrim(torch.cat([states, actions], dim=1))
17 | expert = discrim(torch.Tensor(demonstrations))
18 |
19 | discrim_loss = criterion(learner, torch.ones((states.shape[0], 1))) + \
20 | criterion(expert, torch.zeros((demonstrations.shape[0], 1)))
21 |
22 | discrim_optim.zero_grad()
23 | discrim_loss.backward()
24 | discrim_optim.step()
25 |
26 |
27 | def train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args):
28 | memory = np.array(memory)
29 | states = np.vstack(memory[:, 0])
30 | actions = list(memory[:, 1])
31 | rewards = list(memory[:, 2])
32 | masks = list(memory[:, 3])
33 |
34 | old_values = critic(torch.Tensor(states))
35 | returns, advants = get_gae(rewards, masks, old_values, args)
36 |
37 | mu, std = actor(torch.Tensor(states))
38 | old_policy = log_prob_density(torch.Tensor(actions), mu, std)
39 |
40 | criterion = torch.nn.MSELoss()
41 | n = len(states)
42 | arr = np.arange(n)
43 |
44 | for _ in range(args.actor_critic_update_num):
45 | np.random.shuffle(arr)
46 |
47 | for i in range(n // args.batch_size):
48 | batch_index = arr[args.batch_size * i : args.batch_size * (i + 1)]
49 | batch_index = torch.LongTensor(batch_index)
50 |
51 | inputs = torch.Tensor(states)[batch_index]
52 | actions_samples = torch.Tensor(actions)[batch_index]
53 | returns_samples = returns.unsqueeze(1)[batch_index]
54 | advants_samples = advants.unsqueeze(1)[batch_index]
55 | oldvalue_samples = old_values[batch_index].detach()
56 |
57 | values = critic(inputs)
58 | clipped_values = oldvalue_samples + \
59 | torch.clamp(values - oldvalue_samples,
60 | -args.clip_param,
61 | args.clip_param)
62 | critic_loss1 = criterion(clipped_values, returns_samples)
63 | critic_loss2 = criterion(values, returns_samples)
64 | critic_loss = torch.max(critic_loss1, critic_loss2).mean()
65 |
66 | loss, ratio, entropy = surrogate_loss(actor, advants_samples, inputs,
67 | old_policy.detach(), actions_samples,
68 | batch_index)
69 | clipped_ratio = torch.clamp(ratio,
70 | 1.0 - args.clip_param,
71 | 1.0 + args.clip_param)
72 | clipped_loss = clipped_ratio * advants_samples
73 | actor_loss = -torch.min(loss, clipped_loss).mean()
74 |
75 | loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy
76 |
77 | critic_optim.zero_grad()
78 | loss.backward(retain_graph=True)
79 | critic_optim.step()
80 |
81 | actor_optim.zero_grad()
82 | loss.backward()
83 | actor_optim.step()
84 |
85 | def get_gae(rewards, masks, values, args):
86 | rewards = torch.Tensor(rewards)
87 | masks = torch.Tensor(masks)
88 | returns = torch.zeros_like(rewards)
89 | advants = torch.zeros_like(rewards)
90 |
91 | running_returns = 0
92 | previous_value = 0
93 | running_advants = 0
94 |
95 | for t in reversed(range(0, len(rewards))):
96 | running_returns = rewards[t] + (args.gamma * running_returns * masks[t])
97 | returns[t] = running_returns
98 |
99 | running_delta = rewards[t] + (args.gamma * previous_value * masks[t]) - \
100 | values.data[t]
101 | previous_value = values.data[t]
102 |
103 | running_advants = running_delta + (args.gamma * args.lamda * \
104 | running_advants * masks[t])
105 | advants[t] = running_advants
106 |
107 | advants = (advants - advants.mean()) / advants.std()
108 | return returns, advants
109 |
110 | def surrogate_loss(actor, advants, states, old_policy, actions, batch_index):
111 | mu, std = actor(states)
112 | new_policy = log_prob_density(actions, mu, std)
113 | old_policy = old_policy[batch_index]
114 |
115 | ratio = torch.exp(new_policy - old_policy)
116 | surrogate_loss = ratio * advants
117 | entropy = get_entropy(mu, std)
118 |
119 | return surrogate_loss, ratio, entropy
--------------------------------------------------------------------------------
/mujoco/gail/utils/__pycache__/running_state.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/utils/__pycache__/running_state.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/gail/utils/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/utils/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/gail/utils/__pycache__/zfilter.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/gail/utils/__pycache__/zfilter.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/gail/utils/utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch.distributions import Normal
4 |
5 | def get_action(mu, std):
6 | action = torch.normal(mu, std)
7 | action = action.data.numpy()
8 | return action
9 |
10 | def get_entropy(mu, std):
11 | dist = Normal(mu, std)
12 | entropy = dist.entropy().mean()
13 | return entropy
14 |
15 | def log_prob_density(x, mu, std):
16 | log_prob_density = -(x - mu).pow(2) / (2 * std.pow(2)) \
17 | - 0.5 * math.log(2 * math.pi)
18 | return log_prob_density.sum(1, keepdim=True)
19 |
20 | def get_reward(discrim, state, action):
21 | state = torch.Tensor(state)
22 | action = torch.Tensor(action)
23 | state_action = torch.cat([state, action])
24 | with torch.no_grad():
25 | return -math.log(discrim(state_action)[0].item())
26 |
27 | def save_checkpoint(state, filename):
28 | torch.save(state, filename)
--------------------------------------------------------------------------------
/mujoco/gail/utils/zfilter.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | # from https://github.com/joschu/modular_rl
4 | # http://www.johndcook.com/blog/standard_deviation/
5 |
6 | class RunningStat(object):
7 | def __init__(self, shape):
8 | self._n = 0
9 | self._M = np.zeros(shape)
10 | self._S = np.zeros(shape)
11 |
12 | def push(self, x):
13 | x = np.asarray(x)
14 | assert x.shape == self._M.shape
15 | self._n += 1
16 | if self._n == 1:
17 | self._M[...] = x
18 | else:
19 | oldM = self._M.copy()
20 | self._M[...] = oldM + (x - oldM) / self._n
21 | self._S[...] = self._S + (x - oldM) * (x - self._M)
22 |
23 | @property
24 | def n(self):
25 | return self._n
26 |
27 | @n.setter
28 | def n(self, n):
29 | self._n = n
30 |
31 | @property
32 | def mean(self):
33 | return self._M
34 |
35 | @mean.setter
36 | def mean(self, M):
37 | self._M = M
38 |
39 | @property
40 | def sum_square(self):
41 | return self._S
42 |
43 | @sum_square.setter
44 | def sum_square(self, S):
45 | self._S = S
46 |
47 | @property
48 | def var(self):
49 | return self._S / (self._n - 1) if self._n > 1 else np.square(self._M)
50 |
51 | @property
52 | def std(self):
53 | return np.sqrt(self.var)
54 |
55 | @property
56 | def shape(self):
57 | return self._M.shape
58 |
59 |
60 | class ZFilter:
61 | """
62 | y = (x-mean)/std
63 | using running estimates of mean,std
64 | """
65 |
66 | def __init__(self, shape, demean=True, destd=True, clip=10.0):
67 | self.demean = demean
68 | self.destd = destd
69 | self.clip = clip
70 |
71 | self.rs = RunningStat(shape)
72 |
73 | def __call__(self, x, update=True):
74 | if update: self.rs.push(x)
75 |
76 | if self.demean:
77 | x = x - self.rs.mean
78 |
79 | if self.destd:
80 | x = x / (self.rs.std + 1e-8)
81 |
82 | if self.clip:
83 | x = np.clip(x, -self.clip, self.clip)
84 |
85 | return x
86 |
--------------------------------------------------------------------------------
/mujoco/ppo/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/.DS_Store
--------------------------------------------------------------------------------
/mujoco/ppo/__pycache__/hparams.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/__pycache__/hparams.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/ppo/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/ppo/__pycache__/ppo.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/__pycache__/ppo.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/ppo/__pycache__/train_model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/__pycache__/train_model.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/ppo/logs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/logs/.DS_Store
--------------------------------------------------------------------------------
/mujoco/ppo/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class Actor(nn.Module):
5 | def __init__(self, num_inputs, num_outputs, args):
6 | super(Actor, self).__init__()
7 | self.fc1 = nn.Linear(num_inputs, args.hidden_size)
8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
9 | self.fc3 = nn.Linear(args.hidden_size, num_outputs)
10 |
11 | self.fc3.weight.data.mul_(0.1)
12 | self.fc3.bias.data.mul_(0.0)
13 |
14 | def forward(self, x):
15 | x = torch.tanh(self.fc1(x))
16 | x = torch.tanh(self.fc2(x))
17 | mu = self.fc3(x)
18 | logstd = torch.zeros_like(mu)
19 | std = torch.exp(logstd)
20 | return mu, std
21 |
22 |
23 | class Critic(nn.Module):
24 | def __init__(self, num_inputs, args):
25 | super(Critic, self).__init__()
26 | self.fc1 = nn.Linear(num_inputs, args.hidden_size)
27 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
28 | self.fc3 = nn.Linear(args.hidden_size, 1)
29 |
30 | self.fc3.weight.data.mul_(0.1)
31 | self.fc3.bias.data.mul_(0.0)
32 |
33 | def forward(self, x):
34 | x = torch.tanh(self.fc1(x))
35 | x = torch.tanh(self.fc2(x))
36 | v = self.fc3(x)
37 | return v
--------------------------------------------------------------------------------
/mujoco/ppo/ppo.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from utils.utils import log_prob_density
4 |
5 | def train_model(actor, critic, memory, actor_optim, critic_optim, args):
6 | memory = np.array(memory)
7 | states = np.vstack(memory[:, 0])
8 | actions = list(memory[:, 1])
9 | rewards = list(memory[:, 2])
10 | masks = list(memory[:, 3])
11 |
12 | old_values = critic(torch.Tensor(states))
13 | returns, advants = get_gae(rewards, masks, old_values, args)
14 |
15 | mu, std = actor(torch.Tensor(states))
16 | old_policy = log_prob_density(torch.Tensor(actions), mu, std)
17 |
18 | criterion = torch.nn.MSELoss()
19 | n = len(states)
20 | arr = np.arange(n)
21 |
22 | for _ in range(args.model_update_num):
23 | np.random.shuffle(arr)
24 |
25 | for i in range(n // args.batch_size):
26 | batch_index = arr[args.batch_size * i : args.batch_size * (i + 1)]
27 | batch_index = torch.LongTensor(batch_index)
28 |
29 | inputs = torch.Tensor(states)[batch_index]
30 | actions_samples = torch.Tensor(actions)[batch_index]
31 | returns_samples = returns.unsqueeze(1)[batch_index]
32 | advants_samples = advants.unsqueeze(1)[batch_index]
33 | oldvalue_samples = old_values[batch_index].detach()
34 |
35 | values = critic(inputs)
36 | clipped_values = oldvalue_samples + \
37 | torch.clamp(values - oldvalue_samples,
38 | -args.clip_param,
39 | args.clip_param)
40 | critic_loss1 = criterion(clipped_values, returns_samples)
41 | critic_loss2 = criterion(values, returns_samples)
42 | critic_loss = torch.max(critic_loss1, critic_loss2).mean()
43 |
44 | loss, ratio = surrogate_loss(actor, advants_samples, inputs,
45 | old_policy.detach(), actions_samples,
46 | batch_index)
47 | clipped_ratio = torch.clamp(ratio,
48 | 1.0 - args.clip_param,
49 | 1.0 + args.clip_param)
50 | clipped_loss = clipped_ratio * advants_samples
51 | actor_loss = -torch.min(loss, clipped_loss).mean()
52 |
53 | loss = actor_loss + 0.5 * critic_loss
54 |
55 | critic_optim.zero_grad()
56 | loss.backward(retain_graph=True)
57 | critic_optim.step()
58 |
59 | actor_optim.zero_grad()
60 | loss.backward()
61 | actor_optim.step()
62 |
63 | def get_gae(rewards, masks, values, args):
64 | rewards = torch.Tensor(rewards)
65 | masks = torch.Tensor(masks)
66 | returns = torch.zeros_like(rewards)
67 | advants = torch.zeros_like(rewards)
68 |
69 | running_returns = 0
70 | previous_value = 0
71 | running_advants = 0
72 |
73 | for t in reversed(range(0, len(rewards))):
74 | running_returns = rewards[t] + (args.gamma * running_returns * masks[t])
75 | returns[t] = running_returns
76 |
77 | running_delta = rewards[t] + (args.gamma * previous_value * masks[t]) - \
78 | values.data[t]
79 | previous_value = values.data[t]
80 |
81 | running_advants = running_delta + (args.gamma * args.lamda * \
82 | running_advants * masks[t])
83 | advants[t] = running_advants
84 |
85 | advants = (advants - advants.mean()) / advants.std()
86 | return returns, advants
87 |
88 | def surrogate_loss(actor, advants, states, old_policy, actions, batch_index):
89 | mu, std = actor(states)
90 | new_policy = log_prob_density(actions, mu, std)
91 | old_policy = old_policy[batch_index]
92 |
93 | ratio = torch.exp(new_policy - old_policy)
94 | surrogate_loss = ratio * advants
95 |
96 | return surrogate_loss, ratio
--------------------------------------------------------------------------------
/mujoco/ppo/save_model/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/save_model/.DS_Store
--------------------------------------------------------------------------------
/mujoco/ppo/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import torch
4 | import argparse
5 |
6 | from model import Actor, Critic
7 | from utils.utils import get_action
8 | from utils.running_state import ZFilter
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--env', type=str, default="Hopper-v2",
12 | help='name of Mujoco environement')
13 | parser.add_argument('--iter', type=int, default=5,
14 | help='number of episodes to play')
15 | parser.add_argument("--load_model", type=str, default='ppo_max.tar',
16 | help="if you test pretrained file, write filename in save_model folder")
17 |
18 | args = parser.parse_args()
19 |
20 |
21 | if __name__ == "__main__":
22 | env = gym.make(args.env)
23 | env.seed(500)
24 | torch.manual_seed(500)
25 |
26 | num_inputs = env.observation_space.shape[0]
27 | num_actions = env.action_space.shape[0]
28 |
29 | print("state size: ", num_inputs)
30 | print("action size: ", num_actions)
31 |
32 | actor = Actor(num_inputs, num_actions)
33 | critic = Critic(num_inputs)
34 |
35 | running_state = ZFilter((num_inputs,), clip=5)
36 |
37 | if args.load_model is not None:
38 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
39 |
40 | pretrained_model = torch.load(pretrained_model_path)
41 |
42 | actor.load_state_dict(pretrained_model['actor'])
43 | critic.load_state_dict(pretrained_model['critic'])
44 |
45 | running_state.rs.n = pretrained_model['z_filter_n']
46 | running_state.rs.mean = pretrained_model['z_filter_m']
47 | running_state.rs.sum_square = pretrained_model['z_filter_s']
48 |
49 | print("Loaded OK ex. ZFilter N {}".format(running_state.rs.n))
50 |
51 | else:
52 | assert("Should write pretrained filename in save_model folder. ex) python3 test_algo.py --load_model ppo_max.tar")
53 |
54 |
55 | actor.eval(), critic.eval()
56 | for episode in range(args.iter):
57 | state = env.reset()
58 | steps = 0
59 | score = 0
60 | for _ in range(10000):
61 | env.render()
62 | mu, std, _ = actor(torch.Tensor(state).unsqueeze(0))
63 | action = get_action(mu, std)[0]
64 |
65 | next_state, reward, done, _ = env.step(action)
66 | next_state = running_state(next_state)
67 |
68 | state = next_state
69 | score += reward
70 |
71 | if done:
72 | print("{} cumulative reward: {}".format(episode, score))
73 | break
74 |
--------------------------------------------------------------------------------
/mujoco/ppo/utils/__pycache__/running_state.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/utils/__pycache__/running_state.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/ppo/utils/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/utils/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/ppo/utils/__pycache__/zfilter.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/ppo/utils/__pycache__/zfilter.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/ppo/utils/utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 |
4 | def get_action(mu, std):
5 | action = torch.normal(mu, std)
6 | action = action.data.numpy()
7 | return action
8 |
9 | def log_prob_density(x, mu, std):
10 | log_prob_density = -(x - mu).pow(2) / (2 * std.pow(2)) \
11 | - 0.5 * math.log(2 * math.pi)
12 | return log_prob_density.sum(1, keepdim=True)
13 |
14 | def save_checkpoint(state, filename):
15 | torch.save(state, filename)
--------------------------------------------------------------------------------
/mujoco/ppo/utils/zfilter.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | # from https://github.com/joschu/modular_rl
4 | # http://www.johndcook.com/blog/standard_deviation/
5 |
6 | class RunningStat(object):
7 | def __init__(self, shape):
8 | self._n = 0
9 | self._M = np.zeros(shape)
10 | self._S = np.zeros(shape)
11 |
12 | def push(self, x):
13 | x = np.asarray(x)
14 | assert x.shape == self._M.shape
15 | self._n += 1
16 | if self._n == 1:
17 | self._M[...] = x
18 | else:
19 | oldM = self._M.copy()
20 | self._M[...] = oldM + (x - oldM) / self._n
21 | self._S[...] = self._S + (x - oldM) * (x - self._M)
22 |
23 | @property
24 | def n(self):
25 | return self._n
26 |
27 | @n.setter
28 | def n(self, n):
29 | self._n = n
30 |
31 | @property
32 | def mean(self):
33 | return self._M
34 |
35 | @mean.setter
36 | def mean(self, M):
37 | self._M = M
38 |
39 | @property
40 | def sum_square(self):
41 | return self._S
42 |
43 | @sum_square.setter
44 | def sum_square(self, S):
45 | self._S = S
46 |
47 | @property
48 | def var(self):
49 | return self._S / (self._n - 1) if self._n > 1 else np.square(self._M)
50 |
51 | @property
52 | def std(self):
53 | return np.sqrt(self.var)
54 |
55 | @property
56 | def shape(self):
57 | return self._M.shape
58 |
59 |
60 | class ZFilter:
61 | """
62 | y = (x-mean)/std
63 | using running estimates of mean,std
64 | """
65 |
66 | def __init__(self, shape, demean=True, destd=True, clip=10.0):
67 | self.demean = demean
68 | self.destd = destd
69 | self.clip = clip
70 |
71 | self.rs = RunningStat(shape)
72 |
73 | def __call__(self, x, update=True):
74 | if update: self.rs.push(x)
75 |
76 | if self.demean:
77 | x = x - self.rs.mean
78 |
79 | if self.destd:
80 | x = x / (self.rs.std + 1e-8)
81 |
82 | if self.clip:
83 | x = np.clip(x, -self.clip, self.clip)
84 |
85 | return x
86 |
--------------------------------------------------------------------------------
/mujoco/tnpg/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/.DS_Store
--------------------------------------------------------------------------------
/mujoco/tnpg/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/tnpg/__pycache__/tnpg.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/__pycache__/tnpg.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/tnpg/__pycache__/trpo.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/__pycache__/trpo.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/tnpg/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class Actor(nn.Module):
5 | def __init__(self, state_size, action_size, args):
6 | super(Actor, self).__init__()
7 | self.fc1 = nn.Linear(state_size, args.hidden_size)
8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
9 | self.fc3 = nn.Linear(args.hidden_size, action_size)
10 | self.fc3.weight.data.mul_(0.1)
11 | self.fc3.bias.data.mul_(0.0)
12 |
13 | def forward(self, x):
14 | x = torch.tanh(self.fc1(x))
15 | x = torch.tanh(self.fc2(x))
16 | mu = self.fc3(x)
17 | logstd = torch.zeros_like(mu)
18 | std = torch.exp(logstd)
19 | return mu, std
20 |
--------------------------------------------------------------------------------
/mujoco/tnpg/save_model/24model.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/save_model/24model.pth
--------------------------------------------------------------------------------
/mujoco/tnpg/save_model/40model.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/save_model/40model.pth
--------------------------------------------------------------------------------
/mujoco/tnpg/save_model/67model.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/save_model/67model.pth
--------------------------------------------------------------------------------
/mujoco/tnpg/save_model/76model.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/save_model/76model.pth
--------------------------------------------------------------------------------
/mujoco/tnpg/save_model/79model.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/save_model/79model.pth
--------------------------------------------------------------------------------
/mujoco/tnpg/save_model/86model.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/save_model/86model.pth
--------------------------------------------------------------------------------
/mujoco/tnpg/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import torch
4 | import argparse
5 |
6 | from model import Actor, Critic
7 | from utils.utils import get_action
8 | from utils.running_state import ZFilter
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--env', type=str, default="Hopper-v2",
12 | help='name of Mujoco environement')
13 | parser.add_argument('--iter', type=int, default=5,
14 | help='number of episodes to play')
15 | parser.add_argument("--load_model", type=str, default='ppo_max.tar',
16 | help="if you test pretrained file, write filename in save_model folder")
17 |
18 | args = parser.parse_args()
19 |
20 |
21 | if __name__ == "__main__":
22 | env = gym.make(args.env)
23 | env.seed(500)
24 | torch.manual_seed(500)
25 |
26 | num_inputs = env.observation_space.shape[0]
27 | num_actions = env.action_space.shape[0]
28 |
29 | print("state size: ", num_inputs)
30 | print("action size: ", num_actions)
31 |
32 | actor = Actor(num_inputs, num_actions)
33 | critic = Critic(num_inputs)
34 |
35 | running_state = ZFilter((num_inputs,), clip=5)
36 |
37 | if args.load_model is not None:
38 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
39 |
40 | pretrained_model = torch.load(pretrained_model_path)
41 |
42 | actor.load_state_dict(pretrained_model['actor'])
43 | critic.load_state_dict(pretrained_model['critic'])
44 |
45 | running_state.rs.n = pretrained_model['z_filter_n']
46 | running_state.rs.mean = pretrained_model['z_filter_m']
47 | running_state.rs.sum_square = pretrained_model['z_filter_s']
48 |
49 | print("Loaded OK ex. ZFilter N {}".format(running_state.rs.n))
50 |
51 | else:
52 | assert("Should write pretrained filename in save_model folder. ex) python3 test_algo.py --load_model ppo_max.tar")
53 |
54 |
55 | actor.eval(), critic.eval()
56 | for episode in range(args.iter):
57 | state = env.reset()
58 | steps = 0
59 | score = 0
60 | for _ in range(10000):
61 | env.render()
62 | mu, std, _ = actor(torch.Tensor(state).unsqueeze(0))
63 | action = get_action(mu, std)[0]
64 |
65 | next_state, reward, done, _ = env.step(action)
66 | next_state = running_state(next_state)
67 |
68 | state = next_state
69 | score += reward
70 |
71 | if done:
72 | print("{} cumulative reward: {}".format(episode, score))
73 | break
74 |
--------------------------------------------------------------------------------
/mujoco/tnpg/tnpg.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from utils.utils import *
3 |
4 | def get_returns(rewards, masks, gamma):
5 | rewards = torch.Tensor(rewards)
6 | masks = torch.Tensor(masks)
7 | returns = torch.zeros_like(rewards)
8 |
9 | running_returns = 0
10 |
11 | for t in reversed(range(0, len(rewards))):
12 | running_returns = rewards[t] + gamma * running_returns * masks[t]
13 | returns[t] = running_returns
14 |
15 | returns = (returns - returns.mean()) / returns.std()
16 | return returns
17 |
18 | def get_loss(actor, returns, states, actions):
19 | mu, std = actor(torch.Tensor(states))
20 | log_policy = log_prob_density(torch.Tensor(actions), mu, std)
21 | returns = returns.unsqueeze(1)
22 |
23 | loss = log_policy * returns
24 | loss = loss.mean()
25 | return loss
26 |
27 | # from openai baseline code
28 | # https://github.com/openai/baselines/blob/master/baselines/common/cg.py
29 | def conjugate_gradient(actor, states, b, nsteps, residual_tol=1e-10):
30 | x = torch.zeros(b.size())
31 | r = b.clone()
32 | p = b.clone()
33 | rdotr = torch.dot(r, r)
34 |
35 | for i in range(nsteps): # nsteps=10
36 | f_Ax = hessian_vector_product(actor, states, p, cg_damping=1e-1)
37 | alpha = rdotr / torch.dot(p, f_Ax)
38 | x += alpha * p
39 | r -= alpha * f_Ax
40 | new_rdotr = torch.dot(r, r)
41 | beta = new_rdotr / rdotr
42 | p = r + beta * p
43 |
44 | rdotr = new_rdotr
45 | if rdotr < residual_tol: # residual_tol = 0.0000000001
46 | break
47 | return x
48 |
49 | def train_model(actor, memory, args):
50 | memory = np.array(memory)
51 | states = np.vstack(memory[:, 0])
52 | actions = list(memory[:, 1])
53 | rewards = list(memory[:, 2])
54 | masks = list(memory[:, 3])
55 |
56 | # ----------------------------
57 | # step 1: get returns
58 | returns = get_returns(rewards, masks, args.gamma)
59 |
60 | # ----------------------------
61 | # step 2: get gradient of loss and hessian of kl
62 | loss = get_loss(actor, returns, states, actions)
63 | loss_grad = torch.autograd.grad(loss, actor.parameters())
64 | loss_grad = flat_grad(loss_grad)
65 |
66 | step_dir = conjugate_gradient(actor, states, loss_grad.data, nsteps=10)
67 |
68 | # ----------------------------
69 | # step 3: get step direction and step size and update actor
70 | params = flat_params(actor)
71 | new_params = params + 0.5 * step_dir
72 | update_model(actor, new_params)
--------------------------------------------------------------------------------
/mujoco/tnpg/train.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import argparse
4 | import numpy as np
5 | from collections import deque
6 |
7 | import torch
8 | import torch.optim as optim
9 | from tensorboardX import SummaryWriter
10 |
11 | from model import Actor
12 | from tnpg import train_model
13 | from utils.utils import get_action
14 | from utils.running_state import ZFilter
15 |
16 | parser = argparse.ArgumentParser(description='PyTorch NPG')
17 | parser.add_argument('--env_name', type=str, default="Hopper-v2")
18 | parser.add_argument('--load_model', type=str, default=None)
19 | parser.add_argument('--save_path', default='./save_model/', help='')
20 | parser.add_argument('--render', action="store_true", default=False)
21 | parser.add_argument('--gamma', type=float, default=0.99)
22 | parser.add_argument('--hidden_size', type=int, default=64)
23 | parser.add_argument('--learning_rate', type=float, default=3e-4)
24 | parser.add_argument('--logdir', type=str, default='logs',
25 | help='tensorboardx logs directory')
26 | args = parser.parse_args()
27 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28 |
29 | if __name__=="__main__":
30 | env = gym.make(args.env_name)
31 | env.seed(500)
32 | torch.manual_seed(500)
33 |
34 | state_size = env.observation_space.shape[0]
35 | action_size = env.action_space.shape[0]
36 | print('state size:', state_size)
37 | print('action size:', action_size)
38 |
39 | actor = Actor(state_size, action_size, args)
40 | # writer = SummaryWriter(args.logdir)
41 |
42 | if not os.path.isdir(args.save_path):
43 | os.makedirs(args.save_path)
44 |
45 | running_state = ZFilter((state_size,), clip=5)
46 | episodes = 0
47 |
48 | for iter in range(2000):
49 | memory = deque()
50 | scores = []
51 | steps = 0
52 |
53 | while steps < 2048:
54 | score = 0
55 | episodes += 1
56 |
57 | state = env.reset()
58 | state = running_state(state)
59 |
60 | for _ in range(10000):
61 | if args.render:
62 | env.render()
63 |
64 | steps += 1
65 |
66 | mu, std = actor(torch.Tensor(state).unsqueeze(0))
67 | action = get_action(mu, std)[0]
68 | next_state, reward, done, _ = env.step(action)
69 |
70 | if done:
71 | mask = 0
72 | else:
73 | mask = 1
74 |
75 | memory.append([state, action, reward, mask])
76 |
77 | next_state = running_state(next_state)
78 | state = next_state
79 | score += reward
80 |
81 | if done:
82 | break
83 |
84 | scores.append(score)
85 |
86 | score_avg = np.mean(scores)
87 | print('{} episode score is {:.2f}'.format(episodes, score_avg))
88 | # writer.add_scalar('log/score', float(score_avg), iter)
89 |
90 | actor.train()
91 | train_model(actor, memory, args)
92 |
93 | if iter % 100:
94 | ckpt_path = args.save_path + str(score_avg) + 'model.pth'
95 | torch.save(actor.state_dict(), ckpt_path)
--------------------------------------------------------------------------------
/mujoco/tnpg/utils/__pycache__/running_state.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/utils/__pycache__/running_state.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/tnpg/utils/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/tnpg/utils/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/tnpg/utils/running_state.py:
--------------------------------------------------------------------------------
1 | from collections import deque
2 |
3 | import numpy as np
4 |
5 |
6 | # from https://github.com/joschu/modular_rl
7 | # http://www.johndcook.com/blog/standard_deviation/
8 | class RunningStat(object):
9 | def __init__(self, shape): # shape = (11,)
10 | self._n = 0
11 | self._M = np.zeros(shape)
12 | self._S = np.zeros(shape)
13 |
14 | def push(self, x):
15 | x = np.asarray(x)
16 | assert x.shape == self._M.shape
17 | self._n += 1
18 | if self._n == 1: # Only the first time
19 | self._M[...] = x
20 | else: # From the second time ~
21 | oldM = self._M.copy()
22 | self._M[...] = oldM + (x - oldM) / self._n
23 | self._S[...] = self._S + (x - oldM) * (x - self._M)
24 |
25 | @property
26 | def n(self):
27 | return self._n
28 |
29 | @n.setter
30 | def n(self, n):
31 | self._n = n
32 |
33 | @property
34 | def mean(self):
35 | return self._M
36 |
37 | @mean.setter
38 | def mean(self, M):
39 | self._M = M
40 |
41 | @property
42 | def sum_square(self):
43 | return self._S
44 |
45 | @sum_square.setter
46 | def sum_square(self, S):
47 | self._S = S
48 |
49 | @property
50 | def var(self):
51 | return self._S / (self._n - 1) if self._n > 1 else np.square(self._M)
52 |
53 | @property
54 | def std(self):
55 | return np.sqrt(self.var)
56 |
57 | @property
58 | def shape(self):
59 | return self._M.shape
60 |
61 |
62 | class ZFilter:
63 | """
64 | y = (x-mean)/std
65 | using running estimates of mean,std
66 | """
67 |
68 | def __init__(self, shape, demean=True, destd=True, clip=10.0): # shape = (11,), clip = 5
69 | self.demean = demean
70 | self.destd = destd
71 | self.clip = clip
72 |
73 | self.rs = RunningStat(shape)
74 |
75 | def __call__(self, x, update=True):
76 | if update: self.rs.push(x)
77 |
78 | if self.demean:
79 | x = x - self.rs.mean
80 |
81 | if self.destd:
82 | x = x / (self.rs.std + 1e-8)
83 |
84 | if self.clip:
85 | x = np.clip(x, -self.clip, self.clip)
86 |
87 | return x
88 |
--------------------------------------------------------------------------------
/mujoco/tnpg/utils/utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 |
4 | def get_action(mu, std):
5 | action = torch.normal(mu, std)
6 | action = action.data.numpy()
7 | return action
8 |
9 | # logarithm의 property을 이용하여 ratio를 만들 때 사용하기 위한
10 | # normal distribution의 probability density
11 | def log_prob_density(x, mu, std):
12 | log_density = -(x - mu).pow(2) / (2 * std.pow(2)) \
13 | - 0.5 * math.log(2 * math.pi)
14 | return log_density.sum(1, keepdim=True)
15 |
16 |
17 | def hessian_vector_product(actor, states, p, cg_damping):
18 | p.detach()
19 | kl = kl_divergence(old_actor=actor, new_actor=actor, states=states)
20 | kl = kl.mean()
21 |
22 | kl_grad = torch.autograd.grad(kl, actor.parameters(), create_graph=True)
23 | kl_grad = flat_grad(kl_grad)
24 |
25 | kl_grad_p = (kl_grad * p).sum()
26 | kl_hessian = torch.autograd.grad(kl_grad_p, actor.parameters())
27 | kl_hessian = flat_hessian(kl_hessian)
28 |
29 | return kl_hessian + p * cg_damping # cg_damping = 0.1
30 |
31 | def kl_divergence(old_actor, new_actor, states):
32 | mu, std = new_actor(torch.Tensor(states))
33 | mu_old, std_old = old_actor(torch.Tensor(states))
34 | mu_old = mu_old.detach()
35 | std_old = std_old.detach()
36 |
37 | # kl divergence between old policy and new policy : D( pi_old || pi_new )
38 | # pi_old -> mu_old, std_old / pi_new -> mu, std
39 | # be careful of calculating KL-divergence. It is not symmetric metric.
40 | kl = torch.log(std / std_old) + (std_old.pow(2) + (mu_old - mu).pow(2)) / (2.0 * std.pow(2)) - 0.5
41 | return kl.sum(1, keepdim=True)
42 |
43 | def flat_grad(grads):
44 | grad_flatten = []
45 | for grad in grads:
46 | grad_flatten.append(grad.view(-1))
47 | grad_flatten = torch.cat(grad_flatten)
48 | return grad_flatten
49 |
50 | def flat_hessian(hessians):
51 | hessians_flatten = []
52 | for hessian in hessians:
53 | hessians_flatten.append(hessian.contiguous().view(-1))
54 | hessians_flatten = torch.cat(hessians_flatten).data
55 | return hessians_flatten
56 |
57 |
58 | def flat_params(model):
59 | params = []
60 | for param in model.parameters():
61 | params.append(param.data.view(-1))
62 | params_flatten = torch.cat(params)
63 | return params_flatten
64 |
65 | def update_model(model, new_params):
66 | index = 0
67 | for params in model.parameters():
68 | params_length = len(params.view(-1))
69 | new_param = new_params[index: index + params_length]
70 | new_param = new_param.view(params.size())
71 | params.data.copy_(new_param)
72 | index += params_length
--------------------------------------------------------------------------------
/mujoco/trpo/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/trpo/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/trpo/__pycache__/trpo.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/trpo/__pycache__/trpo.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/trpo/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class Actor(nn.Module):
5 | def __init__(self, state_size, action_size, args):
6 | super(Actor, self).__init__()
7 | self.fc1 = nn.Linear(state_size, args.hidden_size)
8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
9 | self.fc3 = nn.Linear(args.hidden_size, action_size)
10 | self.fc3.weight.data.mul_(0.1)
11 | self.fc3.bias.data.mul_(0.0)
12 |
13 | def forward(self, x):
14 | x = torch.tanh(self.fc1(x))
15 | x = torch.tanh(self.fc2(x))
16 | mu = self.fc3(x)
17 | logstd = torch.zeros_like(mu)
18 | std = torch.exp(logstd)
19 | return mu, std
--------------------------------------------------------------------------------
/mujoco/trpo/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import torch
4 | import argparse
5 |
6 | from model import Actor, Critic
7 | from utils.utils import get_action
8 | from utils.running_state import ZFilter
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--env', type=str, default="Hopper-v2",
12 | help='name of Mujoco environement')
13 | parser.add_argument('--iter', type=int, default=5,
14 | help='number of episodes to play')
15 | parser.add_argument("--load_model", type=str, default='ppo_max.tar',
16 | help="if you test pretrained file, write filename in save_model folder")
17 |
18 | args = parser.parse_args()
19 |
20 |
21 | if __name__ == "__main__":
22 | env = gym.make(args.env)
23 | env.seed(500)
24 | torch.manual_seed(500)
25 |
26 | num_inputs = env.observation_space.shape[0]
27 | num_actions = env.action_space.shape[0]
28 |
29 | print("state size: ", num_inputs)
30 | print("action size: ", num_actions)
31 |
32 | actor = Actor(num_inputs, num_actions)
33 | critic = Critic(num_inputs)
34 |
35 | running_state = ZFilter((num_inputs,), clip=5)
36 |
37 | if args.load_model is not None:
38 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
39 |
40 | pretrained_model = torch.load(pretrained_model_path)
41 |
42 | actor.load_state_dict(pretrained_model['actor'])
43 | critic.load_state_dict(pretrained_model['critic'])
44 |
45 | running_state.rs.n = pretrained_model['z_filter_n']
46 | running_state.rs.mean = pretrained_model['z_filter_m']
47 | running_state.rs.sum_square = pretrained_model['z_filter_s']
48 |
49 | print("Loaded OK ex. ZFilter N {}".format(running_state.rs.n))
50 |
51 | else:
52 | assert("Should write pretrained filename in save_model folder. ex) python3 test_algo.py --load_model ppo_max.tar")
53 |
54 |
55 | actor.eval(), critic.eval()
56 | for episode in range(args.iter):
57 | state = env.reset()
58 | steps = 0
59 | score = 0
60 | for _ in range(10000):
61 | env.render()
62 | mu, std, _ = actor(torch.Tensor(state).unsqueeze(0))
63 | action = get_action(mu, std)[0]
64 |
65 | next_state, reward, done, _ = env.step(action)
66 | next_state = running_state(next_state)
67 |
68 | state = next_state
69 | score += reward
70 |
71 | if done:
72 | print("{} cumulative reward: {}".format(episode, score))
73 | break
74 |
--------------------------------------------------------------------------------
/mujoco/trpo/train.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import argparse
4 | import numpy as np
5 | from collections import deque
6 |
7 | import torch
8 | import torch.optim as optim
9 | from tensorboardX import SummaryWriter
10 |
11 | from model import Actor
12 | from trpo import train_model
13 | from utils.utils import get_action
14 | from utils.running_state import ZFilter
15 |
16 | parser = argparse.ArgumentParser(description='PyTorch TRPO')
17 | parser.add_argument('--env_name', type=str, default="Hopper-v2")
18 | parser.add_argument('--load_model', type=str, default=None)
19 | parser.add_argument('--save_path', default='./save_model/', help='')
20 | parser.add_argument('--render', action="store_true", default=False)
21 | parser.add_argument('--gamma', type=float, default=0.99)
22 | parser.add_argument('--hidden_size', type=int, default=64)
23 | parser.add_argument('--max_kl', type=float, default=1e-2)
24 | parser.add_argument('--logdir', type=str, default='logs',
25 | help='tensorboardx logs directory')
26 | args = parser.parse_args()
27 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28 |
29 | if __name__=="__main__":
30 | env = gym.make(args.env_name)
31 | env.seed(500)
32 | torch.manual_seed(500)
33 |
34 | state_size = env.observation_space.shape[0]
35 | action_size = env.action_space.shape[0]
36 | print('state size:', state_size)
37 | print('action size:', action_size)
38 |
39 | actor = Actor(state_size, action_size, args)
40 | # writer = SummaryWriter(args.logdir)
41 |
42 | if not os.path.isdir(args.save_path):
43 | os.makedirs(args.save_path)
44 |
45 | running_state = ZFilter((state_size,), clip=5)
46 | episodes = 0
47 |
48 | for iter in range(2000):
49 | memory = deque()
50 | scores = []
51 | steps = 0
52 |
53 | while steps < 2048:
54 | score = 0
55 | episodes += 1
56 |
57 | state = env.reset()
58 | state = running_state(state)
59 |
60 | for _ in range(10000):
61 | if args.render:
62 | env.render()
63 |
64 | steps += 1
65 |
66 | mu, std = actor(torch.Tensor(state).unsqueeze(0))
67 | action = get_action(mu, std)[0]
68 | next_state, reward, done, _ = env.step(action)
69 |
70 | if done:
71 | mask = 0
72 | else:
73 | mask = 1
74 |
75 | memory.append([state, action, reward, mask])
76 |
77 | next_state = running_state(next_state)
78 | state = next_state
79 | score += reward
80 |
81 | if done:
82 | break
83 |
84 | scores.append(score)
85 |
86 | score_avg = np.mean(scores)
87 | print('{} episode score is {:.2f}'.format(episodes, score_avg))
88 | # writer.add_scalar('log/score', float(score_avg), iter)
89 |
90 | actor.train()
91 | train_model(actor, memory, state_size, action_size, args)
92 |
93 | # if iter % 100:
94 | # ckpt_path = args.save_path + str(score_avg) + 'model.pth'
95 | # torch.save(actor.state_dict(), ckpt_path)
--------------------------------------------------------------------------------
/mujoco/trpo/trpo.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from model import Actor
3 | from utils.utils import *
4 |
5 | def train_model(actor, memory, state_size, action_size, args):
6 | memory = np.array(memory)
7 | states = np.vstack(memory[:, 0])
8 | actions = list(memory[:, 1])
9 | rewards = list(memory[:, 2])
10 | masks = list(memory[:, 3])
11 |
12 | # ----------------------------
13 | # step 1: get returns
14 | returns = get_returns(rewards, masks, args.gamma)
15 |
16 | # ----------------------------
17 | # step 2: get gradient of loss and hessian of kl and step direction
18 | mu, std = actor(torch.Tensor(states))
19 | old_policy = log_prob_density(torch.Tensor(actions), mu, std)
20 | loss = surrogate_loss(actor, returns, states, old_policy.detach(), actions)
21 |
22 | loss_grad = torch.autograd.grad(loss, actor.parameters())
23 | loss_grad = flat_grad(loss_grad)
24 | loss = loss.data.numpy()
25 |
26 | step_dir = conjugate_gradient(actor, states, loss_grad.data, nsteps=10)
27 |
28 | # ----------------------------
29 | # step 3: get step-size alpha and maximal step
30 | sHs = 0.5 * (step_dir * hessian_vector_product(actor, states, step_dir)
31 | ).sum(0, keepdim=True)
32 | step_size = torch.sqrt(2 * args.max_kl / sHs)[0]
33 | maximal_step = step_size * step_dir
34 |
35 | # ----------------------------
36 | # step 4: perform backtracking line search for n iteration
37 | old_actor = Actor(state_size, action_size, args)
38 | params = flat_params(actor)
39 | update_model(old_actor, params)
40 |
41 | # 구했던 maximal step만큼 parameter space에서 움직였을 때 예상되는 performance 변화
42 | expected_improve = (loss_grad * maximal_step).sum(0, keepdim=True)
43 | expected_improve = expected_improve.data.numpy()
44 |
45 | # Backtracking line search
46 | # see cvx 464p https://web.stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf
47 | # additionally, https://en.wikipedia.org/wiki/Backtracking_line_search
48 | flag = False
49 | alpha = 0.5
50 | beta = 0.5
51 | t = 1.0
52 |
53 | for i in range(10):
54 | new_params = params + t * maximal_step
55 | update_model(actor, new_params)
56 |
57 | new_loss = surrogate_loss(actor, returns, states, old_policy.detach(), actions)
58 | new_loss = new_loss.data.numpy()
59 |
60 | loss_improve = new_loss - loss
61 | expected_improve *= t
62 | improve_condition = loss_improve / expected_improve
63 |
64 | kl = kl_divergence(old_actor=old_actor, new_actor=actor, states=states)
65 | kl = kl.mean()
66 |
67 | print('kl: {:.4f} | loss_improve: {:.4f} | expected_improve: {:.4f} '
68 | '| improve_condition: {:.4f} | number of line search: {}'
69 | .format(kl.data.numpy(), loss_improve, expected_improve[0], improve_condition, i))
70 |
71 | # kl-divergence와 expected_new_loss_grad와 함께 trust region 안에 있는지 밖에 있는지를 판단
72 | # trust region 안에 있으면 loop 탈출
73 | # max_kl = 0.01
74 | if kl < args.max_kl and improve_condition > alpha:
75 | flag = True
76 | break
77 |
78 | # trust region 밖에 있으면 maximal_step을 반만큼 쪼개서 다시 실시
79 | t *= beta
80 |
81 | if not flag:
82 | params = flat_params(old_actor)
83 | update_model(actor, params)
84 | print('policy update does not impove the surrogate')
85 |
86 | def get_returns(rewards, masks, gamma):
87 | rewards = torch.Tensor(rewards)
88 | masks = torch.Tensor(masks)
89 | returns = torch.zeros_like(rewards)
90 |
91 | running_returns = 0
92 |
93 | for t in reversed(range(0, len(rewards))):
94 | running_returns = rewards[t] + gamma * running_returns * masks[t]
95 | returns[t] = running_returns
96 |
97 | returns = (returns - returns.mean()) / returns.std()
98 | return returns
99 |
100 | def surrogate_loss(actor, returns, states, old_policy, actions):
101 | mu, std = actor(torch.Tensor(states))
102 | new_policy = log_prob_density(torch.Tensor(actions), mu, std)
103 | returns = returns.unsqueeze(1)
104 |
105 | surrogate = torch.exp(new_policy - old_policy) * returns
106 | surrogate = surrogate.mean()
107 | return surrogate
108 |
109 | # from openai baseline code
110 | # https://github.com/openai/baselines/blob/master/baselines/common/cg.py
111 | def conjugate_gradient(actor, states, b, nsteps, residual_tol=1e-10):
112 | x = torch.zeros(b.size())
113 | r = b.clone()
114 | p = b.clone()
115 | rdotr = torch.dot(r, r)
116 | for i in range(nsteps):
117 | _Avp = hessian_vector_product(actor, states, p)
118 | alpha = rdotr / torch.dot(p, _Avp)
119 | x += alpha * p
120 | r -= alpha * _Avp
121 | new_rdotr = torch.dot(r, r)
122 | betta = new_rdotr / rdotr
123 | p = r + betta * p
124 | rdotr = new_rdotr
125 | if rdotr < residual_tol:
126 | break
127 | return x
--------------------------------------------------------------------------------
/mujoco/trpo/utils/__pycache__/running_state.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/trpo/utils/__pycache__/running_state.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/trpo/utils/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/trpo/utils/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/trpo/utils/running_state.py:
--------------------------------------------------------------------------------
1 | from collections import deque
2 |
3 | import numpy as np
4 |
5 |
6 | # from https://github.com/joschu/modular_rl
7 | # http://www.johndcook.com/blog/standard_deviation/
8 | class RunningStat(object):
9 | def __init__(self, shape): # shape = (11,)
10 | self._n = 0
11 | self._M = np.zeros(shape)
12 | self._S = np.zeros(shape)
13 |
14 | def push(self, x):
15 | x = np.asarray(x)
16 | assert x.shape == self._M.shape
17 | self._n += 1
18 | if self._n == 1: # Only the first time
19 | self._M[...] = x
20 | else: # From the second time ~
21 | oldM = self._M.copy()
22 | self._M[...] = oldM + (x - oldM) / self._n
23 | self._S[...] = self._S + (x - oldM) * (x - self._M)
24 |
25 | @property
26 | def n(self):
27 | return self._n
28 |
29 | @n.setter
30 | def n(self, n):
31 | self._n = n
32 |
33 | @property
34 | def mean(self):
35 | return self._M
36 |
37 | @mean.setter
38 | def mean(self, M):
39 | self._M = M
40 |
41 | @property
42 | def sum_square(self):
43 | return self._S
44 |
45 | @sum_square.setter
46 | def sum_square(self, S):
47 | self._S = S
48 |
49 | @property
50 | def var(self):
51 | return self._S / (self._n - 1) if self._n > 1 else np.square(self._M)
52 |
53 | @property
54 | def std(self):
55 | return np.sqrt(self.var)
56 |
57 | @property
58 | def shape(self):
59 | return self._M.shape
60 |
61 |
62 | class ZFilter:
63 | """
64 | y = (x-mean)/std
65 | using running estimates of mean,std
66 | """
67 |
68 | def __init__(self, shape, demean=True, destd=True, clip=10.0): # shape = (11,), clip = 5
69 | self.demean = demean
70 | self.destd = destd
71 | self.clip = clip
72 |
73 | self.rs = RunningStat(shape)
74 |
75 | def __call__(self, x, update=True):
76 | if update: self.rs.push(x)
77 |
78 | if self.demean:
79 | x = x - self.rs.mean
80 |
81 | if self.destd:
82 | x = x / (self.rs.std + 1e-8)
83 |
84 | if self.clip:
85 | x = np.clip(x, -self.clip, self.clip)
86 |
87 | return x
88 |
--------------------------------------------------------------------------------
/mujoco/trpo/utils/utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 |
4 | def get_action(mu, std):
5 | action = torch.normal(mu, std)
6 | action = action.data.numpy()
7 | return action
8 |
9 | # logarithm의 property을 이용하여 ratio를 만들 때 사용하기 위한
10 | # normal distribution의 probability density
11 | def log_prob_density(x, mu, std):
12 | log_density = -(x - mu).pow(2) / (2 * std.pow(2)) \
13 | - 0.5 * math.log(2 * math.pi)
14 | return log_density.sum(1, keepdim=True)
15 |
16 |
17 | def hessian_vector_product(actor, states, p):
18 | p.detach()
19 | kl = kl_divergence(old_actor=actor, new_actor=actor, states=states)
20 | kl = kl.mean()
21 | kl_grad = torch.autograd.grad(kl, actor.parameters(), create_graph=True)
22 | kl_grad = flat_grad(kl_grad) # check kl_grad == 0
23 |
24 | kl_grad_p = (kl_grad * p).sum()
25 | kl_hessian_p = torch.autograd.grad(kl_grad_p, actor.parameters())
26 | kl_hessian_p = flat_hessian(kl_hessian_p)
27 |
28 | return kl_hessian_p + 0.1 * p
29 |
30 | def kl_divergence(old_actor, new_actor, states):
31 | mu, std = new_actor(torch.Tensor(states))
32 | mu_old, std_old = old_actor(torch.Tensor(states))
33 | mu_old = mu_old.detach()
34 | std_old = std_old.detach()
35 |
36 | # kl divergence between old policy and new policy : D( pi_old || pi_new )
37 | # pi_old -> mu_old, std_old / pi_new -> mu, std
38 | # be careful of calculating KL-divergence. It is not symmetric metric
39 | kl = (std_old.pow(2) + (mu_old - mu).pow(2)) / (2.0 * std.pow(2)) - 0.5
40 | return kl.sum(1, keepdim=True)
41 |
42 |
43 | def flat_grad(grads):
44 | grad_flatten = []
45 | for grad in grads:
46 | grad_flatten.append(grad.view(-1))
47 | grad_flatten = torch.cat(grad_flatten)
48 | return grad_flatten
49 |
50 | def flat_hessian(hessians):
51 | hessians_flatten = []
52 | for hessian in hessians:
53 | hessians_flatten.append(hessian.contiguous().view(-1))
54 | hessians_flatten = torch.cat(hessians_flatten).data
55 | return hessians_flatten
56 |
57 |
58 | def flat_params(model):
59 | params = []
60 | for param in model.parameters():
61 | params.append(param.data.view(-1))
62 | params_flatten = torch.cat(params)
63 | return params_flatten
64 |
65 | def update_model(model, new_params):
66 | index = 0
67 | for params in model.parameters():
68 | params_length = len(params.view(-1))
69 | new_param = new_params[index: index + params_length]
70 | new_param = new_param.view(params.size())
71 | params.data.copy_(new_param)
72 | index += params_length
73 |
74 |
75 |
76 |
77 |
--------------------------------------------------------------------------------
/mujoco/vail/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/.DS_Store
--------------------------------------------------------------------------------
/mujoco/vail/__pycache__/hparams.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/__pycache__/hparams.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/vail/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/vail/__pycache__/train_model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/__pycache__/train_model.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/vail/expert_demo/expert_demo.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/expert_demo/expert_demo.p
--------------------------------------------------------------------------------
/mujoco/vail/logs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/logs/.DS_Store
--------------------------------------------------------------------------------
/mujoco/vail/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class Actor(nn.Module):
5 | def __init__(self, num_inputs, num_outputs, args):
6 | super(Actor, self).__init__()
7 | self.fc1 = nn.Linear(num_inputs, args.hidden_size)
8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
9 | self.fc3 = nn.Linear(args.hidden_size, num_outputs)
10 |
11 | self.fc3.weight.data.mul_(0.1)
12 | self.fc3.bias.data.mul_(0.0)
13 |
14 | def forward(self, x):
15 | x = torch.tanh(self.fc1(x))
16 | x = torch.tanh(self.fc2(x))
17 | mu = self.fc3(x)
18 | logstd = torch.zeros_like(mu)
19 | std = torch.exp(logstd)
20 | return mu, std
21 |
22 |
23 | class Critic(nn.Module):
24 | def __init__(self, num_inputs, args):
25 | super(Critic, self).__init__()
26 | self.fc1 = nn.Linear(num_inputs, args.hidden_size)
27 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
28 | self.fc3 = nn.Linear(args.hidden_size, 1)
29 |
30 | self.fc3.weight.data.mul_(0.1)
31 | self.fc3.bias.data.mul_(0.0)
32 |
33 | def forward(self, x):
34 | x = torch.tanh(self.fc1(x))
35 | x = torch.tanh(self.fc2(x))
36 | v = self.fc3(x)
37 | return v
38 |
39 |
40 | class VDB(nn.Module):
41 | def __init__(self, num_inputs, args):
42 | super(VDB, self).__init__()
43 | self.fc1 = nn.Linear(num_inputs, args.hidden_size)
44 | self.fc2 = nn.Linear(args.hidden_size, args.z_size)
45 | self.fc3 = nn.Linear(args.hidden_size, args.z_size)
46 | self.fc4 = nn.Linear(args.z_size, args.hidden_size)
47 | self.fc5 = nn.Linear(args.hidden_size, 1)
48 |
49 | self.fc5.weight.data.mul_(0.1)
50 | self.fc5.bias.data.mul_(0.0)
51 |
52 | def encoder(self, x):
53 | h = torch.tanh(self.fc1(x))
54 | return self.fc2(h), self.fc3(h)
55 |
56 | def reparameterize(self, mu, logvar):
57 | std = torch.exp(logvar/2)
58 | eps = torch.randn_like(std)
59 | return mu + std * eps
60 |
61 | def discriminator(self, z):
62 | h = torch.tanh(self.fc4(z))
63 | return torch.sigmoid(self.fc5(h))
64 |
65 | def forward(self, x):
66 | mu, logvar = self.encoder(x)
67 | z = self.reparameterize(mu, logvar)
68 | prob = self.discriminator(z)
69 | return prob, mu, logvar
--------------------------------------------------------------------------------
/mujoco/vail/save_model/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/save_model/.DS_Store
--------------------------------------------------------------------------------
/mujoco/vail/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import torch
4 | import argparse
5 |
6 | from model import Actor, Critic
7 | from utils.utils import get_action
8 | from utils.running_state import ZFilter
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--env', type=str, default="Hopper-v2",
12 | help='name of Mujoco environement')
13 | parser.add_argument('--iter', type=int, default=5,
14 | help='number of episodes to play')
15 | parser.add_argument("--load_model", type=str, default='ppo_max.tar',
16 | help="if you test pretrained file, write filename in save_model folder")
17 |
18 | args = parser.parse_args()
19 |
20 |
21 | if __name__ == "__main__":
22 | env = gym.make(args.env)
23 | env.seed(500)
24 | torch.manual_seed(500)
25 |
26 | num_inputs = env.observation_space.shape[0]
27 | num_actions = env.action_space.shape[0]
28 |
29 | print("state size: ", num_inputs)
30 | print("action size: ", num_actions)
31 |
32 | actor = Actor(num_inputs, num_actions)
33 | critic = Critic(num_inputs)
34 |
35 | running_state = ZFilter((num_inputs,), clip=5)
36 |
37 | if args.load_model is not None:
38 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
39 |
40 | pretrained_model = torch.load(pretrained_model_path)
41 |
42 | actor.load_state_dict(pretrained_model['actor'])
43 | critic.load_state_dict(pretrained_model['critic'])
44 |
45 | running_state.rs.n = pretrained_model['z_filter_n']
46 | running_state.rs.mean = pretrained_model['z_filter_m']
47 | running_state.rs.sum_square = pretrained_model['z_filter_s']
48 |
49 | print("Loaded OK ex. ZFilter N {}".format(running_state.rs.n))
50 |
51 | else:
52 | assert("Should write pretrained filename in save_model folder. ex) python3 test_algo.py --load_model ppo_max.tar")
53 |
54 |
55 | actor.eval(), critic.eval()
56 | for episode in range(args.iter):
57 | state = env.reset()
58 | steps = 0
59 | score = 0
60 | for _ in range(10000):
61 | env.render()
62 | mu, std, _ = actor(torch.Tensor(state).unsqueeze(0))
63 | action = get_action(mu, std)[0]
64 |
65 | next_state, reward, done, _ = env.step(action)
66 | next_state = running_state(next_state)
67 |
68 | state = next_state
69 | score += reward
70 |
71 | if done:
72 | print("{} cumulative reward: {}".format(episode, score))
73 | break
74 |
--------------------------------------------------------------------------------
/mujoco/vail/train_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from utils.utils import *
4 |
5 | def train_vdb(vdb, memory, vdb_optim, demonstrations, beta, args):
6 | memory = np.array(memory)
7 | states = np.vstack(memory[:, 0])
8 | actions = list(memory[:, 1])
9 |
10 | states = torch.Tensor(states)
11 | actions = torch.Tensor(actions)
12 |
13 | criterion = torch.nn.BCELoss()
14 |
15 | for _ in range(args.vdb_update_num):
16 | learner, l_mu, l_logvar = vdb(torch.cat([states, actions], dim=1))
17 | expert, e_mu, e_logvar = vdb(torch.Tensor(demonstrations))
18 |
19 | l_kld = kl_divergence(l_mu, l_logvar)
20 | l_kld = l_kld.mean()
21 |
22 | e_kld = kl_divergence(e_mu, e_logvar)
23 | e_kld = e_kld.mean()
24 |
25 | kld = 0.5 * (l_kld + e_kld)
26 | bottleneck_loss = kld - args.i_c
27 |
28 | beta = max(0, beta + args.alpha_beta * bottleneck_loss)
29 |
30 | vdb_loss = criterion(learner, torch.ones((states.shape[0], 1))) + \
31 | criterion(expert, torch.zeros((demonstrations.shape[0], 1))) + \
32 | beta * bottleneck_loss
33 |
34 | vdb_optim.zero_grad()
35 | vdb_loss.backward(retain_graph=True)
36 | vdb_optim.step()
37 |
38 |
39 | def train_ppo(actor, critic, memory, actor_optim, critic_optim, args):
40 | memory = np.array(memory)
41 | states = np.vstack(memory[:, 0])
42 | actions = list(memory[:, 1])
43 | rewards = list(memory[:, 2])
44 | masks = list(memory[:, 3])
45 |
46 | old_values = critic(torch.Tensor(states))
47 | returns, advants = get_gae(rewards, masks, old_values, args)
48 |
49 | mu, std = actor(torch.Tensor(states))
50 | old_policy = log_prob_density(torch.Tensor(actions), mu, std)
51 |
52 | criterion = torch.nn.MSELoss()
53 | n = len(states)
54 | arr = np.arange(n)
55 |
56 | for _ in range(args.ppo_update_num):
57 | np.random.shuffle(arr)
58 |
59 | for i in range(n // args.batch_size):
60 | batch_index = arr[args.batch_size * i : args.batch_size * (i + 1)]
61 | batch_index = torch.LongTensor(batch_index)
62 |
63 | inputs = torch.Tensor(states)[batch_index]
64 | actions_samples = torch.Tensor(actions)[batch_index]
65 | returns_samples = returns.unsqueeze(1)[batch_index]
66 | advants_samples = advants.unsqueeze(1)[batch_index]
67 | oldvalue_samples = old_values[batch_index].detach()
68 |
69 | values = critic(inputs)
70 | clipped_values = oldvalue_samples + \
71 | torch.clamp(values - oldvalue_samples,
72 | -args.clip_param,
73 | args.clip_param)
74 | critic_loss1 = criterion(clipped_values, returns_samples)
75 | critic_loss2 = criterion(values, returns_samples)
76 | critic_loss = torch.max(critic_loss1, critic_loss2).mean()
77 |
78 | loss, ratio, entropy = surrogate_loss(actor, advants_samples, inputs,
79 | old_policy.detach(), actions_samples,
80 | batch_index)
81 | clipped_ratio = torch.clamp(ratio,
82 | 1.0 - args.clip_param,
83 | 1.0 + args.clip_param)
84 | clipped_loss = clipped_ratio * advants_samples
85 | actor_loss = -torch.min(loss, clipped_loss).mean()
86 |
87 | loss = actor_loss + 0.5 * critic_loss - 0.001 * entropy
88 |
89 | critic_optim.zero_grad()
90 | loss.backward(retain_graph=True)
91 | critic_optim.step()
92 |
93 | actor_optim.zero_grad()
94 | loss.backward()
95 | actor_optim.step()
96 |
97 | def get_gae(rewards, masks, values, args):
98 | rewards = torch.Tensor(rewards)
99 | masks = torch.Tensor(masks)
100 | returns = torch.zeros_like(rewards)
101 | advants = torch.zeros_like(rewards)
102 |
103 | running_returns = 0
104 | previous_value = 0
105 | running_advants = 0
106 |
107 | for t in reversed(range(0, len(rewards))):
108 | running_returns = rewards[t] + (args.gamma * running_returns * masks[t])
109 | returns[t] = running_returns
110 |
111 | running_delta = rewards[t] + (args.gamma * previous_value * masks[t]) - \
112 | values.data[t]
113 | previous_value = values.data[t]
114 |
115 | running_advants = running_delta + (args.gamma * args.lamda * \
116 | running_advants * masks[t])
117 | advants[t] = running_advants
118 |
119 | advants = (advants - advants.mean()) / advants.std()
120 | return returns, advants
121 |
122 | def surrogate_loss(actor, advants, states, old_policy, actions, batch_index):
123 | mu, std = actor(states)
124 | new_policy = log_prob_density(actions, mu, std)
125 | old_policy = old_policy[batch_index]
126 |
127 | ratio = torch.exp(new_policy - old_policy)
128 | surrogate_loss = ratio * advants
129 | entropy = get_entropy(mu, std)
130 |
131 | return surrogate_loss, ratio, entropy
--------------------------------------------------------------------------------
/mujoco/vail/utils/__pycache__/running_state.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/utils/__pycache__/running_state.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/vail/utils/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/utils/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/vail/utils/__pycache__/zfilter.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/mujoco/vail/utils/__pycache__/zfilter.cpython-36.pyc
--------------------------------------------------------------------------------
/mujoco/vail/utils/utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch.distributions import Normal
4 |
5 | def get_action(mu, std):
6 | action = torch.normal(mu, std)
7 | action = action.data.numpy()
8 | return action
9 |
10 | def get_entropy(mu, std):
11 | dist = Normal(mu, std)
12 | entropy = dist.entropy().mean()
13 | return entropy
14 |
15 | def log_prob_density(x, mu, std):
16 | log_prob_density = -(x - mu).pow(2) / (2 * std.pow(2)) \
17 | - 0.5 * math.log(2 * math.pi)
18 | return log_prob_density.sum(1, keepdim=True)
19 |
20 | def get_reward(vdb, state, action):
21 | state = torch.Tensor(state)
22 | action = torch.Tensor(action)
23 | state_action = torch.cat([state, action])
24 | with torch.no_grad():
25 | return -math.log(vdb(state_action)[0].item())
26 |
27 | def kl_divergence(mu, logvar):
28 | kl_div = 0.5 * torch.sum(mu.pow(2) + logvar.exp() - logvar - 1, dim=1)
29 | return kl_div
30 |
31 | def save_checkpoint(state, filename):
32 | torch.save(state, filename)
--------------------------------------------------------------------------------
/mujoco/vail/utils/zfilter.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | # from https://github.com/joschu/modular_rl
4 | # http://www.johndcook.com/blog/standard_deviation/
5 |
6 | class RunningStat(object):
7 | def __init__(self, shape):
8 | self._n = 0
9 | self._M = np.zeros(shape)
10 | self._S = np.zeros(shape)
11 |
12 | def push(self, x):
13 | x = np.asarray(x)
14 | assert x.shape == self._M.shape
15 | self._n += 1
16 | if self._n == 1:
17 | self._M[...] = x
18 | else:
19 | oldM = self._M.copy()
20 | self._M[...] = oldM + (x - oldM) / self._n
21 | self._S[...] = self._S + (x - oldM) * (x - self._M)
22 |
23 | @property
24 | def n(self):
25 | return self._n
26 |
27 | @n.setter
28 | def n(self, n):
29 | self._n = n
30 |
31 | @property
32 | def mean(self):
33 | return self._M
34 |
35 | @mean.setter
36 | def mean(self, M):
37 | self._M = M
38 |
39 | @property
40 | def sum_square(self):
41 | return self._S
42 |
43 | @sum_square.setter
44 | def sum_square(self, S):
45 | self._S = S
46 |
47 | @property
48 | def var(self):
49 | return self._S / (self._n - 1) if self._n > 1 else np.square(self._M)
50 |
51 | @property
52 | def std(self):
53 | return np.sqrt(self.var)
54 |
55 | @property
56 | def shape(self):
57 | return self._M.shape
58 |
59 |
60 | class ZFilter:
61 | """
62 | y = (x-mean)/std
63 | using running estimates of mean,std
64 | """
65 |
66 | def __init__(self, shape, demean=True, destd=True, clip=10.0):
67 | self.demean = demean
68 | self.destd = destd
69 | self.clip = clip
70 |
71 | self.rs = RunningStat(shape)
72 |
73 | def __call__(self, x, update=True):
74 | if update: self.rs.push(x)
75 |
76 | if self.demean:
77 | x = x - self.rs.mean
78 |
79 | if self.destd:
80 | x = x / (self.rs.std + 1e-8)
81 |
82 | if self.clip:
83 | x = np.clip(x, -self.clip, self.clip)
84 |
85 | return x
86 |
--------------------------------------------------------------------------------
/pendulum/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/.DS_Store
--------------------------------------------------------------------------------
/pendulum/ddpg/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ddpg/.DS_Store
--------------------------------------------------------------------------------
/pendulum/ddpg/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ddpg/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/pendulum/ddpg/__pycache__/model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ddpg/__pycache__/model.cpython-37.pyc
--------------------------------------------------------------------------------
/pendulum/ddpg/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ddpg/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/pendulum/ddpg/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ddpg/__pycache__/utils.cpython-37.pyc
--------------------------------------------------------------------------------
/pendulum/ddpg/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class Actor(nn.Module):
5 | def __init__(self, state_size, action_size, args):
6 | super(Actor, self).__init__()
7 | self.fc1 = nn.Linear(state_size, args.hidden_size)
8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
9 | self.fc3 = nn.Linear(args.hidden_size, action_size)
10 |
11 | def forward(self, x):
12 | x = torch.relu(self.fc1(x))
13 | x = torch.relu(self.fc2(x))
14 | policy = self.fc3(x)
15 |
16 | return policy
17 |
18 | class Critic(nn.Module):
19 | def __init__(self, state_size, action_size, args):
20 | super(Critic, self).__init__()
21 | self.fc1 = nn.Linear(state_size + action_size, args.hidden_size)
22 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
23 | self.fc3 = nn.Linear(args.hidden_size, 1)
24 |
25 | def forward(self, states, actions):
26 | x = torch.cat([states, actions], dim=1)
27 | x = torch.relu(self.fc1(x))
28 | x = torch.relu(self.fc2(x))
29 | q_value = self.fc3(x)
30 |
31 | return q_value
--------------------------------------------------------------------------------
/pendulum/ddpg/save_model/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ddpg/save_model/.DS_Store
--------------------------------------------------------------------------------
/pendulum/ddpg/save_model/model.pth.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ddpg/save_model/model.pth.tar
--------------------------------------------------------------------------------
/pendulum/ddpg/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import random
4 | import argparse
5 | import numpy as np
6 |
7 | import torch
8 | from utils import *
9 | from model import Actor, Critic
10 |
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('--env_name', type=str, default="Pendulum-v0")
13 | parser.add_argument("--load_model", type=str, default='model.pth.tar')
14 | parser.add_argument('--render', action="store_true", default=True)
15 | parser.add_argument('--hidden_size', type=int, default=64)
16 | parser.add_argument('--theta', type=float, default=0.15)
17 | parser.add_argument('--mu', type=float, default=0.0)
18 | parser.add_argument('--sigma', type=float, default=0.2)
19 | parser.add_argument('--iter', type=int, default=10000)
20 | parser.add_argument('--log_interval', type=int, default=10)
21 | args = parser.parse_args()
22 |
23 | if __name__=="__main__":
24 | env = gym.make(args.env_name)
25 | env.seed(500)
26 | torch.manual_seed(500)
27 |
28 | state_size = env.observation_space.shape[0]
29 | action_size = env.action_space.shape[0]
30 | print('state size:', state_size)
31 | print('action size:', action_size)
32 |
33 | actor = Actor(state_size, action_size, args)
34 |
35 | if args.load_model is not None:
36 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
37 | pretrained_model = torch.load(pretrained_model_path)
38 | actor.load_state_dict(pretrained_model)
39 |
40 | ou_noise = OUNoise(action_size, args.theta, args.mu, args.sigma)
41 | steps = 0
42 |
43 | for episode in range(args.iter):
44 | done = False
45 | score = 0
46 |
47 | state = env.reset()
48 | state = np.reshape(state, [1, state_size])
49 |
50 | while not done:
51 | if args.render:
52 | env.render()
53 |
54 | steps += 1
55 |
56 | policy = actor(torch.Tensor(state))
57 | action = get_action(policy, ou_noise)
58 |
59 | next_state, reward, done, _ = env.step(action)
60 |
61 | next_state = np.reshape(next_state, [1, state_size])
62 | state = next_state
63 | score += reward
64 |
65 | if episode % args.log_interval == 0:
66 | print('{} episode | score: {:.2f}'.format(episode, score[0]))
--------------------------------------------------------------------------------
/pendulum/ddpg/train.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import random
4 | import argparse
5 | import numpy as np
6 | from collections import deque
7 |
8 | import torch
9 | import torch.optim as optim
10 |
11 | from utils import *
12 | from model import Actor, Critic
13 | from tensorboardX import SummaryWriter
14 |
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument('--env_name', type=str, default="Pendulum-v0")
17 | parser.add_argument('--load_model', type=str, default=None)
18 | parser.add_argument('--save_path', default='./save_model/', help='')
19 | parser.add_argument('--render', action="store_true", default=False)
20 | parser.add_argument('--gamma', type=float, default=0.99)
21 | parser.add_argument('--hidden_size', type=int, default=64)
22 | parser.add_argument('--batch_size', type=int, default=64)
23 | parser.add_argument('--actor_lr', type=float, default=1e-3)
24 | parser.add_argument('--critic_lr', type=float, default=1e-3)
25 | parser.add_argument('--theta', type=float, default=0.15)
26 | parser.add_argument('--mu', type=float, default=0.0)
27 | parser.add_argument('--sigma', type=float, default=0.2)
28 | parser.add_argument('--tau', type=float, default=0.001)
29 | parser.add_argument('--max_iter_num', type=int, default=1000)
30 | parser.add_argument('--log_interval', type=int, default=10)
31 | parser.add_argument('--goal_score', type=int, default=-300)
32 | parser.add_argument('--logdir', type=str, default='./logs',
33 | help='tensorboardx logs directory')
34 | args = parser.parse_args()
35 |
36 | def train_model(actor, critic, target_actor, target_critic,
37 | actor_optimizer, critic_optimizer, mini_batch):
38 | mini_batch = np.array(mini_batch)
39 | states = np.vstack(mini_batch[:, 0])
40 | actions = list(mini_batch[:, 1])
41 | rewards = list(mini_batch[:, 2])
42 | next_states = np.vstack(mini_batch[:, 3])
43 | masks = list(mini_batch[:, 4])
44 |
45 | actions = torch.Tensor(actions).squeeze(1)
46 | rewards = torch.Tensor(rewards).squeeze(1)
47 | masks = torch.Tensor(masks)
48 |
49 | # update critic
50 | criterion = torch.nn.MSELoss()
51 |
52 | # get Q-value
53 | q_value = critic(torch.Tensor(states), actions).squeeze(1)
54 |
55 | # get target
56 | target_next_policy = target_actor(torch.Tensor(next_states))
57 | target_next_q_value = target_critic(torch.Tensor(next_states), target_next_policy).squeeze(1)
58 | target = rewards + masks * args.gamma * target_next_q_value
59 |
60 | critic_loss = criterion(q_value, target.detach())
61 | critic_optimizer.zero_grad()
62 | critic_loss.backward()
63 | critic_optimizer.step()
64 |
65 | # update actor
66 | policy = actor(torch.Tensor(states))
67 |
68 | actor_loss = -critic(torch.Tensor(states), policy).mean()
69 | actor_optimizer.zero_grad()
70 | actor_loss.backward()
71 | actor_optimizer.step()
72 |
73 |
74 | def main():
75 | env = gym.make(args.env_name)
76 | env.seed(500)
77 | torch.manual_seed(500)
78 |
79 | state_size = env.observation_space.shape[0]
80 | action_size = env.action_space.shape[0]
81 | print('state size:', state_size)
82 | print('action size:', action_size)
83 |
84 | actor = Actor(state_size, action_size, args)
85 | target_actor = Actor(state_size, action_size, args)
86 | critic = Critic(state_size, action_size, args)
87 | target_critic = Critic(state_size, action_size, args)
88 |
89 | actor_optimizer = optim.Adam(actor.parameters(), lr=args.actor_lr)
90 | critic_optimizer = optim.Adam(critic.parameters(), lr=args.critic_lr)
91 |
92 | hard_target_update(actor, critic, target_actor, target_critic)
93 | ou_noise = OUNoise(action_size, args.theta, args.mu, args.sigma)
94 |
95 | writer = SummaryWriter(args.logdir)
96 |
97 | replay_buffer = deque(maxlen=10000)
98 | recent_rewards = deque(maxlen=100)
99 | steps = 0
100 |
101 | for episode in range(args.max_iter_num):
102 | done = False
103 | score = 0
104 |
105 | state = env.reset()
106 | state = np.reshape(state, [1, state_size])
107 |
108 | while not done:
109 | if args.render:
110 | env.render()
111 |
112 | steps += 1
113 |
114 | policy = actor(torch.Tensor(state))
115 | action = get_action(policy, ou_noise)
116 |
117 | next_state, reward, done, _ = env.step(action)
118 |
119 | next_state = np.reshape(next_state, [1, state_size])
120 | mask = 0 if done else 1
121 |
122 | replay_buffer.append((state, action, reward, next_state, mask))
123 |
124 | state = next_state
125 | score += reward
126 |
127 | if steps > args.batch_size:
128 | mini_batch = random.sample(replay_buffer, args.batch_size)
129 |
130 | actor.train(), critic.train()
131 | target_actor.train(), target_critic.train()
132 | train_model(actor, critic, target_actor, target_critic,
133 | actor_optimizer, critic_optimizer, mini_batch)
134 |
135 | soft_target_update(actor, critic, target_actor, target_critic, args.tau)
136 |
137 | if done:
138 | recent_rewards.append(score)
139 |
140 | if episode % args.log_interval == 0:
141 | print('{} episode | score_avg: {:.2f}'.format(episode, np.mean(recent_rewards)))
142 | writer.add_scalar('log/score', float(score), episode)
143 |
144 | if np.mean(recent_rewards) > args.goal_score:
145 | if not os.path.isdir(args.save_path):
146 | os.makedirs(args.save_path)
147 |
148 | ckpt_path = args.save_path + 'model.pth.tar'
149 | torch.save(actor.state_dict(), ckpt_path)
150 | print('Recent rewards exceed -300. So end')
151 | break
152 |
153 | if __name__ == '__main__':
154 | main()
--------------------------------------------------------------------------------
/pendulum/ddpg/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 |
4 | class OUNoise:
5 | def __init__(self, action_size, theta, mu, sigma):
6 | self.action_size = action_size
7 | self.theta = theta
8 | self.mu = mu
9 | self.sigma = sigma
10 | self.X = np.zeros(self.action_size)
11 |
12 | def sample(self):
13 | dx = self.theta * (self.mu - self.X)
14 | dx = dx + self.sigma * np.random.randn(len(self.X))
15 | self.X = self.X + dx
16 |
17 | return self.X
18 |
19 | def get_action(policy, ou_noise):
20 | action = policy.detach().numpy() + ou_noise.sample()
21 |
22 | return action
23 |
24 | def hard_target_update(actor, critic, target_actor, target_critic):
25 | target_critic.load_state_dict(critic.state_dict())
26 | target_actor.load_state_dict(actor.state_dict())
27 |
28 | def soft_target_update(actor, critic, target_actor, target_critic, tau):
29 | soft_update(critic, target_critic, tau)
30 | soft_update(actor, target_actor, tau)
31 |
32 | def soft_update(net, target_net, tau):
33 | for param, target_param in zip(net.parameters(), target_net.parameters()):
34 | target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data)
--------------------------------------------------------------------------------
/pendulum/ppo/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ppo/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/pendulum/ppo/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ppo/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/pendulum/ppo/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class Actor(nn.Module):
5 | def __init__(self, state_size, action_size, args):
6 | super(Actor, self).__init__()
7 | self.fc1 = nn.Linear(state_size, args.hidden_size)
8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
9 | self.fc3 = nn.Linear(args.hidden_size, action_size)
10 |
11 | def forward(self, x):
12 | x = torch.tanh(self.fc1(x))
13 | x = torch.tanh(self.fc2(x))
14 |
15 | mu = self.fc3(x)
16 | log_std = torch.zeros_like(mu)
17 | std = torch.exp(log_std)
18 |
19 | return mu, std
20 |
21 | class Critic(nn.Module):
22 | def __init__(self, state_size, args):
23 | super(Critic, self).__init__()
24 | self.fc1 = nn.Linear(state_size, args.hidden_size)
25 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
26 | self.fc3 = nn.Linear(args.hidden_size, 1)
27 |
28 | def forward(self, x):
29 | x = torch.tanh(self.fc1(x))
30 | x = torch.tanh(self.fc2(x))
31 | value = self.fc3(x)
32 |
33 | return value
34 |
--------------------------------------------------------------------------------
/pendulum/ppo/save_model/model.pth.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ppo/save_model/model.pth.tar
--------------------------------------------------------------------------------
/pendulum/ppo/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import argparse
4 | import numpy as np
5 |
6 | import torch
7 | from utils import *
8 | from model import Actor
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--env_name', type=str, default="Pendulum-v0")
12 | parser.add_argument("--load_model", type=str, default='model.pth.tar')
13 | parser.add_argument('--render', action="store_true", default=True)
14 | parser.add_argument('--hidden_size', type=int, default=64)
15 | parser.add_argument('--iter', type=int, default=10000)
16 | parser.add_argument('--log_interval', type=int, default=10)
17 | args = parser.parse_args()
18 |
19 | if __name__=="__main__":
20 | env = gym.make(args.env_name)
21 | env.seed(500)
22 | torch.manual_seed(500)
23 |
24 | state_size = env.observation_space.shape[0]
25 | action_size = env.action_space.shape[0]
26 | print('state size:', state_size)
27 | print('action size:', action_size)
28 |
29 | actor = Actor(state_size, action_size, args)
30 |
31 | if args.load_model is not None:
32 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
33 | pretrained_model = torch.load(pretrained_model_path)
34 | actor.load_state_dict(pretrained_model)
35 |
36 | steps = 0
37 |
38 | for episode in range(args.iter):
39 | done = False
40 | score = 0
41 |
42 | state = env.reset()
43 | state = np.reshape(state, [1, state_size])
44 |
45 | while not done:
46 | if args.render:
47 | env.render()
48 |
49 | steps += 1
50 |
51 | mu, std = actor(torch.Tensor(state))
52 | action = get_action(mu, std)
53 |
54 | next_state, reward, done, _ = env.step(action)
55 |
56 | next_state = np.reshape(next_state, [1, state_size])
57 | state = next_state
58 | score += reward
59 |
60 | if episode % args.log_interval == 0:
61 | print('{} episode | score: {:.2f}'.format(episode, score[0]))
--------------------------------------------------------------------------------
/pendulum/ppo/utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch.distributions import Normal
4 |
5 | def get_action(mu, std):
6 | normal = Normal(mu, std)
7 | action = normal.sample()
8 |
9 | return action.data.numpy()
10 |
11 | def get_returns(rewards, masks, gamma):
12 | returns = torch.zeros_like(rewards)
13 | running_returns = 0
14 |
15 | for t in reversed(range(0, len(rewards))):
16 | running_returns = rewards[t] + masks[t] * gamma * running_returns
17 | returns[t] = running_returns
18 |
19 | return returns
20 |
21 | def get_log_prob(actions, mu, std):
22 | normal = Normal(mu, std)
23 | log_prob = normal.log_prob(actions)
24 |
25 | return log_prob
26 |
27 | def surrogate_loss(actor, values, targets, states, old_policy, actions, batch_index):
28 | mu, std = actor(torch.Tensor(states))
29 | new_policy = get_log_prob(actions, mu, std)
30 |
31 | old_policy = old_policy[batch_index]
32 | ratio = torch.exp(new_policy - old_policy)
33 |
34 | advantages = targets - values
35 |
36 | surrogate_loss = ratio * advantages
37 |
38 | return surrogate_loss, ratio, advantages
--------------------------------------------------------------------------------
/pendulum/ppo_gae/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ppo_gae/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/pendulum/ppo_gae/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ppo_gae/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/pendulum/ppo_gae/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class Actor(nn.Module):
5 | def __init__(self, state_size, action_size, args):
6 | super(Actor, self).__init__()
7 | self.fc1 = nn.Linear(state_size, args.hidden_size)
8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
9 | self.fc3 = nn.Linear(args.hidden_size, action_size)
10 |
11 | def forward(self, x):
12 | x = torch.tanh(self.fc1(x))
13 | x = torch.tanh(self.fc2(x))
14 |
15 | mu = self.fc3(x)
16 | log_std = torch.zeros_like(mu)
17 | std = torch.exp(log_std)
18 |
19 | return mu, std
20 |
21 | class Critic(nn.Module):
22 | def __init__(self, state_size, args):
23 | super(Critic, self).__init__()
24 | self.fc1 = nn.Linear(state_size, args.hidden_size)
25 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
26 | self.fc3 = nn.Linear(args.hidden_size, 1)
27 |
28 | def forward(self, x):
29 | x = torch.tanh(self.fc1(x))
30 | x = torch.tanh(self.fc2(x))
31 | value = self.fc3(x)
32 |
33 | return value
34 |
--------------------------------------------------------------------------------
/pendulum/ppo_gae/save_model/model.pth.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/ppo_gae/save_model/model.pth.tar
--------------------------------------------------------------------------------
/pendulum/ppo_gae/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import argparse
4 | import numpy as np
5 |
6 | import torch
7 | from utils import *
8 | from model import Actor
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--env_name', type=str, default="Pendulum-v0")
12 | parser.add_argument("--load_model", type=str, default='model.pth.tar')
13 | parser.add_argument('--render', action="store_true", default=True)
14 | parser.add_argument('--hidden_size', type=int, default=64)
15 | parser.add_argument('--iter', type=int, default=10000)
16 | parser.add_argument('--log_interval', type=int, default=10)
17 | args = parser.parse_args()
18 |
19 | if __name__=="__main__":
20 | env = gym.make(args.env_name)
21 | env.seed(500)
22 | torch.manual_seed(500)
23 |
24 | state_size = env.observation_space.shape[0]
25 | action_size = env.action_space.shape[0]
26 | print('state size:', state_size)
27 | print('action size:', action_size)
28 |
29 | actor = Actor(state_size, action_size, args)
30 |
31 | if args.load_model is not None:
32 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
33 | pretrained_model = torch.load(pretrained_model_path)
34 | actor.load_state_dict(pretrained_model)
35 |
36 | steps = 0
37 |
38 | for episode in range(args.iter):
39 | done = False
40 | score = 0
41 |
42 | state = env.reset()
43 | state = np.reshape(state, [1, state_size])
44 |
45 | while not done:
46 | if args.render:
47 | env.render()
48 |
49 | steps += 1
50 |
51 | mu, std = actor(torch.Tensor(state))
52 | action = get_action(mu, std)
53 |
54 | next_state, reward, done, _ = env.step(action)
55 |
56 | next_state = np.reshape(next_state, [1, state_size])
57 | state = next_state
58 | score += reward
59 |
60 | if episode % args.log_interval == 0:
61 | print('{} episode | score: {:.2f}'.format(episode, score[0]))
--------------------------------------------------------------------------------
/pendulum/ppo_gae/utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch.distributions import Normal
4 |
5 | def get_action(mu, std):
6 | normal = Normal(mu, std)
7 | action = normal.sample()
8 |
9 | return action.data.numpy()
10 |
11 | def get_gae(rewards, masks, values, args):
12 | returns = torch.zeros_like(rewards)
13 | advantages = torch.zeros_like(rewards)
14 |
15 | running_returns = 0
16 | previous_value = 0
17 | running_advants = 0
18 |
19 | for t in reversed(range(0, len(rewards))):
20 | # return
21 | running_returns = rewards[t] + masks[t] * args.gamma * running_returns
22 | returns[t] = running_returns
23 |
24 | # advantage
25 | running_deltas = rewards[t] + masks[t] * args.gamma * previous_value - values.data[t]
26 | running_advants = running_deltas + masks[t] * args.gamma * args.lamda * running_advants
27 |
28 | previous_value = values.data[t]
29 | advantages[t] = running_advants
30 |
31 | advantages = (advantages - advantages.mean()) / advantages.std()
32 |
33 | return returns, advantages
34 |
35 | def get_log_prob(actions, mu, std):
36 | normal = Normal(mu, std)
37 | log_prob = normal.log_prob(actions)
38 |
39 | return log_prob
40 |
41 | def surrogate_loss(actor, advantages, states, old_policy, actions, batch_index):
42 | mu, std = actor(torch.Tensor(states))
43 | new_policy = get_log_prob(actions, mu, std)
44 |
45 | old_policy = old_policy[batch_index]
46 |
47 | ratio = torch.exp(new_policy - old_policy)
48 | surrogate_loss = ratio * advantages
49 |
50 | return surrogate_loss, ratio
--------------------------------------------------------------------------------
/pendulum/sac/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/sac/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/pendulum/sac/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/sac/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/pendulum/sac/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class Actor(nn.Module):
5 | def __init__(self, state_size, action_size, args, log_std_min=-20, log_std_max=2):
6 | super(Actor, self).__init__()
7 | self.log_std_min = log_std_min
8 | self.log_std_max = log_std_max
9 |
10 | self.fc1 = nn.Linear(state_size, args.hidden_size)
11 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
12 |
13 | self.fc3 = nn.Linear(args.hidden_size, action_size)
14 | self.fc4 = nn.Linear(args.hidden_size, action_size)
15 |
16 | def forward(self, x):
17 | x = torch.relu(self.fc1(x))
18 | x = torch.relu(self.fc2(x))
19 |
20 | mu = self.fc3(x)
21 | log_std = self.fc4(x)
22 |
23 | log_std = torch.clamp(log_std, min=self.log_std_min, max=self.log_std_max)
24 | std = torch.exp(log_std)
25 |
26 | return mu, std
27 |
28 | class Critic(nn.Module):
29 | def __init__(self, state_size, action_size, args):
30 | super(Critic, self).__init__()
31 |
32 | # Q1 architecture
33 | self.fc1 = nn.Linear(state_size + action_size, args.hidden_size)
34 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
35 | self.fc3 = nn.Linear(args.hidden_size, 1)
36 |
37 | # Q2 architecture
38 | self.fc4 = nn.Linear(state_size + action_size, args.hidden_size)
39 | self.fc5 = nn.Linear(args.hidden_size, args.hidden_size)
40 | self.fc6 = nn.Linear(args.hidden_size, 1)
41 |
42 | def forward(self, states, actions):
43 | x = torch.cat([states, actions], dim=1)
44 |
45 | x1 = torch.relu(self.fc1(x))
46 | x1 = torch.relu(self.fc2(x1))
47 | q_value1 = self.fc3(x1)
48 |
49 | x2 = torch.relu(self.fc4(x))
50 | x2 = torch.relu(self.fc5(x2))
51 | q_value2 = self.fc6(x2)
52 |
53 | return q_value1, q_value2
--------------------------------------------------------------------------------
/pendulum/sac/save_model/model.pth.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/sac/save_model/model.pth.tar
--------------------------------------------------------------------------------
/pendulum/sac/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import random
4 | import argparse
5 | import numpy as np
6 |
7 | import torch
8 | from utils import *
9 | from model import Actor, Critic
10 |
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('--env_name', type=str, default="Pendulum-v0")
13 | parser.add_argument("--load_model", type=str, default='model.pth.tar')
14 | parser.add_argument('--render', action="store_true", default=True)
15 | parser.add_argument('--hidden_size', type=int, default=64)
16 | parser.add_argument('--iter', type=int, default=10000)
17 | parser.add_argument('--log_interval', type=int, default=10)
18 | args = parser.parse_args()
19 |
20 | if __name__=="__main__":
21 | env = gym.make(args.env_name)
22 | env.seed(500)
23 | torch.manual_seed(500)
24 |
25 | state_size = env.observation_space.shape[0]
26 | action_size = env.action_space.shape[0]
27 | print('state size:', state_size)
28 | print('action size:', action_size)
29 |
30 | actor = Actor(state_size, action_size, args)
31 |
32 | if args.load_model is not None:
33 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
34 | pretrained_model = torch.load(pretrained_model_path)
35 | actor.load_state_dict(pretrained_model)
36 |
37 | steps = 0
38 |
39 | for episode in range(args.iter):
40 | done = False
41 | score = 0
42 |
43 | state = env.reset()
44 | state = np.reshape(state, [1, state_size])
45 |
46 | while not done:
47 | if args.render:
48 | env.render()
49 |
50 | steps += 1
51 |
52 | mu, std = actor(torch.Tensor(state))
53 | action = get_action(mu, std)
54 |
55 | next_state, reward, done, _ = env.step(action)
56 |
57 | next_state = np.reshape(next_state, [1, state_size])
58 | state = next_state
59 | score += reward
60 |
61 | if episode % args.log_interval == 0:
62 | print('{} episode | score: {:.2f}'.format(episode, score[0]))
--------------------------------------------------------------------------------
/pendulum/sac/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.distributions import Normal
3 |
4 | def get_action(mu, std):
5 | normal = Normal(mu, std)
6 | z = normal.rsample() # reparameterization trick (mean + std * N(0,1))
7 | action = torch.tanh(z)
8 |
9 | return action.data.numpy()
10 |
11 | def eval_action(mu, std, epsilon=1e-6):
12 | normal = Normal(mu, std)
13 | z = normal.rsample() # reparameterization trick (mean + std * N(0,1))
14 | action = torch.tanh(z)
15 | log_prob = normal.log_prob(z)
16 |
17 | # Enforcing Action Bounds
18 | log_prob -= torch.log(1 - action.pow(2) + epsilon)
19 | log_policy = log_prob.sum(1, keepdim=True)
20 |
21 | return action, log_policy
22 |
23 | def hard_target_update(net, target_net):
24 | target_net.load_state_dict(net.state_dict())
25 |
26 | def soft_target_update(net, target_net, tau):
27 | for param, target_param in zip(net.parameters(), target_net.parameters()):
28 | target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data)
--------------------------------------------------------------------------------
/pendulum/tnpg/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/tnpg/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/pendulum/tnpg/__pycache__/tnpg.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/tnpg/__pycache__/tnpg.cpython-36.pyc
--------------------------------------------------------------------------------
/pendulum/tnpg/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/tnpg/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/pendulum/tnpg/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class Actor(nn.Module):
5 | def __init__(self, state_size, action_size, args):
6 | super(Actor, self).__init__()
7 | self.fc1 = nn.Linear(state_size, args.hidden_size)
8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
9 | self.fc3 = nn.Linear(args.hidden_size, action_size)
10 |
11 | def forward(self, x):
12 | x = torch.tanh(self.fc1(x))
13 | x = torch.tanh(self.fc2(x))
14 |
15 | mu = self.fc3(x)
16 | log_std = torch.zeros_like(mu)
17 | std = torch.exp(log_std)
18 |
19 | return mu, std
20 |
21 | class Critic(nn.Module):
22 | def __init__(self, state_size, args):
23 | super(Critic, self).__init__()
24 | self.fc1 = nn.Linear(state_size, args.hidden_size)
25 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
26 | self.fc3 = nn.Linear(args.hidden_size, 1)
27 |
28 | def forward(self, x):
29 | x = torch.tanh(self.fc1(x))
30 | x = torch.tanh(self.fc2(x))
31 | value = self.fc3(x)
32 |
33 | return value
--------------------------------------------------------------------------------
/pendulum/tnpg/save_model/model.pth.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/tnpg/save_model/model.pth.tar
--------------------------------------------------------------------------------
/pendulum/tnpg/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import argparse
4 | import numpy as np
5 |
6 | import torch
7 | from utils import *
8 | from model import Actor
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--env_name', type=str, default="Pendulum-v0")
12 | parser.add_argument("--load_model", type=str, default='model.pth.tar')
13 | parser.add_argument('--render', action="store_true", default=True)
14 | parser.add_argument('--hidden_size', type=int, default=64)
15 | parser.add_argument('--iter', type=int, default=10000)
16 | parser.add_argument('--log_interval', type=int, default=10)
17 | args = parser.parse_args()
18 |
19 | if __name__=="__main__":
20 | env = gym.make(args.env_name)
21 | env.seed(500)
22 | torch.manual_seed(500)
23 |
24 | state_size = env.observation_space.shape[0]
25 | action_size = env.action_space.shape[0]
26 | print('state size:', state_size)
27 | print('action size:', action_size)
28 |
29 | actor = Actor(state_size, action_size, args)
30 |
31 | if args.load_model is not None:
32 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
33 | pretrained_model = torch.load(pretrained_model_path)
34 | actor.load_state_dict(pretrained_model)
35 |
36 | steps = 0
37 |
38 | for episode in range(args.iter):
39 | done = False
40 | score = 0
41 |
42 | state = env.reset()
43 | state = np.reshape(state, [1, state_size])
44 |
45 | while not done:
46 | if args.render:
47 | env.render()
48 |
49 | steps += 1
50 |
51 | mu, std = actor(torch.Tensor(state))
52 | action = get_action(mu, std)
53 |
54 | next_state, reward, done, _ = env.step(action)
55 |
56 | next_state = np.reshape(next_state, [1, state_size])
57 | state = next_state
58 | score += reward
59 |
60 | if episode % args.log_interval == 0:
61 | print('{} episode | score: {:.2f}'.format(episode, score[0]))
--------------------------------------------------------------------------------
/pendulum/tnpg/train.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import argparse
4 | import numpy as np
5 | from collections import deque
6 |
7 | import torch
8 | import torch.optim as optim
9 |
10 | from utils import *
11 | from model import Actor, Critic
12 | from tensorboardX import SummaryWriter
13 |
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('--env_name', type=str, default="Pendulum-v0")
16 | parser.add_argument('--load_model', type=str, default=None)
17 | parser.add_argument('--save_path', default='./save_model/', help='')
18 | parser.add_argument('--render', action="store_true", default=False)
19 | parser.add_argument('--gamma', type=float, default=0.99)
20 | parser.add_argument('--hidden_size', type=int, default=64)
21 | parser.add_argument('--critic_lr', type=float, default=1e-3)
22 | parser.add_argument('--max_kl', type=float, default=1e-2)
23 | parser.add_argument('--max_iter_num', type=int, default=1000)
24 | parser.add_argument('--total_sample_size', type=int, default=2048)
25 | parser.add_argument('--log_interval', type=int, default=5)
26 | parser.add_argument('--goal_score', type=int, default=-300)
27 | parser.add_argument('--logdir', type=str, default='./logs',
28 | help='tensorboardx logs directory')
29 | args = parser.parse_args()
30 |
31 | def train_model(actor, critic, critic_optimizer, trajectories):
32 | trajectories = np.array(trajectories)
33 | states = np.vstack(trajectories[:, 0])
34 | actions = list(trajectories[:, 1])
35 | rewards = list(trajectories[:, 2])
36 | masks = list(trajectories[:, 3])
37 |
38 | actions = torch.Tensor(actions).squeeze(1)
39 | rewards = torch.Tensor(rewards).squeeze(1)
40 | masks = torch.Tensor(masks)
41 |
42 | # ----------------------------
43 | # step 1: get returns
44 | returns = get_returns(rewards, masks, args.gamma)
45 |
46 | # ----------------------------
47 | # step 2: update ciritic
48 | criterion = torch.nn.MSELoss()
49 |
50 | values = critic(torch.Tensor(states))
51 | targets = returns.unsqueeze(1)
52 |
53 | critic_loss = criterion(values, targets)
54 | critic_optimizer.zero_grad()
55 | critic_loss.backward()
56 | critic_optimizer.step()
57 |
58 | # ----------------------------
59 | # step 3: get gradient of actor loss
60 | mu, std = actor(torch.Tensor(states))
61 | log_policy = get_log_prob(actions, mu, std)
62 | actor_loss = get_loss(actor, values, targets, log_policy)
63 |
64 | actor_loss_grad = torch.autograd.grad(actor_loss, actor.parameters())
65 | actor_loss_grad = flat_grad(actor_loss_grad)
66 |
67 | # ----------------------------
68 | # step 4: get search direction through conjugate gradient method
69 | search_dir = conjugate_gradient(actor, states, actor_loss_grad.data, nsteps=10)
70 |
71 | # ----------------------------
72 | # step 5: get step size and maximal step
73 | gHg = (hessian_vector_product(actor, states, search_dir) * search_dir).sum(0, keepdim=True)
74 | step_size = torch.sqrt(2 * args.max_kl / gHg)[0]
75 | maximal_step = step_size * search_dir
76 |
77 | # ----------------------------
78 | # step 6: update actor
79 | params = flat_params(actor)
80 |
81 | new_params = params + maximal_step
82 | update_model(actor, new_params)
83 |
84 |
85 | def main():
86 | env = gym.make(args.env_name)
87 | env.seed(500)
88 | torch.manual_seed(500)
89 |
90 | state_size = env.observation_space.shape[0]
91 | action_size = env.action_space.shape[0]
92 | print('state size:', state_size)
93 | print('action size:', action_size)
94 |
95 | actor = Actor(state_size, action_size, args)
96 | critic = Critic(state_size, args)
97 | critic_optimizer = optim.Adam(critic.parameters(), lr=args.critic_lr)
98 |
99 | writer = SummaryWriter(args.logdir)
100 |
101 | recent_rewards = deque(maxlen=100)
102 | episodes = 0
103 |
104 | for iter in range(args.max_iter_num):
105 | trajectories = deque()
106 | steps = 0
107 |
108 | while steps < args.total_sample_size:
109 | done = False
110 | score = 0
111 | episodes += 1
112 |
113 | state = env.reset()
114 | state = np.reshape(state, [1, state_size])
115 |
116 | while not done:
117 | if args.render:
118 | env.render()
119 |
120 | steps += 1
121 |
122 | mu, std = actor(torch.Tensor(state))
123 | action = get_action(mu, std)
124 |
125 | next_state, reward, done, _ = env.step(action)
126 |
127 | mask = 0 if done else 1
128 |
129 | trajectories.append((state, action, reward, mask))
130 |
131 | next_state = np.reshape(next_state, [1, state_size])
132 | state = next_state
133 | score += reward
134 |
135 | if done:
136 | recent_rewards.append(score)
137 |
138 | actor.train()
139 | train_model(actor, critic, critic_optimizer, trajectories)
140 |
141 | writer.add_scalar('log/score', float(score), episodes)
142 |
143 | if iter % args.log_interval == 0:
144 | print('{} iter | {} episode | score_avg: {:.2f}'.format(iter, episodes, np.mean(recent_rewards)))
145 |
146 | if np.mean(recent_rewards) > args.goal_score:
147 | if not os.path.isdir(args.save_path):
148 | os.makedirs(args.save_path)
149 |
150 | ckpt_path = args.save_path + 'model.pth.tar'
151 | torch.save(actor.state_dict(), ckpt_path)
152 | print('Recent rewards exceed -300. So end')
153 | break
154 |
155 | if __name__ == '__main__':
156 | main()
--------------------------------------------------------------------------------
/pendulum/tnpg/utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch.distributions import Normal
4 |
5 | def get_action(mu, std):
6 | normal = Normal(mu, std)
7 | action = normal.sample()
8 |
9 | return action.data.numpy()
10 |
11 | def get_returns(rewards, masks, gamma):
12 | returns = torch.zeros_like(rewards)
13 | running_returns = 0
14 |
15 | for t in reversed(range(0, len(rewards))):
16 | running_returns = rewards[t] + masks[t] * gamma * running_returns
17 | returns[t] = running_returns
18 |
19 | returns = (returns - returns.mean()) / returns.std()
20 |
21 | return returns
22 |
23 | def get_loss(actor, values, targets, log_policy):
24 | advantages = targets - values
25 |
26 | loss = log_policy * advantages
27 | loss = loss.mean()
28 |
29 | return loss
30 |
31 | def get_log_prob(actions, mu, std):
32 | normal = Normal(mu, std)
33 | log_prob = normal.log_prob(actions)
34 |
35 | return log_prob
36 |
37 |
38 | # from openai baseline code
39 | # https://github.com/openai/baselines/blob/master/baselines/common/cg.py
40 | def conjugate_gradient(actor, states, b, nsteps, residual_tol=1e-10):
41 | x = torch.zeros(b.size())
42 | r = b.clone()
43 | p = b.clone()
44 | rdotr = torch.dot(r, r)
45 |
46 | for i in range(nsteps): # nsteps = 10
47 | f_Ax = hessian_vector_product(actor, states, p, cg_damping=1e-1)
48 | alpha = rdotr / torch.dot(p, f_Ax)
49 |
50 | x += alpha * p
51 | r -= alpha * f_Ax
52 |
53 | new_rdotr = torch.dot(r, r)
54 | betta = new_rdotr / rdotr
55 |
56 | p = r + betta * p
57 | rdotr = new_rdotr
58 |
59 | if rdotr < residual_tol: # residual_tol = 0.0000000001
60 | break
61 |
62 | return x
63 |
64 | def hessian_vector_product(actor, states, p, cg_damping=1e-1):
65 | p.detach()
66 | kl = kl_divergence(new_actor=actor, old_actor=actor, states=states)
67 | kl = kl.mean()
68 |
69 | kl_grad = torch.autograd.grad(kl, actor.parameters(), create_graph=True)
70 | kl_grad = flat_grad(kl_grad)
71 |
72 | kl_grad_p = (kl_grad * p).sum()
73 | kl_hessian = torch.autograd.grad(kl_grad_p, actor.parameters())
74 | kl_hessian = flat_hessian(kl_hessian)
75 |
76 | return kl_hessian + p * cg_damping # cg_damping = 0.1
77 |
78 | def kl_divergence(new_actor, old_actor, states):
79 | mu, std = new_actor(torch.Tensor(states))
80 |
81 | mu_old, std_old = old_actor(torch.Tensor(states))
82 | mu_old = mu_old.detach()
83 | std_old = std_old.detach()
84 |
85 | # kl divergence between old policy and new policy : D( pi_old || pi_new )
86 | # pi_old -> mu_old, std_old / pi_new -> mu, std
87 | # be careful of calculating KL-divergence. It is not symmetric metric.
88 | kl = torch.log(std / std_old) + (std_old.pow(2) + (mu_old - mu).pow(2)) / (2.0 * std.pow(2)) - 0.5
89 |
90 | return kl.sum(1, keepdim=True)
91 |
92 |
93 | def flat_grad(grads):
94 | grad_flatten = []
95 | for grad in grads:
96 | grad_flatten.append(grad.view(-1))
97 | grad_flatten = torch.cat(grad_flatten)
98 |
99 | return grad_flatten
100 |
101 | def flat_hessian(hessians):
102 | hessians_flatten = []
103 | for hessian in hessians:
104 | hessians_flatten.append(hessian.contiguous().view(-1))
105 | hessians_flatten = torch.cat(hessians_flatten).data
106 |
107 | return hessians_flatten
108 |
109 |
110 | def flat_params(model):
111 | params = []
112 | for param in model.parameters():
113 | params.append(param.data.view(-1))
114 | params_flatten = torch.cat(params)
115 |
116 | return params_flatten
117 |
118 | def update_model(model, new_params):
119 | index = 0
120 | for params in model.parameters():
121 | params_length = len(params.view(-1))
122 | new_param = new_params[index: index + params_length]
123 | new_param = new_param.view(params.size())
124 | params.data.copy_(new_param)
125 | index += params_length
--------------------------------------------------------------------------------
/pendulum/trpo/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/trpo/.DS_Store
--------------------------------------------------------------------------------
/pendulum/trpo/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/trpo/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/pendulum/trpo/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/trpo/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/pendulum/trpo/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class Actor(nn.Module):
5 | def __init__(self, state_size, action_size, args):
6 | super(Actor, self).__init__()
7 | self.fc1 = nn.Linear(state_size, args.hidden_size)
8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
9 | self.fc3 = nn.Linear(args.hidden_size, action_size)
10 |
11 | def forward(self, x):
12 | x = torch.tanh(self.fc1(x))
13 | x = torch.tanh(self.fc2(x))
14 |
15 | mu = self.fc3(x)
16 | log_std = torch.zeros_like(mu)
17 | std = torch.exp(log_std)
18 |
19 | return mu, std
20 |
21 | class Critic(nn.Module):
22 | def __init__(self, state_size, args):
23 | super(Critic, self).__init__()
24 | self.fc1 = nn.Linear(state_size, args.hidden_size)
25 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
26 | self.fc3 = nn.Linear(args.hidden_size, 1)
27 |
28 | def forward(self, x):
29 | x = torch.tanh(self.fc1(x))
30 | x = torch.tanh(self.fc2(x))
31 | value = self.fc3(x)
32 |
33 | return value
--------------------------------------------------------------------------------
/pendulum/trpo/save_model/model.pth.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/trpo/save_model/model.pth.tar
--------------------------------------------------------------------------------
/pendulum/trpo/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import argparse
4 | import numpy as np
5 |
6 | import torch
7 | from utils import *
8 | from model import Actor
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--env_name', type=str, default="Pendulum-v0")
12 | parser.add_argument("--load_model", type=str, default='model.pth.tar')
13 | parser.add_argument('--render', action="store_true", default=True)
14 | parser.add_argument('--hidden_size', type=int, default=64)
15 | parser.add_argument('--iter', type=int, default=10000)
16 | parser.add_argument('--log_interval', type=int, default=10)
17 | args = parser.parse_args()
18 |
19 | if __name__=="__main__":
20 | env = gym.make(args.env_name)
21 | env.seed(500)
22 | torch.manual_seed(500)
23 |
24 | state_size = env.observation_space.shape[0]
25 | action_size = env.action_space.shape[0]
26 | print('state size:', state_size)
27 | print('action size:', action_size)
28 |
29 | actor = Actor(state_size, action_size, args)
30 |
31 | if args.load_model is not None:
32 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
33 | pretrained_model = torch.load(pretrained_model_path)
34 | actor.load_state_dict(pretrained_model)
35 |
36 | steps = 0
37 |
38 | for episode in range(args.iter):
39 | done = False
40 | score = 0
41 |
42 | state = env.reset()
43 | state = np.reshape(state, [1, state_size])
44 |
45 | while not done:
46 | if args.render:
47 | env.render()
48 |
49 | steps += 1
50 |
51 | mu, std = actor(torch.Tensor(state))
52 | action = get_action(mu, std)
53 |
54 | next_state, reward, done, _ = env.step(action)
55 |
56 | next_state = np.reshape(next_state, [1, state_size])
57 | state = next_state
58 | score += reward
59 |
60 | if episode % args.log_interval == 0:
61 | print('{} episode | score: {:.2f}'.format(episode, score[0]))
--------------------------------------------------------------------------------
/pendulum/trpo/utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch.distributions import Normal
4 |
5 | def get_action(mu, std):
6 | normal = Normal(mu, std)
7 | action = normal.sample()
8 |
9 | return action.data.numpy()
10 |
11 | def get_returns(rewards, masks, gamma):
12 | returns = torch.zeros_like(rewards)
13 | running_returns = 0
14 |
15 | for t in reversed(range(0, len(rewards))):
16 | running_returns = rewards[t] + masks[t] * gamma * running_returns
17 | returns[t] = running_returns
18 |
19 | returns = (returns - returns.mean()) / returns.std()
20 |
21 | return returns
22 |
23 | def get_log_prob(actions, mu, std):
24 | normal = Normal(mu, std)
25 | log_prob = normal.log_prob(actions)
26 |
27 | return log_prob
28 |
29 | def surrogate_loss(actor, values, targets, states, old_policy, actions):
30 | mu, std = actor(torch.Tensor(states))
31 | new_policy = get_log_prob(actions, mu, std)
32 |
33 | advantages = targets - values
34 |
35 | surrogate_loss = torch.exp(new_policy - old_policy) * advantages
36 | surrogate_loss = surrogate_loss.mean()
37 |
38 | return surrogate_loss
39 |
40 |
41 | # from openai baseline code
42 | # https://github.com/openai/baselines/blob/master/baselines/common/cg.py
43 | def conjugate_gradient(actor, states, b, nsteps, residual_tol=1e-10):
44 | x = torch.zeros(b.size())
45 | r = b.clone()
46 | p = b.clone()
47 | rdotr = torch.dot(r, r)
48 |
49 | for i in range(nsteps): # nsteps = 10
50 | Ap = hessian_vector_product(actor, states, p, cg_damping=1e-1)
51 | alpha = rdotr / torch.dot(p, Ap)
52 |
53 | x += alpha * p
54 | r -= alpha * Ap
55 |
56 | new_rdotr = torch.dot(r, r)
57 | betta = new_rdotr / rdotr
58 |
59 | p = r + betta * p
60 | rdotr = new_rdotr
61 |
62 | if rdotr < residual_tol: # residual_tol = 0.0000000001
63 | break
64 | return x
65 |
66 | def hessian_vector_product(actor, states, p, cg_damping=1e-1):
67 | p.detach()
68 | kl = kl_divergence(new_actor=actor, old_actor=actor, states=states)
69 | kl = kl.mean()
70 |
71 | kl_grad = torch.autograd.grad(kl, actor.parameters(), create_graph=True)
72 | kl_grad = flat_grad(kl_grad)
73 |
74 | kl_grad_p = (kl_grad * p).sum()
75 | kl_hessian = torch.autograd.grad(kl_grad_p, actor.parameters())
76 | kl_hessian = flat_hessian(kl_hessian)
77 |
78 | return kl_hessian + p * cg_damping
79 |
80 | def kl_divergence(new_actor, old_actor, states):
81 | mu, std = new_actor(torch.Tensor(states))
82 |
83 | mu_old, std_old = old_actor(torch.Tensor(states))
84 | mu_old = mu_old.detach()
85 | std_old = std_old.detach()
86 |
87 | # kl divergence between old policy and new policy : D( pi_old || pi_new )
88 | # pi_old -> mu_old, std_old / pi_new -> mu, std
89 | # be careful of calculating KL-divergence. It is not symmetric metric.
90 | kl = torch.log(std / std_old) + (std_old.pow(2) + (mu_old - mu).pow(2)) / (2.0 * std.pow(2)) - 0.5
91 | return kl.sum(1, keepdim=True)
92 |
93 |
94 | def flat_grad(grads):
95 | grad_flatten = []
96 | for grad in grads:
97 | grad_flatten.append(grad.view(-1))
98 | grad_flatten = torch.cat(grad_flatten)
99 | return grad_flatten
100 |
101 | def flat_hessian(hessians):
102 | hessians_flatten = []
103 | for hessian in hessians:
104 | hessians_flatten.append(hessian.contiguous().view(-1))
105 | hessians_flatten = torch.cat(hessians_flatten).data
106 | return hessians_flatten
107 |
108 |
109 | def flat_params(model):
110 | params = []
111 | for param in model.parameters():
112 | params.append(param.data.view(-1))
113 | params_flatten = torch.cat(params)
114 | return params_flatten
115 |
116 | def update_model(model, new_params):
117 | index = 0
118 | for params in model.parameters():
119 | params_length = len(params.view(-1))
120 | new_param = new_params[index : index + params_length]
121 | new_param = new_param.view(params.size())
122 | params.data.copy_(new_param)
123 | index += params_length
124 |
125 |
126 | def backtracking_line_search(old_actor, actor, actor_loss, actor_loss_grad,
127 | old_policy, params, maximal_step, max_kl,
128 | values, targets, states, actions):
129 | backtrac_coef = 1.0
130 | alpha = 0.5
131 | beta = 0.5
132 | flag = False
133 |
134 | expected_improve = (actor_loss_grad * maximal_step).sum(0, keepdim=True)
135 |
136 | for i in range(10):
137 | new_params = params + backtrac_coef * maximal_step
138 | update_model(actor, new_params)
139 |
140 | new_actor_loss = surrogate_loss(actor, values, targets, states, old_policy.detach(), actions)
141 |
142 | loss_improve = new_actor_loss - actor_loss
143 | expected_improve *= backtrac_coef
144 | improve_condition = loss_improve / expected_improve
145 |
146 | kl = kl_divergence(new_actor=actor, old_actor=old_actor, states=states)
147 | kl = kl.mean()
148 |
149 | if kl < max_kl and improve_condition > alpha:
150 | flag = True
151 | break
152 |
153 | backtrac_coef *= beta
154 |
155 | if not flag:
156 | params = flat_params(old_actor)
157 | update_model(actor, params)
158 | print('policy update does not impove the surrogate')
--------------------------------------------------------------------------------
/pendulum/trpo_gae/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/trpo_gae/__pycache__/model.cpython-36.pyc
--------------------------------------------------------------------------------
/pendulum/trpo_gae/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/trpo_gae/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/pendulum/trpo_gae/model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class Actor(nn.Module):
5 | def __init__(self, state_size, action_size, args):
6 | super(Actor, self).__init__()
7 | self.fc1 = nn.Linear(state_size, args.hidden_size)
8 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
9 | self.fc3 = nn.Linear(args.hidden_size, action_size)
10 |
11 | def forward(self, x):
12 | x = torch.tanh(self.fc1(x))
13 | x = torch.tanh(self.fc2(x))
14 |
15 | mu = self.fc3(x)
16 | log_std = torch.zeros_like(mu)
17 | std = torch.exp(log_std)
18 |
19 | return mu, std
20 |
21 | class Critic(nn.Module):
22 | def __init__(self, state_size, args):
23 | super(Critic, self).__init__()
24 | self.fc1 = nn.Linear(state_size, args.hidden_size)
25 | self.fc2 = nn.Linear(args.hidden_size, args.hidden_size)
26 | self.fc3 = nn.Linear(args.hidden_size, 1)
27 |
28 | def forward(self, x):
29 | x = torch.tanh(self.fc1(x))
30 | x = torch.tanh(self.fc2(x))
31 | value = self.fc3(x)
32 |
33 | return value
34 |
--------------------------------------------------------------------------------
/pendulum/trpo_gae/save_model/model.pth.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dongminlee94/Reinforcement-Learning-Code/c72408ca29c25230687d81b7ca3a85bab40956a9/pendulum/trpo_gae/save_model/model.pth.tar
--------------------------------------------------------------------------------
/pendulum/trpo_gae/test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import gym
3 | import argparse
4 | import numpy as np
5 |
6 | import torch
7 | from utils import *
8 | from model import Actor
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--env_name', type=str, default="Pendulum-v0")
12 | parser.add_argument("--load_model", type=str, default='model.pth.tar')
13 | parser.add_argument('--render', action="store_true", default=True)
14 | parser.add_argument('--hidden_size', type=int, default=64)
15 | parser.add_argument('--iter', type=int, default=10000)
16 | parser.add_argument('--log_interval', type=int, default=10)
17 | args = parser.parse_args()
18 |
19 | if __name__=="__main__":
20 | env = gym.make(args.env_name)
21 | env.seed(500)
22 | torch.manual_seed(500)
23 |
24 | state_size = env.observation_space.shape[0]
25 | action_size = env.action_space.shape[0]
26 | print('state size:', state_size)
27 | print('action size:', action_size)
28 |
29 | actor = Actor(state_size, action_size, args)
30 |
31 | if args.load_model is not None:
32 | pretrained_model_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
33 | pretrained_model = torch.load(pretrained_model_path)
34 | actor.load_state_dict(pretrained_model)
35 |
36 | steps = 0
37 |
38 | for episode in range(args.iter):
39 | done = False
40 | score = 0
41 |
42 | state = env.reset()
43 | state = np.reshape(state, [1, state_size])
44 |
45 | while not done:
46 | if args.render:
47 | env.render()
48 |
49 | steps += 1
50 |
51 | mu, std = actor(torch.Tensor(state))
52 | action = get_action(mu, std)
53 |
54 | next_state, reward, done, _ = env.step(action)
55 |
56 | next_state = np.reshape(next_state, [1, state_size])
57 | state = next_state
58 | score += reward
59 |
60 | if episode % args.log_interval == 0:
61 | print('{} episode | score: {:.2f}'.format(episode, score[0]))
--------------------------------------------------------------------------------
/pendulum/trpo_gae/utils.py:
--------------------------------------------------------------------------------
1 | import math
2 | import torch
3 | from torch.distributions import Normal
4 |
5 | def get_action(mu, std):
6 | normal = Normal(mu, std)
7 | action = normal.sample()
8 |
9 | return action.data.numpy()
10 |
11 | def get_gae(rewards, masks, values, args):
12 | returns = torch.zeros_like(rewards)
13 | advantages = torch.zeros_like(rewards)
14 |
15 | running_returns = 0
16 | previous_value = 0
17 | running_advants = 0
18 |
19 | for t in reversed(range(0, len(rewards))):
20 | # return
21 | running_returns = rewards[t] + masks[t] * args.gamma * running_returns
22 | returns[t] = running_returns
23 |
24 | # advantage
25 | running_deltas = rewards[t] + masks[t] * args.gamma * previous_value - values.data[t]
26 | running_advants = running_deltas + masks[t] * args.gamma * args.lamda * running_advants
27 |
28 | previous_value = values.data[t]
29 | advantages[t] = running_advants
30 |
31 | advantages = (advantages - advantages.mean()) / advantages.std()
32 |
33 | return returns, advantages
34 |
35 | def get_log_prob(actions, mu, std):
36 | normal = Normal(mu, std)
37 | log_prob = normal.log_prob(actions)
38 |
39 | return log_prob
40 |
41 | def surrogate_loss(actor, advantages, states, old_policy, actions):
42 | mu, std = actor(torch.Tensor(states))
43 | new_policy = get_log_prob(actions, mu, std)
44 |
45 | advantages = advantages.unsqueeze(1)
46 |
47 | surrogate_loss = torch.exp(new_policy - old_policy) * advantages
48 | surrogate_loss = surrogate_loss.mean()
49 |
50 | return surrogate_loss
51 |
52 |
53 | # from openai baseline code
54 | # https://github.com/openai/baselines/blob/master/baselines/common/cg.py
55 | def conjugate_gradient(actor, states, b, nsteps, residual_tol=1e-10):
56 | x = torch.zeros(b.size())
57 | r = b.clone()
58 | p = b.clone()
59 | rdotr = torch.dot(r, r)
60 |
61 | for i in range(nsteps): # nsteps = 10
62 | Ap = hessian_vector_product(actor, states, p, cg_damping=1e-1)
63 | alpha = rdotr / torch.dot(p, Ap)
64 |
65 | x += alpha * p
66 | r -= alpha * Ap
67 |
68 | new_rdotr = torch.dot(r, r)
69 | betta = new_rdotr / rdotr
70 |
71 | p = r + betta * p
72 | rdotr = new_rdotr
73 |
74 | if rdotr < residual_tol: # residual_tol = 0.0000000001
75 | break
76 | return x
77 |
78 | def hessian_vector_product(actor, states, p, cg_damping=1e-1):
79 | p.detach()
80 | kl = kl_divergence(new_actor=actor, old_actor=actor, states=states)
81 | kl = kl.mean()
82 |
83 | kl_grad = torch.autograd.grad(kl, actor.parameters(), create_graph=True)
84 | kl_grad = flat_grad(kl_grad)
85 |
86 | kl_grad_p = (kl_grad * p).sum()
87 | kl_hessian = torch.autograd.grad(kl_grad_p, actor.parameters())
88 | kl_hessian = flat_hessian(kl_hessian)
89 |
90 | return kl_hessian + p * cg_damping # cg_damping = 0.1
91 |
92 | def kl_divergence(new_actor, old_actor, states):
93 | mu, std = new_actor(torch.Tensor(states))
94 |
95 | mu_old, std_old = old_actor(torch.Tensor(states))
96 | mu_old = mu_old.detach()
97 | std_old = std_old.detach()
98 |
99 | # kl divergence between old policy and new policy : D( pi_old || pi_new )
100 | # pi_old -> mu_old, std_old / pi_new -> mu, std
101 | # be careful of calculating KL-divergence. It is not symmetric metric.
102 | kl = torch.log(std / std_old) + (std_old.pow(2) + (mu_old - mu).pow(2)) / (2.0 * std.pow(2)) - 0.5
103 | return kl.sum(1, keepdim=True)
104 |
105 |
106 | def flat_grad(grads):
107 | grad_flatten = []
108 | for grad in grads:
109 | grad_flatten.append(grad.view(-1))
110 | grad_flatten = torch.cat(grad_flatten)
111 | return grad_flatten
112 |
113 | def flat_hessian(hessians):
114 | hessians_flatten = []
115 | for hessian in hessians:
116 | hessians_flatten.append(hessian.contiguous().view(-1))
117 | hessians_flatten = torch.cat(hessians_flatten).data
118 | return hessians_flatten
119 |
120 |
121 | def flat_params(model):
122 | params = []
123 | for param in model.parameters():
124 | params.append(param.data.view(-1))
125 | params_flatten = torch.cat(params)
126 | return params_flatten
127 |
128 | def update_model(model, new_params):
129 | index = 0
130 | for params in model.parameters():
131 | params_length = len(params.view(-1))
132 | new_param = new_params[index: index + params_length]
133 | new_param = new_param.view(params.size())
134 | params.data.copy_(new_param)
135 | index += params_length
136 |
137 |
138 | def backtracking_line_search(old_actor, actor, actor_loss, actor_loss_grad,
139 | old_policy, params, maximal_step, max_kl,
140 | advantages, states, actions):
141 | backtrac_coef = 1.0
142 | alpha = 0.5
143 | beta = 0.5
144 | flag = False
145 |
146 | expected_improve = (actor_loss_grad * maximal_step).sum(0, keepdim=True)
147 |
148 | for i in range(10):
149 | new_params = params + backtrac_coef * maximal_step
150 | update_model(actor, new_params)
151 |
152 | new_actor_loss = surrogate_loss(actor, advantages, states, old_policy.detach(), actions)
153 |
154 | loss_improve = new_actor_loss - actor_loss
155 | expected_improve *= backtrac_coef
156 | improve_condition = loss_improve / expected_improve
157 |
158 | kl = kl_divergence(new_actor=actor, old_actor=old_actor, states=states)
159 | kl = kl.mean()
160 |
161 | if kl < max_kl and improve_condition > alpha:
162 | flag = True
163 | break
164 |
165 | backtrac_coef *= beta
166 |
167 | if not flag:
168 | params = flat_params(old_actor)
169 | update_model(actor, params)
170 | print('policy update does not impove the surrogate')
--------------------------------------------------------------------------------