├── PPO
├── __init__.py
├── render.py
├── replay.py
├── ppo.py
└── model.py
├── requirements.txt
├── setup.py
├── LICENSE
├── .gitignore
├── README.md
└── tests
└── PPO
├── test_model.py
└── test_replay.py
/PPO/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm==4.50.2
2 | torch==1.6.0
3 | gym==0.17.3
4 | numpy==1.19.2
5 | Pillow==8.0.1
6 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from setuptools import setup
4 |
5 |
6 | # Utility function to read the README file.
7 | # Used for the long_description. It's nice, because now 1) we have a top level
8 | # README file and 2) it's easier to type in the README file than to put a raw
9 | # string in below ...
10 | def read(fname):
11 | return open(os.path.join(os.path.dirname(__file__), fname)).read()
12 |
13 |
14 | setup(
15 | name="rl-from-scratch",
16 | version="0.0.2",
17 | author="Youness MANSAR",
18 | author_email="mansaryounessecp@gmail.com",
19 | description="Deep neural networks for tabular data",
20 | license="MIT",
21 | keywords="tabular",
22 | url="https://github.com/CVxTz/RL",
23 | packages=["PPO"],
24 | classifiers=[
25 | "Development Status :: 3 - Alpha",
26 | "Topic :: Utilities",
27 | "License :: OSI Approved :: BSD License",
28 | ],
29 | )
30 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Mansar Youness
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/PPO/render.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import torch
3 | from PIL import Image
4 | from tqdm import tqdm
5 | import random
6 | import imageio
7 | import cv2
8 | import numpy as np
9 |
10 |
11 | from PPO.model import (
12 | PolicyNetwork,
13 | ValueNetwork,
14 | device,
15 | )
16 |
17 |
18 | def write_on_image(img, reward):
19 |
20 | cv2.putText(
21 | img,
22 | f"Sum Reward: {int(reward)}",
23 | (0, img.shape[0] - 20),
24 | cv2.FONT_HERSHEY_SIMPLEX,
25 | 1,
26 | (0, 0, 0),
27 | 2,
28 | )
29 |
30 |
31 | if __name__ == "__main__":
32 | import argparse
33 |
34 | parser = argparse.ArgumentParser()
35 |
36 | parser.add_argument("--policy_path")
37 | parser.add_argument("--env_name")
38 | parser.add_argument("--n_episodes", type=int, default=5)
39 | parser.add_argument("--max_timesteps", type=int, default=400)
40 |
41 | parser.add_argument("--out_gif")
42 |
43 | state_scale = 1.0
44 |
45 | args = parser.parse_args()
46 |
47 | policy_path = args.policy_path
48 | env_name = args.env_name
49 |
50 | n_episodes = args.n_episodes
51 | max_timesteps = args.max_timesteps
52 |
53 | out_gif = args.out_gif
54 |
55 | env = gym.make(env_name)
56 | observation = env.reset()
57 | n_actions = env.action_space.n
58 | feature_dim = observation.size
59 |
60 | policy_model = PolicyNetwork(in_dim=feature_dim, n=n_actions).to(device)
61 |
62 | policy_model.load_state_dict(torch.load(policy_path))
63 |
64 | frames = []
65 | sum_reward = 0
66 | for _ in tqdm(range(n_episodes)):
67 | observation = env.reset()
68 |
69 | for timestep in range(max_timesteps):
70 |
71 | frames.append(np.ascontiguousarray(env.render(mode="rgb_array")))
72 |
73 | write_on_image(frames[-1], sum_reward)
74 |
75 | action = policy_model.best_action(observation / state_scale)
76 |
77 | new_observation, reward, done, info = env.step(action)
78 | sum_reward += reward
79 |
80 | if done:
81 | for a in range(10):
82 | frames.append(np.ascontiguousarray(env.render(mode="rgb_array")))
83 | write_on_image(frames[-1], sum_reward)
84 | break
85 |
86 | observation = new_observation
87 |
88 | imageio.mimsave(out_gif, frames, fps=60)
89 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/PPO/replay.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from torch.utils.data import Dataset
4 |
5 |
6 | def cumulative_sum(array, gamma=1.0):
7 | curr = 0
8 | cumulative_array = []
9 |
10 | for a in array[::-1]:
11 | curr = a + gamma * curr
12 | cumulative_array.append(curr)
13 |
14 | return cumulative_array[::-1]
15 |
16 |
17 | class Episode:
18 | def __init__(self, gamma=0.99, lambd=0.95):
19 | self.observations = []
20 | self.actions = []
21 | self.advantages = []
22 | self.rewards = []
23 | self.rewards_to_go = []
24 | self.values = []
25 | self.log_probabilities = []
26 | self.gamma = gamma
27 | self.lambd = lambd
28 |
29 | def append(
30 | self, observation, action, reward, value, log_probability, reward_scale=20
31 | ):
32 | self.observations.append(observation)
33 | self.actions.append(action)
34 | self.rewards.append(reward / reward_scale)
35 | self.values.append(value)
36 | self.log_probabilities.append(log_probability)
37 |
38 | def end_episode(self, last_value):
39 | rewards = np.array(self.rewards + [last_value])
40 | values = np.array(self.values + [last_value])
41 |
42 | deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]
43 |
44 | self.advantages = cumulative_sum(deltas.tolist(), gamma=self.gamma * self.lambd)
45 |
46 | self.rewards_to_go = cumulative_sum(rewards.tolist(), gamma=self.gamma)[:-1]
47 |
48 |
49 | def normalize_list(array):
50 | array = np.array(array)
51 | array = (array - np.mean(array)) / (np.std(array) + 1e-5)
52 | return array.tolist()
53 |
54 |
55 | class History(Dataset):
56 | def __init__(self):
57 | self.episodes = []
58 | self.observations = []
59 | self.actions = []
60 | self.advantages = []
61 | self.rewards = []
62 | self.rewards_to_go = []
63 | self.log_probabilities = []
64 |
65 | def free_memory(self):
66 | del self.episodes[:]
67 | del self.observations[:]
68 | del self.actions[:]
69 | del self.advantages[:]
70 | del self.rewards[:]
71 | del self.rewards_to_go[:]
72 | del self.log_probabilities[:]
73 |
74 | def add_episode(self, episode):
75 | self.episodes.append(episode)
76 |
77 | def build_dataset(self):
78 | for episode in self.episodes:
79 | self.observations += episode.observations
80 | self.actions += episode.actions
81 | self.advantages += episode.advantages
82 | self.rewards += episode.rewards
83 | self.rewards_to_go += episode.rewards_to_go
84 | self.log_probabilities += episode.log_probabilities
85 |
86 | assert (
87 | len(
88 | {
89 | len(self.observations),
90 | len(self.actions),
91 | len(self.advantages),
92 | len(self.rewards),
93 | len(self.rewards_to_go),
94 | len(self.log_probabilities),
95 | }
96 | )
97 | == 1
98 | )
99 |
100 | self.advantages = normalize_list(self.advantages)
101 |
102 | def __len__(self):
103 | return len(self.observations)
104 |
105 | def __getitem__(self, idx):
106 | return (
107 | self.observations[idx],
108 | self.actions[idx],
109 | self.advantages[idx],
110 | self.log_probabilities[idx],
111 | self.rewards_to_go[idx],
112 | )
113 |
--------------------------------------------------------------------------------
/PPO/ppo.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import gym
4 | import numpy as np
5 | import torch
6 | import torch.optim as optim
7 | from torch.utils.data import DataLoader
8 | from torch.utils.tensorboard import SummaryWriter
9 | from tqdm import tqdm
10 |
11 | from PPO.model import (
12 | PolicyNetwork,
13 | ValueNetwork,
14 | device,
15 | train_value_network,
16 | train_policy_network,
17 | )
18 | from PPO.replay import Episode, History
19 |
20 |
21 | def main(
22 | env_name="LunarLander-v2",
23 | reward_scale=20.0,
24 | clip=0.2,
25 | log_dir="../logs",
26 | learning_rate=0.001,
27 | state_scale=1.0,
28 | ):
29 | writer = SummaryWriter(log_dir=log_dir, filename_suffix=env_name, comment=env_name)
30 |
31 | env = gym.make(env_name)
32 | observation = env.reset()
33 |
34 | n_actions = env.action_space.n
35 | feature_dim = observation.size
36 |
37 | value_model = ValueNetwork(in_dim=feature_dim).to(device)
38 | value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate)
39 |
40 | policy_model = PolicyNetwork(in_dim=feature_dim, n=n_actions).to(device)
41 | policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)
42 |
43 | n_epoch = 4
44 |
45 | max_episodes = 20
46 | max_timesteps = 400
47 |
48 | batch_size = 32
49 |
50 | max_iterations = 200
51 |
52 | history = History()
53 |
54 | epoch_ite = 0
55 | episode_ite = 0
56 |
57 | for ite in tqdm(range(max_iterations)):
58 |
59 | if ite % 50 == 0:
60 | torch.save(
61 | policy_model.state_dict(),
62 | Path(log_dir) / (env_name + f"_{str(ite)}_policy.pth"),
63 | )
64 | torch.save(
65 | value_model.state_dict(),
66 | Path(log_dir) / (env_name + f"_{str(ite)}_value.pth"),
67 | )
68 |
69 | for episode_i in range(max_episodes):
70 |
71 | observation = env.reset()
72 | episode = Episode()
73 |
74 | for timestep in range(max_timesteps):
75 |
76 | action, log_probability = policy_model.sample_action(
77 | observation / state_scale
78 | )
79 | value = value_model.state_value(observation / state_scale)
80 |
81 | new_observation, reward, done, info = env.step(action)
82 |
83 | episode.append(
84 | observation=observation / state_scale,
85 | action=action,
86 | reward=reward,
87 | value=value,
88 | log_probability=log_probability,
89 | reward_scale=reward_scale,
90 | )
91 |
92 | observation = new_observation
93 |
94 | if done:
95 | episode.end_episode(last_value=0)
96 | break
97 |
98 | if timestep == max_timesteps - 1:
99 | value = value_model.state_value(observation / state_scale)
100 | episode.end_episode(last_value=value)
101 |
102 | episode_ite += 1
103 | writer.add_scalar(
104 | "Average Episode Reward",
105 | reward_scale * np.sum(episode.rewards),
106 | episode_ite,
107 | )
108 | writer.add_scalar(
109 | "Average Probabilities",
110 | np.exp(np.mean(episode.log_probabilities)),
111 | episode_ite,
112 | )
113 |
114 | history.add_episode(episode)
115 |
116 | history.build_dataset()
117 | data_loader = DataLoader(history, batch_size=batch_size, shuffle=True)
118 |
119 | policy_loss = train_policy_network(
120 | policy_model, policy_optimizer, data_loader, epochs=n_epoch, clip=clip
121 | )
122 |
123 | value_loss = train_value_network(
124 | value_model, value_optimizer, data_loader, epochs=n_epoch
125 | )
126 |
127 | for p_l, v_l in zip(policy_loss, value_loss):
128 | epoch_ite += 1
129 | writer.add_scalar("Policy Loss", p_l, epoch_ite)
130 | writer.add_scalar("Value Loss", v_l, epoch_ite)
131 |
132 | history.free_memory()
133 |
134 |
135 | if __name__ == "__main__":
136 |
137 | main(
138 | reward_scale=20.0,
139 | clip=0.2,
140 | env_name="LunarLander-v2",
141 | learning_rate=0.001,
142 | state_scale=1.0,
143 | log_dir="logs/Lunar"
144 | )
145 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # RL
2 |
3 | ### PPO
4 | How to run:
5 | ```
6 | python PPO/ppo.py
7 | ```
8 | Test Policy:
9 | ```
10 | python PPO/render.py --policy_path logs/Lunar/LunarLander-v2_100_policy.pth \
11 | --env_name LunarLander-v2 --out_gif logs/lunar_late.gif
12 |
13 | ```
14 |
15 | ### Learning to Play CartPole and LunarLander with Proximal Policy Optimization
16 |
17 | #### Implementing PPO from scratch with Pytorch
18 |
19 | 
20 |
21 | In this post, we will train an RL agent to play two control based games:
22 |
23 | * [https://gym.openai.com/envs/LunarLander-v2/](https://gym.openai.com/envs/LunarLander-v2/)
24 | * [https://gym.openai.com/envs/CartPole-v1/](https://gym.openai.com/envs/CartPole-v1/)
25 |
26 | Our agent will be trained using an algorithm called Proximal Policy
27 | Optimization. We will implement this approach from scratch using PyTorch and
28 | OpenAi gym.
29 |
30 | This project is based on the following paper:
31 |
32 | * [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347)
33 |
34 | #### Gym:
35 |
36 | The basic idea behind OpenAI Gym is that we define an environment env by
37 | calling:
38 |
39 | env = gym.make(env_name)
40 |
41 | Then at each time step **t**, we pick an action **a** and we get a new
42 | state_(t+1) and a reward **reward_t**. The objective is to train an Agent that
43 | learns a policy PI that can predict for each state the best action that will
44 | maximize the sum of the future rewards. For example, in the environment
45 | LunarLander, we get the maximum reward if we land the rocket smoothly on top of
46 | the landing area. In the environment CartPole, the objective is to keep the pole
47 | vertical for as long as possible.
48 |
49 | 
50 |
51 | #### PPO:
52 |
53 | Our final objective is to learn a policy network that will take the state as
54 | input and then output a probability distribution over the actions that will
55 | maximize the expected reward.
56 |
57 | Implementing PPO goes as follows:
58 |
59 | * First, we start with a policy PI_old
60 | * We sample some trajectories from P_old
61 | * For each action **a** in each trajectory we compute the Advantage, a kind of
62 | measure of how much better the action **a** is compared to other possible
63 | actions at state_t.
64 | * For a few epochs, we maximize the following objective with gradient ascent:
65 |
66 | 
67 |
68 | From [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347)
69 |
70 | What this loss does is that it **increases the probability** if action a_t at
71 | state s_t if it has a **positive advantage** and **decreases the probability**
72 | in the case of a **negative advantage**. However, in practice this ratio of
73 | probabilities tends to diverge to infinity, making the training unstable. The
74 | authors propose a clipped version of the loss to solve this issue:
75 |
76 | 
77 |
78 | [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347)
79 |
80 | We also add two additional terms to the loss, an mean squared error over the
81 | Value function and Entropy encourage exploration during the sampling of the
82 | trajectories.
83 |
84 | We can plot the sum of the rewards during the progression of the training for
85 | each episode:
86 |
87 | 
88 |
89 | Training reward for LunarLander
90 |
91 | #### Trained Agents:
92 |
93 | Now we get the see the trained policy network in action.
94 |
95 | * CartPole
96 |
97 | 
98 |
99 | * LunarLander
100 |
101 | 
102 |
103 | Perfect Landing!
104 |
105 | #### Other resources:
106 |
107 | If you are interested in learning more about PPO or Policy gradient
108 | Reinforcement Learning methods I recommend following this course:
109 | [https://www.youtube.com/playlist?list=PL_iWQOsE6TfURIIhCrlt-wj9ByIVpbfGc](https://www.youtube.com/playlist?list=PL_iWQOsE6TfURIIhCrlt-wj9ByIVpbfGc)
110 | by Sergey Levine at Berkeley. The course is very long and math-heavy but the
111 | instructor is really good.
112 |
113 | #### Code :
114 |
115 | [https://github.com/CVxTz/RL](https://github.com/CVxTz/RL)
116 |
--------------------------------------------------------------------------------
/PPO/model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn.functional as F
4 | from torch.distributions import Categorical
5 |
6 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
7 |
8 |
9 | class PolicyNetwork(torch.nn.Module):
10 | def __init__(self, n=4, in_dim=128):
11 | super(PolicyNetwork, self).__init__()
12 |
13 | self.fc1 = torch.nn.Linear(in_dim, 128)
14 | self.fc2 = torch.nn.Linear(128, 128)
15 | self.fc3 = torch.nn.Linear(128, 128)
16 |
17 | self.fc4 = torch.nn.Linear(128, n)
18 |
19 | self.l_relu = torch.nn.LeakyReLU(0.1)
20 |
21 | def forward(self, x):
22 |
23 | x = self.l_relu(self.fc1(x))
24 | x = self.l_relu(self.fc2(x))
25 | x = self.l_relu(self.fc3(x))
26 |
27 | y = self.fc4(x)
28 |
29 | y = F.softmax(y, dim=-1)
30 |
31 | return y
32 |
33 | def sample_action(self, state):
34 |
35 | if not state is torch.Tensor:
36 | state = torch.from_numpy(state).float().to(device)
37 |
38 | if len(state.size()) == 1:
39 | state = state.unsqueeze(0)
40 |
41 | y = self(state)
42 |
43 | dist = Categorical(y)
44 |
45 | action = dist.sample()
46 |
47 | log_probability = dist.log_prob(action)
48 |
49 | return action.item(), log_probability.item()
50 |
51 | def best_action(self, state):
52 |
53 | if not state is torch.Tensor:
54 | state = torch.from_numpy(state).float().to(device)
55 |
56 | if len(state.size()) == 1:
57 | state = state.unsqueeze(0)
58 |
59 | y = self(state).squeeze()
60 |
61 | action = torch.argmax(y)
62 |
63 | return action.item()
64 |
65 | def evaluate_actions(self, states, actions):
66 | y = self(states)
67 |
68 | dist = Categorical(y)
69 |
70 | entropy = dist.entropy()
71 |
72 | log_probabilities = dist.log_prob(actions)
73 |
74 | return log_probabilities, entropy
75 |
76 |
77 | class ValueNetwork(torch.nn.Module):
78 | def __init__(self, in_dim=128):
79 | super(ValueNetwork, self).__init__()
80 |
81 | self.fc1 = torch.nn.Linear(in_dim, 128)
82 | self.fc2 = torch.nn.Linear(128, 128)
83 | self.fc3 = torch.nn.Linear(128, 128)
84 |
85 | self.fc4 = torch.nn.Linear(128, 1)
86 |
87 | self.l_relu = torch.nn.LeakyReLU(0.1)
88 |
89 | def forward(self, x):
90 |
91 | x = self.l_relu(self.fc1(x))
92 | x = self.l_relu(self.fc2(x))
93 | x = self.l_relu(self.fc3(x))
94 |
95 | y = self.fc4(x)
96 |
97 | return y.squeeze(1)
98 |
99 | def state_value(self, state):
100 |
101 | if not state is torch.Tensor:
102 | state = torch.from_numpy(state).float().to(device)
103 |
104 | if len(state.size()) == 1:
105 | state = state.unsqueeze(0)
106 |
107 | y = self(state)
108 |
109 | return y.item()
110 |
111 |
112 | def train_value_network(value_model, value_optimizer, data_loader, epochs=4):
113 | epochs_losses = []
114 |
115 | for i in range(epochs):
116 |
117 | losses = []
118 |
119 | for observations, _, _, _, rewards_to_go in data_loader:
120 | observations = observations.float().to(device)
121 | rewards_to_go = rewards_to_go.float().to(device)
122 |
123 | value_optimizer.zero_grad()
124 |
125 | values = value_model(observations)
126 |
127 | loss = F.mse_loss(values, rewards_to_go)
128 |
129 | loss.backward()
130 |
131 | value_optimizer.step()
132 |
133 | losses.append(loss.item())
134 |
135 | mean_loss = np.mean(losses)
136 |
137 | epochs_losses.append(mean_loss)
138 |
139 | return epochs_losses
140 |
141 |
142 | def ac_loss(new_log_probabilities, old_log_probabilities, advantages, epsilon_clip=0.2):
143 | probability_ratios = torch.exp(new_log_probabilities - old_log_probabilities)
144 | clipped_probabiliy_ratios = torch.clamp(
145 | probability_ratios, 1 - epsilon_clip, 1 + epsilon_clip
146 | )
147 |
148 | surrogate_1 = probability_ratios * advantages
149 | surrogate_2 = clipped_probabiliy_ratios * advantages
150 |
151 | return -torch.min(surrogate_1, surrogate_2)
152 |
153 |
154 | def train_policy_network(
155 | policy_model, policy_optimizer, data_loader, epochs=4, clip=0.2
156 | ):
157 | epochs_losses = []
158 |
159 | c1 = 0.01
160 |
161 | for i in range(epochs):
162 |
163 | losses = []
164 |
165 | for observations, actions, advantages, log_probabilities, _ in data_loader:
166 | observations = observations.float().to(device)
167 | actions = actions.long().to(device)
168 | advantages = advantages.float().to(device)
169 | old_log_probabilities = log_probabilities.float().to(device)
170 |
171 | policy_optimizer.zero_grad()
172 |
173 | new_log_probabilities, entropy = policy_model.evaluate_actions(
174 | observations, actions
175 | )
176 |
177 | loss = (
178 | ac_loss(
179 | new_log_probabilities,
180 | old_log_probabilities,
181 | advantages,
182 | epsilon_clip=clip,
183 | ).mean()
184 | - c1 * entropy.mean()
185 | )
186 |
187 | loss.backward()
188 |
189 | policy_optimizer.step()
190 |
191 | losses.append(loss.item())
192 |
193 | mean_loss = np.mean(losses)
194 |
195 | epochs_losses.append(mean_loss)
196 |
197 | return epochs_losses
198 |
--------------------------------------------------------------------------------
/tests/PPO/test_model.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 | import torch
4 | import torch.optim as optim
5 | from torch.utils.data import DataLoader
6 |
7 | from PPO.model import (
8 | PolicyNetwork,
9 | ValueNetwork,
10 | device,
11 | train_value_network,
12 | train_policy_network,
13 | )
14 | from PPO.replay import Episode, History
15 |
16 |
17 | def test_model_1():
18 | env = gym.make("LunarLander-v2")
19 |
20 | observation = env.reset()
21 |
22 | n_actions = env.action_space.n
23 | feature_dim = observation.size
24 |
25 | policy_model = PolicyNetwork(n=n_actions, in_dim=feature_dim)
26 |
27 | policy_model.to(device)
28 |
29 | action, log_probability = policy_model.sample_action(observation)
30 |
31 | assert action in list(range(n_actions))
32 |
33 |
34 | def test_model_2():
35 | env = gym.make("LunarLander-v2")
36 |
37 | observation = env.reset()
38 |
39 | n_actions = env.action_space.n
40 | feature_dim = observation.size
41 |
42 | policy_model = PolicyNetwork(n=n_actions, in_dim=feature_dim)
43 | policy_model.to(device)
44 |
45 | observations = [observation / i for i in range(1, 11)]
46 |
47 | observations = torch.from_numpy(np.array(observations)).to(device)
48 |
49 | probs = policy_model(observations)
50 |
51 | assert list(probs.size()) == [10, n_actions]
52 |
53 | assert abs(probs[0, :].sum().item() - 1) < 1e-3
54 |
55 |
56 | def test_model_3():
57 | env = gym.make("LunarLander-v2")
58 |
59 | observation = env.reset()
60 |
61 | n_actions = env.action_space.n
62 | feature_dim = observation.size
63 |
64 | policy_model = PolicyNetwork(n=n_actions, in_dim=feature_dim)
65 | policy_model.to(device)
66 |
67 | observations = [observation / i for i in range(1, 11)]
68 |
69 | actions = [i % 4 for i in range(1, 11)]
70 |
71 | observations = torch.from_numpy(np.array(observations)).to(device)
72 | actions = torch.IntTensor(actions).to(device)
73 |
74 | log_probabilities, entropy = policy_model.evaluate_actions(observations, actions)
75 |
76 | assert list(log_probabilities.size()) == [10]
77 | assert list(entropy.size()) == [10]
78 |
79 |
80 | def test_history_episode_model():
81 | reward_scale = 20
82 |
83 | env = gym.make("LunarLander-v2")
84 | observation = env.reset()
85 |
86 | n_actions = env.action_space.n
87 | feature_dim = observation.size
88 |
89 | policy_model = PolicyNetwork(n=n_actions, in_dim=feature_dim).to(device)
90 | value_model = ValueNetwork(in_dim=feature_dim).to(device)
91 |
92 | max_episodes = 10
93 | max_timesteps = 100
94 |
95 | reward_sum = 0
96 | ite = 0
97 |
98 | history = History()
99 |
100 | for episode_i in range(max_episodes):
101 |
102 | observation = env.reset()
103 | episode = Episode()
104 |
105 | for timestep in range(max_timesteps):
106 |
107 | action, log_probability = policy_model.sample_action(observation)
108 | value = value_model.state_value(observation)
109 |
110 | new_observation, reward, done, info = env.step(action)
111 |
112 | episode.append(
113 | observation=observation,
114 | action=action,
115 | reward=reward,
116 | value=value,
117 | log_probability=log_probability,
118 | reward_scale=reward_scale,
119 | )
120 |
121 | observation = new_observation
122 |
123 | reward_sum += reward
124 | ite += 1
125 |
126 | if done:
127 | episode.end_episode(last_value=np.random.uniform())
128 | break
129 |
130 | if timestep == max_timesteps - 1:
131 | episode.end_episode(last_value=0)
132 |
133 | history.add_episode(episode)
134 |
135 | history.build_dataset()
136 |
137 | assert abs(np.sum(history.rewards) - reward_sum / reward_scale) < 1e-5
138 |
139 | assert len(history.rewards) == ite
140 |
141 | assert abs(np.mean(history.advantages)) <= 1e-10
142 |
143 | assert abs(np.std(history.advantages) - 1) <= 1e-3
144 |
145 |
146 | def test_value_network():
147 | env = gym.make("LunarLander-v2")
148 | observation = env.reset()
149 |
150 | n_actions = env.action_space.n
151 | feature_dim = observation.size
152 |
153 | n_epoch = 4
154 |
155 | max_episodes = 10
156 | max_timesteps = 100
157 |
158 | reward_sum = 0
159 | ite = 0
160 |
161 | history = History()
162 |
163 | for episode_i in range(max_episodes):
164 |
165 | observation = env.reset()
166 | episode = Episode()
167 |
168 | for timestep in range(max_timesteps):
169 |
170 | action = env.action_space.sample()
171 |
172 | new_observation, reward, done, info = env.step(action)
173 |
174 | episode.append(
175 | observation=observation,
176 | action=action,
177 | reward=reward,
178 | value=ite,
179 | log_probability=np.log(1 / n_actions),
180 | )
181 |
182 | observation = new_observation
183 |
184 | reward_sum += reward
185 | ite += 1
186 |
187 | if done:
188 | episode.end_episode(last_value=np.random.uniform())
189 | break
190 |
191 | if timestep == max_timesteps - 1:
192 | episode.end_episode(last_value=0)
193 |
194 | history.add_episode(episode)
195 |
196 | history.build_dataset()
197 |
198 | value_model = ValueNetwork(in_dim=feature_dim).to(device)
199 | value_optimizer = optim.Adam(value_model.parameters(), lr=0.001)
200 |
201 | data_loader = DataLoader(history, batch_size=64, shuffle=True)
202 |
203 | epochs_losses = train_value_network(
204 | value_model, value_optimizer, data_loader, epochs=n_epoch
205 | )
206 |
207 | assert epochs_losses[0] > epochs_losses[-1]
208 |
209 |
210 | def test_policy_network():
211 | env = gym.make("LunarLander-v2")
212 | observation = env.reset()
213 |
214 | n_actions = env.action_space.n
215 | feature_dim = observation.size
216 |
217 | n_epoch = 4
218 |
219 | max_episodes = 10
220 | max_timesteps = 100
221 |
222 | reward_sum = 0
223 | ite = 0
224 |
225 | history = History()
226 |
227 | for episode_i in range(max_episodes):
228 |
229 | observation = env.reset()
230 | episode = Episode()
231 |
232 | for timestep in range(max_timesteps):
233 |
234 | action = env.action_space.sample()
235 |
236 | new_observation, reward, done, info = env.step(action)
237 |
238 | episode.append(
239 | observation=observation,
240 | action=action,
241 | reward=reward,
242 | value=ite,
243 | log_probability=np.log(1 / n_actions),
244 | )
245 |
246 | observation = new_observation
247 |
248 | reward_sum += reward
249 | ite += 1
250 |
251 | if done:
252 | episode.end_episode(last_value=np.random.uniform())
253 | break
254 |
255 | if timestep == max_timesteps - 1:
256 | episode.end_episode(last_value=0)
257 |
258 | history.add_episode(episode)
259 |
260 | history.build_dataset()
261 |
262 | policy_model = PolicyNetwork(in_dim=feature_dim).to(device)
263 | policy_optimizer = optim.Adam(policy_model.parameters(), lr=0.01)
264 |
265 | data_loader = DataLoader(history, batch_size=64, shuffle=True)
266 |
267 | epochs_losses = train_policy_network(
268 | policy_model, policy_optimizer, data_loader, epochs=n_epoch
269 | )
270 |
271 | assert epochs_losses[0] > epochs_losses[-1]
272 |
--------------------------------------------------------------------------------
/tests/PPO/test_replay.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import numpy as np
3 |
4 | from PPO.replay import cumulative_sum, Episode, History
5 |
6 |
7 | def test_cumulative_sum_1():
8 | array = [0, 1, 2, 3, 4, 5]
9 |
10 | cumulative_array = cumulative_sum(array)
11 |
12 | expected_cumulative_array = [15, 15, 14, 12, 9, 5]
13 |
14 | assert cumulative_array == expected_cumulative_array
15 |
16 |
17 | def test_cumulative_sum_2():
18 | array = [0, 1, 2, 3, 4, 5]
19 |
20 | cumulative_array = cumulative_sum(array, gamma=0.99)
21 |
22 | expected_cumulative_array = [
23 | 14.458431289499998,
24 | 14.604476049999999,
25 | 13.741895,
26 | 11.8605,
27 | 8.95,
28 | 5.0,
29 | ]
30 |
31 | assert cumulative_array == expected_cumulative_array
32 |
33 |
34 | def test_episode_1():
35 | episode = Episode(gamma=0.99, lambd=0.95)
36 |
37 | reward_scale = 20
38 |
39 | episode.append(
40 | observation=0,
41 | action=1,
42 | reward=0,
43 | value=0,
44 | log_probability=-1,
45 | reward_scale=reward_scale,
46 | )
47 | episode.append(
48 | observation=0,
49 | action=1,
50 | reward=1,
51 | value=0,
52 | log_probability=-1,
53 | reward_scale=reward_scale,
54 | )
55 | episode.append(
56 | observation=0,
57 | action=1,
58 | reward=2,
59 | value=0,
60 | log_probability=-1,
61 | reward_scale=reward_scale,
62 | )
63 | episode.append(
64 | observation=0,
65 | action=1,
66 | reward=3,
67 | value=0,
68 | log_probability=-1,
69 | reward_scale=reward_scale,
70 | )
71 | episode.append(
72 | observation=0,
73 | action=1,
74 | reward=4,
75 | value=0,
76 | log_probability=-1,
77 | reward_scale=reward_scale,
78 | )
79 | episode.append(
80 | observation=0,
81 | action=1,
82 | reward=5,
83 | value=0,
84 | log_probability=-1,
85 | reward_scale=reward_scale,
86 | )
87 | episode.end_episode(last_value=0)
88 |
89 | expected_rewards_to_go = [
90 | 0.722921564475,
91 | 0.7302238025,
92 | 0.68709475,
93 | 0.593025,
94 | 0.4475,
95 | 0.25,
96 | ]
97 |
98 | assert episode.rewards_to_go == expected_rewards_to_go
99 |
100 |
101 | def test_episode_2():
102 | episode = Episode(gamma=0.99, lambd=0.95)
103 |
104 | reward_scale = 20
105 |
106 | episode.append(
107 | observation=0,
108 | action=1,
109 | reward=0,
110 | value=0,
111 | log_probability=-1,
112 | reward_scale=reward_scale,
113 | )
114 | episode.append(
115 | observation=0,
116 | action=1,
117 | reward=1,
118 | value=0,
119 | log_probability=-1,
120 | reward_scale=reward_scale,
121 | )
122 | episode.append(
123 | observation=0,
124 | action=1,
125 | reward=2,
126 | value=1,
127 | log_probability=-1,
128 | reward_scale=reward_scale,
129 | )
130 | episode.append(
131 | observation=0,
132 | action=1,
133 | reward=3,
134 | value=2,
135 | log_probability=-1,
136 | reward_scale=reward_scale,
137 | )
138 | episode.append(
139 | observation=0,
140 | action=1,
141 | reward=4,
142 | value=3,
143 | log_probability=-1,
144 | reward_scale=reward_scale,
145 | )
146 | episode.append(
147 | observation=0,
148 | action=1,
149 | reward=5,
150 | value=5,
151 | log_probability=-1,
152 | reward_scale=reward_scale,
153 | )
154 | episode.end_episode(last_value=5)
155 |
156 | expected_advantages = [
157 | 4.694519008033593,
158 | 4.991514096792763,
159 | 4.201503558525,
160 | 3.3189830500000004,
161 | 2.3381000000000007,
162 | 0.20000000000000018,
163 | ]
164 | assert episode.advantages == expected_advantages
165 |
166 |
167 | def test_history_1():
168 | episode1 = Episode(gamma=0.99, lambd=0.95)
169 | episode1.append(observation=0, action=1, reward=0, value=0, log_probability=-1)
170 | episode1.append(observation=0, action=1, reward=1, value=0, log_probability=-1)
171 | episode1.append(observation=0, action=1, reward=2, value=1, log_probability=-1)
172 | episode1.append(observation=0, action=1, reward=3, value=2, log_probability=-1)
173 | episode1.append(observation=0, action=1, reward=4, value=3, log_probability=-1)
174 | episode1.append(observation=0, action=1, reward=5, value=5, log_probability=-1)
175 | episode1.end_episode(last_value=5)
176 |
177 | episode2 = Episode(gamma=0.99, lambd=0.95)
178 | episode2.append(observation=0, action=1, reward=0, value=0, log_probability=-1)
179 | episode2.append(observation=0, action=1, reward=-1, value=0, log_probability=-1)
180 | episode2.append(observation=0, action=1, reward=-2, value=-1, log_probability=-1)
181 | episode2.append(observation=0, action=1, reward=3, value=2, log_probability=-1)
182 | episode2.append(observation=0, action=1, reward=-4, value=-3, log_probability=-1)
183 | episode2.end_episode(last_value=0)
184 |
185 | history = History()
186 |
187 | history.add_episode(episode1)
188 | history.add_episode(episode2)
189 |
190 | history.build_dataset()
191 |
192 | assert len(history) == 11
193 | assert abs(np.mean(history.advantages)) <= 1e-10
194 | assert abs(np.std(history.advantages) - 1) <= 1e-3
195 |
196 |
197 | def test_history_2():
198 | episode1 = Episode(gamma=0.99, lambd=0.95)
199 | episode1.append(observation=0, action=1, reward=0, value=0, log_probability=-1)
200 | episode1.append(observation=0, action=1, reward=1, value=0, log_probability=-1)
201 | episode1.append(observation=0, action=1, reward=2, value=1, log_probability=-1)
202 | episode1.append(observation=0, action=1, reward=3, value=2, log_probability=-1)
203 | episode1.append(observation=0, action=1, reward=4, value=3, log_probability=-1)
204 | episode1.append(observation=0, action=1, reward=5, value=5, log_probability=-1)
205 | episode1.end_episode(last_value=5)
206 |
207 | episode2 = Episode(gamma=0.99, lambd=0.95)
208 | episode2.append(observation=0, action=1, reward=0, value=0, log_probability=-1)
209 | episode2.append(observation=0, action=1, reward=-1, value=0, log_probability=-1)
210 | episode2.append(observation=0, action=1, reward=-2, value=-1, log_probability=-1)
211 | episode2.append(observation=0, action=1, reward=3, value=2, log_probability=-1)
212 | episode2.append(observation=0, action=1, reward=-4, value=-3, log_probability=-1)
213 | episode2.end_episode(last_value=0)
214 |
215 | history = History()
216 |
217 | history.add_episode(episode1)
218 | history.add_episode(episode2)
219 |
220 | history.build_dataset()
221 |
222 | history.free_memory()
223 |
224 | assert len(history) == 0
225 | assert len(history.rewards) == 0
226 | assert len(history.advantages) == 0
227 | assert len(history.log_probabilities) == 0
228 | assert len(history.rewards_to_go) == 0
229 | assert len(history.episodes) == 0
230 |
231 |
232 | def test_history_episode():
233 | reward_scale = 20
234 |
235 | env = gym.make("LunarLander-v2")
236 | observation = env.reset()
237 |
238 | n_actions = env.action_space.n
239 | feature_dim = observation.size
240 |
241 | max_episodes = 10
242 | max_timesteps = 100
243 |
244 | reward_sum = 0
245 | ite = 0
246 |
247 | history = History()
248 |
249 | for episode_i in range(max_episodes):
250 |
251 | observation = env.reset()
252 | episode = Episode()
253 |
254 | for timestep in range(max_timesteps):
255 |
256 | action = env.action_space.sample()
257 |
258 | new_observation, reward, done, info = env.step(action)
259 |
260 | episode.append(
261 | observation=observation,
262 | action=action,
263 | reward=reward,
264 | value=ite,
265 | log_probability=np.log(1 / n_actions),
266 | reward_scale=reward_scale,
267 | )
268 |
269 | observation = new_observation
270 |
271 | reward_sum += reward
272 | ite += 1
273 |
274 | if done:
275 | episode.end_episode(last_value=np.random.uniform())
276 | break
277 |
278 | if timestep == max_timesteps - 1:
279 | episode.end_episode(last_value=0)
280 |
281 | history.add_episode(episode)
282 |
283 | history.build_dataset()
284 |
285 | assert abs(np.sum(history.rewards) - reward_sum / reward_scale) < 1e-5
286 |
287 | assert len(history.rewards) == ite
288 |
289 | assert abs(np.mean(history.advantages)) <= 1e-10
290 |
291 | assert abs(np.std(history.advantages) - 1) <= 1e-3
292 |
293 | assert np.std(history.log_probabilities) <= 1e-3
294 |
295 | assert (
296 | abs(
297 | sum([v for episode in history.episodes for v in episode.values])
298 | - ite * (ite - 1) / 2
299 | )
300 | <= 1e-3
301 | )
302 |
303 | assert history.observations[-1].shape[0] == feature_dim
304 |
305 | assert (
306 | abs(
307 | len([a for a in history.actions if a == 0])
308 | - len(history.actions) / n_actions
309 | )
310 | < 30
311 | )
312 |
--------------------------------------------------------------------------------