├── PPO ├── __init__.py ├── render.py ├── replay.py ├── ppo.py └── model.py ├── requirements.txt ├── setup.py ├── LICENSE ├── .gitignore ├── README.md └── tests └── PPO ├── test_model.py └── test_replay.py /PPO/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm==4.50.2 2 | torch==1.6.0 3 | gym==0.17.3 4 | numpy==1.19.2 5 | Pillow==8.0.1 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import setup 4 | 5 | 6 | # Utility function to read the README file. 7 | # Used for the long_description. It's nice, because now 1) we have a top level 8 | # README file and 2) it's easier to type in the README file than to put a raw 9 | # string in below ... 10 | def read(fname): 11 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 12 | 13 | 14 | setup( 15 | name="rl-from-scratch", 16 | version="0.0.2", 17 | author="Youness MANSAR", 18 | author_email="mansaryounessecp@gmail.com", 19 | description="Deep neural networks for tabular data", 20 | license="MIT", 21 | keywords="tabular", 22 | url="https://github.com/CVxTz/RL", 23 | packages=["PPO"], 24 | classifiers=[ 25 | "Development Status :: 3 - Alpha", 26 | "Topic :: Utilities", 27 | "License :: OSI Approved :: BSD License", 28 | ], 29 | ) 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Mansar Youness 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /PPO/render.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import torch 3 | from PIL import Image 4 | from tqdm import tqdm 5 | import random 6 | import imageio 7 | import cv2 8 | import numpy as np 9 | 10 | 11 | from PPO.model import ( 12 | PolicyNetwork, 13 | ValueNetwork, 14 | device, 15 | ) 16 | 17 | 18 | def write_on_image(img, reward): 19 | 20 | cv2.putText( 21 | img, 22 | f"Sum Reward: {int(reward)}", 23 | (0, img.shape[0] - 20), 24 | cv2.FONT_HERSHEY_SIMPLEX, 25 | 1, 26 | (0, 0, 0), 27 | 2, 28 | ) 29 | 30 | 31 | if __name__ == "__main__": 32 | import argparse 33 | 34 | parser = argparse.ArgumentParser() 35 | 36 | parser.add_argument("--policy_path") 37 | parser.add_argument("--env_name") 38 | parser.add_argument("--n_episodes", type=int, default=5) 39 | parser.add_argument("--max_timesteps", type=int, default=400) 40 | 41 | parser.add_argument("--out_gif") 42 | 43 | state_scale = 1.0 44 | 45 | args = parser.parse_args() 46 | 47 | policy_path = args.policy_path 48 | env_name = args.env_name 49 | 50 | n_episodes = args.n_episodes 51 | max_timesteps = args.max_timesteps 52 | 53 | out_gif = args.out_gif 54 | 55 | env = gym.make(env_name) 56 | observation = env.reset() 57 | n_actions = env.action_space.n 58 | feature_dim = observation.size 59 | 60 | policy_model = PolicyNetwork(in_dim=feature_dim, n=n_actions).to(device) 61 | 62 | policy_model.load_state_dict(torch.load(policy_path)) 63 | 64 | frames = [] 65 | sum_reward = 0 66 | for _ in tqdm(range(n_episodes)): 67 | observation = env.reset() 68 | 69 | for timestep in range(max_timesteps): 70 | 71 | frames.append(np.ascontiguousarray(env.render(mode="rgb_array"))) 72 | 73 | write_on_image(frames[-1], sum_reward) 74 | 75 | action = policy_model.best_action(observation / state_scale) 76 | 77 | new_observation, reward, done, info = env.step(action) 78 | sum_reward += reward 79 | 80 | if done: 81 | for a in range(10): 82 | frames.append(np.ascontiguousarray(env.render(mode="rgb_array"))) 83 | write_on_image(frames[-1], sum_reward) 84 | break 85 | 86 | observation = new_observation 87 | 88 | imageio.mimsave(out_gif, frames, fps=60) 89 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /PPO/replay.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from torch.utils.data import Dataset 4 | 5 | 6 | def cumulative_sum(array, gamma=1.0): 7 | curr = 0 8 | cumulative_array = [] 9 | 10 | for a in array[::-1]: 11 | curr = a + gamma * curr 12 | cumulative_array.append(curr) 13 | 14 | return cumulative_array[::-1] 15 | 16 | 17 | class Episode: 18 | def __init__(self, gamma=0.99, lambd=0.95): 19 | self.observations = [] 20 | self.actions = [] 21 | self.advantages = [] 22 | self.rewards = [] 23 | self.rewards_to_go = [] 24 | self.values = [] 25 | self.log_probabilities = [] 26 | self.gamma = gamma 27 | self.lambd = lambd 28 | 29 | def append( 30 | self, observation, action, reward, value, log_probability, reward_scale=20 31 | ): 32 | self.observations.append(observation) 33 | self.actions.append(action) 34 | self.rewards.append(reward / reward_scale) 35 | self.values.append(value) 36 | self.log_probabilities.append(log_probability) 37 | 38 | def end_episode(self, last_value): 39 | rewards = np.array(self.rewards + [last_value]) 40 | values = np.array(self.values + [last_value]) 41 | 42 | deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1] 43 | 44 | self.advantages = cumulative_sum(deltas.tolist(), gamma=self.gamma * self.lambd) 45 | 46 | self.rewards_to_go = cumulative_sum(rewards.tolist(), gamma=self.gamma)[:-1] 47 | 48 | 49 | def normalize_list(array): 50 | array = np.array(array) 51 | array = (array - np.mean(array)) / (np.std(array) + 1e-5) 52 | return array.tolist() 53 | 54 | 55 | class History(Dataset): 56 | def __init__(self): 57 | self.episodes = [] 58 | self.observations = [] 59 | self.actions = [] 60 | self.advantages = [] 61 | self.rewards = [] 62 | self.rewards_to_go = [] 63 | self.log_probabilities = [] 64 | 65 | def free_memory(self): 66 | del self.episodes[:] 67 | del self.observations[:] 68 | del self.actions[:] 69 | del self.advantages[:] 70 | del self.rewards[:] 71 | del self.rewards_to_go[:] 72 | del self.log_probabilities[:] 73 | 74 | def add_episode(self, episode): 75 | self.episodes.append(episode) 76 | 77 | def build_dataset(self): 78 | for episode in self.episodes: 79 | self.observations += episode.observations 80 | self.actions += episode.actions 81 | self.advantages += episode.advantages 82 | self.rewards += episode.rewards 83 | self.rewards_to_go += episode.rewards_to_go 84 | self.log_probabilities += episode.log_probabilities 85 | 86 | assert ( 87 | len( 88 | { 89 | len(self.observations), 90 | len(self.actions), 91 | len(self.advantages), 92 | len(self.rewards), 93 | len(self.rewards_to_go), 94 | len(self.log_probabilities), 95 | } 96 | ) 97 | == 1 98 | ) 99 | 100 | self.advantages = normalize_list(self.advantages) 101 | 102 | def __len__(self): 103 | return len(self.observations) 104 | 105 | def __getitem__(self, idx): 106 | return ( 107 | self.observations[idx], 108 | self.actions[idx], 109 | self.advantages[idx], 110 | self.log_probabilities[idx], 111 | self.rewards_to_go[idx], 112 | ) 113 | -------------------------------------------------------------------------------- /PPO/ppo.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import gym 4 | import numpy as np 5 | import torch 6 | import torch.optim as optim 7 | from torch.utils.data import DataLoader 8 | from torch.utils.tensorboard import SummaryWriter 9 | from tqdm import tqdm 10 | 11 | from PPO.model import ( 12 | PolicyNetwork, 13 | ValueNetwork, 14 | device, 15 | train_value_network, 16 | train_policy_network, 17 | ) 18 | from PPO.replay import Episode, History 19 | 20 | 21 | def main( 22 | env_name="LunarLander-v2", 23 | reward_scale=20.0, 24 | clip=0.2, 25 | log_dir="../logs", 26 | learning_rate=0.001, 27 | state_scale=1.0, 28 | ): 29 | writer = SummaryWriter(log_dir=log_dir, filename_suffix=env_name, comment=env_name) 30 | 31 | env = gym.make(env_name) 32 | observation = env.reset() 33 | 34 | n_actions = env.action_space.n 35 | feature_dim = observation.size 36 | 37 | value_model = ValueNetwork(in_dim=feature_dim).to(device) 38 | value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate) 39 | 40 | policy_model = PolicyNetwork(in_dim=feature_dim, n=n_actions).to(device) 41 | policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate) 42 | 43 | n_epoch = 4 44 | 45 | max_episodes = 20 46 | max_timesteps = 400 47 | 48 | batch_size = 32 49 | 50 | max_iterations = 200 51 | 52 | history = History() 53 | 54 | epoch_ite = 0 55 | episode_ite = 0 56 | 57 | for ite in tqdm(range(max_iterations)): 58 | 59 | if ite % 50 == 0: 60 | torch.save( 61 | policy_model.state_dict(), 62 | Path(log_dir) / (env_name + f"_{str(ite)}_policy.pth"), 63 | ) 64 | torch.save( 65 | value_model.state_dict(), 66 | Path(log_dir) / (env_name + f"_{str(ite)}_value.pth"), 67 | ) 68 | 69 | for episode_i in range(max_episodes): 70 | 71 | observation = env.reset() 72 | episode = Episode() 73 | 74 | for timestep in range(max_timesteps): 75 | 76 | action, log_probability = policy_model.sample_action( 77 | observation / state_scale 78 | ) 79 | value = value_model.state_value(observation / state_scale) 80 | 81 | new_observation, reward, done, info = env.step(action) 82 | 83 | episode.append( 84 | observation=observation / state_scale, 85 | action=action, 86 | reward=reward, 87 | value=value, 88 | log_probability=log_probability, 89 | reward_scale=reward_scale, 90 | ) 91 | 92 | observation = new_observation 93 | 94 | if done: 95 | episode.end_episode(last_value=0) 96 | break 97 | 98 | if timestep == max_timesteps - 1: 99 | value = value_model.state_value(observation / state_scale) 100 | episode.end_episode(last_value=value) 101 | 102 | episode_ite += 1 103 | writer.add_scalar( 104 | "Average Episode Reward", 105 | reward_scale * np.sum(episode.rewards), 106 | episode_ite, 107 | ) 108 | writer.add_scalar( 109 | "Average Probabilities", 110 | np.exp(np.mean(episode.log_probabilities)), 111 | episode_ite, 112 | ) 113 | 114 | history.add_episode(episode) 115 | 116 | history.build_dataset() 117 | data_loader = DataLoader(history, batch_size=batch_size, shuffle=True) 118 | 119 | policy_loss = train_policy_network( 120 | policy_model, policy_optimizer, data_loader, epochs=n_epoch, clip=clip 121 | ) 122 | 123 | value_loss = train_value_network( 124 | value_model, value_optimizer, data_loader, epochs=n_epoch 125 | ) 126 | 127 | for p_l, v_l in zip(policy_loss, value_loss): 128 | epoch_ite += 1 129 | writer.add_scalar("Policy Loss", p_l, epoch_ite) 130 | writer.add_scalar("Value Loss", v_l, epoch_ite) 131 | 132 | history.free_memory() 133 | 134 | 135 | if __name__ == "__main__": 136 | 137 | main( 138 | reward_scale=20.0, 139 | clip=0.2, 140 | env_name="LunarLander-v2", 141 | learning_rate=0.001, 142 | state_scale=1.0, 143 | log_dir="logs/Lunar" 144 | ) 145 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RL 2 | 3 | ### PPO 4 | How to run: 5 | ``` 6 | python PPO/ppo.py 7 | ``` 8 | Test Policy: 9 | ``` 10 | python PPO/render.py --policy_path logs/Lunar/LunarLander-v2_100_policy.pth \ 11 | --env_name LunarLander-v2 --out_gif logs/lunar_late.gif 12 | 13 | ``` 14 | 15 | ### Learning to Play CartPole and LunarLander with Proximal Policy Optimization 16 | 17 | #### Implementing PPO from scratch with Pytorch 18 | 19 | ![](https://cdn-images-1.medium.com/max/800/1*WiKzN5tiKqettn8yeLj-MQ.gif) 20 | 21 | In this post, we will train an RL agent to play two control based games: 22 | 23 | * [https://gym.openai.com/envs/LunarLander-v2/](https://gym.openai.com/envs/LunarLander-v2/) 24 | * [https://gym.openai.com/envs/CartPole-v1/](https://gym.openai.com/envs/CartPole-v1/) 25 | 26 | Our agent will be trained using an algorithm called Proximal Policy 27 | Optimization. We will implement this approach from scratch using PyTorch and 28 | OpenAi gym. 29 | 30 | This project is based on the following paper: 31 | 32 | * [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347) 33 | 34 | #### Gym: 35 | 36 | The basic idea behind OpenAI Gym is that we define an environment env by 37 | calling: 38 | 39 | env = gym.make(env_name) 40 | 41 | Then at each time step **t**, we pick an action **a** and we get a new 42 | state_(t+1) and a reward **reward_t**. The objective is to train an Agent that 43 | learns a policy PI that can predict for each state the best action that will 44 | maximize the sum of the future rewards. For example, in the environment 45 | LunarLander, we get the maximum reward if we land the rocket smoothly on top of 46 | the landing area. In the environment CartPole, the objective is to keep the pole 47 | vertical for as long as possible. 48 | 49 | ![](https://cdn-images-1.medium.com/max/800/1*RxpfmLGwZR8kEVOlxbSjzQ.gif) 50 | 51 | #### PPO: 52 | 53 | Our final objective is to learn a policy network that will take the state as 54 | input and then output a probability distribution over the actions that will 55 | maximize the expected reward. 56 | 57 | Implementing PPO goes as follows: 58 | 59 | * First, we start with a policy PI_old 60 | * We sample some trajectories from P_old 61 | * For each action **a** in each trajectory we compute the Advantage, a kind of 62 | measure of how much better the action **a** is compared to other possible 63 | actions at state_t. 64 | * For a few epochs, we maximize the following objective with gradient ascent: 65 | 66 | ![](https://cdn-images-1.medium.com/max/800/1*cnjyaLHg0QynODhIXkYYSg.png) 67 | 68 | From [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347) 69 | 70 | What this loss does is that it **increases the probability** if action a_t at 71 | state s_t if it has a **positive advantage** and **decreases the probability** 72 | in the case of a **negative advantage**. However, in practice this ratio of 73 | probabilities tends to diverge to infinity, making the training unstable. The 74 | authors propose a clipped version of the loss to solve this issue: 75 | 76 | ![](https://cdn-images-1.medium.com/max/800/1*ocve-gRQDzkXVov-yTtZuA.png) 77 | 78 | [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347) 79 | 80 | We also add two additional terms to the loss, an mean squared error over the 81 | Value function and Entropy encourage exploration during the sampling of the 82 | trajectories. 83 | 84 | We can plot the sum of the rewards during the progression of the training for 85 | each episode: 86 | 87 | ![](https://cdn-images-1.medium.com/max/1200/1*MIF6DYmlJT9yLQFDafmOJA.png) 88 | 89 | Training reward for LunarLander 90 | 91 | #### Trained Agents: 92 | 93 | Now we get the see the trained policy network in action. 94 | 95 | * CartPole 96 | 97 | ![](https://cdn-images-1.medium.com/max/800/1*_ddmwllJuY-9Zvh8x6PCbQ.gif) 98 | 99 | * LunarLander 100 | 101 | ![](https://cdn-images-1.medium.com/max/800/1*tKbe-gnp6VujnrQ2YwEZ-g.gif) 102 | 103 | Perfect Landing! 104 | 105 | #### Other resources: 106 | 107 | If you are interested in learning more about PPO or Policy gradient 108 | Reinforcement Learning methods I recommend following this course: 109 | [https://www.youtube.com/playlist?list=PL_iWQOsE6TfURIIhCrlt-wj9ByIVpbfGc](https://www.youtube.com/playlist?list=PL_iWQOsE6TfURIIhCrlt-wj9ByIVpbfGc) 110 | by Sergey Levine at Berkeley. The course is very long and math-heavy but the 111 | instructor is really good. 112 | 113 | #### Code : 114 | 115 | [https://github.com/CVxTz/RL](https://github.com/CVxTz/RL) 116 | -------------------------------------------------------------------------------- /PPO/model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | from torch.distributions import Categorical 5 | 6 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 7 | 8 | 9 | class PolicyNetwork(torch.nn.Module): 10 | def __init__(self, n=4, in_dim=128): 11 | super(PolicyNetwork, self).__init__() 12 | 13 | self.fc1 = torch.nn.Linear(in_dim, 128) 14 | self.fc2 = torch.nn.Linear(128, 128) 15 | self.fc3 = torch.nn.Linear(128, 128) 16 | 17 | self.fc4 = torch.nn.Linear(128, n) 18 | 19 | self.l_relu = torch.nn.LeakyReLU(0.1) 20 | 21 | def forward(self, x): 22 | 23 | x = self.l_relu(self.fc1(x)) 24 | x = self.l_relu(self.fc2(x)) 25 | x = self.l_relu(self.fc3(x)) 26 | 27 | y = self.fc4(x) 28 | 29 | y = F.softmax(y, dim=-1) 30 | 31 | return y 32 | 33 | def sample_action(self, state): 34 | 35 | if not state is torch.Tensor: 36 | state = torch.from_numpy(state).float().to(device) 37 | 38 | if len(state.size()) == 1: 39 | state = state.unsqueeze(0) 40 | 41 | y = self(state) 42 | 43 | dist = Categorical(y) 44 | 45 | action = dist.sample() 46 | 47 | log_probability = dist.log_prob(action) 48 | 49 | return action.item(), log_probability.item() 50 | 51 | def best_action(self, state): 52 | 53 | if not state is torch.Tensor: 54 | state = torch.from_numpy(state).float().to(device) 55 | 56 | if len(state.size()) == 1: 57 | state = state.unsqueeze(0) 58 | 59 | y = self(state).squeeze() 60 | 61 | action = torch.argmax(y) 62 | 63 | return action.item() 64 | 65 | def evaluate_actions(self, states, actions): 66 | y = self(states) 67 | 68 | dist = Categorical(y) 69 | 70 | entropy = dist.entropy() 71 | 72 | log_probabilities = dist.log_prob(actions) 73 | 74 | return log_probabilities, entropy 75 | 76 | 77 | class ValueNetwork(torch.nn.Module): 78 | def __init__(self, in_dim=128): 79 | super(ValueNetwork, self).__init__() 80 | 81 | self.fc1 = torch.nn.Linear(in_dim, 128) 82 | self.fc2 = torch.nn.Linear(128, 128) 83 | self.fc3 = torch.nn.Linear(128, 128) 84 | 85 | self.fc4 = torch.nn.Linear(128, 1) 86 | 87 | self.l_relu = torch.nn.LeakyReLU(0.1) 88 | 89 | def forward(self, x): 90 | 91 | x = self.l_relu(self.fc1(x)) 92 | x = self.l_relu(self.fc2(x)) 93 | x = self.l_relu(self.fc3(x)) 94 | 95 | y = self.fc4(x) 96 | 97 | return y.squeeze(1) 98 | 99 | def state_value(self, state): 100 | 101 | if not state is torch.Tensor: 102 | state = torch.from_numpy(state).float().to(device) 103 | 104 | if len(state.size()) == 1: 105 | state = state.unsqueeze(0) 106 | 107 | y = self(state) 108 | 109 | return y.item() 110 | 111 | 112 | def train_value_network(value_model, value_optimizer, data_loader, epochs=4): 113 | epochs_losses = [] 114 | 115 | for i in range(epochs): 116 | 117 | losses = [] 118 | 119 | for observations, _, _, _, rewards_to_go in data_loader: 120 | observations = observations.float().to(device) 121 | rewards_to_go = rewards_to_go.float().to(device) 122 | 123 | value_optimizer.zero_grad() 124 | 125 | values = value_model(observations) 126 | 127 | loss = F.mse_loss(values, rewards_to_go) 128 | 129 | loss.backward() 130 | 131 | value_optimizer.step() 132 | 133 | losses.append(loss.item()) 134 | 135 | mean_loss = np.mean(losses) 136 | 137 | epochs_losses.append(mean_loss) 138 | 139 | return epochs_losses 140 | 141 | 142 | def ac_loss(new_log_probabilities, old_log_probabilities, advantages, epsilon_clip=0.2): 143 | probability_ratios = torch.exp(new_log_probabilities - old_log_probabilities) 144 | clipped_probabiliy_ratios = torch.clamp( 145 | probability_ratios, 1 - epsilon_clip, 1 + epsilon_clip 146 | ) 147 | 148 | surrogate_1 = probability_ratios * advantages 149 | surrogate_2 = clipped_probabiliy_ratios * advantages 150 | 151 | return -torch.min(surrogate_1, surrogate_2) 152 | 153 | 154 | def train_policy_network( 155 | policy_model, policy_optimizer, data_loader, epochs=4, clip=0.2 156 | ): 157 | epochs_losses = [] 158 | 159 | c1 = 0.01 160 | 161 | for i in range(epochs): 162 | 163 | losses = [] 164 | 165 | for observations, actions, advantages, log_probabilities, _ in data_loader: 166 | observations = observations.float().to(device) 167 | actions = actions.long().to(device) 168 | advantages = advantages.float().to(device) 169 | old_log_probabilities = log_probabilities.float().to(device) 170 | 171 | policy_optimizer.zero_grad() 172 | 173 | new_log_probabilities, entropy = policy_model.evaluate_actions( 174 | observations, actions 175 | ) 176 | 177 | loss = ( 178 | ac_loss( 179 | new_log_probabilities, 180 | old_log_probabilities, 181 | advantages, 182 | epsilon_clip=clip, 183 | ).mean() 184 | - c1 * entropy.mean() 185 | ) 186 | 187 | loss.backward() 188 | 189 | policy_optimizer.step() 190 | 191 | losses.append(loss.item()) 192 | 193 | mean_loss = np.mean(losses) 194 | 195 | epochs_losses.append(mean_loss) 196 | 197 | return epochs_losses 198 | -------------------------------------------------------------------------------- /tests/PPO/test_model.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | import torch 4 | import torch.optim as optim 5 | from torch.utils.data import DataLoader 6 | 7 | from PPO.model import ( 8 | PolicyNetwork, 9 | ValueNetwork, 10 | device, 11 | train_value_network, 12 | train_policy_network, 13 | ) 14 | from PPO.replay import Episode, History 15 | 16 | 17 | def test_model_1(): 18 | env = gym.make("LunarLander-v2") 19 | 20 | observation = env.reset() 21 | 22 | n_actions = env.action_space.n 23 | feature_dim = observation.size 24 | 25 | policy_model = PolicyNetwork(n=n_actions, in_dim=feature_dim) 26 | 27 | policy_model.to(device) 28 | 29 | action, log_probability = policy_model.sample_action(observation) 30 | 31 | assert action in list(range(n_actions)) 32 | 33 | 34 | def test_model_2(): 35 | env = gym.make("LunarLander-v2") 36 | 37 | observation = env.reset() 38 | 39 | n_actions = env.action_space.n 40 | feature_dim = observation.size 41 | 42 | policy_model = PolicyNetwork(n=n_actions, in_dim=feature_dim) 43 | policy_model.to(device) 44 | 45 | observations = [observation / i for i in range(1, 11)] 46 | 47 | observations = torch.from_numpy(np.array(observations)).to(device) 48 | 49 | probs = policy_model(observations) 50 | 51 | assert list(probs.size()) == [10, n_actions] 52 | 53 | assert abs(probs[0, :].sum().item() - 1) < 1e-3 54 | 55 | 56 | def test_model_3(): 57 | env = gym.make("LunarLander-v2") 58 | 59 | observation = env.reset() 60 | 61 | n_actions = env.action_space.n 62 | feature_dim = observation.size 63 | 64 | policy_model = PolicyNetwork(n=n_actions, in_dim=feature_dim) 65 | policy_model.to(device) 66 | 67 | observations = [observation / i for i in range(1, 11)] 68 | 69 | actions = [i % 4 for i in range(1, 11)] 70 | 71 | observations = torch.from_numpy(np.array(observations)).to(device) 72 | actions = torch.IntTensor(actions).to(device) 73 | 74 | log_probabilities, entropy = policy_model.evaluate_actions(observations, actions) 75 | 76 | assert list(log_probabilities.size()) == [10] 77 | assert list(entropy.size()) == [10] 78 | 79 | 80 | def test_history_episode_model(): 81 | reward_scale = 20 82 | 83 | env = gym.make("LunarLander-v2") 84 | observation = env.reset() 85 | 86 | n_actions = env.action_space.n 87 | feature_dim = observation.size 88 | 89 | policy_model = PolicyNetwork(n=n_actions, in_dim=feature_dim).to(device) 90 | value_model = ValueNetwork(in_dim=feature_dim).to(device) 91 | 92 | max_episodes = 10 93 | max_timesteps = 100 94 | 95 | reward_sum = 0 96 | ite = 0 97 | 98 | history = History() 99 | 100 | for episode_i in range(max_episodes): 101 | 102 | observation = env.reset() 103 | episode = Episode() 104 | 105 | for timestep in range(max_timesteps): 106 | 107 | action, log_probability = policy_model.sample_action(observation) 108 | value = value_model.state_value(observation) 109 | 110 | new_observation, reward, done, info = env.step(action) 111 | 112 | episode.append( 113 | observation=observation, 114 | action=action, 115 | reward=reward, 116 | value=value, 117 | log_probability=log_probability, 118 | reward_scale=reward_scale, 119 | ) 120 | 121 | observation = new_observation 122 | 123 | reward_sum += reward 124 | ite += 1 125 | 126 | if done: 127 | episode.end_episode(last_value=np.random.uniform()) 128 | break 129 | 130 | if timestep == max_timesteps - 1: 131 | episode.end_episode(last_value=0) 132 | 133 | history.add_episode(episode) 134 | 135 | history.build_dataset() 136 | 137 | assert abs(np.sum(history.rewards) - reward_sum / reward_scale) < 1e-5 138 | 139 | assert len(history.rewards) == ite 140 | 141 | assert abs(np.mean(history.advantages)) <= 1e-10 142 | 143 | assert abs(np.std(history.advantages) - 1) <= 1e-3 144 | 145 | 146 | def test_value_network(): 147 | env = gym.make("LunarLander-v2") 148 | observation = env.reset() 149 | 150 | n_actions = env.action_space.n 151 | feature_dim = observation.size 152 | 153 | n_epoch = 4 154 | 155 | max_episodes = 10 156 | max_timesteps = 100 157 | 158 | reward_sum = 0 159 | ite = 0 160 | 161 | history = History() 162 | 163 | for episode_i in range(max_episodes): 164 | 165 | observation = env.reset() 166 | episode = Episode() 167 | 168 | for timestep in range(max_timesteps): 169 | 170 | action = env.action_space.sample() 171 | 172 | new_observation, reward, done, info = env.step(action) 173 | 174 | episode.append( 175 | observation=observation, 176 | action=action, 177 | reward=reward, 178 | value=ite, 179 | log_probability=np.log(1 / n_actions), 180 | ) 181 | 182 | observation = new_observation 183 | 184 | reward_sum += reward 185 | ite += 1 186 | 187 | if done: 188 | episode.end_episode(last_value=np.random.uniform()) 189 | break 190 | 191 | if timestep == max_timesteps - 1: 192 | episode.end_episode(last_value=0) 193 | 194 | history.add_episode(episode) 195 | 196 | history.build_dataset() 197 | 198 | value_model = ValueNetwork(in_dim=feature_dim).to(device) 199 | value_optimizer = optim.Adam(value_model.parameters(), lr=0.001) 200 | 201 | data_loader = DataLoader(history, batch_size=64, shuffle=True) 202 | 203 | epochs_losses = train_value_network( 204 | value_model, value_optimizer, data_loader, epochs=n_epoch 205 | ) 206 | 207 | assert epochs_losses[0] > epochs_losses[-1] 208 | 209 | 210 | def test_policy_network(): 211 | env = gym.make("LunarLander-v2") 212 | observation = env.reset() 213 | 214 | n_actions = env.action_space.n 215 | feature_dim = observation.size 216 | 217 | n_epoch = 4 218 | 219 | max_episodes = 10 220 | max_timesteps = 100 221 | 222 | reward_sum = 0 223 | ite = 0 224 | 225 | history = History() 226 | 227 | for episode_i in range(max_episodes): 228 | 229 | observation = env.reset() 230 | episode = Episode() 231 | 232 | for timestep in range(max_timesteps): 233 | 234 | action = env.action_space.sample() 235 | 236 | new_observation, reward, done, info = env.step(action) 237 | 238 | episode.append( 239 | observation=observation, 240 | action=action, 241 | reward=reward, 242 | value=ite, 243 | log_probability=np.log(1 / n_actions), 244 | ) 245 | 246 | observation = new_observation 247 | 248 | reward_sum += reward 249 | ite += 1 250 | 251 | if done: 252 | episode.end_episode(last_value=np.random.uniform()) 253 | break 254 | 255 | if timestep == max_timesteps - 1: 256 | episode.end_episode(last_value=0) 257 | 258 | history.add_episode(episode) 259 | 260 | history.build_dataset() 261 | 262 | policy_model = PolicyNetwork(in_dim=feature_dim).to(device) 263 | policy_optimizer = optim.Adam(policy_model.parameters(), lr=0.01) 264 | 265 | data_loader = DataLoader(history, batch_size=64, shuffle=True) 266 | 267 | epochs_losses = train_policy_network( 268 | policy_model, policy_optimizer, data_loader, epochs=n_epoch 269 | ) 270 | 271 | assert epochs_losses[0] > epochs_losses[-1] 272 | -------------------------------------------------------------------------------- /tests/PPO/test_replay.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | 4 | from PPO.replay import cumulative_sum, Episode, History 5 | 6 | 7 | def test_cumulative_sum_1(): 8 | array = [0, 1, 2, 3, 4, 5] 9 | 10 | cumulative_array = cumulative_sum(array) 11 | 12 | expected_cumulative_array = [15, 15, 14, 12, 9, 5] 13 | 14 | assert cumulative_array == expected_cumulative_array 15 | 16 | 17 | def test_cumulative_sum_2(): 18 | array = [0, 1, 2, 3, 4, 5] 19 | 20 | cumulative_array = cumulative_sum(array, gamma=0.99) 21 | 22 | expected_cumulative_array = [ 23 | 14.458431289499998, 24 | 14.604476049999999, 25 | 13.741895, 26 | 11.8605, 27 | 8.95, 28 | 5.0, 29 | ] 30 | 31 | assert cumulative_array == expected_cumulative_array 32 | 33 | 34 | def test_episode_1(): 35 | episode = Episode(gamma=0.99, lambd=0.95) 36 | 37 | reward_scale = 20 38 | 39 | episode.append( 40 | observation=0, 41 | action=1, 42 | reward=0, 43 | value=0, 44 | log_probability=-1, 45 | reward_scale=reward_scale, 46 | ) 47 | episode.append( 48 | observation=0, 49 | action=1, 50 | reward=1, 51 | value=0, 52 | log_probability=-1, 53 | reward_scale=reward_scale, 54 | ) 55 | episode.append( 56 | observation=0, 57 | action=1, 58 | reward=2, 59 | value=0, 60 | log_probability=-1, 61 | reward_scale=reward_scale, 62 | ) 63 | episode.append( 64 | observation=0, 65 | action=1, 66 | reward=3, 67 | value=0, 68 | log_probability=-1, 69 | reward_scale=reward_scale, 70 | ) 71 | episode.append( 72 | observation=0, 73 | action=1, 74 | reward=4, 75 | value=0, 76 | log_probability=-1, 77 | reward_scale=reward_scale, 78 | ) 79 | episode.append( 80 | observation=0, 81 | action=1, 82 | reward=5, 83 | value=0, 84 | log_probability=-1, 85 | reward_scale=reward_scale, 86 | ) 87 | episode.end_episode(last_value=0) 88 | 89 | expected_rewards_to_go = [ 90 | 0.722921564475, 91 | 0.7302238025, 92 | 0.68709475, 93 | 0.593025, 94 | 0.4475, 95 | 0.25, 96 | ] 97 | 98 | assert episode.rewards_to_go == expected_rewards_to_go 99 | 100 | 101 | def test_episode_2(): 102 | episode = Episode(gamma=0.99, lambd=0.95) 103 | 104 | reward_scale = 20 105 | 106 | episode.append( 107 | observation=0, 108 | action=1, 109 | reward=0, 110 | value=0, 111 | log_probability=-1, 112 | reward_scale=reward_scale, 113 | ) 114 | episode.append( 115 | observation=0, 116 | action=1, 117 | reward=1, 118 | value=0, 119 | log_probability=-1, 120 | reward_scale=reward_scale, 121 | ) 122 | episode.append( 123 | observation=0, 124 | action=1, 125 | reward=2, 126 | value=1, 127 | log_probability=-1, 128 | reward_scale=reward_scale, 129 | ) 130 | episode.append( 131 | observation=0, 132 | action=1, 133 | reward=3, 134 | value=2, 135 | log_probability=-1, 136 | reward_scale=reward_scale, 137 | ) 138 | episode.append( 139 | observation=0, 140 | action=1, 141 | reward=4, 142 | value=3, 143 | log_probability=-1, 144 | reward_scale=reward_scale, 145 | ) 146 | episode.append( 147 | observation=0, 148 | action=1, 149 | reward=5, 150 | value=5, 151 | log_probability=-1, 152 | reward_scale=reward_scale, 153 | ) 154 | episode.end_episode(last_value=5) 155 | 156 | expected_advantages = [ 157 | 4.694519008033593, 158 | 4.991514096792763, 159 | 4.201503558525, 160 | 3.3189830500000004, 161 | 2.3381000000000007, 162 | 0.20000000000000018, 163 | ] 164 | assert episode.advantages == expected_advantages 165 | 166 | 167 | def test_history_1(): 168 | episode1 = Episode(gamma=0.99, lambd=0.95) 169 | episode1.append(observation=0, action=1, reward=0, value=0, log_probability=-1) 170 | episode1.append(observation=0, action=1, reward=1, value=0, log_probability=-1) 171 | episode1.append(observation=0, action=1, reward=2, value=1, log_probability=-1) 172 | episode1.append(observation=0, action=1, reward=3, value=2, log_probability=-1) 173 | episode1.append(observation=0, action=1, reward=4, value=3, log_probability=-1) 174 | episode1.append(observation=0, action=1, reward=5, value=5, log_probability=-1) 175 | episode1.end_episode(last_value=5) 176 | 177 | episode2 = Episode(gamma=0.99, lambd=0.95) 178 | episode2.append(observation=0, action=1, reward=0, value=0, log_probability=-1) 179 | episode2.append(observation=0, action=1, reward=-1, value=0, log_probability=-1) 180 | episode2.append(observation=0, action=1, reward=-2, value=-1, log_probability=-1) 181 | episode2.append(observation=0, action=1, reward=3, value=2, log_probability=-1) 182 | episode2.append(observation=0, action=1, reward=-4, value=-3, log_probability=-1) 183 | episode2.end_episode(last_value=0) 184 | 185 | history = History() 186 | 187 | history.add_episode(episode1) 188 | history.add_episode(episode2) 189 | 190 | history.build_dataset() 191 | 192 | assert len(history) == 11 193 | assert abs(np.mean(history.advantages)) <= 1e-10 194 | assert abs(np.std(history.advantages) - 1) <= 1e-3 195 | 196 | 197 | def test_history_2(): 198 | episode1 = Episode(gamma=0.99, lambd=0.95) 199 | episode1.append(observation=0, action=1, reward=0, value=0, log_probability=-1) 200 | episode1.append(observation=0, action=1, reward=1, value=0, log_probability=-1) 201 | episode1.append(observation=0, action=1, reward=2, value=1, log_probability=-1) 202 | episode1.append(observation=0, action=1, reward=3, value=2, log_probability=-1) 203 | episode1.append(observation=0, action=1, reward=4, value=3, log_probability=-1) 204 | episode1.append(observation=0, action=1, reward=5, value=5, log_probability=-1) 205 | episode1.end_episode(last_value=5) 206 | 207 | episode2 = Episode(gamma=0.99, lambd=0.95) 208 | episode2.append(observation=0, action=1, reward=0, value=0, log_probability=-1) 209 | episode2.append(observation=0, action=1, reward=-1, value=0, log_probability=-1) 210 | episode2.append(observation=0, action=1, reward=-2, value=-1, log_probability=-1) 211 | episode2.append(observation=0, action=1, reward=3, value=2, log_probability=-1) 212 | episode2.append(observation=0, action=1, reward=-4, value=-3, log_probability=-1) 213 | episode2.end_episode(last_value=0) 214 | 215 | history = History() 216 | 217 | history.add_episode(episode1) 218 | history.add_episode(episode2) 219 | 220 | history.build_dataset() 221 | 222 | history.free_memory() 223 | 224 | assert len(history) == 0 225 | assert len(history.rewards) == 0 226 | assert len(history.advantages) == 0 227 | assert len(history.log_probabilities) == 0 228 | assert len(history.rewards_to_go) == 0 229 | assert len(history.episodes) == 0 230 | 231 | 232 | def test_history_episode(): 233 | reward_scale = 20 234 | 235 | env = gym.make("LunarLander-v2") 236 | observation = env.reset() 237 | 238 | n_actions = env.action_space.n 239 | feature_dim = observation.size 240 | 241 | max_episodes = 10 242 | max_timesteps = 100 243 | 244 | reward_sum = 0 245 | ite = 0 246 | 247 | history = History() 248 | 249 | for episode_i in range(max_episodes): 250 | 251 | observation = env.reset() 252 | episode = Episode() 253 | 254 | for timestep in range(max_timesteps): 255 | 256 | action = env.action_space.sample() 257 | 258 | new_observation, reward, done, info = env.step(action) 259 | 260 | episode.append( 261 | observation=observation, 262 | action=action, 263 | reward=reward, 264 | value=ite, 265 | log_probability=np.log(1 / n_actions), 266 | reward_scale=reward_scale, 267 | ) 268 | 269 | observation = new_observation 270 | 271 | reward_sum += reward 272 | ite += 1 273 | 274 | if done: 275 | episode.end_episode(last_value=np.random.uniform()) 276 | break 277 | 278 | if timestep == max_timesteps - 1: 279 | episode.end_episode(last_value=0) 280 | 281 | history.add_episode(episode) 282 | 283 | history.build_dataset() 284 | 285 | assert abs(np.sum(history.rewards) - reward_sum / reward_scale) < 1e-5 286 | 287 | assert len(history.rewards) == ite 288 | 289 | assert abs(np.mean(history.advantages)) <= 1e-10 290 | 291 | assert abs(np.std(history.advantages) - 1) <= 1e-3 292 | 293 | assert np.std(history.log_probabilities) <= 1e-3 294 | 295 | assert ( 296 | abs( 297 | sum([v for episode in history.episodes for v in episode.values]) 298 | - ite * (ite - 1) / 2 299 | ) 300 | <= 1e-3 301 | ) 302 | 303 | assert history.observations[-1].shape[0] == feature_dim 304 | 305 | assert ( 306 | abs( 307 | len([a for a in history.actions if a == 0]) 308 | - len(history.actions) / n_actions 309 | ) 310 | < 30 311 | ) 312 | --------------------------------------------------------------------------------