├── PPO
    ├── __init__.py
    ├── render.py
    ├── replay.py
    ├── ppo.py
    └── model.py
├── requirements.txt
├── setup.py
├── LICENSE
├── .gitignore
├── README.md
└── tests
    └── PPO
        ├── test_model.py
        └── test_replay.py


/PPO/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm==4.50.2
2 | torch==1.6.0
3 | gym==0.17.3
4 | numpy==1.19.2
5 | Pillow==8.0.1
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from setuptools import setup
 4 | 
 5 | 
 6 | # Utility function to read the README file.
 7 | # Used for the long_description.  It's nice, because now 1) we have a top level
 8 | # README file and 2) it's easier to type in the README file than to put a raw
 9 | # string in below ...
10 | def read(fname):
11 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
12 | 
13 | 
14 | setup(
15 |     name="rl-from-scratch",
16 |     version="0.0.2",
17 |     author="Youness MANSAR",
18 |     author_email="mansaryounessecp@gmail.com",
19 |     description="Deep neural networks for tabular data",
20 |     license="MIT",
21 |     keywords="tabular",
22 |     url="https://github.com/CVxTz/RL",
23 |     packages=["PPO"],
24 |     classifiers=[
25 |         "Development Status :: 3 - Alpha",
26 |         "Topic :: Utilities",
27 |         "License :: OSI Approved :: BSD License",
28 |     ],
29 | )
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Mansar Youness
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/PPO/render.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import torch
 3 | from PIL import Image
 4 | from tqdm import tqdm
 5 | import random
 6 | import imageio
 7 | import cv2
 8 | import numpy as np
 9 | 
10 | 
11 | from PPO.model import (
12 |     PolicyNetwork,
13 |     ValueNetwork,
14 |     device,
15 | )
16 | 
17 | 
18 | def write_on_image(img, reward):
19 | 
20 |     cv2.putText(
21 |         img,
22 |         f"Sum Reward: {int(reward)}",
23 |         (0, img.shape[0] - 20),
24 |         cv2.FONT_HERSHEY_SIMPLEX,
25 |         1,
26 |         (0, 0, 0),
27 |         2,
28 |     )
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     import argparse
33 | 
34 |     parser = argparse.ArgumentParser()
35 | 
36 |     parser.add_argument("--policy_path")
37 |     parser.add_argument("--env_name")
38 |     parser.add_argument("--n_episodes", type=int, default=5)
39 |     parser.add_argument("--max_timesteps", type=int, default=400)
40 | 
41 |     parser.add_argument("--out_gif")
42 | 
43 |     state_scale = 1.0
44 | 
45 |     args = parser.parse_args()
46 | 
47 |     policy_path = args.policy_path
48 |     env_name = args.env_name
49 | 
50 |     n_episodes = args.n_episodes
51 |     max_timesteps = args.max_timesteps
52 | 
53 |     out_gif = args.out_gif
54 | 
55 |     env = gym.make(env_name)
56 |     observation = env.reset()
57 |     n_actions = env.action_space.n
58 |     feature_dim = observation.size
59 | 
60 |     policy_model = PolicyNetwork(in_dim=feature_dim, n=n_actions).to(device)
61 | 
62 |     policy_model.load_state_dict(torch.load(policy_path))
63 | 
64 |     frames = []
65 |     sum_reward = 0
66 |     for _ in tqdm(range(n_episodes)):
67 |         observation = env.reset()
68 | 
69 |         for timestep in range(max_timesteps):
70 | 
71 |             frames.append(np.ascontiguousarray(env.render(mode="rgb_array")))
72 | 
73 |             write_on_image(frames[-1], sum_reward)
74 | 
75 |             action = policy_model.best_action(observation / state_scale)
76 | 
77 |             new_observation, reward, done, info = env.step(action)
78 |             sum_reward += reward
79 | 
80 |             if done:
81 |                 for a in range(10):
82 |                     frames.append(np.ascontiguousarray(env.render(mode="rgb_array")))
83 |                     write_on_image(frames[-1], sum_reward)
84 |                 break
85 | 
86 |             observation = new_observation
87 | 
88 |     imageio.mimsave(out_gif, frames, fps=60)
89 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/PPO/replay.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from torch.utils.data import Dataset
  4 | 
  5 | 
  6 | def cumulative_sum(array, gamma=1.0):
  7 |     curr = 0
  8 |     cumulative_array = []
  9 | 
 10 |     for a in array[::-1]:
 11 |         curr = a + gamma * curr
 12 |         cumulative_array.append(curr)
 13 | 
 14 |     return cumulative_array[::-1]
 15 | 
 16 | 
 17 | class Episode:
 18 |     def __init__(self, gamma=0.99, lambd=0.95):
 19 |         self.observations = []
 20 |         self.actions = []
 21 |         self.advantages = []
 22 |         self.rewards = []
 23 |         self.rewards_to_go = []
 24 |         self.values = []
 25 |         self.log_probabilities = []
 26 |         self.gamma = gamma
 27 |         self.lambd = lambd
 28 | 
 29 |     def append(
 30 |         self, observation, action, reward, value, log_probability, reward_scale=20
 31 |     ):
 32 |         self.observations.append(observation)
 33 |         self.actions.append(action)
 34 |         self.rewards.append(reward / reward_scale)
 35 |         self.values.append(value)
 36 |         self.log_probabilities.append(log_probability)
 37 | 
 38 |     def end_episode(self, last_value):
 39 |         rewards = np.array(self.rewards + [last_value])
 40 |         values = np.array(self.values + [last_value])
 41 | 
 42 |         deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]
 43 | 
 44 |         self.advantages = cumulative_sum(deltas.tolist(), gamma=self.gamma * self.lambd)
 45 | 
 46 |         self.rewards_to_go = cumulative_sum(rewards.tolist(), gamma=self.gamma)[:-1]
 47 | 
 48 | 
 49 | def normalize_list(array):
 50 |     array = np.array(array)
 51 |     array = (array - np.mean(array)) / (np.std(array) + 1e-5)
 52 |     return array.tolist()
 53 | 
 54 | 
 55 | class History(Dataset):
 56 |     def __init__(self):
 57 |         self.episodes = []
 58 |         self.observations = []
 59 |         self.actions = []
 60 |         self.advantages = []
 61 |         self.rewards = []
 62 |         self.rewards_to_go = []
 63 |         self.log_probabilities = []
 64 | 
 65 |     def free_memory(self):
 66 |         del self.episodes[:]
 67 |         del self.observations[:]
 68 |         del self.actions[:]
 69 |         del self.advantages[:]
 70 |         del self.rewards[:]
 71 |         del self.rewards_to_go[:]
 72 |         del self.log_probabilities[:]
 73 | 
 74 |     def add_episode(self, episode):
 75 |         self.episodes.append(episode)
 76 | 
 77 |     def build_dataset(self):
 78 |         for episode in self.episodes:
 79 |             self.observations += episode.observations
 80 |             self.actions += episode.actions
 81 |             self.advantages += episode.advantages
 82 |             self.rewards += episode.rewards
 83 |             self.rewards_to_go += episode.rewards_to_go
 84 |             self.log_probabilities += episode.log_probabilities
 85 | 
 86 |         assert (
 87 |             len(
 88 |                 {
 89 |                     len(self.observations),
 90 |                     len(self.actions),
 91 |                     len(self.advantages),
 92 |                     len(self.rewards),
 93 |                     len(self.rewards_to_go),
 94 |                     len(self.log_probabilities),
 95 |                 }
 96 |             )
 97 |             == 1
 98 |         )
 99 | 
100 |         self.advantages = normalize_list(self.advantages)
101 | 
102 |     def __len__(self):
103 |         return len(self.observations)
104 | 
105 |     def __getitem__(self, idx):
106 |         return (
107 |             self.observations[idx],
108 |             self.actions[idx],
109 |             self.advantages[idx],
110 |             self.log_probabilities[idx],
111 |             self.rewards_to_go[idx],
112 |         )
113 | 


--------------------------------------------------------------------------------
/PPO/ppo.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import gym
  4 | import numpy as np
  5 | import torch
  6 | import torch.optim as optim
  7 | from torch.utils.data import DataLoader
  8 | from torch.utils.tensorboard import SummaryWriter
  9 | from tqdm import tqdm
 10 | 
 11 | from PPO.model import (
 12 |     PolicyNetwork,
 13 |     ValueNetwork,
 14 |     device,
 15 |     train_value_network,
 16 |     train_policy_network,
 17 | )
 18 | from PPO.replay import Episode, History
 19 | 
 20 | 
 21 | def main(
 22 |     env_name="LunarLander-v2",
 23 |     reward_scale=20.0,
 24 |     clip=0.2,
 25 |     log_dir="../logs",
 26 |     learning_rate=0.001,
 27 |     state_scale=1.0,
 28 | ):
 29 |     writer = SummaryWriter(log_dir=log_dir, filename_suffix=env_name, comment=env_name)
 30 | 
 31 |     env = gym.make(env_name)
 32 |     observation = env.reset()
 33 | 
 34 |     n_actions = env.action_space.n
 35 |     feature_dim = observation.size
 36 | 
 37 |     value_model = ValueNetwork(in_dim=feature_dim).to(device)
 38 |     value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate)
 39 | 
 40 |     policy_model = PolicyNetwork(in_dim=feature_dim, n=n_actions).to(device)
 41 |     policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)
 42 | 
 43 |     n_epoch = 4
 44 | 
 45 |     max_episodes = 20
 46 |     max_timesteps = 400
 47 | 
 48 |     batch_size = 32
 49 | 
 50 |     max_iterations = 200
 51 | 
 52 |     history = History()
 53 | 
 54 |     epoch_ite = 0
 55 |     episode_ite = 0
 56 | 
 57 |     for ite in tqdm(range(max_iterations)):
 58 | 
 59 |         if ite % 50 == 0:
 60 |             torch.save(
 61 |                 policy_model.state_dict(),
 62 |                 Path(log_dir) / (env_name + f"_{str(ite)}_policy.pth"),
 63 |             )
 64 |             torch.save(
 65 |                 value_model.state_dict(),
 66 |                 Path(log_dir) / (env_name + f"_{str(ite)}_value.pth"),
 67 |             )
 68 | 
 69 |         for episode_i in range(max_episodes):
 70 | 
 71 |             observation = env.reset()
 72 |             episode = Episode()
 73 | 
 74 |             for timestep in range(max_timesteps):
 75 | 
 76 |                 action, log_probability = policy_model.sample_action(
 77 |                     observation / state_scale
 78 |                 )
 79 |                 value = value_model.state_value(observation / state_scale)
 80 | 
 81 |                 new_observation, reward, done, info = env.step(action)
 82 | 
 83 |                 episode.append(
 84 |                     observation=observation / state_scale,
 85 |                     action=action,
 86 |                     reward=reward,
 87 |                     value=value,
 88 |                     log_probability=log_probability,
 89 |                     reward_scale=reward_scale,
 90 |                 )
 91 | 
 92 |                 observation = new_observation
 93 | 
 94 |                 if done:
 95 |                     episode.end_episode(last_value=0)
 96 |                     break
 97 | 
 98 |                 if timestep == max_timesteps - 1:
 99 |                     value = value_model.state_value(observation / state_scale)
100 |                     episode.end_episode(last_value=value)
101 | 
102 |             episode_ite += 1
103 |             writer.add_scalar(
104 |                 "Average Episode Reward",
105 |                 reward_scale * np.sum(episode.rewards),
106 |                 episode_ite,
107 |             )
108 |             writer.add_scalar(
109 |                 "Average Probabilities",
110 |                 np.exp(np.mean(episode.log_probabilities)),
111 |                 episode_ite,
112 |             )
113 | 
114 |             history.add_episode(episode)
115 | 
116 |         history.build_dataset()
117 |         data_loader = DataLoader(history, batch_size=batch_size, shuffle=True)
118 | 
119 |         policy_loss = train_policy_network(
120 |             policy_model, policy_optimizer, data_loader, epochs=n_epoch, clip=clip
121 |         )
122 | 
123 |         value_loss = train_value_network(
124 |             value_model, value_optimizer, data_loader, epochs=n_epoch
125 |         )
126 | 
127 |         for p_l, v_l in zip(policy_loss, value_loss):
128 |             epoch_ite += 1
129 |             writer.add_scalar("Policy Loss", p_l, epoch_ite)
130 |             writer.add_scalar("Value Loss", v_l, epoch_ite)
131 | 
132 |         history.free_memory()
133 | 
134 | 
135 | if __name__ == "__main__":
136 | 
137 |     main(
138 |         reward_scale=20.0,
139 |         clip=0.2,
140 |         env_name="LunarLander-v2",
141 |         learning_rate=0.001,
142 |         state_scale=1.0,
143 |         log_dir="logs/Lunar"
144 |     )
145 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # RL
  2 | 
  3 | ### PPO
  4 | How to run:
  5 | ```
  6 | python PPO/ppo.py
  7 | ```
  8 | Test Policy:
  9 | ```
 10 | python PPO/render.py --policy_path logs/Lunar/LunarLander-v2_100_policy.pth \ 
 11 |                      --env_name LunarLander-v2 --out_gif logs/lunar_late.gif
 12 | 
 13 | ```
 14 | 
 15 | ### Learning to Play CartPole and LunarLander with Proximal Policy Optimization
 16 | 
 17 | #### Implementing PPO from scratch with Pytorch
 18 | 
 19 | ![](https://cdn-images-1.medium.com/max/800/1*WiKzN5tiKqettn8yeLj-MQ.gif)
 20 | 
 21 | In this post, we will train an RL agent to play two control based games:
 22 | 
 23 | * [https://gym.openai.com/envs/LunarLander-v2/](https://gym.openai.com/envs/LunarLander-v2/)
 24 | * [https://gym.openai.com/envs/CartPole-v1/](https://gym.openai.com/envs/CartPole-v1/)
 25 | 
 26 | Our agent will be trained using an algorithm called Proximal Policy
 27 | Optimization. We will implement this approach from scratch using PyTorch and
 28 | OpenAi gym.
 29 | 
 30 | This project is based on the following paper:
 31 | 
 32 | * [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347)
 33 | 
 34 | #### Gym:
 35 | 
 36 | The basic idea behind OpenAI Gym is that we define an environment env by
 37 | calling:
 38 | 
 39 |     env = gym.make(env_name)
 40 | 
 41 | Then at each time step **t**, we pick an action **a** and we get a new
 42 | state_(t+1) and a reward **reward_t**. The objective is to train an Agent that
 43 | learns a policy PI that can predict for each state the best action that will
 44 | maximize the sum of the future rewards. For example, in the environment
 45 | LunarLander, we get the maximum reward if we land the rocket smoothly on top of
 46 | the landing area. In the environment CartPole, the objective is to keep the pole
 47 | vertical for as long as possible.
 48 | 
 49 | ![](https://cdn-images-1.medium.com/max/800/1*RxpfmLGwZR8kEVOlxbSjzQ.gif)
 50 | 
 51 | #### PPO:
 52 | 
 53 | Our final objective is to learn a policy network that will take the state as
 54 | input and then output a probability distribution over the actions that will
 55 | maximize the expected reward.
 56 | 
 57 | Implementing PPO goes as follows:
 58 | 
 59 | * First, we start with a policy PI_old
 60 | * We sample some trajectories from P_old
 61 | * For each action **a** in each trajectory we compute the Advantage, a kind of
 62 | measure of how much better the action **a** is compared to other possible
 63 | actions at state_t.
 64 | * For a few epochs, we maximize the following objective with gradient ascent:
 65 | 
 66 | ![](https://cdn-images-1.medium.com/max/800/1*cnjyaLHg0QynODhIXkYYSg.png)
 67 | 
 68 | <span class="figcaption_hack">From [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347)</span>
 69 | 
 70 | What this loss does is that it **increases the probability** if action a_t at
 71 | state s_t if it has a **positive advantage** and **decreases the probability**
 72 | in the case of a **negative advantage**. However, in practice this ratio of
 73 | probabilities tends to diverge to infinity, making the training unstable. The
 74 | authors propose a clipped version of the loss to solve this issue:
 75 | 
 76 | ![](https://cdn-images-1.medium.com/max/800/1*ocve-gRQDzkXVov-yTtZuA.png)
 77 | 
 78 | <span class="figcaption_hack">[Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347)</span>
 79 | 
 80 | We also add two additional terms to the loss, an mean squared error over the
 81 | Value function and Entropy encourage exploration during the sampling of the
 82 | trajectories.
 83 | 
 84 | We can plot the sum of the rewards during the progression of the training for
 85 | each episode:
 86 | 
 87 | ![](https://cdn-images-1.medium.com/max/1200/1*MIF6DYmlJT9yLQFDafmOJA.png)
 88 | 
 89 | <span class="figcaption_hack">Training reward for LunarLander</span>
 90 | 
 91 | #### Trained Agents:
 92 | 
 93 | Now we get the see the trained policy network in action.
 94 | 
 95 | * CartPole
 96 | 
 97 | ![](https://cdn-images-1.medium.com/max/800/1*_ddmwllJuY-9Zvh8x6PCbQ.gif)
 98 | 
 99 | * LunarLander
100 | 
101 | ![](https://cdn-images-1.medium.com/max/800/1*tKbe-gnp6VujnrQ2YwEZ-g.gif)
102 | 
103 | Perfect Landing!
104 | 
105 | #### Other resources:
106 | 
107 | If you are interested in learning more about PPO or Policy gradient
108 | Reinforcement Learning methods I recommend following this course:
109 | [https://www.youtube.com/playlist?list=PL_iWQOsE6TfURIIhCrlt-wj9ByIVpbfGc](https://www.youtube.com/playlist?list=PL_iWQOsE6TfURIIhCrlt-wj9ByIVpbfGc)
110 | by Sergey Levine at Berkeley. The course is very long and math-heavy but the
111 | instructor is really good.
112 | 
113 | #### Code :
114 | 
115 | [https://github.com/CVxTz/RL](https://github.com/CVxTz/RL)
116 | 


--------------------------------------------------------------------------------
/PPO/model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from torch.distributions import Categorical
  5 | 
  6 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  7 | 
  8 | 
  9 | class PolicyNetwork(torch.nn.Module):
 10 |     def __init__(self, n=4, in_dim=128):
 11 |         super(PolicyNetwork, self).__init__()
 12 | 
 13 |         self.fc1 = torch.nn.Linear(in_dim, 128)
 14 |         self.fc2 = torch.nn.Linear(128, 128)
 15 |         self.fc3 = torch.nn.Linear(128, 128)
 16 | 
 17 |         self.fc4 = torch.nn.Linear(128, n)
 18 | 
 19 |         self.l_relu = torch.nn.LeakyReLU(0.1)
 20 | 
 21 |     def forward(self, x):
 22 | 
 23 |         x = self.l_relu(self.fc1(x))
 24 |         x = self.l_relu(self.fc2(x))
 25 |         x = self.l_relu(self.fc3(x))
 26 | 
 27 |         y = self.fc4(x)
 28 | 
 29 |         y = F.softmax(y, dim=-1)
 30 | 
 31 |         return y
 32 | 
 33 |     def sample_action(self, state):
 34 | 
 35 |         if not state is torch.Tensor:
 36 |             state = torch.from_numpy(state).float().to(device)
 37 | 
 38 |         if len(state.size()) == 1:
 39 |             state = state.unsqueeze(0)
 40 | 
 41 |         y = self(state)
 42 | 
 43 |         dist = Categorical(y)
 44 | 
 45 |         action = dist.sample()
 46 | 
 47 |         log_probability = dist.log_prob(action)
 48 | 
 49 |         return action.item(), log_probability.item()
 50 | 
 51 |     def best_action(self, state):
 52 | 
 53 |         if not state is torch.Tensor:
 54 |             state = torch.from_numpy(state).float().to(device)
 55 | 
 56 |         if len(state.size()) == 1:
 57 |             state = state.unsqueeze(0)
 58 | 
 59 |         y = self(state).squeeze()
 60 | 
 61 |         action = torch.argmax(y)
 62 | 
 63 |         return action.item()
 64 | 
 65 |     def evaluate_actions(self, states, actions):
 66 |         y = self(states)
 67 | 
 68 |         dist = Categorical(y)
 69 | 
 70 |         entropy = dist.entropy()
 71 | 
 72 |         log_probabilities = dist.log_prob(actions)
 73 | 
 74 |         return log_probabilities, entropy
 75 | 
 76 | 
 77 | class ValueNetwork(torch.nn.Module):
 78 |     def __init__(self, in_dim=128):
 79 |         super(ValueNetwork, self).__init__()
 80 | 
 81 |         self.fc1 = torch.nn.Linear(in_dim, 128)
 82 |         self.fc2 = torch.nn.Linear(128, 128)
 83 |         self.fc3 = torch.nn.Linear(128, 128)
 84 | 
 85 |         self.fc4 = torch.nn.Linear(128, 1)
 86 | 
 87 |         self.l_relu = torch.nn.LeakyReLU(0.1)
 88 | 
 89 |     def forward(self, x):
 90 | 
 91 |         x = self.l_relu(self.fc1(x))
 92 |         x = self.l_relu(self.fc2(x))
 93 |         x = self.l_relu(self.fc3(x))
 94 | 
 95 |         y = self.fc4(x)
 96 | 
 97 |         return y.squeeze(1)
 98 | 
 99 |     def state_value(self, state):
100 | 
101 |         if not state is torch.Tensor:
102 |             state = torch.from_numpy(state).float().to(device)
103 | 
104 |         if len(state.size()) == 1:
105 |             state = state.unsqueeze(0)
106 | 
107 |         y = self(state)
108 | 
109 |         return y.item()
110 | 
111 | 
112 | def train_value_network(value_model, value_optimizer, data_loader, epochs=4):
113 |     epochs_losses = []
114 | 
115 |     for i in range(epochs):
116 | 
117 |         losses = []
118 | 
119 |         for observations, _, _, _, rewards_to_go in data_loader:
120 |             observations = observations.float().to(device)
121 |             rewards_to_go = rewards_to_go.float().to(device)
122 | 
123 |             value_optimizer.zero_grad()
124 | 
125 |             values = value_model(observations)
126 | 
127 |             loss = F.mse_loss(values, rewards_to_go)
128 | 
129 |             loss.backward()
130 | 
131 |             value_optimizer.step()
132 | 
133 |             losses.append(loss.item())
134 | 
135 |         mean_loss = np.mean(losses)
136 | 
137 |         epochs_losses.append(mean_loss)
138 | 
139 |     return epochs_losses
140 | 
141 | 
142 | def ac_loss(new_log_probabilities, old_log_probabilities, advantages, epsilon_clip=0.2):
143 |     probability_ratios = torch.exp(new_log_probabilities - old_log_probabilities)
144 |     clipped_probabiliy_ratios = torch.clamp(
145 |         probability_ratios, 1 - epsilon_clip, 1 + epsilon_clip
146 |     )
147 | 
148 |     surrogate_1 = probability_ratios * advantages
149 |     surrogate_2 = clipped_probabiliy_ratios * advantages
150 | 
151 |     return -torch.min(surrogate_1, surrogate_2)
152 | 
153 | 
154 | def train_policy_network(
155 |     policy_model, policy_optimizer, data_loader, epochs=4, clip=0.2
156 | ):
157 |     epochs_losses = []
158 | 
159 |     c1 = 0.01
160 | 
161 |     for i in range(epochs):
162 | 
163 |         losses = []
164 | 
165 |         for observations, actions, advantages, log_probabilities, _ in data_loader:
166 |             observations = observations.float().to(device)
167 |             actions = actions.long().to(device)
168 |             advantages = advantages.float().to(device)
169 |             old_log_probabilities = log_probabilities.float().to(device)
170 | 
171 |             policy_optimizer.zero_grad()
172 | 
173 |             new_log_probabilities, entropy = policy_model.evaluate_actions(
174 |                 observations, actions
175 |             )
176 | 
177 |             loss = (
178 |                 ac_loss(
179 |                     new_log_probabilities,
180 |                     old_log_probabilities,
181 |                     advantages,
182 |                     epsilon_clip=clip,
183 |                 ).mean()
184 |                 - c1 * entropy.mean()
185 |             )
186 | 
187 |             loss.backward()
188 | 
189 |             policy_optimizer.step()
190 | 
191 |             losses.append(loss.item())
192 | 
193 |         mean_loss = np.mean(losses)
194 | 
195 |         epochs_losses.append(mean_loss)
196 | 
197 |     return epochs_losses
198 | 


--------------------------------------------------------------------------------
/tests/PPO/test_model.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import torch
  4 | import torch.optim as optim
  5 | from torch.utils.data import DataLoader
  6 | 
  7 | from PPO.model import (
  8 |     PolicyNetwork,
  9 |     ValueNetwork,
 10 |     device,
 11 |     train_value_network,
 12 |     train_policy_network,
 13 | )
 14 | from PPO.replay import Episode, History
 15 | 
 16 | 
 17 | def test_model_1():
 18 |     env = gym.make("LunarLander-v2")
 19 | 
 20 |     observation = env.reset()
 21 | 
 22 |     n_actions = env.action_space.n
 23 |     feature_dim = observation.size
 24 | 
 25 |     policy_model = PolicyNetwork(n=n_actions, in_dim=feature_dim)
 26 | 
 27 |     policy_model.to(device)
 28 | 
 29 |     action, log_probability = policy_model.sample_action(observation)
 30 | 
 31 |     assert action in list(range(n_actions))
 32 | 
 33 | 
 34 | def test_model_2():
 35 |     env = gym.make("LunarLander-v2")
 36 | 
 37 |     observation = env.reset()
 38 | 
 39 |     n_actions = env.action_space.n
 40 |     feature_dim = observation.size
 41 | 
 42 |     policy_model = PolicyNetwork(n=n_actions, in_dim=feature_dim)
 43 |     policy_model.to(device)
 44 | 
 45 |     observations = [observation / i for i in range(1, 11)]
 46 | 
 47 |     observations = torch.from_numpy(np.array(observations)).to(device)
 48 | 
 49 |     probs = policy_model(observations)
 50 | 
 51 |     assert list(probs.size()) == [10, n_actions]
 52 | 
 53 |     assert abs(probs[0, :].sum().item() - 1) < 1e-3
 54 | 
 55 | 
 56 | def test_model_3():
 57 |     env = gym.make("LunarLander-v2")
 58 | 
 59 |     observation = env.reset()
 60 | 
 61 |     n_actions = env.action_space.n
 62 |     feature_dim = observation.size
 63 | 
 64 |     policy_model = PolicyNetwork(n=n_actions, in_dim=feature_dim)
 65 |     policy_model.to(device)
 66 | 
 67 |     observations = [observation / i for i in range(1, 11)]
 68 | 
 69 |     actions = [i % 4 for i in range(1, 11)]
 70 | 
 71 |     observations = torch.from_numpy(np.array(observations)).to(device)
 72 |     actions = torch.IntTensor(actions).to(device)
 73 | 
 74 |     log_probabilities, entropy = policy_model.evaluate_actions(observations, actions)
 75 | 
 76 |     assert list(log_probabilities.size()) == [10]
 77 |     assert list(entropy.size()) == [10]
 78 | 
 79 | 
 80 | def test_history_episode_model():
 81 |     reward_scale = 20
 82 | 
 83 |     env = gym.make("LunarLander-v2")
 84 |     observation = env.reset()
 85 | 
 86 |     n_actions = env.action_space.n
 87 |     feature_dim = observation.size
 88 | 
 89 |     policy_model = PolicyNetwork(n=n_actions, in_dim=feature_dim).to(device)
 90 |     value_model = ValueNetwork(in_dim=feature_dim).to(device)
 91 | 
 92 |     max_episodes = 10
 93 |     max_timesteps = 100
 94 | 
 95 |     reward_sum = 0
 96 |     ite = 0
 97 | 
 98 |     history = History()
 99 | 
100 |     for episode_i in range(max_episodes):
101 | 
102 |         observation = env.reset()
103 |         episode = Episode()
104 | 
105 |         for timestep in range(max_timesteps):
106 | 
107 |             action, log_probability = policy_model.sample_action(observation)
108 |             value = value_model.state_value(observation)
109 | 
110 |             new_observation, reward, done, info = env.step(action)
111 | 
112 |             episode.append(
113 |                 observation=observation,
114 |                 action=action,
115 |                 reward=reward,
116 |                 value=value,
117 |                 log_probability=log_probability,
118 |                 reward_scale=reward_scale,
119 |             )
120 | 
121 |             observation = new_observation
122 | 
123 |             reward_sum += reward
124 |             ite += 1
125 | 
126 |             if done:
127 |                 episode.end_episode(last_value=np.random.uniform())
128 |                 break
129 | 
130 |             if timestep == max_timesteps - 1:
131 |                 episode.end_episode(last_value=0)
132 | 
133 |         history.add_episode(episode)
134 | 
135 |     history.build_dataset()
136 | 
137 |     assert abs(np.sum(history.rewards) - reward_sum / reward_scale) < 1e-5
138 | 
139 |     assert len(history.rewards) == ite
140 | 
141 |     assert abs(np.mean(history.advantages)) <= 1e-10
142 | 
143 |     assert abs(np.std(history.advantages) - 1) <= 1e-3
144 | 
145 | 
146 | def test_value_network():
147 |     env = gym.make("LunarLander-v2")
148 |     observation = env.reset()
149 | 
150 |     n_actions = env.action_space.n
151 |     feature_dim = observation.size
152 | 
153 |     n_epoch = 4
154 | 
155 |     max_episodes = 10
156 |     max_timesteps = 100
157 | 
158 |     reward_sum = 0
159 |     ite = 0
160 | 
161 |     history = History()
162 | 
163 |     for episode_i in range(max_episodes):
164 | 
165 |         observation = env.reset()
166 |         episode = Episode()
167 | 
168 |         for timestep in range(max_timesteps):
169 | 
170 |             action = env.action_space.sample()
171 | 
172 |             new_observation, reward, done, info = env.step(action)
173 | 
174 |             episode.append(
175 |                 observation=observation,
176 |                 action=action,
177 |                 reward=reward,
178 |                 value=ite,
179 |                 log_probability=np.log(1 / n_actions),
180 |             )
181 | 
182 |             observation = new_observation
183 | 
184 |             reward_sum += reward
185 |             ite += 1
186 | 
187 |             if done:
188 |                 episode.end_episode(last_value=np.random.uniform())
189 |                 break
190 | 
191 |             if timestep == max_timesteps - 1:
192 |                 episode.end_episode(last_value=0)
193 | 
194 |         history.add_episode(episode)
195 | 
196 |     history.build_dataset()
197 | 
198 |     value_model = ValueNetwork(in_dim=feature_dim).to(device)
199 |     value_optimizer = optim.Adam(value_model.parameters(), lr=0.001)
200 | 
201 |     data_loader = DataLoader(history, batch_size=64, shuffle=True)
202 | 
203 |     epochs_losses = train_value_network(
204 |         value_model, value_optimizer, data_loader, epochs=n_epoch
205 |     )
206 | 
207 |     assert epochs_losses[0] > epochs_losses[-1]
208 | 
209 | 
210 | def test_policy_network():
211 |     env = gym.make("LunarLander-v2")
212 |     observation = env.reset()
213 | 
214 |     n_actions = env.action_space.n
215 |     feature_dim = observation.size
216 | 
217 |     n_epoch = 4
218 | 
219 |     max_episodes = 10
220 |     max_timesteps = 100
221 | 
222 |     reward_sum = 0
223 |     ite = 0
224 | 
225 |     history = History()
226 | 
227 |     for episode_i in range(max_episodes):
228 | 
229 |         observation = env.reset()
230 |         episode = Episode()
231 | 
232 |         for timestep in range(max_timesteps):
233 | 
234 |             action = env.action_space.sample()
235 | 
236 |             new_observation, reward, done, info = env.step(action)
237 | 
238 |             episode.append(
239 |                 observation=observation,
240 |                 action=action,
241 |                 reward=reward,
242 |                 value=ite,
243 |                 log_probability=np.log(1 / n_actions),
244 |             )
245 | 
246 |             observation = new_observation
247 | 
248 |             reward_sum += reward
249 |             ite += 1
250 | 
251 |             if done:
252 |                 episode.end_episode(last_value=np.random.uniform())
253 |                 break
254 | 
255 |             if timestep == max_timesteps - 1:
256 |                 episode.end_episode(last_value=0)
257 | 
258 |         history.add_episode(episode)
259 | 
260 |     history.build_dataset()
261 | 
262 |     policy_model = PolicyNetwork(in_dim=feature_dim).to(device)
263 |     policy_optimizer = optim.Adam(policy_model.parameters(), lr=0.01)
264 | 
265 |     data_loader = DataLoader(history, batch_size=64, shuffle=True)
266 | 
267 |     epochs_losses = train_policy_network(
268 |         policy_model, policy_optimizer, data_loader, epochs=n_epoch
269 |     )
270 | 
271 |     assert epochs_losses[0] > epochs_losses[-1]
272 | 


--------------------------------------------------------------------------------
/tests/PPO/test_replay.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | 
  4 | from PPO.replay import cumulative_sum, Episode, History
  5 | 
  6 | 
  7 | def test_cumulative_sum_1():
  8 |     array = [0, 1, 2, 3, 4, 5]
  9 | 
 10 |     cumulative_array = cumulative_sum(array)
 11 | 
 12 |     expected_cumulative_array = [15, 15, 14, 12, 9, 5]
 13 | 
 14 |     assert cumulative_array == expected_cumulative_array
 15 | 
 16 | 
 17 | def test_cumulative_sum_2():
 18 |     array = [0, 1, 2, 3, 4, 5]
 19 | 
 20 |     cumulative_array = cumulative_sum(array, gamma=0.99)
 21 | 
 22 |     expected_cumulative_array = [
 23 |         14.458431289499998,
 24 |         14.604476049999999,
 25 |         13.741895,
 26 |         11.8605,
 27 |         8.95,
 28 |         5.0,
 29 |     ]
 30 | 
 31 |     assert cumulative_array == expected_cumulative_array
 32 | 
 33 | 
 34 | def test_episode_1():
 35 |     episode = Episode(gamma=0.99, lambd=0.95)
 36 | 
 37 |     reward_scale = 20
 38 | 
 39 |     episode.append(
 40 |         observation=0,
 41 |         action=1,
 42 |         reward=0,
 43 |         value=0,
 44 |         log_probability=-1,
 45 |         reward_scale=reward_scale,
 46 |     )
 47 |     episode.append(
 48 |         observation=0,
 49 |         action=1,
 50 |         reward=1,
 51 |         value=0,
 52 |         log_probability=-1,
 53 |         reward_scale=reward_scale,
 54 |     )
 55 |     episode.append(
 56 |         observation=0,
 57 |         action=1,
 58 |         reward=2,
 59 |         value=0,
 60 |         log_probability=-1,
 61 |         reward_scale=reward_scale,
 62 |     )
 63 |     episode.append(
 64 |         observation=0,
 65 |         action=1,
 66 |         reward=3,
 67 |         value=0,
 68 |         log_probability=-1,
 69 |         reward_scale=reward_scale,
 70 |     )
 71 |     episode.append(
 72 |         observation=0,
 73 |         action=1,
 74 |         reward=4,
 75 |         value=0,
 76 |         log_probability=-1,
 77 |         reward_scale=reward_scale,
 78 |     )
 79 |     episode.append(
 80 |         observation=0,
 81 |         action=1,
 82 |         reward=5,
 83 |         value=0,
 84 |         log_probability=-1,
 85 |         reward_scale=reward_scale,
 86 |     )
 87 |     episode.end_episode(last_value=0)
 88 | 
 89 |     expected_rewards_to_go = [
 90 |         0.722921564475,
 91 |         0.7302238025,
 92 |         0.68709475,
 93 |         0.593025,
 94 |         0.4475,
 95 |         0.25,
 96 |     ]
 97 | 
 98 |     assert episode.rewards_to_go == expected_rewards_to_go
 99 | 
100 | 
101 | def test_episode_2():
102 |     episode = Episode(gamma=0.99, lambd=0.95)
103 | 
104 |     reward_scale = 20
105 | 
106 |     episode.append(
107 |         observation=0,
108 |         action=1,
109 |         reward=0,
110 |         value=0,
111 |         log_probability=-1,
112 |         reward_scale=reward_scale,
113 |     )
114 |     episode.append(
115 |         observation=0,
116 |         action=1,
117 |         reward=1,
118 |         value=0,
119 |         log_probability=-1,
120 |         reward_scale=reward_scale,
121 |     )
122 |     episode.append(
123 |         observation=0,
124 |         action=1,
125 |         reward=2,
126 |         value=1,
127 |         log_probability=-1,
128 |         reward_scale=reward_scale,
129 |     )
130 |     episode.append(
131 |         observation=0,
132 |         action=1,
133 |         reward=3,
134 |         value=2,
135 |         log_probability=-1,
136 |         reward_scale=reward_scale,
137 |     )
138 |     episode.append(
139 |         observation=0,
140 |         action=1,
141 |         reward=4,
142 |         value=3,
143 |         log_probability=-1,
144 |         reward_scale=reward_scale,
145 |     )
146 |     episode.append(
147 |         observation=0,
148 |         action=1,
149 |         reward=5,
150 |         value=5,
151 |         log_probability=-1,
152 |         reward_scale=reward_scale,
153 |     )
154 |     episode.end_episode(last_value=5)
155 | 
156 |     expected_advantages = [
157 |         4.694519008033593,
158 |         4.991514096792763,
159 |         4.201503558525,
160 |         3.3189830500000004,
161 |         2.3381000000000007,
162 |         0.20000000000000018,
163 |     ]
164 |     assert episode.advantages == expected_advantages
165 | 
166 | 
167 | def test_history_1():
168 |     episode1 = Episode(gamma=0.99, lambd=0.95)
169 |     episode1.append(observation=0, action=1, reward=0, value=0, log_probability=-1)
170 |     episode1.append(observation=0, action=1, reward=1, value=0, log_probability=-1)
171 |     episode1.append(observation=0, action=1, reward=2, value=1, log_probability=-1)
172 |     episode1.append(observation=0, action=1, reward=3, value=2, log_probability=-1)
173 |     episode1.append(observation=0, action=1, reward=4, value=3, log_probability=-1)
174 |     episode1.append(observation=0, action=1, reward=5, value=5, log_probability=-1)
175 |     episode1.end_episode(last_value=5)
176 | 
177 |     episode2 = Episode(gamma=0.99, lambd=0.95)
178 |     episode2.append(observation=0, action=1, reward=0, value=0, log_probability=-1)
179 |     episode2.append(observation=0, action=1, reward=-1, value=0, log_probability=-1)
180 |     episode2.append(observation=0, action=1, reward=-2, value=-1, log_probability=-1)
181 |     episode2.append(observation=0, action=1, reward=3, value=2, log_probability=-1)
182 |     episode2.append(observation=0, action=1, reward=-4, value=-3, log_probability=-1)
183 |     episode2.end_episode(last_value=0)
184 | 
185 |     history = History()
186 | 
187 |     history.add_episode(episode1)
188 |     history.add_episode(episode2)
189 | 
190 |     history.build_dataset()
191 | 
192 |     assert len(history) == 11
193 |     assert abs(np.mean(history.advantages)) <= 1e-10
194 |     assert abs(np.std(history.advantages) - 1) <= 1e-3
195 | 
196 | 
197 | def test_history_2():
198 |     episode1 = Episode(gamma=0.99, lambd=0.95)
199 |     episode1.append(observation=0, action=1, reward=0, value=0, log_probability=-1)
200 |     episode1.append(observation=0, action=1, reward=1, value=0, log_probability=-1)
201 |     episode1.append(observation=0, action=1, reward=2, value=1, log_probability=-1)
202 |     episode1.append(observation=0, action=1, reward=3, value=2, log_probability=-1)
203 |     episode1.append(observation=0, action=1, reward=4, value=3, log_probability=-1)
204 |     episode1.append(observation=0, action=1, reward=5, value=5, log_probability=-1)
205 |     episode1.end_episode(last_value=5)
206 | 
207 |     episode2 = Episode(gamma=0.99, lambd=0.95)
208 |     episode2.append(observation=0, action=1, reward=0, value=0, log_probability=-1)
209 |     episode2.append(observation=0, action=1, reward=-1, value=0, log_probability=-1)
210 |     episode2.append(observation=0, action=1, reward=-2, value=-1, log_probability=-1)
211 |     episode2.append(observation=0, action=1, reward=3, value=2, log_probability=-1)
212 |     episode2.append(observation=0, action=1, reward=-4, value=-3, log_probability=-1)
213 |     episode2.end_episode(last_value=0)
214 | 
215 |     history = History()
216 | 
217 |     history.add_episode(episode1)
218 |     history.add_episode(episode2)
219 | 
220 |     history.build_dataset()
221 | 
222 |     history.free_memory()
223 | 
224 |     assert len(history) == 0
225 |     assert len(history.rewards) == 0
226 |     assert len(history.advantages) == 0
227 |     assert len(history.log_probabilities) == 0
228 |     assert len(history.rewards_to_go) == 0
229 |     assert len(history.episodes) == 0
230 | 
231 | 
232 | def test_history_episode():
233 |     reward_scale = 20
234 | 
235 |     env = gym.make("LunarLander-v2")
236 |     observation = env.reset()
237 | 
238 |     n_actions = env.action_space.n
239 |     feature_dim = observation.size
240 | 
241 |     max_episodes = 10
242 |     max_timesteps = 100
243 | 
244 |     reward_sum = 0
245 |     ite = 0
246 | 
247 |     history = History()
248 | 
249 |     for episode_i in range(max_episodes):
250 | 
251 |         observation = env.reset()
252 |         episode = Episode()
253 | 
254 |         for timestep in range(max_timesteps):
255 | 
256 |             action = env.action_space.sample()
257 | 
258 |             new_observation, reward, done, info = env.step(action)
259 | 
260 |             episode.append(
261 |                 observation=observation,
262 |                 action=action,
263 |                 reward=reward,
264 |                 value=ite,
265 |                 log_probability=np.log(1 / n_actions),
266 |                 reward_scale=reward_scale,
267 |             )
268 | 
269 |             observation = new_observation
270 | 
271 |             reward_sum += reward
272 |             ite += 1
273 | 
274 |             if done:
275 |                 episode.end_episode(last_value=np.random.uniform())
276 |                 break
277 | 
278 |             if timestep == max_timesteps - 1:
279 |                 episode.end_episode(last_value=0)
280 | 
281 |         history.add_episode(episode)
282 | 
283 |     history.build_dataset()
284 | 
285 |     assert abs(np.sum(history.rewards) - reward_sum / reward_scale) < 1e-5
286 | 
287 |     assert len(history.rewards) == ite
288 | 
289 |     assert abs(np.mean(history.advantages)) <= 1e-10
290 | 
291 |     assert abs(np.std(history.advantages) - 1) <= 1e-3
292 | 
293 |     assert np.std(history.log_probabilities) <= 1e-3
294 | 
295 |     assert (
296 |         abs(
297 |             sum([v for episode in history.episodes for v in episode.values])
298 |             - ite * (ite - 1) / 2
299 |         )
300 |         <= 1e-3
301 |     )
302 | 
303 |     assert history.observations[-1].shape[0] == feature_dim
304 | 
305 |     assert (
306 |         abs(
307 |             len([a for a in history.actions if a == 0])
308 |             - len(history.actions) / n_actions
309 |         )
310 |         < 30
311 |     )
312 | 


--------------------------------------------------------------------------------