├── .gitattributes ├── .gitignore ├── OpenAI ├── BipedalWalker-v2 │ ├── BipedalWalker-v2.html │ ├── BipedalWalker-v2.ipynb │ ├── Model.py │ ├── README.md │ ├── ReplayBuffer.py │ └── preTrained │ │ ├── TD3_BipedalWalker-v2_0_solved_actor.pth │ │ ├── TD3_BipedalWalker-v2_0_solved_actor_target.pth │ │ ├── TD3_BipedalWalker-v2_0_solved_critic_1_target.pth │ │ ├── TD3_BipedalWalker-v2_0_solved_critic_2_target.pth │ │ ├── TD3_BipedalWalker-v2_0_solved_crtic_1.pth │ │ └── TD3_BipedalWalker-v2_0_solved_crtic_2.pth ├── BipedalWalker-v3 │ ├── Agent.py │ ├── BipedalWalker-v3.html │ ├── BipedalWalker-v3.ipynb │ ├── README.md │ ├── ReplayBuffer.py │ └── preTrained │ │ ├── TD3_BipedalWalker-v3_0_solved_actor.pth │ │ ├── TD3_BipedalWalker-v3_0_solved_actor_target.pth │ │ ├── TD3_BipedalWalker-v3_0_solved_critic_1_target.pth │ │ ├── TD3_BipedalWalker-v3_0_solved_critic_2_target.pth │ │ ├── TD3_BipedalWalker-v3_0_solved_crtic_1.pth │ │ └── TD3_BipedalWalker-v3_0_solved_crtic_2.pth ├── CartPole-v0 │ ├── .gitignore │ ├── CartPole-v0.ipynb │ ├── README.md │ ├── agents │ │ ├── DDQN.py │ │ └── __init__.py │ ├── assets │ │ ├── cartpole-v0.jpg │ │ └── game_reward.png │ ├── memory.py │ ├── model.h5 │ └── requirements.txt ├── HumanoidPyBulletEnv-v0 │ ├── HumanoidPyBulletEnv-v0.ipynb │ ├── README.md │ ├── multiprocessing_env.py │ └── pretrained │ │ ├── actor_HumanoidPyBulletEnv-v0_checkpoint.pt │ │ ├── actor_HumanoidPyBulletEnv-v0_final.pt │ │ ├── critic_HumanoidPyBulletEnv-v0_checkpoint.pt │ │ └── critic_HumanoidPyBulletEnv-v0_final.pt ├── LunarLander-v2 │ ├── LunarLanderContinuous-v2 (DDPG).html │ ├── LunarLanderContinuous-v2 (DDPG).ipynb │ ├── README.md │ ├── checkpoint_actor.pth │ ├── checkpoint_critic.pth │ ├── ddpg_agent.py │ └── model.py ├── MountainCarContinuous-v0 │ ├── Agent.py │ ├── Model.py │ ├── MountainCarContinuous-v0 (DDPG).html │ ├── MountainCarContinuous-v0 (DDPG).ipynb │ ├── Noise.py │ ├── README.md │ ├── checkpoint_actor.pth │ └── checkpoint_critic.pth ├── Taxi-v2 │ ├── README.md │ ├── agent.py │ ├── main.py │ └── monitor.py └── Taxi-v3 │ ├── Reinforcement Learning.ppsx │ └── Taxi-v3.ipynb ├── README.md └── Unity-ML └── Soccer ├── Agent.py ├── Model.py ├── Noise.py ├── Soccer.ipynb ├── Soccer_Windows_x86_64 ├── .DS_Store └── Soccer_Data │ ├── MonoBleedingEdge │ └── etc │ │ └── mono │ │ ├── 2.0 │ │ ├── Browsers │ │ │ └── Compat.browser │ │ ├── DefaultWsdlHelpGenerator.aspx │ │ ├── machine.config │ │ ├── settings.map │ │ └── web.config │ │ ├── 4.0 │ │ ├── Browsers │ │ │ └── Compat.browser │ │ ├── DefaultWsdlHelpGenerator.aspx │ │ ├── machine.config │ │ ├── settings.map │ │ └── web.config │ │ ├── 4.5 │ │ ├── Browsers │ │ │ └── Compat.browser │ │ ├── DefaultWsdlHelpGenerator.aspx │ │ ├── machine.config │ │ ├── settings.map │ │ └── web.config │ │ ├── browscap.ini │ │ ├── config │ │ └── mconfig │ │ └── config.xml │ ├── Resources │ ├── unity default resources │ └── unity_builtin_extra │ ├── app.info │ ├── boot.config │ ├── globalgamemanagers │ ├── globalgamemanagers.assets │ ├── level0 │ ├── resources.assets │ ├── sharedassets0.assets │ └── sharedassets0.assets.resS ├── checkpoint_goalie_actor.pth ├── checkpoint_goalie_critic.pth ├── checkpoint_striker_actor.pth └── checkpoint_striker_critic.pth /.gitattributes: -------------------------------------------------------------------------------- 1 | Unity-ML/Soccer/Soccer_Windows_x86_64/* linguist-vendored 2 | *.html linguist-vendored 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # IntelliJ 107 | .idea/ -------------------------------------------------------------------------------- /OpenAI/BipedalWalker-v2/Model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | 6 | # Hyperparameters 7 | gamma = 0.99 # discount for future rewards 8 | batch_size = 100 # num of transitions sampled from replay buffer 9 | polyak = 0.995 # target policy update parameter (1-tau) 10 | policy_noise = 0.2 # target policy smoothing noise 11 | noise_clip = 0.5 12 | policy_delay = 2 # delayed policy updates parameter 13 | 14 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 15 | 16 | class Actor(nn.Module): 17 | def __init__(self, state_dim, action_dim, max_action): 18 | super(Actor, self).__init__() 19 | 20 | self.l1 = nn.Linear(state_dim, 400) 21 | self.l2 = nn.Linear(400, 300) 22 | self.l3 = nn.Linear(300, action_dim) 23 | 24 | self.max_action = max_action 25 | 26 | def forward(self, state): 27 | a = F.relu(self.l1(state)) 28 | a = F.relu(self.l2(a)) 29 | a = torch.tanh(self.l3(a)) * self.max_action 30 | return a 31 | 32 | class Critic(nn.Module): 33 | def __init__(self, state_dim, action_dim): 34 | super(Critic, self).__init__() 35 | 36 | self.l1 = nn.Linear(state_dim + action_dim, 400) 37 | self.l2 = nn.Linear(400, 300) 38 | self.l3 = nn.Linear(300, 1) 39 | 40 | def forward(self, state, action): 41 | state_action = torch.cat([state, action], 1) 42 | 43 | q = F.relu(self.l1(state_action)) 44 | q = F.relu(self.l2(q)) 45 | q = self.l3(q) 46 | return q 47 | 48 | class TD3: 49 | def __init__(self, state_dim, action_dim, max_action): 50 | 51 | self.actor = Actor(state_dim, action_dim, max_action).to(device) 52 | self.actor_target = Actor(state_dim, action_dim, max_action).to(device) 53 | self.actor_target.load_state_dict(self.actor.state_dict()) 54 | self.actor_optimizer = optim.Adam(self.actor.parameters()) 55 | 56 | self.critic_1 = Critic(state_dim, action_dim).to(device) 57 | self.critic_1_target = Critic(state_dim, action_dim).to(device) 58 | self.critic_1_target.load_state_dict(self.critic_1.state_dict()) 59 | self.critic_1_optimizer = optim.Adam(self.critic_1.parameters()) 60 | 61 | self.critic_2 = Critic(state_dim, action_dim).to(device) 62 | self.critic_2_target = Critic(state_dim, action_dim).to(device) 63 | self.critic_2_target.load_state_dict(self.critic_2.state_dict()) 64 | self.critic_2_optimizer = optim.Adam(self.critic_2.parameters()) 65 | 66 | self.max_action = max_action 67 | 68 | def select_action(self, state): 69 | state = torch.FloatTensor(state.reshape(1, -1)).to(device) 70 | return self.actor(state).cpu().data.numpy().flatten() 71 | 72 | def update(self, replay_buffer, n_iter): 73 | 74 | for i in range(n_iter): 75 | # Sample a batch of transitions from replay buffer: 76 | state, action_, reward, next_state, done = replay_buffer.sample(batch_size) 77 | state = torch.FloatTensor(state).to(device) 78 | action = torch.FloatTensor(action_).to(device) 79 | reward = torch.FloatTensor(reward).reshape((batch_size,1)).to(device) 80 | next_state = torch.FloatTensor(next_state).to(device) 81 | done = torch.FloatTensor(done).reshape((batch_size,1)).to(device) 82 | 83 | # Select next action according to target policy: 84 | noise = torch.FloatTensor(action_).data.normal_(0, policy_noise).to(device) 85 | noise = noise.clamp(-noise_clip, noise_clip) 86 | next_action = (self.actor_target(next_state) + noise) 87 | next_action = next_action.clamp(-self.max_action, self.max_action) 88 | 89 | # Compute target Q-value: 90 | target_Q1 = self.critic_1_target(next_state, next_action) 91 | target_Q2 = self.critic_2_target(next_state, next_action) 92 | target_Q = torch.min(target_Q1, target_Q2) 93 | target_Q = reward + ((1-done) * gamma * target_Q).detach() 94 | 95 | # Optimize Critic 1: 96 | current_Q1 = self.critic_1(state, action) 97 | loss_Q1 = F.mse_loss(current_Q1, target_Q) 98 | self.critic_1_optimizer.zero_grad() 99 | loss_Q1.backward() 100 | self.critic_1_optimizer.step() 101 | 102 | # Optimize Critic 2: 103 | current_Q2 = self.critic_2(state, action) 104 | loss_Q2 = F.mse_loss(current_Q2, target_Q) 105 | self.critic_2_optimizer.zero_grad() 106 | loss_Q2.backward() 107 | self.critic_2_optimizer.step() 108 | 109 | # Delayed policy updates: 110 | if i % policy_delay == 0: 111 | # Compute actor loss: 112 | actor_loss = -self.critic_1(state, self.actor(state)).mean() 113 | 114 | # Optimize the actor 115 | self.actor_optimizer.zero_grad() 116 | actor_loss.backward() 117 | self.actor_optimizer.step() 118 | 119 | # Polyak averaging update: 120 | for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): 121 | target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data)) 122 | 123 | for param, target_param in zip(self.critic_1.parameters(), self.critic_1_target.parameters()): 124 | target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data)) 125 | 126 | for param, target_param in zip(self.critic_2.parameters(), self.critic_2_target.parameters()): 127 | target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data)) 128 | 129 | 130 | def save(self, directory, name): 131 | torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, name)) 132 | torch.save(self.actor_target.state_dict(), '%s/%s_actor_target.pth' % (directory, name)) 133 | 134 | torch.save(self.critic_1.state_dict(), '%s/%s_crtic_1.pth' % (directory, name)) 135 | torch.save(self.critic_1_target.state_dict(), '%s/%s_critic_1_target.pth' % (directory, name)) 136 | 137 | torch.save(self.critic_2.state_dict(), '%s/%s_crtic_2.pth' % (directory, name)) 138 | torch.save(self.critic_2_target.state_dict(), '%s/%s_critic_2_target.pth' % (directory, name)) 139 | 140 | def load(self, directory, name): 141 | self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, name), map_location=lambda storage, loc: storage)) 142 | self.actor_target.load_state_dict(torch.load('%s/%s_actor_target.pth' % (directory, name), map_location=lambda storage, loc: storage)) 143 | 144 | self.critic_1.load_state_dict(torch.load('%s/%s_crtic_1.pth' % (directory, name), map_location=lambda storage, loc: storage)) 145 | self.critic_1_target.load_state_dict(torch.load('%s/%s_critic_1_target.pth' % (directory, name), map_location=lambda storage, loc: storage)) 146 | 147 | self.critic_2.load_state_dict(torch.load('%s/%s_crtic_2.pth' % (directory, name), map_location=lambda storage, loc: storage)) 148 | self.critic_2_target.load_state_dict(torch.load('%s/%s_critic_2_target.pth' % (directory, name), map_location=lambda storage, loc: storage)) 149 | 150 | 151 | def load_actor(self, directory, name): 152 | self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, name), map_location=lambda storage, loc: storage)) 153 | self.actor_target.load_state_dict(torch.load('%s/%s_actor_target.pth' % (directory, name), map_location=lambda storage, loc: storage)) 154 | -------------------------------------------------------------------------------- /OpenAI/BipedalWalker-v2/README.md: -------------------------------------------------------------------------------- 1 | # BipedalWalker Problem 2 | 3 | ### Getting Started 4 | The environment to the BipedalWalker is described [here](https://github.com/openai/gym/wiki/BipedalWalker-v2). 5 | 6 | ### Solution Video 7 | [![BipedalWalker-v2](http://img.youtube.com/vi/QW6fWP5FDoU/0.jpg)](https://www.youtube.com/watch?v=QW6fWP5FDoU "BipedalWalker-v2") 8 | 9 | The video shows in the first part the behaviour of the untrained agent and then in comparison the behaviour of the trained agent. 10 | 11 | ### Solution Info 12 | My learning algorithm is a [Twin Delayed Deep Deterministic Policy Gradient algorithm (TD3)]([https://arxiv.org/pdf/1802.09477.pdf]). 13 | 14 | ### Instructions 15 | 16 | start Jupyter Notebook `BipedalWalker-v2.ipynb` and follow the instructions. 17 | -------------------------------------------------------------------------------- /OpenAI/BipedalWalker-v2/ReplayBuffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer: 4 | def __init__(self): 5 | self.buffer = [] 6 | 7 | def add(self, transition): 8 | self.buffer.append(transition) 9 | 10 | def sample(self, batch_size): 11 | indexes = np.random.randint(0, len(self.buffer), size=batch_size) 12 | state, action, reward, next_state, done = [], [], [], [], [] 13 | 14 | for i in indexes: 15 | s, a, r, s_, d = self.buffer[i] 16 | state.append(np.array(s, copy=False)) 17 | action.append(np.array(a, copy=False)) 18 | reward.append(np.array(r, copy=False)) 19 | next_state.append(np.array(s_, copy=False)) 20 | done.append(np.array(d, copy=False)) 21 | 22 | return np.array(state), np.array(action), np.array(reward), np.array(next_state), np.array(done) 23 | -------------------------------------------------------------------------------- /OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_actor.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_actor.pth -------------------------------------------------------------------------------- /OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_actor_target.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_actor_target.pth -------------------------------------------------------------------------------- /OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_critic_1_target.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_critic_1_target.pth -------------------------------------------------------------------------------- /OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_critic_2_target.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_critic_2_target.pth -------------------------------------------------------------------------------- /OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_crtic_1.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_crtic_1.pth -------------------------------------------------------------------------------- /OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_crtic_2.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_crtic_2.pth -------------------------------------------------------------------------------- /OpenAI/BipedalWalker-v3/Agent.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.optim as optim 5 | import numpy as np 6 | 7 | # Hyperparameters 8 | gamma = 0.99 # discount for future rewards 9 | batch_size = 100 # num of transitions sampled from replay buffer 10 | polyak = 0.995 # target policy update parameter (1-tau) 11 | policy_noise = 0.2 # target policy smoothing noise 12 | noise_clip = 0.5 13 | exploration_noise = 0.1 14 | 15 | policy_delay = 2 # delayed policy updates parameter 16 | LR_ACTOR = 0.001 17 | LR_CRITIC = 0.001 18 | 19 | WEIGHT_DECAY = 0.0 20 | 21 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 22 | 23 | class Actor(nn.Module): 24 | def __init__(self, state_dim, action_dim, max_action): 25 | super(Actor, self).__init__() 26 | 27 | self.l1 = nn.Linear(state_dim, 400) 28 | self.l2 = nn.Linear(400, 300) 29 | self.l3 = nn.Linear(300, action_dim) 30 | 31 | self.max_action = max_action 32 | 33 | def forward(self, state): 34 | a = F.relu(self.l1(state)) 35 | a = F.relu(self.l2(a)) 36 | a = torch.tanh(self.l3(a)) * self.max_action 37 | return a 38 | 39 | class Critic(nn.Module): 40 | def __init__(self, state_dim, action_dim): 41 | super(Critic, self).__init__() 42 | 43 | self.l1 = nn.Linear(state_dim + action_dim, 400) 44 | self.l2 = nn.Linear(400, 300) 45 | self.l3 = nn.Linear(300, 1) 46 | 47 | def forward(self, state, action): 48 | state_action = torch.cat([state, action], 1) 49 | 50 | q = F.relu(self.l1(state_action)) 51 | q = F.relu(self.l2(q)) 52 | q = self.l3(q) 53 | return q 54 | 55 | class TD3: 56 | def __init__(self, env): 57 | self.env = env 58 | 59 | state_dim = env.observation_space.shape[0] 60 | action_dim = env.action_space.shape[0] 61 | max_action = float(env.action_space.high[0]) 62 | 63 | self.actor = Actor(state_dim, action_dim, max_action).to(device) 64 | self.actor_target = Actor(state_dim, action_dim, max_action).to(device) 65 | self.actor_target.load_state_dict(self.actor.state_dict()) 66 | self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=LR_ACTOR) 67 | 68 | self.critic_1 = Critic(state_dim, action_dim).to(device) 69 | self.critic_1_target = Critic(state_dim, action_dim).to(device) 70 | self.critic_1_target.load_state_dict(self.critic_1.state_dict()) 71 | self.critic_1_optimizer = optim.Adam(self.critic_1.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) 72 | 73 | self.critic_2 = Critic(state_dim, action_dim).to(device) 74 | self.critic_2_target = Critic(state_dim, action_dim).to(device) 75 | self.critic_2_target.load_state_dict(self.critic_2.state_dict()) 76 | self.critic_2_optimizer = optim.Adam(self.critic_2.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) 77 | 78 | self.max_action = max_action 79 | 80 | def select_action(self, state): 81 | state = torch.FloatTensor(state.reshape(1, -1)).to(device) 82 | action = self.actor(state).cpu().data.numpy().flatten() 83 | 84 | if exploration_noise != 0: 85 | action = (action + np.random.normal(0, exploration_noise, size=self.env.action_space.shape[0])) 86 | 87 | return action.clip(self.env.action_space.low, self.env.action_space.high) 88 | 89 | def update(self, replay_buffer, n_iter): 90 | for i in range(n_iter): 91 | state, action, reward, next_state, done = replay_buffer.sample(batch_size) 92 | 93 | state = torch.FloatTensor(state).to(device) 94 | action = torch.FloatTensor(action).to(device) 95 | reward = torch.FloatTensor(reward).to(device) 96 | next_state = torch.FloatTensor(next_state).to(device) 97 | done = torch.FloatTensor(done).to(device) 98 | 99 | # Select next action according to target policy: 100 | noise = torch.empty_like(action).data.normal_(0, policy_noise).to(device) 101 | noise = noise.clamp(-noise_clip, noise_clip) 102 | next_action = (self.actor_target(next_state) + noise) 103 | next_action = next_action.clamp(-self.max_action, self.max_action) 104 | 105 | # Compute target Q-value: 106 | target_Q1 = self.critic_1_target(next_state, next_action) 107 | target_Q2 = self.critic_2_target(next_state, next_action) 108 | target_Q = torch.min(target_Q1, target_Q2) 109 | target_Q = reward + ((1-done) * gamma * target_Q).detach() 110 | 111 | # Optimize Critic 1: 112 | current_Q1 = self.critic_1(state, action) 113 | loss_Q1 = F.mse_loss(current_Q1, target_Q) 114 | 115 | self.critic_1_optimizer.zero_grad() 116 | loss_Q1.backward() 117 | self.critic_1_optimizer.step() 118 | 119 | # Optimize Critic 2: 120 | current_Q2 = self.critic_2(state, action) 121 | loss_Q2 = F.mse_loss(current_Q2, target_Q) 122 | 123 | self.critic_2_optimizer.zero_grad() 124 | loss_Q2.backward() 125 | self.critic_2_optimizer.step() 126 | 127 | # Delayed policy updates: 128 | if i % policy_delay == 0: 129 | # Compute actor loss: 130 | actor_loss = -self.critic_1(state, self.actor(state)).mean() 131 | 132 | # Optimize the actor 133 | self.actor_optimizer.zero_grad() 134 | actor_loss.backward() 135 | self.actor_optimizer.step() 136 | 137 | # Polyak averaging update: 138 | for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): 139 | target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data)) 140 | 141 | for param, target_param in zip(self.critic_1.parameters(), self.critic_1_target.parameters()): 142 | target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data)) 143 | 144 | for param, target_param in zip(self.critic_2.parameters(), self.critic_2_target.parameters()): 145 | target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data)) 146 | 147 | 148 | def save(self, directory, name): 149 | torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, name)) 150 | torch.save(self.actor_target.state_dict(), '%s/%s_actor_target.pth' % (directory, name)) 151 | 152 | torch.save(self.critic_1.state_dict(), '%s/%s_crtic_1.pth' % (directory, name)) 153 | torch.save(self.critic_1_target.state_dict(), '%s/%s_critic_1_target.pth' % (directory, name)) 154 | 155 | torch.save(self.critic_2.state_dict(), '%s/%s_crtic_2.pth' % (directory, name)) 156 | torch.save(self.critic_2_target.state_dict(), '%s/%s_critic_2_target.pth' % (directory, name)) 157 | 158 | def load(self, directory, name): 159 | self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, name), map_location=lambda storage, loc: storage)) 160 | self.actor_target.load_state_dict(torch.load('%s/%s_actor_target.pth' % (directory, name), map_location=lambda storage, loc: storage)) 161 | 162 | self.critic_1.load_state_dict(torch.load('%s/%s_crtic_1.pth' % (directory, name), map_location=lambda storage, loc: storage)) 163 | self.critic_1_target.load_state_dict(torch.load('%s/%s_critic_1_target.pth' % (directory, name), map_location=lambda storage, loc: storage)) 164 | 165 | self.critic_2.load_state_dict(torch.load('%s/%s_crtic_2.pth' % (directory, name), map_location=lambda storage, loc: storage)) 166 | self.critic_2_target.load_state_dict(torch.load('%s/%s_critic_2_target.pth' % (directory, name), map_location=lambda storage, loc: storage)) 167 | 168 | 169 | def load_actor(self, directory, name): 170 | self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, name), map_location=lambda storage, loc: storage)) 171 | self.actor_target.load_state_dict(torch.load('%s/%s_actor_target.pth' % (directory, name), map_location=lambda storage, loc: storage)) 172 | -------------------------------------------------------------------------------- /OpenAI/BipedalWalker-v3/README.md: -------------------------------------------------------------------------------- 1 | # BipedalWalker Problem 2 | 3 | ### Getting Started 4 | The environment to the BipedalWalker is described [here](https://github.com/openai/gym/wiki/BipedalWalker-v2). 5 | 6 | ### Solution Video 7 | [![BipedalWalker-v3](http://img.youtube.com/vi/14yGAsIG-Rs/0.jpg)](https://www.youtube.com/watch?v=14yGAsIG-Rs "BipedalWalker-v3") 8 | 9 | The video shows in the first part the behaviour of the untrained agent and then in comparison the behaviour of the trained agent. 10 | 11 | ### Solution Info 12 | My learning algorithm is a [Twin Delayed Deep Deterministic Policy Gradient algorithm (TD3)]([https://arxiv.org/pdf/1802.09477.pdf]). 13 | 14 | ### Instructions 15 | 16 | start Jupyter Notebook `BipedalWalker-v3.ipynb` and follow the instructions. 17 | -------------------------------------------------------------------------------- /OpenAI/BipedalWalker-v3/ReplayBuffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer: 4 | def __init__(self, max_size=1000000): 5 | self.buffer = [] 6 | self.max_size = int(max_size) 7 | self.size = 0 8 | 9 | def add(self, transition): 10 | self.size +=1 11 | # transiton is tuple of (state, action, reward, next_state, done) 12 | self.buffer.append(transition) 13 | 14 | def sample(self, batch_size): 15 | # delete 1/5th of the buffer when full 16 | if self.size > self.max_size: 17 | del self.buffer[0:int(self.size/5)] 18 | self.size = len(self.buffer) 19 | 20 | indexes = np.random.randint(0, len(self.buffer), size=batch_size) 21 | state, action, reward, next_state, done = [], [], [], [], [] 22 | 23 | for i in indexes: 24 | s, a, r, s_, d = self.buffer[i] 25 | state.append(np.array(s, copy=False)) 26 | action.append(np.array(a, copy=False)) 27 | reward.append(np.array(r, copy=False)) 28 | next_state.append(np.array(s_, copy=False)) 29 | done.append(np.array(d, copy=False)) 30 | 31 | return np.array(state), np.array(action), np.array(reward).reshape(-1, 1), np.array(next_state), np.array(done).reshape(-1, 1) 32 | -------------------------------------------------------------------------------- /OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_actor.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_actor.pth -------------------------------------------------------------------------------- /OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_actor_target.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_actor_target.pth -------------------------------------------------------------------------------- /OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_critic_1_target.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_critic_1_target.pth -------------------------------------------------------------------------------- /OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_critic_2_target.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_critic_2_target.pth -------------------------------------------------------------------------------- /OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_crtic_1.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_crtic_1.pth -------------------------------------------------------------------------------- /OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_crtic_2.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_crtic_2.pth -------------------------------------------------------------------------------- /OpenAI/CartPole-v0/.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | /logs 3 | -------------------------------------------------------------------------------- /OpenAI/CartPole-v0/CartPole-v0.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "Using TensorFlow backend.\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "import gym\n", 18 | "import sys\n", 19 | "import numpy as np\n", 20 | "import random as rn\n", 21 | "import cv2\n", 22 | "import time\n", 23 | "import functools\n", 24 | "import datetime\n", 25 | "import tensorflow as tf\n", 26 | "from agents.DDQN import *\n", 27 | "from IPython import display\n", 28 | "import matplotlib\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "from matplotlib import gridspec\n", 31 | "%matplotlib inline\n", 32 | "\n", 33 | "np.set_printoptions(threshold=sys.maxsize)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "#### Set seed" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 2, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "SEED = 789325\n", 50 | "\n", 51 | "rn.seed(SEED)\n", 52 | "np.random.seed(SEED)\n", 53 | "tf.random.set_seed(SEED)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "#### Environment" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 3, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "def build_environment(envName=\"CartPole-v0\", seed=None):\n", 70 | " env = gym.make(envName) \n", 71 | " if seed is not None:\n", 72 | " env.seed(seed) \n", 73 | " \n", 74 | " return env" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "#### Show Environment information" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 4, 87 | "metadata": { 88 | "scrolled": true 89 | }, 90 | "outputs": [ 91 | { 92 | "name": "stdout", 93 | "output_type": "stream", 94 | "text": [ 95 | "Actions: 2\n", 96 | "Size of state: 4\n" 97 | ] 98 | } 99 | ], 100 | "source": [ 101 | "env = build_environment(seed=SEED)\n", 102 | "\n", 103 | "# size of each action\n", 104 | "action_size = env.action_space.n\n", 105 | "print('Actions: ', action_size)\n", 106 | "if hasattr(env.env, 'get_action_meanings'):\n", 107 | " print(env.env.get_action_meanings())\n", 108 | "\n", 109 | "# examine the state space \n", 110 | "states = env.observation_space.shape\n", 111 | "state_size = states[0]\n", 112 | "print('Size of state:', state_size)\n", 113 | "\n", 114 | "env.close()" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "# Training" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 5, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "def build_agent(pre_trained=None):\n", 131 | " return DDQNAgent(state_size,\n", 132 | " action_size,\n", 133 | " buffer_size=2000,\n", 134 | " epsilon_start=0.5,\n", 135 | " epsilon_steps_to_min=3500,\n", 136 | " mode=\"DuelingDQN\",\n", 137 | " use_PER=True,\n", 138 | " pre_trained=pre_trained)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 6, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "logdir = \"logs/\" + time.strftime(\"%Y%m%d_%H%M%S\")\n", 148 | "writer = tf.summary.create_file_writer(logdir)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 7, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "SAVE_EVERY_EPISODES = 100\n", 158 | "LEARNING_START_AFTER_STEPS = 500\n", 159 | "EPISODES = 80\n", 160 | "SCORE_TO_SOLVE = 195.0\n", 161 | "\n", 162 | "UPDATE_MODE = 'soft'\n", 163 | "UPDATE_TARGET_FREQUENCY = 10" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 8, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "name": "stdout", 173 | "output_type": "stream", 174 | "text": [ 175 | "Training started: 2019-12-29 13:29:18.235840\n", 176 | "Episode 1: Step 15 reward 15.0: \n", 177 | "Save model...\n", 178 | "Episode 5: Step 80 reward 30.0: \n", 179 | "Save model...\n", 180 | "Episode 6: Step 123 reward 43.0: \n", 181 | "Save model...\n", 182 | "Episode 7: Step 180 reward 57.0: \n", 183 | "Save model...\n" 184 | ] 185 | }, 186 | { 187 | "name": "stderr", 188 | "output_type": "stream", 189 | "text": [ 190 | "D:\\Deep Learning\\Reinforcement-Learning\\OpenAI\\CartPole-v0\\memory.py:47: RuntimeWarning: divide by zero encountered in double_scalars\n", 191 | " max_weight = (p_min * n) ** (-self.PER_b)\n", 192 | "D:\\Deep Learning\\Reinforcement-Learning\\OpenAI\\CartPole-v0\\memory.py:47: RuntimeWarning: divide by zero encountered in double_scalars\n", 193 | " max_weight = (p_min * n) ** (-self.PER_b)\n" 194 | ] 195 | }, 196 | { 197 | "name": "stdout", 198 | "output_type": "stream", 199 | "text": [ 200 | "Episode 46: Step 860 reward 58.0: \n", 201 | "Save model...\n", 202 | "Episode 47: Step 947 reward 87.0: \n", 203 | "Save model...\n", 204 | "Episode 61: Step 1757 reward 92.0: \n", 205 | "Save model...\n", 206 | "Episode 65: Step 2039 reward 105.0: \n", 207 | "Save model...\n", 208 | "Episode 67: Step 2269 reward 133.0: \n", 209 | "Save model...\n", 210 | "Episode 68: Step 2469 reward 200.0: \n", 211 | "Save model...\n", 212 | "Save model...\n", 213 | "Training finished\n" 214 | ] 215 | } 216 | ], 217 | "source": [ 218 | "def train():\n", 219 | " env = build_environment(seed=SEED)\n", 220 | " agent = build_agent()\n", 221 | " \n", 222 | " max_reward = -9999999 \n", 223 | " game_rewards_deque = deque(maxlen=100) \n", 224 | " frame_count = 0\n", 225 | " \n", 226 | " print(\"Training started: \" + str(datetime.datetime.now()))\n", 227 | " \n", 228 | " frame_count = 0\n", 229 | " \n", 230 | " for i_episode in range(1, EPISODES+1):\n", 231 | " state = env.reset()\n", 232 | " \n", 233 | " game_reward = 0\n", 234 | " steps = 0\n", 235 | " \n", 236 | " while True:\n", 237 | " frame_count += 1\n", 238 | " steps += 1\n", 239 | " \n", 240 | " state = agent.preprocess(state) \n", 241 | " action = agent.act(state) \n", 242 | " \n", 243 | " next_state, reward, done, info = env.step(action) \n", 244 | " game_reward += reward\n", 245 | " \n", 246 | " agent.remember(state[0], action, reward, next_state, done)\n", 247 | " \n", 248 | " state = next_state\n", 249 | " \n", 250 | " if frame_count % 10000 == 0:\n", 251 | " print(\"Step count: {}\".format(frame_count))\n", 252 | " \n", 253 | " if done:\n", 254 | " break \n", 255 | " \n", 256 | " if frame_count > LEARNING_START_AFTER_STEPS: \n", 257 | " agent.train()\n", 258 | " if UPDATE_MODE == \"soft\":\n", 259 | " agent.soft_update_target_network()\n", 260 | " \n", 261 | " \n", 262 | " if UPDATE_MODE == \"hard\" and frame_count % UPDATE_TARGET_FREQUENCY == 0:\n", 263 | " agent.hard_update_target_network()\n", 264 | " \n", 265 | " # Log episode reward\n", 266 | " with writer.as_default():\n", 267 | " tf.summary.scalar(\"epsilon\", agent.epsilon, step=i_episode)\n", 268 | " tf.summary.scalar(\"game_reward\", game_reward, step=i_episode) \n", 269 | " \n", 270 | " if i_episode % SAVE_EVERY_EPISODES == 0:\n", 271 | " print(\"Save after {} episodes.\".format(i_episode))\n", 272 | " agent.save() \n", 273 | " \n", 274 | " game_rewards_deque.append(game_reward)\n", 275 | " \n", 276 | " if game_reward > max_reward:\n", 277 | " print(\"Episode {}: Step {} reward {}: \".format(i_episode, frame_count, game_reward))\n", 278 | " max_reward = game_reward\n", 279 | " agent.save() \n", 280 | " \n", 281 | " if np.mean(game_rewards_deque) >= SCORE_TO_SOLVE:\n", 282 | " agent.save()\n", 283 | " print(\"Solved in Episode {} Step {} reward {}: \".format(i_episode, frame_count, game_reward))\n", 284 | " break \n", 285 | " \n", 286 | " env.close()\n", 287 | " agent.save()\n", 288 | " \n", 289 | "train()\n", 290 | "print(\"Training finished\")" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "metadata": {}, 296 | "source": [ 297 | "# Show Result" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 9, 303 | "metadata": {}, 304 | "outputs": [ 305 | { 306 | "name": "stdout", 307 | "output_type": "stream", 308 | "text": [ 309 | "Episode finished with score: 161.0\n" 310 | ] 311 | }, 312 | { 313 | "data": { 314 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAD8CAYAAABXe05zAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAARYUlEQVR4nO3df6zddX3H8edrpVYjJMK4kNofo3M1GZhZ3E1nwrIwcdKxH8U/WEoy0z9Iyh+QaGaygSZD/mjiFn/sn2FWB7HZ1K6JEjrCNmunMSaO0mLBllK5SoVrm7bojLA/6lre++N+Ow7ltvf03nu8/ZzzfCQn5/v9fL/fc95vQl98+ZzP6UlVIUlqx68sdAGSpAtjcEtSYwxuSWqMwS1JjTG4JakxBrckNWZgwZ1kXZJDSSaS3DOo95GkUZNBrONOsgj4PvAHwCTwBHB7VT0z728mSSNmUHfca4GJqvphVf0C2AasH9B7SdJIuWRAr7sMeLFnfxL4nXOdfOWVV9Y111wzoFIkqT2HDx/mpZdeynTHBhXc073Z6+ZkkmwCNgGsXLmSPXv2DKgUSWrP+Pj4OY8NaqpkEljRs78cONJ7QlVtqarxqhofGxsbUBmSNHwGFdxPAKuTrEryJmADsGNA7yVJI2UgUyVVdSrJ3cB/AIuAh6rqwCDeS5JGzaDmuKmqx4DHBvX6kjSq/OakJDXG4JakxhjcktQYg1uSGmNwS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMYY3JLUGINbkhpjcEtSYwxuSWqMwS1JjTG4JakxBrckNcbglqTGzOmny5IcBl4GTgOnqmo8yRXAvwDXAIeBP6uq/55bmZKkM+bjjvv3q2pNVY13+/cAu6pqNbCr25ckzZNBTJWsB7Z221uBWwfwHpI0suYa3AV8LcneJJu6saur6ihA93zVHN9DktRjTnPcwA1VdSTJVcDOJM/2e2EX9JsAVq5cOccyJGl0zOmOu6qOdM/HgYeBtcCxJEsBuufj57h2S1WNV9X42NjYXMqQpJEy6+BO8tYkl53ZBj4A7Ad2ABu70zYCj8y1SEnSa+YyVXI18HCSM6/zpar69yRPANuT3AG8ANw29zIlSWfMOrir6ofAu6cZ/wlw01yKkiSdm9+clKTGGNyS1BiDW5IaY3BLUmMMbklqjMEtSY0xuCWpMQa3JDXG4JakxhjcktQYg1uSGmNwS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMYY3JLUGINbkhozY3AneSjJ8ST7e8auSLIzyXPd8+U9x+5NMpHkUJKbB1W4JI2qfu64vwCsO2vsHmBXVa0GdnX7JLkW2ABc113zQJJF81atJGnm4K6qbwE/PWt4PbC1294K3Nozvq2qTlbV88AEsHaeapUkMfs57qur6ihA93xVN74MeLHnvMlu7A2SbEqyJ8meEydOzLIMSRo98/3hZKYZq+lOrKotVTVeVeNjY2PzXIYkDa/ZBvexJEsBuufj3fgksKLnvOXAkdmXJ0k622yDewewsdveCDzSM74hyZIkq4DVwO65lShJ6nXJTCck+TJwI3BlkkngPuCTwPYkdwAvALcBVNWBJNuBZ4BTwF1VdXpAtUvSSJoxuKvq9nMcuukc528GNs+lKEnSufnNSUlqjMEtSY0xuCWpMQa3JDXG4JakxhjcktQYg1uSGmNwS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMYY3JLUGINbkhpjcEtSYwxuSWqMwS1JjZkxuJM8lOR4kv09Y59I8uMk+7rHLT3H7k0ykeRQkpsHVbgkjap+7ri/AKybZvyzVbWmezwGkORaYANwXXfNA0kWzVexkqQ+gruqvgX8tM/XWw9sq6qTVfU8MAGsnUN9kqSzzGWO++4kT3dTKZd3Y8uAF3vOmezG3iDJpiR7kuw5ceLEHMqQpNEy2+D+HPAOYA1wFPh0N55pzq3pXqCqtlTVeFWNj42NzbIMSRo9swruqjpWVaer6lXg87w2HTIJrOg5dTlwZG4lSpJ6zSq4kyzt2f0gcGbFyQ5gQ5IlSVYBq4HdcytRktTrkplOSPJl4EbgyiSTwH3AjUnWMDUNchi4E6CqDiTZDjwDnALuqqrTgyldkkbTjMFdVbdPM/zgec7fDGyeS1GSpHPzm5OS1BiDW5IaY3BLUmMMbklqjMEtSY0xuCWpMQa3JDVmxnXcUiv2brnzdfu/vekfFqgSabC849bQOjvIpWFhcEtSYwxuSWqMwa2hMN20iHPcGlYGtyQ1xuBW8/wQUqPG4JakxhjcktQYg1uSGmNwayi5okTDbMbgTrIiyTeSHExyIMmHu/ErkuxM8lz3fHnPNfcmmUhyKMnNg2xAkkZNP3fcp4CPVtVvAu8F7kpyLXAPsKuqVgO7un26YxuA64B1wANJFg2ieMkVJRpFMwZ3VR2tqie77ZeBg8AyYD2wtTttK3Brt70e2FZVJ6vqeWACWDvfhUvSqLqgOe4k1wDXA48DV1fVUZgKd+Cq7rRlwIs9l012Y2e/1qYke5LsOXHixIVXLkkjqu/gTnIp8BXgI1X18/OdOs1YvWGgaktVjVfV+NjYWL9lSDPyg0kNu76CO8lipkL7i1X11W74WJKl3fGlwPFufBJY0XP5cuDI/JQrSepnVUmAB4GDVfWZnkM7gI3d9kbgkZ7xDUmWJFkFrAZ2z1/J0hQ/mNSo6ucXcG4APgR8L8m+buxjwCeB7UnuAF4AbgOoqgNJtgPPMLUi5a6qOj3vlUvSiJoxuKvq20w/bw1w0zmu2QxsnkNdkqRz8JuTktQYg1tDxRUlGgUGtyQ1xuBWk1xRolFmcEtSYwxuSWqMwa2h4QeTGhUGtyQ1xuBWc/xgUqPO4JakxhjcktQYg1tDwQ8mNUoMbklqjMEtSY0xuNWU6VaUOE2iUWNwS1JjDG41w/Xb0hSDW5Ia08+PBa9I8o0kB5McSPLhbvwTSX6cZF/3uKXnmnuTTCQ5lOTmQTYgSaOmnx8LPgV8tKqeTHIZsDfJzu7YZ6vqU70nJ7kW2ABcB7wd+HqSd/qDwRoEP5jUKJrxjruqjlbVk932y8BBYNl5LlkPbKuqk1X1PDABrJ2PYiVJFzjHneQa4Hrg8W7o7iRPJ3koyeXd2DLgxZ7LJjl/0Esz8oNJ6TV9B3eSS4GvAB+pqp8DnwPeAawBjgKfPnPqNJfXNK+3KcmeJHtOnDhxwYVL0qjqK7iTLGYqtL9YVV8FqKpjVXW6ql4FPs9r0yGTwIqey5cDR85+zaraUlXjVTU+NjY2lx4kaaT0s6okwIPAwar6TM/40p7TPgjs77Z3ABuSLEmyClgN7J6/kiVptPWzquQG4EPA95Ls68Y+BtyeZA1T0yCHgTsBqupAku3AM0ytSLnLFSUaBFeUaFTNGNxV9W2mn7d+7DzXbAY2z6EuSdI5+M1JXfRcUSK9nsEtSY0xuCWpMQa3muQHkxplBrckNcbg1kXNDyalNzK4JakxBrckNcbgVnP8YFKjzuCWpMYY3JLUGINbFy1XlEjTM7glqTEGt35pklzQYy6vIw0zg1tNGb9zy0KXIC24fn5IQVoQ/3pk0/9v/8nbDWzpDO+4dVG67749r9vvDXFp1BncaobTJNKUfn4s+M1Jdid5KsmBJPd341ck2Znkue758p5r7k0ykeRQkpsH2YAkjZp+7rhPAu+rqncDa4B1Sd4L3APsqqrVwK5unyTXAhuA64B1wANJFg2ieA2vs+e0neOWXtPPjwUX8Eq3u7h7FLAeuLEb3wp8E/irbnxbVZ0Enk8yAawFvjOfhWu4TU2LvBbW9y9cKdJFp69VJd0d817gN4C/r6rHk1xdVUcBqupokqu605cB/9Vz+WQ3dk579+517a3mlf8+aZj1FdxVdRpYk+RtwMNJ3nWe06f7E1NvOCnZBGwCWLlyJT/60Y/6KUUN+2WG6dT/KErtGh8fP+exC1pVUlU/Y2pKZB1wLMlSgO75eHfaJLCi57LlwJFpXmtLVY1X1fjY2NiFlCFJI62fVSVj3Z02Sd4CvB94FtgBbOxO2wg80m3vADYkWZJkFbAa2D3fhUvSqOpnqmQpsLWb5/4VYHtVPZrkO8D2JHcALwC3AVTVgSTbgWeAU8Bd3VSLJGke9LOq5Gng+mnGfwLcdI5rNgOb51ydJOkN/OakJDXG4JakxhjcktQY/1pX/dK4tlqaH95xS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMYY3JLUGINbkhpjcEtSYwxuSWqMwS1JjTG4JakxBrckNcbglqTG9PNjwW9OsjvJU0kOJLm/G/9Ekh8n2dc9bum55t4kE0kOJbl5kA1I0qjp5+/jPgm8r6peSbIY+HaSf+uOfbaqPtV7cpJrgQ3AdcDbga8neac/GCxJ82PGO+6a8kq3u7h7nO9vxF8PbKuqk1X1PDABrJ1zpZIkoM857iSLkuwDjgM7q+rx7tDdSZ5O8lCSy7uxZcCLPZdPdmOSpHnQV3BX1emqWgMsB9YmeRfwOeAdwBrgKPDp7vRM9xJnDyTZlGRPkj0nTpyYVfGSNIouaFVJVf0M+CawrqqOdYH+KvB5XpsOmQRW9Fy2HDgyzWttqarxqhofGxubVfGSNIr6WVUyluRt3fZbgPcDzyZZ2nPaB4H93fYOYEOSJUlWAauB3fNbtiSNrn5WlSwFtiZZxFTQb6+qR5P8U5I1TE2DHAbuBKiqA0m2A88Ap4C7XFEiSfNnxuCuqqeB66cZ/9B5rtkMbJ5baZKk6fjNSUlqjMEtSY0xuCWpMQa3JDXG4JakxhjcktQYg1uSGmNwS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMYY3JLUGINbkhpjcEtSYwxuSWqMwS1JjTG4JakxBrckNcbglqTGGNyS1JhU1ULXQJITwP8ALy10LQNwJfbVmmHtzb7a8mtVNTbdgYsiuAGS7Kmq8YWuY77ZV3uGtTf7Gh5OlUhSYwxuSWrMxRTcWxa6gAGxr/YMa2/2NSQumjluSVJ/LqY7bklSHxY8uJOsS3IoyUSSexa6nguV5KEkx5Ps7xm7IsnOJM91z5f3HLu36/VQkpsXpuqZJVmR5BtJDiY5kOTD3XjTvSV5c5LdSZ7q+rq/G2+6rzOSLEry3SSPdvvD0tfhJN9Lsi/Jnm5sKHqblapasAewCPgB8OvAm4CngGsXsqZZ9PB7wHuA/T1jfwvc023fA/xNt31t1+MSYFXX+6KF7uEcfS0F3tNtXwZ8v6u/6d6AAJd224uBx4H3tt5XT39/AXwJeHRY/l3s6j0MXHnW2FD0NpvHQt9xrwUmquqHVfULYBuwfoFruiBV9S3gp2cNrwe2dttbgVt7xrdV1cmqeh6YYOqfwUWnqo5W1ZPd9svAQWAZjfdWU17pdhd3j6LxvgCSLAf+CPjHnuHm+zqPYe7tvBY6uJcBL/bsT3Zjrbu6qo7CVAACV3XjTfab5BrgeqbuTpvvrZtO2AccB3ZW1VD0Bfwd8JfAqz1jw9AXTP3H9WtJ9ibZ1I0NS28X7JIFfv9MMzbMy1ya6zfJpcBXgI9U1c+T6VqYOnWasYuyt6o6DaxJ8jbg4STvOs/pTfSV5I+B41W1N8mN/VwyzdhF11ePG6rqSJKrgJ1Jnj3Pua31dsEW+o57EljRs78cOLJAtcynY0mWAnTPx7vxpvpNspip0P5iVX21Gx6K3gCq6mfAN4F1tN/XDcCfJjnM1JTj+5L8M+33BUBVHemejwMPMzX1MRS9zcZCB/cTwOokq5K8CdgA7FjgmubDDmBjt70ReKRnfEOSJUlWAauB3QtQ34wydWv9IHCwqj7Tc6jp3pKMdXfaJHkL8H7gWRrvq6rurarlVXUNU3+O/rOq/pzG+wJI8tYkl53ZBj4A7GcIepu1hf50FLiFqRULPwA+vtD1zKL+LwNHgf9l6r/0dwC/CuwCnuuer+g5/+Ndr4eAP1zo+s/T1+8y9b+XTwP7usctrfcG/Bbw3a6v/cBfd+NN93VWjzfy2qqS5vtiatXZU93jwJmcGIbeZvvwm5OS1JiFniqRJF0gg1uSGmNwS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMb8H8EOG9Pp82HgAAAAAElFTkSuQmCC\n", 315 | "text/plain": [ 316 | "
" 317 | ] 318 | }, 319 | "metadata": { 320 | "needs_background": "light" 321 | }, 322 | "output_type": "display_data" 323 | } 324 | ], 325 | "source": [ 326 | "env = build_environment(seed=SEED)\n", 327 | "agent = build_agent(pre_trained='model.h5')\n", 328 | "\n", 329 | "state = env.reset()\n", 330 | "final_reward = 0\n", 331 | "\n", 332 | "img = plt.imshow(env.render(mode='rgb_array'))\n", 333 | "while True:\n", 334 | " img.set_data(env.render(mode='rgb_array'))\n", 335 | " display.display(plt.gcf())\n", 336 | " display.clear_output(wait=True)\n", 337 | "\n", 338 | " state = np.reshape(state, [1, state_size])\n", 339 | " action = agent.act(state) \n", 340 | " next_state, reward, done, info = env.step(action)\n", 341 | " final_reward += reward \n", 342 | " \n", 343 | " state = next_state\n", 344 | " \n", 345 | " if done:\n", 346 | " print(\"Episode finished with score: {}\".format(final_reward))\n", 347 | " break\n", 348 | "env.close() " 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": {}, 355 | "outputs": [], 356 | "source": [] 357 | } 358 | ], 359 | "metadata": { 360 | "kernelspec": { 361 | "display_name": "Python 3", 362 | "language": "python", 363 | "name": "python3" 364 | }, 365 | "language_info": { 366 | "codemirror_mode": { 367 | "name": "ipython", 368 | "version": 3 369 | }, 370 | "file_extension": ".py", 371 | "mimetype": "text/x-python", 372 | "name": "python", 373 | "nbconvert_exporter": "python", 374 | "pygments_lexer": "ipython3", 375 | "version": "3.6.9" 376 | } 377 | }, 378 | "nbformat": 4, 379 | "nbformat_minor": 2 380 | } -------------------------------------------------------------------------------- /OpenAI/CartPole-v0/README.md: -------------------------------------------------------------------------------- 1 | # CartPole-v0 2 | Reinforcement Learning project to train a neural network to play the 3 | [OpenAI](https://openai.com/) environment [CartPole-v0](https://github.com/openai/gym/wiki/CartPole-v0). 4 | ![CartPole-0](assets/cartpole-v0.jpg "CartPole-v0") 5 | 6 | ### Objectives 7 | 8 | 9 | ## Additional Information 10 | Tensorflow Version: GPU 2.0.0 11 | 12 | ## Installation 13 | 1. Create and activate a new environment. 14 | ``` 15 | conda create -n openai python=3.6 16 | source activate openai 17 | ``` 18 | 2. Install Dependencies. 19 | ``` 20 | pip install -r requirements.txt 21 | pip install gym[atari] 22 | ``` 23 | 24 | ### Launch Jupyter notebook 25 | ``` 26 | jupyter notebook CartPole-v0.ipynb 27 | ``` 28 | 29 | ### Additional commands 30 | Starts Tensorboard Visualisation. 31 | ``` 32 | tensorboard --logdir=logs/ 33 | ``` 34 | 35 | #### Rewards 36 | ![Reward](assets/game_reward.png "Reward") 37 | 38 | -------------------------------------------------------------------------------- /OpenAI/CartPole-v0/agents/DDQN.py: -------------------------------------------------------------------------------- 1 | from memory import * 2 | from keras.layers import * 3 | from keras.models import * 4 | from keras.optimizers import * 5 | from keras.initializers import * 6 | 7 | class DDQNAgent(object): 8 | def __init__(self, 9 | state_size, 10 | action_size, 11 | buffer_size=10000, 12 | batch_size=32, 13 | gamma=0.99, 14 | epsilon_start=1.0, 15 | epsilon_min=0.1, 16 | epsilon_steps_to_min=1000, 17 | tau=0.1, 18 | mode='QNetwork', 19 | use_PER=True, 20 | pre_trained=None): 21 | 22 | self.state_size = state_size 23 | self.action_size = action_size 24 | 25 | 26 | self.batch_size = batch_size 27 | self.gamma = gamma 28 | self.epsilon = epsilon_start 29 | self.epsilon_min = epsilon_min 30 | self.epsilon_step = (self.epsilon - self.epsilon_min) / epsilon_steps_to_min 31 | self.tau = tau 32 | 33 | self.model = self.build_model(mode, pre_trained) 34 | self.target_model = self.build_model(mode, pre_trained) 35 | self.hard_update_target_network() 36 | 37 | self.use_PER = use_PER 38 | 39 | if self.use_PER: 40 | self.replay_buffer = PrioritizedReplayBuffer(capacity=buffer_size) 41 | else: 42 | self.replay_buffer = Memory(max_size=buffer_size) 43 | 44 | def build_model(self, mode, pre_trained): 45 | model = Sequential() 46 | model.add(Dense(64, input_dim=self.state_size, activation='relu')) 47 | model.add(Dense(64, activation='relu')) 48 | 49 | if mode == "QNetwork": 50 | model.add(Dense(self.action_size, activation='linear')) 51 | 52 | if mode == "DuelingDQN": 53 | model.add(Dense(self.action_size + 1, activation='linear')) 54 | model.add(Lambda(lambda i: K.expand_dims(i[:,0],-1) + i[:,1:] - K.mean(i[:,1:], keepdims=True), 55 | output_shape=(self.action_size,))) 56 | 57 | if pre_trained: 58 | model = load_model(pre_trained) 59 | 60 | model.compile(optimizer=Adam(lr=0.001), loss='mse') 61 | return model 62 | 63 | def hard_update_target_network(self): 64 | pars = self.model.get_weights() 65 | self.target_model.set_weights(pars) 66 | 67 | def soft_update_target_network(self): 68 | pars_behavior = self.model.get_weights() 69 | pars_target = self.target_model.get_weights() 70 | 71 | ctr = 0 72 | for par_behavior,par_target in zip(pars_behavior,pars_target): 73 | par_target = par_target*(1-self.tau) + par_behavior*self.tau 74 | pars_target[ctr] = par_target 75 | ctr += 1 76 | 77 | self.target_model.set_weights(pars_target) 78 | 79 | def remember(self, state, action, reward, next_state, done): 80 | self.replay_buffer.add((state, action, reward, next_state, done)) 81 | 82 | def preprocess(self, state): 83 | return np.reshape(state, [1, self.state_size]) 84 | 85 | def act(self, state): 86 | # Update exploration rate 87 | if self.epsilon > self.epsilon_min: 88 | self.epsilon -= self.epsilon_step 89 | 90 | # Choose Action 91 | if np.random.rand() <= self.epsilon: 92 | action = np.random.choice(self.action_size) 93 | else: 94 | Qs = self.model.predict(state)[0] 95 | action = np.argmax(Qs) 96 | 97 | return action 98 | 99 | def train(self): 100 | indices, mini_batch, importance = self.replay_buffer.sample(self.batch_size) 101 | 102 | states = [] 103 | actions = [] 104 | rewards = [] 105 | next_states = [] 106 | dones = [] 107 | 108 | Q_wants = [] 109 | td_errors = np.zeros(self.batch_size) 110 | 111 | for i in range(len(mini_batch)): 112 | if not self.use_PER: 113 | state, action, reward, next_state, done = mini_batch[i] 114 | else: 115 | state = mini_batch[i][0][0] 116 | action = mini_batch[i][0][1] 117 | reward = mini_batch[i][0][2] 118 | next_state = mini_batch[i][0][3] 119 | done = mini_batch[i][0][4] 120 | 121 | states.append(state) 122 | actions.append(action) 123 | rewards.append(reward) 124 | next_states.append(next_state) 125 | dones.append(done) 126 | 127 | states_tensor = np.reshape(states,(self.batch_size,len(states[0]))) 128 | Q_wants_pred = self.model.predict(states_tensor) 129 | 130 | next_states_tensor = np.reshape(next_states,(self.batch_size,len(next_states[0]))) 131 | Q_next_state_vecs = self.model.predict(next_states_tensor) 132 | Q_target_next_state_vecs = self.target_model.predict(next_states_tensor) 133 | 134 | for i in range(len(mini_batch)): 135 | action = actions[i] 136 | reward = rewards[i] 137 | done = dones[i] 138 | 139 | Q_want = Q_wants_pred[i] 140 | Q_want_old = Q_want[action] 141 | 142 | if done: 143 | Q_want[action] = reward 144 | else: 145 | Q_next_state_vec = Q_next_state_vecs[i] 146 | action_max = np.argmax(Q_next_state_vec) 147 | 148 | Q_target_next_state_vec = Q_target_next_state_vecs[i] 149 | Q_target_next_state_max = Q_target_next_state_vec[action_max] 150 | 151 | Q_want[action] = reward + self.gamma*Q_target_next_state_max 152 | Q_want_tensor = np.reshape(Q_want,(1,len(Q_want))) 153 | 154 | Q_wants.append(Q_want) 155 | td_errors[i] = abs(Q_want_old - Q_want[action]) 156 | 157 | states = np.array(states) 158 | Q_wants = np.array(Q_wants) 159 | self.model.fit(states, Q_wants, verbose=False, epochs=1) 160 | 161 | # update replay buffer 162 | self.replay_buffer.batch_update(indices, np.array(td_errors)) 163 | 164 | def save(self, file='model.h5'): 165 | print('Save model...') 166 | self.model.save(file) 167 | -------------------------------------------------------------------------------- /OpenAI/CartPole-v0/agents/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/CartPole-v0/agents/__init__.py -------------------------------------------------------------------------------- /OpenAI/CartPole-v0/assets/cartpole-v0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/CartPole-v0/assets/cartpole-v0.jpg -------------------------------------------------------------------------------- /OpenAI/CartPole-v0/assets/game_reward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/CartPole-v0/assets/game_reward.png -------------------------------------------------------------------------------- /OpenAI/CartPole-v0/memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import deque 3 | 4 | class Memory(object): 5 | def __init__(self, max_size=2000): 6 | self.max_size = max_size 7 | self.buffer = deque(maxlen=max_size) 8 | 9 | def add(self, experience): 10 | if len(self.buffer) <= self.max_size: 11 | self.buffer.append(experience) 12 | else: 13 | self.buffer[0] = experience 14 | 15 | def sample(self, batch_size): 16 | return [], rn.sample(self.buffer, batch_size), [] 17 | 18 | def batch_update(self, indices, td_errors): 19 | pass 20 | 21 | class PrioritizedReplayBuffer(object): 22 | PER_e = 0.01 23 | PER_a = 0.6 24 | PER_b = 0.4 25 | 26 | PER_b_increment_per_sampling = 0.001 27 | 28 | absolute_error_upper = 1. 29 | 30 | def __init__(self, capacity): 31 | self.tree = SumTree(capacity) 32 | 33 | def add(self, experience): 34 | max_priority = np.max(self.tree.tree[-self.tree.capacity:]) 35 | 36 | if max_priority == 0: 37 | max_priority = self.absolute_error_upper 38 | 39 | self.tree.add(max_priority, experience) 40 | 41 | def sample(self, n): 42 | memory_b = [] 43 | b_idx, b_ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, 1), dtype=np.float32) 44 | priority_segment = self.tree.total_priority / n 45 | self.PER_b = np.min([1., self.PER_b + self.PER_b_increment_per_sampling]) 46 | p_min = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_priority 47 | max_weight = (p_min * n) ** (-self.PER_b) 48 | 49 | for i in range(n): 50 | a, b = priority_segment * i, priority_segment * (i + 1) 51 | value = np.random.uniform(a, b) 52 | 53 | index, priority, data = self.tree.get_leaf(value) 54 | 55 | sampling_probabilities = priority / self.tree.total_priority 56 | 57 | b_ISWeights[i, 0] = np.power(n * sampling_probabilities, -self.PER_b)/ max_weight 58 | 59 | b_idx[i]= index 60 | 61 | experience = [data] 62 | 63 | memory_b.append(experience) 64 | 65 | return b_idx, memory_b, b_ISWeights 66 | 67 | def batch_update(self, tree_idx, abs_errors): 68 | abs_errors += self.PER_e 69 | clipped_errors = np.minimum(abs_errors, self.absolute_error_upper) 70 | ps = np.power(clipped_errors, self.PER_a) 71 | 72 | for ti, p in zip(tree_idx, ps): 73 | self.tree.update(ti, p) 74 | 75 | class SumTree(object): 76 | data_pointer = 0 77 | 78 | def __init__(self, capacity): 79 | self.capacity = capacity 80 | self.tree = np.zeros(2 * capacity - 1) 81 | self.data = np.zeros(capacity, dtype=object) 82 | 83 | def add(self, priority, data): 84 | tree_index = self.data_pointer + self.capacity - 1 85 | self.data[self.data_pointer] = data 86 | self.update (tree_index, priority) 87 | self.data_pointer += 1 88 | 89 | if self.data_pointer >= self.capacity: 90 | self.data_pointer = 0 91 | 92 | def update(self, tree_index, priority): 93 | change = priority - self.tree[tree_index] 94 | self.tree[tree_index] = priority 95 | 96 | while tree_index != 0: 97 | tree_index = (tree_index - 1) // 2 98 | self.tree[tree_index] += change 99 | 100 | def get_leaf(self, v): 101 | parent_index = 0 102 | 103 | while True: 104 | left_child_index = 2 * parent_index + 1 105 | right_child_index = left_child_index + 1 106 | 107 | if left_child_index >= len(self.tree): 108 | leaf_index = parent_index 109 | break 110 | 111 | else: 112 | if v <= self.tree[left_child_index]: 113 | parent_index = left_child_index 114 | 115 | else: 116 | v -= self.tree[left_child_index] 117 | parent_index = right_child_index 118 | 119 | data_index = leaf_index - self.capacity + 1 120 | 121 | return leaf_index, self.tree[leaf_index], self.data[data_index] 122 | 123 | @property 124 | def total_priority(self): 125 | return self.tree[0] 126 | -------------------------------------------------------------------------------- /OpenAI/CartPole-v0/model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/CartPole-v0/model.h5 -------------------------------------------------------------------------------- /OpenAI/CartPole-v0/requirements.txt: -------------------------------------------------------------------------------- 1 | prompt_toolkit==2.0.10 2 | matplotlib 3 | numpy==1.16.4 4 | pandas 5 | opencv-python 6 | pillow 7 | imutils 8 | scikit-image 9 | tqdm 10 | tensorflow-gpu>=2.4.0 11 | Keras==2.3.1 12 | h5py 13 | ipykernel 14 | jupyter 15 | gym 16 | gym[atari] 17 | -------------------------------------------------------------------------------- /OpenAI/HumanoidPyBulletEnv-v0/HumanoidPyBulletEnv-v0.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## HumanoidPyBulletEnv-v0\n", 8 | "\n", 9 | "In this notebook, you will implement a PPO agent with OpenAI Gym's HumanoidPyBulletEnv-v0 environment." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import math\n", 19 | "import random\n", 20 | "import sys\n", 21 | "import pathlib\n", 22 | "\n", 23 | "import gym\n", 24 | "import pybullet\n", 25 | "import pybulletgym\n", 26 | "import numpy as np\n", 27 | "\n", 28 | "import torch\n", 29 | "import torch.nn as nn\n", 30 | "import torch.optim as optim\n", 31 | "import torch.nn.functional as F\n", 32 | "from torch.distributions import Normal" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "from IPython.display import clear_output\n", 42 | "import matplotlib.pyplot as plt\n", 43 | "%matplotlib inline" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "

Use CUDA

" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "use_cuda = torch.cuda.is_available()\n", 60 | "device = torch.device(\"cuda\" if use_cuda else \"cpu\")" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "

Create Environments

" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "from multiprocessing_env import SubprocVecEnv\n", 77 | "import time\n", 78 | "\n", 79 | "LOAD_CHECKPOINT = True\n", 80 | "DO_TRAINING = False\n", 81 | "\n", 82 | "num_envs = 8\n", 83 | "env_name = \"HumanoidPyBulletEnv-v0\"\n", 84 | "\n", 85 | "hidden_size = 64\n", 86 | "\n", 87 | "\n", 88 | "policy_optimizer_lr = 0.00005\n", 89 | "policy_stopping_kl = 0.02\n", 90 | "\n", 91 | "value_optimizer_lr = 0.00015\n", 92 | "value_stopping_mse = 25\n", 93 | "\n", 94 | "entropy_loss_weight = 0.01\n", 95 | "\n", 96 | "num_steps = 1024\n", 97 | "mini_batch_size = 64\n", 98 | "ppo_epochs = 15\n", 99 | "threshold_reward = 6000\n", 100 | "\n", 101 | "\n", 102 | "ACTOR_CHECKPOINT_PATH = pathlib.Path(\"./pretrained/\" + \"actor_\" + env_name + \"_checkpoint.pt\")\n", 103 | "ACTOR_FINAL_PATH = pathlib.Path(\"./pretrained/\" + \"actor_\" + env_name + \"_final.pt\")\n", 104 | "\n", 105 | "CRITIC_CHECKPOINT_PATH = pathlib.Path(\"./pretrained/\" + \"critic_\" + env_name + \"_checkpoint.pt\")\n", 106 | "CRITIC_FINAL_PATH = pathlib.Path(\"./pretrained/\" + \"critic_\" + env_name + \"_final.pt\")\n", 107 | "\n", 108 | "def make_env():\n", 109 | " def _thunk():\n", 110 | " env = gym.make(env_name)\n", 111 | " return env\n", 112 | "\n", 113 | " return _thunk\n", 114 | "\n", 115 | "envs = [make_env() for i in range(num_envs)]\n", 116 | "envs = SubprocVecEnv(envs)\n", 117 | "\n", 118 | "env = gym.make(env_name)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "

Neural Network

" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "class BaseModule(nn.Module):\n", 135 | " def __init__(self):\n", 136 | " super(BaseModule, self).__init__()\n", 137 | "\n", 138 | " def _build_network(self, num_inputs, num_outputs, hidden_size):\n", 139 | " \n", 140 | " if isinstance(hidden_size, int):\n", 141 | " return nn.Sequential(\n", 142 | " nn.Linear(num_inputs, hidden_size),\n", 143 | " nn.ReLU(),\n", 144 | " nn.Linear(hidden_size, num_outputs)\n", 145 | " )\n", 146 | " \n", 147 | " else:\n", 148 | " return nn.Sequential(\n", 149 | " nn.Linear(num_inputs, hidden_size[0]),\n", 150 | " nn.ReLU(),\n", 151 | " *self._build_hidden(hidden_size),\n", 152 | " nn.Linear(hidden_size[-1], num_outputs)\n", 153 | " ) \n", 154 | " \n", 155 | " def _build_hidden(self, hidden_size):\n", 156 | " hidden_layers = []\n", 157 | " for i in range(len(hidden_size)-1): \n", 158 | " hidden_layers.append(nn.Linear(hidden_size[i], hidden_size[i+1]))\n", 159 | " hidden_layers.append(nn.ReLU())\n", 160 | " return hidden_layers \n", 161 | " \n", 162 | "class Actor(BaseModule):\n", 163 | " def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):\n", 164 | " super(Actor, self).__init__()\n", 165 | " self.model = self._build_network(num_inputs, num_outputs, hidden_size) \n", 166 | " self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std, requires_grad=True)\n", 167 | " \n", 168 | " def forward(self, x):\n", 169 | " mu = self.model(x) \n", 170 | " std = self.log_std.exp().expand_as(mu)\n", 171 | " dist = Normal(mu, std)\n", 172 | " return dist\n", 173 | " \n", 174 | "class Critic(BaseModule):\n", 175 | " def __init__(self, num_inputs, hidden_size):\n", 176 | " super(Critic, self).__init__()\n", 177 | " self.model = self._build_network(num_inputs, 1, hidden_size)\n", 178 | " \n", 179 | " def forward(self, x):\n", 180 | " return self.model(x)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "def plot(frame_idx, rewards):\n", 190 | " clear_output(True)\n", 191 | " plt.figure(figsize=(20,5))\n", 192 | " plt.subplot(131)\n", 193 | " plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))\n", 194 | " \n", 195 | " mean = []\n", 196 | " for x in range(len(rewards)):\n", 197 | " mean.append(np.array(rewards[:x]).mean())\n", 198 | " \n", 199 | " plt.plot(rewards, label=\"Reward\")\n", 200 | " plt.plot(mean, label=\"mean\")\n", 201 | " plt.legend()\n", 202 | " plt.show()\n", 203 | " \n", 204 | "def test_env(vis=False):\n", 205 | " state = env.reset()\n", 206 | " if vis: env.render()\n", 207 | " done = False\n", 208 | " total_reward = 0\n", 209 | " while not done:\n", 210 | " state = torch.FloatTensor(state).unsqueeze(0).to(device)\n", 211 | " dist = policy_model(state)\n", 212 | " next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])\n", 213 | " state = next_state\n", 214 | " if vis: env.render()\n", 215 | " total_reward += reward\n", 216 | " return total_reward" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.97):\n", 226 | " values = values + [next_value]\n", 227 | " gae = 0\n", 228 | " returns = []\n", 229 | " for step in reversed(range(len(rewards))):\n", 230 | " delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]\n", 231 | " gae = delta + gamma * tau * masks[step] * gae\n", 232 | " returns.insert(0, gae + values[step])\n", 233 | " return returns" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage):\n", 243 | " batch_size = states.size(0)\n", 244 | " for _ in range(batch_size // mini_batch_size):\n", 245 | " rand_ids = np.random.randint(0, batch_size, mini_batch_size)\n", 246 | " yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]\n", 247 | "\n", 248 | "\n", 249 | "def ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, \n", 250 | " clip_param=0.1, value_loss_coef=0.5, entropy_coef=0.01, max_grad_norm=0.5):\n", 251 | " # Policy\n", 252 | " for _ in range(ppo_epochs):\n", 253 | " for state, action, old_log_probs, return_, advantage in ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages):\n", 254 | " dist = policy_model(state)\n", 255 | " \n", 256 | " entropy = dist.entropy().mean()\n", 257 | " new_log_probs = dist.log_prob(action)\n", 258 | "\n", 259 | " ratio = (new_log_probs - old_log_probs).exp()\n", 260 | " surr1 = ratio * advantage\n", 261 | " surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage\n", 262 | "\n", 263 | " policy_loss = -torch.min(surr1, surr2).mean()\n", 264 | " entropy_loss = -entropy.mean() * entropy_loss_weight\n", 265 | " \n", 266 | " policy_optimizer.zero_grad()\n", 267 | " \n", 268 | " if max_grad_norm:\n", 269 | " torch.nn.utils.clip_grad_norm_(policy_model.model.parameters(), max_grad_norm)\n", 270 | " \n", 271 | " (policy_loss + entropy_loss).backward()\n", 272 | " policy_optimizer.step()\n", 273 | " \n", 274 | " with torch.no_grad():\n", 275 | " dist = policy_model(state) \n", 276 | " logpas_pred_all = dist.log_prob(action) \n", 277 | " kl = (new_log_probs - logpas_pred_all).mean()\n", 278 | " if kl.item() > policy_stopping_kl:\n", 279 | " break\n", 280 | " \n", 281 | " # Value \n", 282 | " for _ in range(ppo_epochs):\n", 283 | " for state, action, old_log_probs, return_, advantage in ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages):\n", 284 | " value = value_model.model(state)\n", 285 | " \n", 286 | " value_loss = 0.5 * (return_ - value).pow(2).mean()\n", 287 | " \n", 288 | " value_optimizer.zero_grad()\n", 289 | " \n", 290 | " if max_grad_norm:\n", 291 | " torch.nn.utils.clip_grad_norm_(value_model.model.parameters(), max_grad_norm) \n", 292 | " \n", 293 | " value_loss.backward()\n", 294 | " value_optimizer.step()\n", 295 | " \n", 296 | " with torch.no_grad():\n", 297 | " values_pred_all = value_model.model(state)\n", 298 | " mse = 0.5 * (value - values_pred_all).pow(2).mean()\n", 299 | " if mse.item() > value_stopping_mse:\n", 300 | " break" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "def loadCheckpoint(filename, model):\n", 310 | " checkpoint = torch.load(filename)\n", 311 | " model.load_state_dict(checkpoint['model_state_dict'])\n", 312 | " model.to(device)\n", 313 | " model.eval()" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "def saveCheckpoint(filename, epoch, model, optimizer):\n", 323 | " checkpoint = {\n", 324 | " 'epoch': epoch,\n", 325 | " 'model_state_dict': model.state_dict(),\n", 326 | " 'optimizer_state_dict': optimizer.state_dict(),\n", 327 | " }\n", 328 | "\n", 329 | " torch.save(checkpoint, filename)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "num_inputs = envs.observation_space.shape[0]\n", 339 | "num_outputs = envs.action_space.shape[0]\n", 340 | "\n", 341 | "policy_model = Actor(num_inputs, num_outputs, hidden_size).to(device)\n", 342 | "policy_optimizer = optim.Adam(policy_model.parameters(), lr=policy_optimizer_lr) \n", 343 | "\n", 344 | "value_model = Critic(num_inputs, hidden_size).to(device)\n", 345 | "value_optimizer = optim.Adam(value_model.parameters(), lr=value_optimizer_lr)\n", 346 | "\n", 347 | "if LOAD_CHECKPOINT: \n", 348 | " loadCheckpoint(ACTOR_CHECKPOINT_PATH, policy_model)\n", 349 | " loadCheckpoint(CRITIC_CHECKPOINT_PATH, value_model) \n", 350 | " \n", 351 | "print(policy_model)\n", 352 | "print(value_model)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "def train():\n", 362 | " frame_idx = 0\n", 363 | " train_epoch = 0\n", 364 | "\n", 365 | " test_rewards = []\n", 366 | " best_reward = None\n", 367 | "\n", 368 | " state = envs.reset()\n", 369 | " early_stop = False\n", 370 | "\n", 371 | " while not early_stop:\n", 372 | " state = envs.reset()\n", 373 | "\n", 374 | " log_probs = []\n", 375 | " values = []\n", 376 | " states = []\n", 377 | " actions = []\n", 378 | " rewards = []\n", 379 | " masks = []\n", 380 | "\n", 381 | " for _ in range(num_steps):\n", 382 | " state = torch.FloatTensor(state).to(device)\n", 383 | " dist = policy_model(state)\n", 384 | " value = value_model(state)\n", 385 | "\n", 386 | " action = dist.sample()\n", 387 | " next_state, reward, done, _ = envs.step(action.cpu().numpy())\n", 388 | "\n", 389 | " log_prob = dist.log_prob(action)\n", 390 | "\n", 391 | " log_probs.append(log_prob)\n", 392 | " values.append(value)\n", 393 | " rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))\n", 394 | " masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))\n", 395 | "\n", 396 | " states.append(state)\n", 397 | " actions.append(action)\n", 398 | "\n", 399 | " state = next_state\n", 400 | " frame_idx += 1\n", 401 | "\n", 402 | " next_state = torch.FloatTensor(next_state).to(device)\n", 403 | " next_value = value_model(next_state)\n", 404 | " returns = compute_gae(next_value, rewards, masks, values)\n", 405 | "\n", 406 | " returns = torch.cat(returns).detach()\n", 407 | " log_probs = torch.cat(log_probs).detach()\n", 408 | " values = torch.cat(values).detach()\n", 409 | " states = torch.cat(states)\n", 410 | " actions = torch.cat(actions)\n", 411 | " advantage = returns - values\n", 412 | "\n", 413 | " ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage)\n", 414 | " train_epoch += 1 \n", 415 | "\n", 416 | " if train_epoch % 10 == 0:\n", 417 | " test_reward = np.mean([test_env() for _ in range(10)]) \n", 418 | " test_rewards.append(test_reward)\n", 419 | " plot(train_epoch, test_rewards)\n", 420 | "\n", 421 | " if best_reward is None or best_reward < test_reward: \n", 422 | " if best_reward is not None: \n", 423 | " saveCheckpoint(ACTOR_FINAL_PATH, train_epoch, policy_model, policy_optimizer)\n", 424 | " saveCheckpoint(CRITIC_FINAL_PATH, train_epoch, value_model, value_optimizer)\n", 425 | "\n", 426 | " best_reward = test_reward\n", 427 | "\n", 428 | " if test_reward > threshold_reward: \n", 429 | " early_stop = True \n", 430 | "\n", 431 | " if train_epoch % 100 == 0:\n", 432 | " saveCheckpoint(ACTOR_CHECKPOINT_PATH, train_epoch, policy_model, policy_optimizer)\n", 433 | " saveCheckpoint(CRITIC_CHECKPOINT_PATH, train_epoch, value_model, value_optimizer)\n", 434 | " \n", 435 | "if DO_TRAINING:\n", 436 | " train()" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": {}, 442 | "source": [ 443 | "### Replay" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": null, 449 | "metadata": {}, 450 | "outputs": [], 451 | "source": [ 452 | "env = gym.make(env_name)\n", 453 | "env.render(mode=\"human\")\n", 454 | "\n", 455 | "for i_episode in range(5):\n", 456 | " \n", 457 | " state = env.reset()\n", 458 | " done = False\n", 459 | " total_reward = 0\n", 460 | " \n", 461 | " frame_idx = 0\n", 462 | " \n", 463 | " distance = 3\n", 464 | " yaw = 0\n", 465 | " \n", 466 | " humanPos, humanOrn = pybullet.getBasePositionAndOrientation(1)\n", 467 | " pybullet.resetDebugVisualizerCamera(distance, yaw, -20, humanPos) \n", 468 | " \n", 469 | " while not done:\n", 470 | " frame_idx += 1\n", 471 | " \n", 472 | " state = torch.FloatTensor(state).unsqueeze(0).to(device)\n", 473 | " dist = policy_model(state)\n", 474 | " action = dist.sample().cpu().numpy()[0]\n", 475 | " next_state, reward, done, _ = env.step(action)\n", 476 | " \n", 477 | " state = next_state\n", 478 | " total_reward += reward\n", 479 | " \n", 480 | " time.sleep(1/30)\n", 481 | " \n", 482 | " if frame_idx % 150 == 0:\n", 483 | " humanPos, humanOrn = pybullet.getBasePositionAndOrientation(1)\n", 484 | " pybullet.resetDebugVisualizerCamera(distance, yaw, -20, humanPos) \n", 485 | " \n", 486 | " print(\"episode:\", i_episode, \"reward:\", total_reward, \"frames\", frame_idx)\n", 487 | "\n", 488 | "env.close()" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "metadata": {}, 502 | "outputs": [], 503 | "source": [] 504 | } 505 | ], 506 | "metadata": { 507 | "kernelspec": { 508 | "display_name": "Python 3", 509 | "language": "python", 510 | "name": "python3" 511 | }, 512 | "language_info": { 513 | "codemirror_mode": { 514 | "name": "ipython", 515 | "version": 3 516 | }, 517 | "file_extension": ".py", 518 | "mimetype": "text/x-python", 519 | "name": "python", 520 | "nbconvert_exporter": "python", 521 | "pygments_lexer": "ipython3", 522 | "version": "3.8.8" 523 | } 524 | }, 525 | "nbformat": 4, 526 | "nbformat_minor": 2 527 | } 528 | -------------------------------------------------------------------------------- /OpenAI/HumanoidPyBulletEnv-v0/README.md: -------------------------------------------------------------------------------- 1 | # Humanoid Walker Problem 2 | 3 | ### Getting Started 4 | The environment to the Humanoid is described [here](https://github.com/benelot/pybullet-gym/blob/master/README.md). 5 | 6 | ### Solution Video 7 | [![HumanoidPyBulletEnv-v0](http://img.youtube.com/vi/dxZP1icxsMw/0.jpg)](https://www.youtube.com/watch?v=dxZP1icxsMw "BipedalWalker-v3") 8 | 9 | The video shows in the first part the behaviour of the untrained agent and then in comparison the behaviour of the trained agent. 10 | 11 | ### Solution Info 12 | My learning algorithm is a [Proximal Policy Optimization(PPO)]([https://arxiv.org/pdf/1707.06347.pdf]). 13 | 14 | ### Instructions 15 | 16 | start Jupyter Notebook `HumanoidPyBulletEnv-v0.ipynb` and follow the instructions. 17 | -------------------------------------------------------------------------------- /OpenAI/HumanoidPyBulletEnv-v0/multiprocessing_env.py: -------------------------------------------------------------------------------- 1 | #This code is from openai baseline 2 | #https://github.com/openai/baselines/tree/master/baselines/common/vec_env 3 | 4 | import numpy as np 5 | from multiprocessing import Process, Pipe 6 | import gym 7 | import pybullet 8 | import pybulletgym 9 | 10 | def worker(remote, parent_remote, env_fn_wrapper): 11 | parent_remote.close() 12 | env = env_fn_wrapper.x() 13 | while True: 14 | cmd, data = remote.recv() 15 | if cmd == 'step': 16 | ob, reward, done, info = env.step(data) 17 | if done: 18 | ob = env.reset() 19 | remote.send((ob, reward, done, info)) 20 | elif cmd == 'reset': 21 | ob = env.reset() 22 | remote.send(ob) 23 | elif cmd == 'reset_task': 24 | ob = env.reset_task() 25 | remote.send(ob) 26 | elif cmd == 'close': 27 | remote.close() 28 | break 29 | elif cmd == 'get_spaces': 30 | remote.send((env.observation_space, env.action_space)) 31 | else: 32 | raise NotImplementedError 33 | 34 | class VecEnv(object): 35 | """ 36 | An abstract asynchronous, vectorized environment. 37 | """ 38 | def __init__(self, num_envs, observation_space, action_space): 39 | self.num_envs = num_envs 40 | self.observation_space = observation_space 41 | self.action_space = action_space 42 | 43 | def reset(self): 44 | """ 45 | Reset all the environments and return an array of 46 | observations, or a tuple of observation arrays. 47 | If step_async is still doing work, that work will 48 | be cancelled and step_wait() should not be called 49 | until step_async() is invoked again. 50 | """ 51 | pass 52 | 53 | def step_async(self, actions): 54 | """ 55 | Tell all the environments to start taking a step 56 | with the given actions. 57 | Call step_wait() to get the results of the step. 58 | You should not call this if a step_async run is 59 | already pending. 60 | """ 61 | pass 62 | 63 | def step_wait(self): 64 | """ 65 | Wait for the step taken with step_async(). 66 | Returns (obs, rews, dones, infos): 67 | - obs: an array of observations, or a tuple of 68 | arrays of observations. 69 | - rews: an array of rewards 70 | - dones: an array of "episode done" booleans 71 | - infos: a sequence of info objects 72 | """ 73 | pass 74 | 75 | def close(self): 76 | """ 77 | Clean up the environments' resources. 78 | """ 79 | pass 80 | 81 | def step(self, actions): 82 | self.step_async(actions) 83 | return self.step_wait() 84 | 85 | 86 | class CloudpickleWrapper(object): 87 | """ 88 | Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) 89 | """ 90 | def __init__(self, x): 91 | self.x = x 92 | def __getstate__(self): 93 | import cloudpickle 94 | return cloudpickle.dumps(self.x) 95 | def __setstate__(self, ob): 96 | import pickle 97 | self.x = pickle.loads(ob) 98 | 99 | 100 | class SubprocVecEnv(VecEnv): 101 | def __init__(self, env_fns, spaces=None): 102 | """ 103 | envs: list of gym environments to run in subprocesses 104 | """ 105 | self.waiting = False 106 | self.closed = False 107 | nenvs = len(env_fns) 108 | self.nenvs = nenvs 109 | self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) 110 | self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) 111 | for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] 112 | for p in self.ps: 113 | p.daemon = True # if the main process crashes, we should not cause things to hang 114 | p.start() 115 | for remote in self.work_remotes: 116 | remote.close() 117 | 118 | self.remotes[0].send(('get_spaces', None)) 119 | observation_space, action_space = self.remotes[0].recv() 120 | VecEnv.__init__(self, len(env_fns), observation_space, action_space) 121 | 122 | def step_async(self, actions): 123 | for remote, action in zip(self.remotes, actions): 124 | remote.send(('step', action)) 125 | self.waiting = True 126 | 127 | def step_wait(self): 128 | results = [remote.recv() for remote in self.remotes] 129 | self.waiting = False 130 | obs, rews, dones, infos = zip(*results) 131 | return np.stack(obs), np.stack(rews), np.stack(dones), infos 132 | 133 | def reset(self): 134 | for remote in self.remotes: 135 | remote.send(('reset', None)) 136 | return np.stack([remote.recv() for remote in self.remotes]) 137 | 138 | def reset_idx(self, idx): 139 | self.remotes[idx].send(('reset', None)) 140 | return self.remotes[idx].recv() 141 | 142 | def reset_task(self): 143 | for remote in self.remotes: 144 | remote.send(('reset_task', None)) 145 | return np.stack([remote.recv() for remote in self.remotes]) 146 | 147 | def close(self): 148 | if self.closed: 149 | return 150 | if self.waiting: 151 | for remote in self.remotes: 152 | remote.recv() 153 | for remote in self.remotes: 154 | remote.send(('close', None)) 155 | for p in self.ps: 156 | p.join() 157 | self.closed = True 158 | 159 | def __len__(self): 160 | return self.nenvs -------------------------------------------------------------------------------- /OpenAI/HumanoidPyBulletEnv-v0/pretrained/actor_HumanoidPyBulletEnv-v0_checkpoint.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/HumanoidPyBulletEnv-v0/pretrained/actor_HumanoidPyBulletEnv-v0_checkpoint.pt -------------------------------------------------------------------------------- /OpenAI/HumanoidPyBulletEnv-v0/pretrained/actor_HumanoidPyBulletEnv-v0_final.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/HumanoidPyBulletEnv-v0/pretrained/actor_HumanoidPyBulletEnv-v0_final.pt -------------------------------------------------------------------------------- /OpenAI/HumanoidPyBulletEnv-v0/pretrained/critic_HumanoidPyBulletEnv-v0_checkpoint.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/HumanoidPyBulletEnv-v0/pretrained/critic_HumanoidPyBulletEnv-v0_checkpoint.pt -------------------------------------------------------------------------------- /OpenAI/HumanoidPyBulletEnv-v0/pretrained/critic_HumanoidPyBulletEnv-v0_final.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/HumanoidPyBulletEnv-v0/pretrained/critic_HumanoidPyBulletEnv-v0_final.pt -------------------------------------------------------------------------------- /OpenAI/LunarLander-v2/README.md: -------------------------------------------------------------------------------- 1 | # LunarLander Problem 2 | 3 | ### Getting Started 4 | The environment to the LunarLanderContinuous is described [here](https://gym.openai.com/envs/LunarLanderContinuous-v2/). 5 | 6 | ### Solution Video 7 | [![LunarLanderContinuous-v2](http://img.youtube.com/vi/615X49z3u6o/0.jpg)](https://www.youtube.com/watch?v=615X49z3u6o "LunarLanderContinuous-v2") 8 | 9 | The video shows in the first part the behaviour of the untrained agent and then in comparison the behaviour of the trained agent. 10 | 11 | ### Solution Info 12 | My learning algorithm is a Deep Deterministic Policy Gradient. 13 | 14 | DDPG is an actor-critic algorithm and primarily uses two neural networks. 15 | One for the actor and one for the critic. These networks calculate action vectors for the current state and and generate a temporal-difference error signal each time step. 16 | 17 | DDPG uses a stochastic behavioral policy for good exploration and a deterministic target policy for estimating. 18 | 19 | The current state is the input of the actuator network and the output is a single value representing the action. The deterministic policy gradient theorem provides the update rule for the weights of the actor network. 20 | 21 | The critic's output is simply the estimated Q-value of the current state and the action given by the actor. The critic network is updated from the gradients obtained from the TD error signal. 22 | 23 | More general information about DDPG in [this](https://arxiv.org/pdf/1509.02971.pdf) paper. 24 | 25 | ### Instructions 26 | 27 | start Jupyter Notebook `LunarLanderContinuous-v2 (DDPG).ipynb` and follow the instructions. -------------------------------------------------------------------------------- /OpenAI/LunarLander-v2/checkpoint_actor.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/LunarLander-v2/checkpoint_actor.pth -------------------------------------------------------------------------------- /OpenAI/LunarLander-v2/checkpoint_critic.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/LunarLander-v2/checkpoint_critic.pth -------------------------------------------------------------------------------- /OpenAI/LunarLander-v2/ddpg_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import copy 4 | from collections import namedtuple, deque 5 | 6 | from model import Actor, Critic 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | import torch.optim as optim 11 | 12 | BUFFER_SIZE = int(1e6) # replay buffer size 13 | BATCH_SIZE = 64 # minibatch size 14 | GAMMA = 0.99 # discount factor 15 | TAU = 1e-3 # for soft update of target parameters 16 | LR_ACTOR = 1e-4 # learning rate of the actor 17 | LR_CRITIC = 1e-3 # learning rate of the critic 18 | WEIGHT_DECAY = 0.0001 # L2 weight decay 19 | EPSILON = 1.0 20 | EPSILON_MIN = 0.1 21 | EPSILON_DECAY = 1e-6 22 | 23 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 24 | 25 | class Agent(): 26 | """Interacts with and learns from the environment.""" 27 | 28 | def __init__(self, state_size, action_size, random_seed): 29 | """Initialize an Agent object. 30 | 31 | Params 32 | ====== 33 | state_size (int): dimension of each state 34 | action_size (int): dimension of each action 35 | random_seed (int): random seed 36 | """ 37 | self.state_size = state_size 38 | self.action_size = action_size 39 | self.seed = random.seed(random_seed) 40 | self.epsilon = EPSILON 41 | 42 | # Actor Network (w/ Target Network) 43 | self.actor_local = Actor(state_size, action_size, random_seed).to(device) 44 | self.actor_target = Actor(state_size, action_size, random_seed).to(device) 45 | self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) 46 | 47 | # Critic Network (w/ Target Network) 48 | self.critic_local = Critic(state_size, action_size, random_seed).to(device) 49 | self.critic_target = Critic(state_size, action_size, random_seed).to(device) 50 | self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) 51 | 52 | # Noise process 53 | self.noise = OUNoise(action_size, random_seed) 54 | 55 | # Replay memory 56 | self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) 57 | 58 | # Make sure target is with the same weight as the source 59 | self.hard_update(self.actor_target, self.actor_local) 60 | self.hard_update(self.critic_target, self.critic_local) 61 | 62 | def step(self, state, action, reward, next_state, done, timestep): 63 | """Save experience in replay memory, and use random sample from buffer to learn.""" 64 | # Save experience / reward 65 | self.memory.add(state, action, reward, next_state, done) 66 | 67 | # Learn, if enough samples are available in memory 68 | if len(self.memory) > BATCH_SIZE and timestep % 20 == 0: 69 | for _ in range(10): 70 | experiences = self.memory.sample() 71 | self.learn(experiences, GAMMA) 72 | 73 | def act(self, state, add_noise=True): 74 | """Returns actions for given state as per current policy.""" 75 | 76 | state = torch.from_numpy(state).float().to(device) 77 | 78 | self.actor_local.eval() 79 | with torch.no_grad(): 80 | action = self.actor_local(state).cpu().data.numpy() 81 | self.actor_local.train() 82 | 83 | if add_noise: 84 | action += self.epsilon * self.noise.sample() 85 | 86 | return action 87 | 88 | def reset(self): 89 | self.noise.reset() 90 | 91 | def learn(self, experiences, gamma): 92 | """Update policy and value parameters using given batch of experience tuples. 93 | Q_targets = r + ? * critic_target(next_state, actor_target(next_state)) 94 | where: 95 | actor_target(state) -> action 96 | critic_target(state, action) -> Q-value 97 | 98 | Params 99 | ====== 100 | experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 101 | gamma (float): discount factor 102 | """ 103 | states, actions, rewards, next_states, dones = experiences 104 | 105 | # ---------------------------- update critic ---------------------------- # 106 | # Get predicted next-state actions and Q values from target models 107 | actions_next = self.actor_target(next_states) 108 | Q_targets_next = self.critic_target(next_states, actions_next) 109 | 110 | # Compute Q targets for current states (y_i) 111 | Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) 112 | 113 | # Compute critic loss 114 | Q_expected = self.critic_local(states, actions) 115 | critic_loss = F.mse_loss(Q_expected, Q_targets) 116 | 117 | # Minimize the loss 118 | self.critic_optimizer.zero_grad() 119 | critic_loss.backward() 120 | torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) 121 | self.critic_optimizer.step() 122 | 123 | # ---------------------------- update actor ---------------------------- # 124 | # Compute actor loss 125 | actions_pred = self.actor_local(states) 126 | actor_loss = -self.critic_local(states, actions_pred).mean() 127 | 128 | # Minimize the loss 129 | self.actor_optimizer.zero_grad() 130 | actor_loss.backward() 131 | self.actor_optimizer.step() 132 | 133 | # ----------------------- update target networks ----------------------- # 134 | self.soft_update(self.critic_local, self.critic_target, TAU) 135 | self.soft_update(self.actor_local, self.actor_target, TAU) 136 | 137 | # ---------------------------- update noise ---------------------------- # 138 | if self.epsilon - EPSILON_DECAY > EPSILON_MIN: 139 | self.epsilon -= EPSILON_DECAY 140 | 141 | self.noise.reset() 142 | 143 | def soft_update(self, local_model, target_model, tau): 144 | """Soft update model parameters. 145 | ?_target = t*?_local + (1 - t)*?_target 146 | 147 | Params 148 | ====== 149 | local_model: PyTorch model (weights will be copied from) 150 | target_model: PyTorch model (weights will be copied to) 151 | tau (float): interpolation parameter 152 | """ 153 | for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): 154 | target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) 155 | 156 | def hard_update(self, target, source): 157 | for target_param, param in zip(target.parameters(), source.parameters()): 158 | target_param.data.copy_(param.data) 159 | 160 | class OUNoise: 161 | """Ornstein-Uhlenbeck process.""" 162 | 163 | def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.3): 164 | """Initialize parameters and noise process.""" 165 | self.mu = mu * np.ones(size) 166 | self.theta = theta 167 | self.sigma = sigma 168 | self.seed = random.seed(seed) 169 | self.reset() 170 | 171 | def reset(self): 172 | """Reset the internal state (= noise) to mean (mu).""" 173 | self.state = copy.copy(self.mu) 174 | 175 | def sample(self): 176 | """Update internal state and return it as a noise sample.""" 177 | x = self.state 178 | dx = self.theta * (self.mu - x) + self.sigma * np.array([random.random() for i in range(len(x))]) 179 | self.state = x + dx 180 | return self.state 181 | 182 | class ReplayBuffer: 183 | """Fixed-size buffer to store experience tuples.""" 184 | 185 | def __init__(self, action_size, buffer_size, batch_size, seed): 186 | """Initialize a ReplayBuffer object. 187 | Params 188 | ====== 189 | buffer_size (int): maximum size of buffer 190 | batch_size (int): size of each training batch 191 | """ 192 | self.action_size = action_size 193 | self.memory = deque(maxlen=buffer_size) # internal memory (deque) 194 | self.batch_size = batch_size 195 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) 196 | self.seed = random.seed(seed) 197 | 198 | def add(self, state, action, reward, next_state, done): 199 | """Add a new experience to memory.""" 200 | e = self.experience(state, action, reward, next_state, done) 201 | self.memory.append(e) 202 | 203 | def sample(self): 204 | """Randomly sample a batch of experiences from memory.""" 205 | experiences = random.sample(self.memory, k=self.batch_size) 206 | 207 | states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) 208 | actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device) 209 | rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device) 210 | next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device) 211 | dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device) 212 | 213 | return (states, actions, rewards, next_states, dones) 214 | 215 | def __len__(self): 216 | """Return the current size of internal memory.""" 217 | return len(self.memory) -------------------------------------------------------------------------------- /OpenAI/LunarLander-v2/model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | def hidden_init(layer): 8 | fan_in = layer.weight.data.size()[0] 9 | lim = 1. / np.sqrt(fan_in) 10 | return (-lim, lim) 11 | 12 | class Actor(nn.Module): 13 | """Actor (Policy) Model.""" 14 | 15 | def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64): 16 | """Initialize parameters and build model. 17 | Params 18 | ====== 19 | state_size (int): Dimension of each state 20 | action_size (int): Dimension of each action 21 | seed (int): Random seed 22 | fc1_units (int): Number of nodes in first hidden layer 23 | fc2_units (int): Number of nodes in second hidden layer 24 | """ 25 | super(Actor, self).__init__() 26 | self.seed = torch.manual_seed(seed) 27 | 28 | self.fc1 = nn.Linear(state_size, fc1_units) 29 | self.fc2 = nn.Linear(fc1_units, fc2_units) 30 | self.fc3 = nn.Linear(fc2_units, action_size) 31 | self.reset_parameters() 32 | 33 | def reset_parameters(self): 34 | self.fc1.weight.data.uniform_(*hidden_init(self.fc1)) 35 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2)) 36 | self.fc3.weight.data.uniform_(-3e-3, 3e-3) 37 | 38 | def forward(self, state): 39 | """Build an actor (policy) network that maps states -> actions.""" 40 | x = state 41 | x = F.relu(self.fc1(x)) 42 | x = F.relu(self.fc2(x)) 43 | return torch.tanh(self.fc3(x)) 44 | 45 | class Critic(nn.Module): 46 | """Critic (Value) Model.""" 47 | 48 | def __init__(self, state_size, action_size, seed, fcs1_units=128, fc2_units=64): 49 | """Initialize parameters and build model. 50 | Params 51 | ====== 52 | state_size (int): Dimension of each state 53 | action_size (int): Dimension of each action 54 | seed (int): Random seed 55 | fcs1_units (int): Number of nodes in the first hidden layer 56 | fc2_units (int): Number of nodes in the second hidden layer 57 | fc3_units (int): Number of nodes in the third hidden layer 58 | """ 59 | super(Critic, self).__init__() 60 | self.seed = torch.manual_seed(seed) 61 | self.bn0 = nn.BatchNorm1d(state_size) 62 | self.fcs1 = nn.Linear(state_size, fcs1_units) 63 | self.fc2 = nn.Linear(fcs1_units+action_size, fc2_units) 64 | self.fc3 = nn.Linear(fc2_units, 1) 65 | self.reset_parameters() 66 | 67 | def reset_parameters(self): 68 | self.fcs1.weight.data.uniform_(*hidden_init(self.fcs1)) 69 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2)) 70 | self.fc3.weight.data.uniform_(-3e-3, 3e-3) 71 | 72 | def forward(self, state, action): 73 | """Build a critic (value) network that maps (state, action) pairs -> Q-values.""" 74 | state = self.bn0(state) 75 | xs = F.relu(self.fcs1(state)) 76 | x = torch.cat((xs, action), dim=1) 77 | x = F.relu(self.fc2(x)) 78 | return self.fc3(x) -------------------------------------------------------------------------------- /OpenAI/MountainCarContinuous-v0/Agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import copy 4 | from collections import namedtuple, deque 5 | 6 | from Model import Actor, Critic 7 | from Noise import OUNoise 8 | 9 | import torch 10 | import torch.nn.functional as F 11 | import torch.optim as optim 12 | 13 | BUFFER_SIZE = int(1e6) # replay buffer size 14 | BATCH_SIZE = 64 # minibatch size 15 | GAMMA = 0.99 # discount factor 16 | TAU = 1e-3 # for soft update of target parameters 17 | LR_ACTOR = 1e-2 # learning rate of the actor 18 | LR_CRITIC = 5e-3 # learning rate of the critic 19 | WEIGHT_DECAY = 0 # L2 weight decay 20 | EPSILON_MAX = 1.0 21 | EPSILON_MIN = 0.1 22 | EPSILON_DECAY = 1e-6 23 | LEARN_START = 20000 24 | UPDATE_EVERY = 1 25 | UPDATES_PER_STEP = 1 26 | 27 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 28 | 29 | class Agent(): 30 | """Interacts with and learns from the environment.""" 31 | 32 | def __init__(self, state_size, action_size, random_seed): 33 | """Initialize an Agent object. 34 | 35 | Params 36 | ====== 37 | state_size (int): dimension of each state 38 | action_size (int): dimension of each action 39 | random_seed (int): random seed 40 | """ 41 | self.state_size = state_size 42 | self.action_size = action_size 43 | self.seed = random.seed(random_seed) 44 | self.epsilon = EPSILON_MAX 45 | 46 | # Actor Network (w/ Target Network) 47 | self.actor_local = Actor(state_size, action_size, random_seed).to(device) 48 | self.actor_target = Actor(state_size, action_size, random_seed).to(device) 49 | self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) 50 | 51 | # Critic Network (w/ Target Network) 52 | self.critic_local = Critic(state_size, action_size, random_seed).to(device) 53 | self.critic_target = Critic(state_size, action_size, random_seed).to(device) 54 | self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) 55 | 56 | # Noise process 57 | self.noise = OUNoise(action_size, random_seed, mu=0, theta=0.15, sigma=0.2) 58 | 59 | # Replay memory 60 | self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) 61 | 62 | # Make sure target is with the same weight as the source 63 | self.hard_update(self.actor_target, self.actor_local) 64 | self.hard_update(self.critic_target, self.critic_local) 65 | 66 | self.t_step = 0 67 | 68 | def step(self, state, action, reward, next_state, done, timestep): 69 | """Save experience in replay memory, and use random sample from buffer to learn.""" 70 | # Save experience / reward 71 | self.memory.add(state, action, reward, next_state, done) 72 | 73 | if len(self.memory) > LEARN_START: 74 | # Learn every UPDATE_EVERY time steps. 75 | self.t_step = (self.t_step + 1) % UPDATE_EVERY 76 | if self.t_step == 0: 77 | # Learn, if enough samples are available in memory 78 | if len(self.memory) > BATCH_SIZE: 79 | for _ in range(UPDATES_PER_STEP): 80 | experiences = self.memory.sample() 81 | self.learn(experiences, GAMMA) 82 | 83 | def act(self, state, add_noise=True): 84 | """Returns actions for given state as per current policy.""" 85 | 86 | state = torch.from_numpy(state).float().to(device) 87 | 88 | self.actor_local.eval() 89 | with torch.no_grad(): 90 | action = self.actor_local(state).cpu().data.numpy() 91 | 92 | self.actor_local.train() 93 | 94 | if add_noise: 95 | action += self.epsilon * self.noise.sample() 96 | 97 | return np.clip(action, -1, 1) 98 | 99 | def reset(self): 100 | self.noise.reset() 101 | 102 | def learn(self, experiences, gamma): 103 | """Update policy and value parameters using given batch of experience tuples. 104 | Q_targets = r + ? * critic_target(next_state, actor_target(next_state)) 105 | where: 106 | actor_target(state) -> action 107 | critic_target(state, action) -> Q-value 108 | 109 | Params 110 | ====== 111 | experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 112 | gamma (float): discount factor 113 | """ 114 | states, actions, rewards, next_states, dones = experiences 115 | 116 | # ---------------------------- update critic ---------------------------- # 117 | # Get predicted next-state actions and Q values from target models 118 | actions_next = self.actor_target(next_states) 119 | Q_targets_next = self.critic_target(next_states, actions_next) 120 | 121 | # Compute Q targets for current states (y_i) 122 | Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) 123 | 124 | # Compute critic loss 125 | Q_expected = self.critic_local(states, actions) 126 | critic_loss = F.mse_loss(Q_expected, Q_targets) 127 | 128 | # Minimize the loss 129 | self.critic_optimizer.zero_grad() 130 | critic_loss.backward() 131 | #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) 132 | self.critic_optimizer.step() 133 | 134 | # ---------------------------- update actor ---------------------------- # 135 | # Compute actor loss 136 | actions_pred = self.actor_local(states) 137 | actor_loss = -self.critic_local(states, actions_pred).mean() 138 | 139 | # Minimize the loss 140 | self.actor_optimizer.zero_grad() 141 | actor_loss.backward() 142 | self.actor_optimizer.step() 143 | 144 | # ----------------------- update target networks ----------------------- # 145 | self.soft_update(self.critic_local, self.critic_target, TAU) 146 | self.soft_update(self.actor_local, self.actor_target, TAU) 147 | 148 | # ---------------------------- update noise ---------------------------- # 149 | if self.epsilon - EPSILON_DECAY > EPSILON_MIN: 150 | self.epsilon -= EPSILON_DECAY 151 | else: 152 | self.epsilon = EPSILON_MIN 153 | 154 | self.noise.reset() 155 | 156 | def soft_update(self, local_model, target_model, tau): 157 | """Soft update model parameters. 158 | ?_target = t*?_local + (1 - t)*?_target 159 | 160 | Params 161 | ====== 162 | local_model: PyTorch model (weights will be copied from) 163 | target_model: PyTorch model (weights will be copied to) 164 | tau (float): interpolation parameter 165 | """ 166 | for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): 167 | target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) 168 | 169 | def hard_update(self, target, source): 170 | for target_param, param in zip(target.parameters(), source.parameters()): 171 | target_param.data.copy_(param.data) 172 | 173 | class ReplayBuffer: 174 | """Fixed-size buffer to store experience tuples.""" 175 | 176 | def __init__(self, action_size, buffer_size, batch_size, seed): 177 | """Initialize a ReplayBuffer object. 178 | Params 179 | ====== 180 | buffer_size (int): maximum size of buffer 181 | batch_size (int): size of each training batch 182 | """ 183 | self.action_size = action_size 184 | self.buffer_size = buffer_size 185 | self.batch_size = batch_size 186 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) 187 | self.seed = random.seed(seed) 188 | 189 | self.reset() 190 | 191 | def add(self, state, action, reward, next_state, done): 192 | """Add a new experience to memory.""" 193 | e = self.experience(state, action, reward, next_state, done) 194 | self.memory.append(e) 195 | 196 | def reset(self): 197 | self.memory = deque(maxlen=self.buffer_size) 198 | 199 | def sample(self): 200 | """Randomly sample a batch of experiences from memory.""" 201 | experiences = random.sample(self.memory, k=self.batch_size) 202 | 203 | states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) 204 | actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device) 205 | rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device) 206 | next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device) 207 | dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device) 208 | 209 | return states, actions, rewards, next_states, dones 210 | 211 | def __len__(self): 212 | """Return the current size of internal memory.""" 213 | return len(self.memory) -------------------------------------------------------------------------------- /OpenAI/MountainCarContinuous-v0/Model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | def hidden_init(layer): 8 | fan_in = layer.weight.data.size()[0] 9 | lim = 1. / np.sqrt(fan_in) 10 | return (-lim, lim) 11 | 12 | 13 | class Actor(nn.Module): 14 | """Actor (Policy) Model.""" 15 | 16 | def __init__(self, state_size, action_size, seed, fc1_units=5, fc2_units=5): 17 | """Initialize parameters and build model. 18 | Params 19 | ====== 20 | state_size (int): Dimension of each state 21 | action_size (int): Dimension of each action 22 | seed (int): Random seed 23 | fc1_units (int): Number of nodes in first hidden layer 24 | fc2_units (int): Number of nodes in second hidden layer 25 | """ 26 | super(Actor, self).__init__() 27 | self.seed = torch.manual_seed(seed) 28 | 29 | self.fc1 = nn.Linear(state_size, fc1_units) 30 | self.ln1 = nn.LayerNorm(fc1_units) 31 | self.fc2 = nn.Linear(fc1_units, fc2_units) 32 | self.ln2 = nn.LayerNorm(fc2_units) 33 | self.fc3 = nn.Linear(fc2_units, action_size) 34 | self.reset_parameters() 35 | 36 | def reset_parameters(self): 37 | self.fc1.weight.data.uniform_(*hidden_init(self.fc1)) 38 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2)) 39 | self.fc3.weight.data.uniform_(-3e-3, 3e-3) 40 | 41 | def forward(self, state): 42 | """Build an actor (policy) network that maps states -> actions.""" 43 | x = state 44 | x = self.fc1(x) 45 | x = self.ln1(x) 46 | x = F.relu(x) 47 | x = self.fc2(x) 48 | x = self.ln2(x) 49 | x = F.relu(x) 50 | x = self.fc3(x) 51 | return torch.tanh(x) 52 | 53 | 54 | class Critic(nn.Module): 55 | """Critic (Value) Model.""" 56 | 57 | def __init__(self, state_size, action_size, seed, fc1_units=20, fc2_units=10): 58 | """Initialize parameters and build model. 59 | Params 60 | ====== 61 | state_size (int): Dimension of each state 62 | action_size (int): Dimension of each action 63 | seed (int): Random seed 64 | fcs1_units (int): Number of nodes in the first hidden layer 65 | fc2_units (int): Number of nodes in the second hidden layer 66 | fc3_units (int): Number of nodes in the third hidden layer 67 | """ 68 | super(Critic, self).__init__() 69 | self.seed = torch.manual_seed(seed) 70 | self.fc1 = nn.Linear(state_size, fc1_units) 71 | self.bn1 = nn.BatchNorm1d(fc1_units) 72 | self.fc2 = nn.Linear(fc1_units+action_size, fc2_units) 73 | self.fc3 = nn.Linear(fc2_units, 1) 74 | self.reset_parameters() 75 | 76 | def reset_parameters(self): 77 | self.fc1.weight.data.uniform_(*hidden_init(self.fc1)) 78 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2)) 79 | self.fc3.weight.data.uniform_(-3e-3, 3e-3) 80 | 81 | def forward(self, state, action): 82 | """Build a critic (value) network that maps (state, action) pairs -> Q-values.""" 83 | xs = self.fc1(state) 84 | xs = self.bn1(xs) 85 | xs = F.leaky_relu(xs) 86 | x = torch.cat((xs, action), dim=1) 87 | x = self.fc2(x) 88 | x = F.leaky_relu(x) 89 | return self.fc3(x) 90 | -------------------------------------------------------------------------------- /OpenAI/MountainCarContinuous-v0/Noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import copy 4 | 5 | class OUNoise: 6 | """Ornstein-Uhlenbeck process.""" 7 | 8 | def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2): 9 | """Initialize parameters and noise process.""" 10 | self.mu = mu * np.ones(size) 11 | self.theta = theta 12 | self.sigma = sigma 13 | self.seed = random.seed(seed) 14 | self.reset() 15 | 16 | def reset(self): 17 | """Reset the internal state (= noise) to mean (mu).""" 18 | self.state = copy.copy(self.mu) 19 | 20 | def sample(self): 21 | """Update internal state and return it as a noise sample.""" 22 | x = self.state 23 | dx = self.theta * (self.mu - x) + self.sigma * np.array([random.random() for i in range(len(x))]) 24 | self.state = x + dx 25 | return self.state 26 | -------------------------------------------------------------------------------- /OpenAI/MountainCarContinuous-v0/README.md: -------------------------------------------------------------------------------- 1 | # MountainCarContinuous Problem 2 | 3 | ### Getting Started 4 | The environment to the MountainCarContinuous is described [here](https://github.com/openai/gym/wiki/MountainCarContinuous-v0). 5 | 6 | ### Solution Video 7 | [![LunarLanderContinuous-v2](http://img.youtube.com/vi/RGKRfxfEFEA/0.jpg)](https://www.youtube.com/watch?v=RGKRfxfEFEA "MountainCarContinuous-v0") 8 | 9 | The video shows the solution of the environment after 32 episodes. 10 | 11 | ### Solution Info 12 | My learning algorithm is a Deep Deterministic Policy Gradient. 13 | 14 | DDPG is an actor-critic algorithm and primarily uses two neural networks. 15 | One for the actor and one for the critic. These networks calculate action vectors for the current state and and generate a temporal-difference error signal each time step. 16 | 17 | DDPG uses a stochastic behavioral policy for good exploration and a deterministic target policy for estimating. 18 | 19 | The current state is the input of the actuator network and the output is a single value representing the action. The deterministic policy gradient theorem provides the update rule for the weights of the actor network. 20 | 21 | The critic's output is simply the estimated Q-value of the current state and the action given by the actor. The critic network is updated from the gradients obtained from the TD error signal. 22 | 23 | More general information about DDPG in [this](https://arxiv.org/pdf/1509.02971.pdf) paper. 24 | 25 | ### Instructions 26 | 27 | start Jupyter Notebook `MountainCarContinuous-v0 (DDPG).ipynb` and follow the instructions. -------------------------------------------------------------------------------- /OpenAI/MountainCarContinuous-v0/checkpoint_actor.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/MountainCarContinuous-v0/checkpoint_actor.pth -------------------------------------------------------------------------------- /OpenAI/MountainCarContinuous-v0/checkpoint_critic.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/MountainCarContinuous-v0/checkpoint_critic.pth -------------------------------------------------------------------------------- /OpenAI/Taxi-v2/README.md: -------------------------------------------------------------------------------- 1 | # Taxi Problem 2 | 3 | ### Getting Started 4 | 5 | Read the description of the environment in subsection 3.1 of [this paper](https://arxiv.org/pdf/cs/9905014.pdf). You can verify that the description in the paper matches the OpenAI Gym environment by peeking at the code [here](https://github.com/openai/gym/blob/master/gym/envs/toy_text/taxi.py). 6 | 7 | 8 | ### Instructions 9 | 10 | The repository contains three files: 11 | - `agent.py`: Develop your reinforcement learning agent here. This is the only file that you should modify. 12 | - `monitor.py`: The `interact` function tests how well your agent learns from interaction with the environment. 13 | - `main.py`: Run this file in the terminal to check the performance of your agent. 14 | 15 | Begin by running the following command in the terminal: 16 | ``` 17 | python main.py 18 | ``` 19 | 20 | When you run `main.py`, the agent that you specify in `agent.py` interacts with the environment for 20,000 episodes. The details of the interaction are specified in `monitor.py`, which returns two variables: `avg_rewards` and `best_avg_reward`. 21 | - `avg_rewards` is a deque where `avg_rewards[i]` is the average (undiscounted) return collected by the agent from episodes `i+1` to episode `i+100`, inclusive. So, for instance, `avg_rewards[0]` is the average return collected by the agent over the first 100 episodes. 22 | - `best_avg_reward` is the largest entry in `avg_rewards`. This is the final score that you should use when determining how well your agent performed in the task. 23 | 24 | Your assignment is to modify the `agents.py` file to improve the agent's performance. 25 | - Use the `__init__()` method to define any needed instance variables. Currently, we define the number of actions available to the agent (`nA`) and initialize the action values (`Q`) to an empty dictionary of arrays. Feel free to add more instance variables; for example, you may find it useful to define the value of epsilon if the agent uses an epsilon-greedy policy for selecting actions. 26 | - The `select_action()` method accepts the environment state as input and returns the agent's choice of action. The default code that we have provided randomly selects an action. 27 | - The `step()` method accepts a (`state`, `action`, `reward`, `next_state`) tuple as input, along with the `done` variable, which is `True` if the episode has ended. The default code (which you should certainly change!) increments the action value of the previous state-action pair by 1. You should change this method to use the sampled tuple of experience to update the agent's knowledge of the problem. 28 | 29 | Once you have modified the function, you need only run `python main.py` to test your new agent. 30 | 31 | OpenAI Gym [defines "solving"](https://gym.openai.com/envs/Taxi-v1/) this task as getting average return of 9.7 over 100 consecutive trials. 32 | -------------------------------------------------------------------------------- /OpenAI/Taxi-v2/agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import defaultdict 3 | 4 | class Agent: 5 | 6 | def __init__(self, nA=6): 7 | """ Initialize agent. 8 | 9 | Params 10 | ====== 11 | - nA: number of actions available to the agent 12 | """ 13 | self.i_episode = 1 14 | self.alpha = .01 15 | self.gamma = 1.0 16 | self.nA = nA 17 | self.Q = defaultdict(lambda: np.zeros(self.nA)) 18 | 19 | def epsilon_greedy_probs(self, Q_s, eps=None): 20 | """ obtains the action probabilities corresponding to epsilon-greedy policy """ 21 | epsilon = 1.0 / self.i_episode 22 | if eps is not None: 23 | epsilon = eps 24 | policy_s = np.ones(self.nA) * epsilon / self.nA 25 | policy_s[np.argmax(Q_s)] = 1 - epsilon + (epsilon / self.nA) 26 | return policy_s 27 | 28 | def update_Q(self, Qsa, Qsa_next, reward, alpha, gamma): 29 | """ updates the action-value function estimate using the most recent time step """ 30 | return Qsa + (alpha * (reward + (gamma * Qsa_next) - Qsa)) 31 | 32 | def select_action(self, state): 33 | """ Given the state, select an action. 34 | 35 | Params 36 | ====== 37 | - state: the current state of the environment 38 | 39 | Returns 40 | ======= 41 | - action: an integer, compatible with the task's action space 42 | """ 43 | # get epsilon-greedy action probabilities 44 | policy_s = self.epsilon_greedy_probs(self.Q[state]) 45 | 46 | # pick next action A 47 | return np.random.choice(np.arange(self.nA), p=policy_s) 48 | 49 | def step(self, state, action, reward, next_state, done): 50 | """ Update the agent's knowledge, using the most recently sampled tuple. 51 | 52 | Params 53 | ====== 54 | - state: the previous state of the environment 55 | - action: the agent's previous choice of action 56 | - reward: last reward received 57 | - next_state: the current state of the environment 58 | - done: whether the episode is complete (True or False) 59 | """ 60 | # update Q 61 | self.Q[state][action] = self.update_Q(self.Q[state][action], np.max(self.Q[next_state]), reward, self.alpha, 62 | self.gamma) 63 | 64 | self.i_episode += 1 65 | -------------------------------------------------------------------------------- /OpenAI/Taxi-v2/main.py: -------------------------------------------------------------------------------- 1 | from agent import Agent 2 | from monitor import interact 3 | import gym 4 | import numpy as np 5 | 6 | env = gym.make('Taxi-v2') 7 | agent = Agent() 8 | avg_rewards, best_avg_reward = interact(env, agent) -------------------------------------------------------------------------------- /OpenAI/Taxi-v2/monitor.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | import sys 3 | import math 4 | import numpy as np 5 | 6 | def interact(env, agent, num_episodes=20000, window=100): 7 | """ Monitor agent's performance. 8 | 9 | Params 10 | ====== 11 | - env: instance of OpenAI Gym's Taxi-v1 environment 12 | - agent: instance of class Agent (see Agent.py for details) 13 | - num_episodes: number of episodes of agent-environment interaction 14 | - window: number of episodes to consider when calculating average rewards 15 | 16 | Returns 17 | ======= 18 | - avg_rewards: deque containing average rewards 19 | - best_avg_reward: largest value in the avg_rewards deque 20 | """ 21 | # initialize average rewards 22 | avg_rewards = deque(maxlen=num_episodes) 23 | # initialize best average reward 24 | best_avg_reward = -math.inf 25 | # initialize monitor for most recent rewards 26 | samp_rewards = deque(maxlen=window) 27 | # for each episode 28 | for i_episode in range(1, num_episodes+1): 29 | # begin the episode 30 | state = env.reset() 31 | # initialize the sampled reward 32 | samp_reward = 0 33 | while True: 34 | # agent selects an action 35 | action = agent.select_action(state) 36 | # agent performs the selected action 37 | next_state, reward, done, _ = env.step(action) 38 | # agent performs internal updates based on sampled experience 39 | agent.step(state, action, reward, next_state, done) 40 | # update the sampled reward 41 | samp_reward += reward 42 | # update the state (s <- s') to next time step 43 | state = next_state 44 | if done: 45 | # save final sampled reward 46 | samp_rewards.append(samp_reward) 47 | break 48 | if (i_episode >= 100): 49 | # get average reward from last 100 episodes 50 | avg_reward = np.mean(samp_rewards) 51 | # append to deque 52 | avg_rewards.append(avg_reward) 53 | # update best average reward 54 | if avg_reward > best_avg_reward: 55 | best_avg_reward = avg_reward 56 | # monitor progress 57 | print("\rEpisode {}/{} || Best average reward {}".format(i_episode, num_episodes, best_avg_reward), end="") 58 | sys.stdout.flush() 59 | # check if task is solved (according to OpenAI Gym) 60 | if best_avg_reward >= 9.7: 61 | print('\nEnvironment solved in {} episodes.'.format(i_episode), end="") 62 | break 63 | if i_episode == num_episodes: print('\n') 64 | return avg_rewards, best_avg_reward -------------------------------------------------------------------------------- /OpenAI/Taxi-v3/Reinforcement Learning.ppsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/OpenAI/Taxi-v3/Reinforcement Learning.ppsx -------------------------------------------------------------------------------- /OpenAI/Taxi-v3/Taxi-v3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "commercial-proportion", 6 | "metadata": {}, 7 | "source": [ 8 | "#### Import dependencies" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "favorite-cathedral", 15 | "metadata": { 16 | "scrolled": true 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "import gym\n", 21 | "import random\n", 22 | "import numpy as np\n", 23 | "import time\n", 24 | "from IPython import display\n", 25 | "import matplotlib.pyplot as plt\n", 26 | "from collections import defaultdict\n", 27 | "import pylab as pl\n", 28 | "\n", 29 | "%matplotlib inline" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "id": "reduced-prime", 35 | "metadata": {}, 36 | "source": [ 37 | "### Load Environment" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "id": "three-flood", 44 | "metadata": { 45 | "scrolled": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "env = gym.make(\"Taxi-v3\")" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "id": "stupid-thailand", 55 | "metadata": {}, 56 | "source": [ 57 | "### Inspect Environment" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "id": "integral-sharing", 64 | "metadata": { 65 | "scrolled": true 66 | }, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | "Action size 6\n", 73 | "State size 500\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "# There are 6 discrete deterministic actions:\n", 79 | "# - 0: move south\n", 80 | "# - 1: move north\n", 81 | "# - 2: move east\n", 82 | "# - 3: move west\n", 83 | "# - 4: pickup passenger\n", 84 | "# - 5: drop off passenger\n", 85 | "\n", 86 | "action_size = env.action_space.n\n", 87 | "print(\"Action size \", action_size)\n", 88 | "\n", 89 | "# There are 500 discrete states since there are 25 taxi positions\n", 90 | "# 5 possible locations of the passenger (including the case when the passenger is in the taxi)\n", 91 | "# and 4 destination locations.\n", 92 | "# Start-Position is random\n", 93 | "state_size = env.observation_space.n\n", 94 | "print(\"State size \", state_size)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 4, 100 | "id": "rocky-seventh", 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "+---------+\n", 108 | "|\u001b[35mR\u001b[0m: | : :G|\n", 109 | "| : | : : |\n", 110 | "|\u001b[43m \u001b[0m: : : : |\n", 111 | "| | : | : |\n", 112 | "|Y| : |\u001b[34;1mB\u001b[0m: |\n", 113 | "+---------+\n", 114 | "\n" 115 | ] 116 | } 117 | ], 118 | "source": [ 119 | "env.reset()\n", 120 | "env.render()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "id": "measured-invalid", 126 | "metadata": {}, 127 | "source": [ 128 | "### Agent" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 5, 134 | "id": "alternate-greek", 135 | "metadata": { 136 | "scrolled": true 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "class Agent():\n", 141 | " def __init__(self, n_actions, n_states, gamma=0.9):\n", 142 | " self.n_actions = n_actions\n", 143 | " \n", 144 | " self.gamma = gamma\n", 145 | " self.Q = np.zeros((n_states, n_actions))\n", 146 | " \n", 147 | " def decay_schedule(self, init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10):\n", 148 | " decay_steps = int(max_steps * decay_ratio)\n", 149 | " rem_steps = max_steps - decay_steps\n", 150 | " values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[::-1]\n", 151 | " values = (values - values.min()) / (values.max() - values.min())\n", 152 | " values = (init_value - min_value) * values + min_value\n", 153 | " values = np.pad(values, (0, rem_steps), 'edge')\n", 154 | " return values \n", 155 | " \n", 156 | " def act(self, state, eps=0):\n", 157 | " if random.uniform(0, 1) < eps:\n", 158 | " return random.choice(np.arange(self.n_actions)) \n", 159 | " else:\n", 160 | " return np.argmax(self.Q[state])\n", 161 | " \n", 162 | " def learn(self, state, action, reward, next_state, done, alpha, algo='qlearn'): \n", 163 | " if algo == 'qlearn': \n", 164 | " # Q-Learning\n", 165 | " td_target = reward + self.gamma * np.max(self.Q[next_state, :]) * (not done)\n", 166 | " \n", 167 | " else: \n", 168 | " # SARSA\n", 169 | " td_target = reward + self.gamma * self.Q[next_state, self.act(next_state)] * (not done)\n", 170 | " \n", 171 | " td_error = td_target - self.Q[state, action] \n", 172 | " \n", 173 | " self.Q[state, action] = self.Q[state, action] + alpha * td_error" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "id": "english-label", 179 | "metadata": {}, 180 | "source": [ 181 | "### Q - Learning" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 6, 187 | "id": "brilliant-scenario", 188 | "metadata": { 189 | "scrolled": true 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "def learning(n_actions, n_states, episodes=50000, max_steps=500, print_every=5000):\n", 194 | " agent = Agent(n_actions, n_states)\n", 195 | " \n", 196 | " alphas = agent.decay_schedule(0.9, 0.01, 0.2, episodes)\n", 197 | " epsilons = agent.decay_schedule(1.0, 0.01, 0.5, episodes)\n", 198 | " \n", 199 | " for n_episode in range(episodes):\n", 200 | " state = env.reset() \n", 201 | " \n", 202 | " for n_step in range(max_steps):\n", 203 | " action = agent.act(state, epsilons[n_episode])\n", 204 | " next_state, reward, done, info = env.step(action) \n", 205 | " \n", 206 | " agent.learn(state, action, reward, next_state, done, alphas[n_episode])\n", 207 | " \n", 208 | " state = next_state\n", 209 | " \n", 210 | " if done: \n", 211 | " break\n", 212 | " \n", 213 | " if n_episode % print_every == 1:\n", 214 | " print('Episode: {0} done after {1} Steps.'.format(n_episode+1, n_step))\n", 215 | " \n", 216 | " print('Done.')\n", 217 | " env.close()\n", 218 | " \n", 219 | " return agent" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "id": "historic-charger", 225 | "metadata": {}, 226 | "source": [ 227 | "#### Training" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 7, 233 | "id": "reasonable-dimension", 234 | "metadata": {}, 235 | "outputs": [ 236 | { 237 | "name": "stdout", 238 | "output_type": "stream", 239 | "text": [ 240 | "Episode: 2 done after 199 Steps.\n", 241 | "Episode: 5002 done after 21 Steps.\n", 242 | "Episode: 10002 done after 12 Steps.\n", 243 | "Episode: 15002 done after 17 Steps.\n", 244 | "Episode: 20002 done after 15 Steps.\n", 245 | "Episode: 25002 done after 15 Steps.\n", 246 | "Episode: 30002 done after 11 Steps.\n", 247 | "Episode: 35002 done after 9 Steps.\n", 248 | "Episode: 40002 done after 11 Steps.\n", 249 | "Episode: 45002 done after 13 Steps.\n", 250 | "Done.\n" 251 | ] 252 | } 253 | ], 254 | "source": [ 255 | "agent = learning(action_size, state_size)" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "id": "thirty-truck", 261 | "metadata": {}, 262 | "source": [ 263 | "### Replay trained Agent" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 8, 269 | "id": "closed-sport", 270 | "metadata": { 271 | "scrolled": true 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "def replay(agent, max_steps=20): \n", 276 | " n_steps = 0\n", 277 | "\n", 278 | " state, done = env.reset(), False\n", 279 | " rewards = 0\n", 280 | "\n", 281 | " while not done and n_steps < max_steps:\n", 282 | " action = agent.act(state)\n", 283 | " next_state, reward, done, info = env.step(action) \n", 284 | " \n", 285 | " state = next_state\n", 286 | " rewards += reward\n", 287 | " \n", 288 | " display.clear_output(wait=True)\n", 289 | " env.render()\n", 290 | " time.sleep(.5)\n", 291 | "\n", 292 | " n_steps+=1\n", 293 | "\n", 294 | " print('Solved after {0} Steps.'.format(n_steps))" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 9, 300 | "id": "african-output", 301 | "metadata": { 302 | "scrolled": false 303 | }, 304 | "outputs": [ 305 | { 306 | "name": "stdout", 307 | "output_type": "stream", 308 | "text": [ 309 | "+---------+\n", 310 | "|R: | : :G|\n", 311 | "| : | : : |\n", 312 | "| : : : : |\n", 313 | "| | : | : |\n", 314 | "|\u001b[35m\u001b[34;1m\u001b[43mY\u001b[0m\u001b[0m\u001b[0m| : |B: |\n", 315 | "+---------+\n", 316 | " (Dropoff)\n", 317 | "Solved after 12 Steps.\n" 318 | ] 319 | } 320 | ], 321 | "source": [ 322 | "for _ in range(5):\n", 323 | " replay(agent)\n", 324 | " time.sleep(1)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "id": "42e3925d", 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [] 334 | } 335 | ], 336 | "metadata": { 337 | "kernelspec": { 338 | "display_name": "Python 3", 339 | "language": "python", 340 | "name": "python3" 341 | }, 342 | "language_info": { 343 | "codemirror_mode": { 344 | "name": "ipython", 345 | "version": 3 346 | }, 347 | "file_extension": ".py", 348 | "mimetype": "text/x-python", 349 | "name": "python", 350 | "nbconvert_exporter": "python", 351 | "pygments_lexer": "ipython3", 352 | "version": "3.8.8" 353 | } 354 | }, 355 | "nbformat": 4, 356 | "nbformat_minor": 5 357 | } 358 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement-Learning 2 | 3 | ### Repository for OpenAI and Unity-ML Reinforcement Learning environments. 4 | 5 | #### Q - Learning 6 | [Taxi-v2 (Solution)](OpenAI/Taxi-v2) 7 | 8 | #### Q - Learning or SARSA 9 | [Taxi-v3 (Solution)](OpenAI/Taxi-v3) (Decay ε Greedy) 10 | 11 | #### Pytorch 12 | [LunarLanderContinuous-v2 (Solution DDPG)](OpenAI/LunarLander-v2) 13 | [MountainCarContinuous-v0 (Solution DDPG)](OpenAI/MountainCarContinuous-v0) 14 | [BipedalWalker-v2 (Solution TD3)](OpenAI/BipedalWalker-v2) solved after 1635 episodes 15 | [BipedalWalker-v3 (Solution TD3)](OpenAI/BipedalWalker-v3) solved after 678 episodes 16 | [HumanoidPyBulletEnv-v0 (Solution PPO)](OpenAI/HumanoidPyBulletEnv-v0) 17 | 18 | #### Tensorflow / Keras 19 | [CartPole-v0 (Solution DDQN, Duelling DQN (incl. Prioritized Replay Buffer (PER))](OpenAI/CartPole-v0) 20 | -------------------------------------------------------------------------------- /Unity-ML/Soccer/Agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import copy 4 | from collections import namedtuple, deque 5 | 6 | from Model import Actor, Critic 7 | from Noise import OUNoise 8 | 9 | import torch 10 | import torch.nn.functional as F 11 | import torch.optim as optim 12 | 13 | BUFFER_SIZE = int(1e6) # replay buffer size 14 | BATCH_SIZE = 1024 # minibatch size 15 | GAMMA = 0.99 # discount factor 16 | TAU = 1e-3 # for soft update of target parameters 17 | LR_ACTOR = 1e-4 # learning rate of the actor 18 | LR_CRITIC = 1e-3 # learning rate of the critic 19 | WEIGHT_DECAY = 0 # L2 weight decay 20 | EPSILON_MAX = 1.0 21 | EPSILON_MIN = 0.1 22 | EPSILON_DECAY = 0.995 23 | LEARN_START = 0 24 | UPDATE_EVERY = 1 25 | UPDATES_PER_STEP = 1 26 | 27 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 28 | 29 | class Agent(): 30 | """Interacts with and learns from the environment.""" 31 | 32 | def __init__(self, state_size, action_size, num_agents, random_seed): 33 | """Initialize an Agent object. 34 | 35 | Params 36 | ====== 37 | state_size (int): dimension of each state 38 | action_size (int): dimension of each action 39 | num_agents (int): number of agents 40 | random_seed (int): random seed 41 | """ 42 | self.state_size = state_size 43 | self.action_size = action_size 44 | self.num_agents = num_agents 45 | self.seed = random.seed(random_seed) 46 | self.epsilon = EPSILON_MAX 47 | 48 | # Actor Network (w/ Target Network) 49 | self.actor_local = Actor(state_size, action_size, random_seed).to(device) 50 | self.actor_target = Actor(state_size, action_size, random_seed).to(device) 51 | self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) 52 | 53 | # Critic Network (w/ Target Network) 54 | self.critic_local = Critic(state_size, action_size, random_seed).to(device) 55 | self.critic_target = Critic(state_size, action_size, random_seed).to(device) 56 | self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) 57 | 58 | # Noise process 59 | self.noise = OUNoise(action_size, random_seed, mu=0, theta=0.15, sigma=0.2) 60 | 61 | # Noise process 62 | self.noise = [OUNoise(action_size, random_seed) for i in range(self.num_agents)] 63 | 64 | # Replay memory 65 | self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) 66 | 67 | # Make sure target is with the same weight as the source 68 | self.hard_update(self.actor_target, self.actor_local) 69 | self.hard_update(self.critic_target, self.critic_local) 70 | 71 | self.t_step = 0 72 | 73 | def step(self, state, action, reward, next_state, done): 74 | """Save experience in replay memory, and use random sample from buffer to learn.""" 75 | # Save experience / reward 76 | self.memory.add(state, action, reward, next_state, done, self.num_agents) 77 | 78 | if len(self.memory) > LEARN_START: 79 | # Learn every UPDATE_EVERY time steps. 80 | self.t_step = (self.t_step + 1) % UPDATE_EVERY 81 | if self.t_step == 0: 82 | # Learn, if enough samples are available in memory 83 | if len(self.memory) > BATCH_SIZE: 84 | for _ in range(UPDATES_PER_STEP): 85 | experiences = self.memory.sample() 86 | self.learn(experiences, GAMMA) 87 | 88 | def act(self, state, add_noise=True): 89 | """Returns actions for given state as per current policy.""" 90 | #state = torch.from_numpy(state).float().unsqueeze(0).to(device) 91 | state = torch.from_numpy(state).float().to(device) 92 | 93 | self.actor_local.eval() 94 | with torch.no_grad(): 95 | action = self.actor_local(state).cpu().data.numpy() 96 | 97 | self.actor_local.train() 98 | 99 | if add_noise: 100 | for i in range(self.num_agents): 101 | agent_action = action[i] 102 | for j in agent_action: 103 | j += self.epsilon * self.noise[i].sample() 104 | 105 | actions = [] 106 | for i in range(len(action)): 107 | actions.append(np.argmax(action[i])) 108 | 109 | #print(action) 110 | return actions 111 | 112 | def reset(self): 113 | for i in range(self.num_agents): 114 | self.noise[i].reset() 115 | 116 | def learn(self, experiences, gamma): 117 | """Update policy and value parameters using given batch of experience tuples. 118 | Q_targets = r + ? * critic_target(next_state, actor_target(next_state)) 119 | where: 120 | actor_target(state) -> action 121 | critic_target(state, action) -> Q-value 122 | 123 | Params 124 | ====== 125 | experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 126 | gamma (float): discount factor 127 | """ 128 | states, actions, rewards, next_states, dones = experiences 129 | 130 | # ---------------------------- update critic ---------------------------- # 131 | # Get predicted next-state actions and Q values from target models 132 | actions_next = self.actor_target(next_states) 133 | Q_targets_next = self.critic_target(next_states, actions_next) 134 | 135 | # Compute Q targets for current states (y_i) 136 | Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) 137 | 138 | # Compute critic loss 139 | Q_expected = self.critic_local(states, actions) 140 | critic_loss = F.mse_loss(Q_expected, Q_targets) 141 | 142 | # Minimize the loss 143 | self.critic_optimizer.zero_grad() 144 | critic_loss.backward() 145 | torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) 146 | self.critic_optimizer.step() 147 | 148 | # ---------------------------- update actor ---------------------------- # 149 | # Compute actor loss 150 | actions_pred = self.actor_local(states) 151 | actor_loss = -self.critic_local(states, actions_pred).mean() 152 | 153 | # Minimize the loss 154 | self.actor_optimizer.zero_grad() 155 | actor_loss.backward() 156 | self.actor_optimizer.step() 157 | 158 | # ----------------------- update target networks ----------------------- # 159 | self.soft_update(self.critic_local, self.critic_target, TAU) 160 | self.soft_update(self.actor_local, self.actor_target, TAU) 161 | 162 | # ---------------------------- update noise ---------------------------- # 163 | if self.epsilon - EPSILON_DECAY > EPSILON_MIN: 164 | self.epsilon -= EPSILON_DECAY 165 | else: 166 | self.epsilon = EPSILON_MIN 167 | 168 | def soft_update(self, local_model, target_model, tau): 169 | """Soft update model parameters. 170 | ?_target = t*?_local + (1 - t)*?_target 171 | 172 | Params 173 | ====== 174 | local_model: PyTorch model (weights will be copied from) 175 | target_model: PyTorch model (weights will be copied to) 176 | tau (float): interpolation parameter 177 | """ 178 | for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): 179 | target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) 180 | 181 | def hard_update(self, target, source): 182 | for target_param, param in zip(target.parameters(), source.parameters()): 183 | target_param.data.copy_(param.data) 184 | 185 | class ReplayBuffer: 186 | """Fixed-size buffer to store experience tuples.""" 187 | 188 | def __init__(self, action_size, buffer_size, batch_size, seed): 189 | """Initialize a ReplayBuffer object. 190 | Params 191 | ====== 192 | buffer_size (int): maximum size of buffer 193 | batch_size (int): size of each training batch 194 | """ 195 | self.action_size = action_size 196 | self.buffer_size = buffer_size 197 | self.batch_size = batch_size 198 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"]) 199 | self.seed = random.seed(seed) 200 | 201 | self.reset() 202 | 203 | def add(self, state, action, reward, next_state, done, num_agents): 204 | """Add a new experience to memory.""" 205 | for i in range(num_agents): 206 | e = self.experience(state[i], action[i], reward[i], next_state[i], done[i]) 207 | self.memory.append(e) 208 | 209 | def reset(self): 210 | self.memory = deque(maxlen=self.buffer_size) 211 | 212 | def sample(self): 213 | """Randomly sample a batch of experiences from memory.""" 214 | experiences = random.sample(self.memory, k=self.batch_size) 215 | 216 | states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device) 217 | actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device) 218 | rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device) 219 | next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device) 220 | dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device) 221 | 222 | return states, actions, rewards, next_states, dones 223 | 224 | def __len__(self): 225 | """Return the current size of internal memory.""" 226 | return len(self.memory) -------------------------------------------------------------------------------- /Unity-ML/Soccer/Model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | def hidden_init(layer): 8 | fan_in = layer.weight.data.size()[0] 9 | lim = 1. / np.sqrt(fan_in) 10 | return (-lim, lim) 11 | 12 | 13 | class Actor(nn.Module): 14 | """Actor (Policy) Model.""" 15 | 16 | def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64): 17 | """Initialize parameters and build model. 18 | Params 19 | ====== 20 | state_size (int): Dimension of each state 21 | action_size (int): Dimension of each action 22 | seed (int): Random seed 23 | fc1_units (int): Number of nodes in first hidden layer 24 | fc2_units (int): Number of nodes in second hidden layer 25 | """ 26 | super(Actor, self).__init__() 27 | self.seed = torch.manual_seed(seed) 28 | 29 | self.fc1 = nn.Linear(state_size, fc1_units) 30 | self.bn1 = nn.BatchNorm1d(fc1_units) 31 | self.fc2 = nn.Linear(fc1_units, fc2_units) 32 | self.bn2 = nn.BatchNorm1d(fc2_units) 33 | self.fc3 = nn.Linear(fc2_units, action_size) 34 | self.bn3 = nn.BatchNorm1d(action_size) 35 | self.softmax = nn.Softmax(dim=1) 36 | self.reset_parameters() 37 | 38 | def reset_parameters(self): 39 | self.fc1.weight.data.uniform_(*hidden_init(self.fc1)) 40 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2)) 41 | self.fc3.weight.data.uniform_(-3e-3, 3e-3) 42 | 43 | def forward(self, state): 44 | """Build an actor (policy) network that maps states -> actions.""" 45 | x = self.fc1(state) 46 | x = F.relu(x) 47 | x = self.bn1(x) 48 | x = self.fc2(x) 49 | x = F.relu(x) 50 | x = self.bn2(x) 51 | x = self.fc3(x) 52 | x = self.bn3(x) 53 | #return torch.tanh(x) 54 | #return self.softmax(x) 55 | 56 | # transform to logits 57 | return F.log_softmax(x) 58 | 59 | 60 | class Critic(nn.Module): 61 | """Critic (Value) Model.""" 62 | 63 | def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64): 64 | """Initialize parameters and build model. 65 | Params 66 | ====== 67 | state_size (int): Dimension of each state 68 | action_size (int): Dimension of each action 69 | seed (int): Random seed 70 | fcs1_units (int): Number of nodes in the first hidden layer 71 | fc2_units (int): Number of nodes in the second hidden layer 72 | fc3_units (int): Number of nodes in the third hidden layer 73 | """ 74 | super(Critic, self).__init__() 75 | self.seed = torch.manual_seed(seed) 76 | 77 | self.bn0 = nn.BatchNorm1d(state_size) 78 | self.fc1 = nn.Linear(state_size, fc1_units) 79 | self.bn1 = nn.BatchNorm1d(fc1_units) 80 | self.fc2 = nn.Linear(fc1_units+action_size, fc2_units) 81 | self.bn2 = nn.BatchNorm1d(fc2_units) 82 | self.fc3 = nn.Linear(fc2_units, 1) 83 | self.reset_parameters() 84 | 85 | def reset_parameters(self): 86 | self.fc1.weight.data.uniform_(*hidden_init(self.fc1)) 87 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2)) 88 | self.fc3.weight.data.uniform_(-3e-3, 3e-3) 89 | 90 | def forward(self, state, action): 91 | """Build a critic (value) network that maps (state, action) pairs -> Q-values.""" 92 | state = self.bn0(state) 93 | xs = self.fc1(state) 94 | xs = self.bn1(xs) 95 | xs = F.leaky_relu(xs) 96 | x = torch.cat((xs, action), dim=1) 97 | x = self.fc2(x) 98 | x = self.bn2(x) 99 | x = F.leaky_relu(x) 100 | return self.fc3(x) 101 | -------------------------------------------------------------------------------- /Unity-ML/Soccer/Noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import copy 4 | 5 | class OUNoise: 6 | """Ornstein-Uhlenbeck process.""" 7 | 8 | def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.1): 9 | """Initialize parameters and noise process.""" 10 | self.mu = mu * np.ones(size) 11 | self.theta = theta 12 | self.sigma = sigma 13 | self.seed = random.seed(seed) 14 | self.reset() 15 | 16 | def reset(self): 17 | """Reset the internal state (= noise) to mean (mu).""" 18 | self.state = copy.copy(self.mu) 19 | 20 | def sample(self): 21 | """Update internal state and return it as a noise sample.""" 22 | x = self.state 23 | dx = self.theta * (self.mu - x) + self.sigma * np.array([random.random() for i in range(len(x))]) 24 | self.state = x + dx 25 | return self.state 26 | -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/Unity-ML/Soccer/Soccer_Windows_x86_64/.DS_Store -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/2.0/Browsers/Compat.browser: -------------------------------------------------------------------------------- 1 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/2.0/settings.map: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | 10 | 11 | 20 | 23 | 24 | 25 | 26 | 29 | 30 | 33 | 34 | 43 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/2.0/web.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 64 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 108 | 110 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 150 | 151 | 152 | 153 | 154 | 155 | -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/4.0/Browsers/Compat.browser: -------------------------------------------------------------------------------- 1 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/4.0/settings.map: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | 10 | 11 | 20 | 23 | 24 | 25 | 26 | 29 | 30 | 33 | 34 | 43 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/4.0/web.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 79 | 80 | 81 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 94 | 95 | 98 | 99 | 100 | 103 | 104 | 105 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 123 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 197 | 199 | 201 | 202 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 228 | 229 | 230 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 249 | 250 | 251 | 252 | 253 | 254 | -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/4.5/Browsers/Compat.browser: -------------------------------------------------------------------------------- 1 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/4.5/settings.map: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 7 | 10 | 11 | 20 | 23 | 24 | 25 | 26 | 29 | 30 | 33 | 34 | 43 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/4.5/web.config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 79 | 80 | 81 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 94 | 95 | 98 | 99 | 100 | 103 | 104 | 105 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 123 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 197 | 199 | 201 | 202 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 228 | 229 | 230 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 249 | 250 | 251 | 252 | 253 | 254 | -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/browscap.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/browscap.ini -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/config: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/Resources/unity default resources: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/Resources/unity default resources -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/Resources/unity_builtin_extra: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/Resources/unity_builtin_extra -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/app.info: -------------------------------------------------------------------------------- 1 | Unity Technologies 2 | Unity Environment -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/boot.config: -------------------------------------------------------------------------------- 1 | wait-for-native-debugger=0 2 | scripting-runtime-version=latest 3 | -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/globalgamemanagers: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/globalgamemanagers -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/globalgamemanagers.assets: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/globalgamemanagers.assets -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/level0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/level0 -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/resources.assets: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/resources.assets -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/sharedassets0.assets: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/sharedassets0.assets -------------------------------------------------------------------------------- /Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/sharedassets0.assets.resS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/sharedassets0.assets.resS -------------------------------------------------------------------------------- /Unity-ML/Soccer/checkpoint_goalie_actor.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/Unity-ML/Soccer/checkpoint_goalie_actor.pth -------------------------------------------------------------------------------- /Unity-ML/Soccer/checkpoint_goalie_critic.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/Unity-ML/Soccer/checkpoint_goalie_critic.pth -------------------------------------------------------------------------------- /Unity-ML/Soccer/checkpoint_striker_actor.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/Unity-ML/Soccer/checkpoint_striker_actor.pth -------------------------------------------------------------------------------- /Unity-ML/Soccer/checkpoint_striker_critic.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/c3b1edbe4ef9470015041c9794e2198c25eaa4d7/Unity-ML/Soccer/checkpoint_striker_critic.pth --------------------------------------------------------------------------------