├── OpenAI
├── CartPole-v0
│ ├── agents
│ │ ├── __init__.py
│ │ └── DDQN.py
│ ├── .gitignore
│ ├── model.h5
│ ├── assets
│ │ ├── cartpole-v0.jpg
│ │ └── game_reward.png
│ ├── requirements.txt
│ ├── README.md
│ ├── memory.py
│ └── CartPole-v0.ipynb
├── LunarLander-v2
│ ├── checkpoint_actor.pth
│ ├── checkpoint_critic.pth
│ ├── README.md
│ ├── model.py
│ └── ddpg_agent.py
├── Taxi-v3
│ ├── Reinforcement Learning.ppsx
│ └── Taxi-v3.ipynb
├── MountainCarContinuous-v0
│ ├── checkpoint_actor.pth
│ ├── checkpoint_critic.pth
│ ├── Noise.py
│ ├── README.md
│ ├── Model.py
│ └── Agent.py
├── BipedalWalker-v2
│ ├── preTrained
│ │ ├── TD3_BipedalWalker-v2_0_solved_actor.pth
│ │ ├── TD3_BipedalWalker-v2_0_solved_crtic_1.pth
│ │ ├── TD3_BipedalWalker-v2_0_solved_crtic_2.pth
│ │ ├── TD3_BipedalWalker-v2_0_solved_actor_target.pth
│ │ ├── TD3_BipedalWalker-v2_0_solved_critic_1_target.pth
│ │ └── TD3_BipedalWalker-v2_0_solved_critic_2_target.pth
│ ├── README.md
│ ├── ReplayBuffer.py
│ └── Model.py
├── BipedalWalker-v3
│ ├── preTrained
│ │ ├── TD3_BipedalWalker-v3_0_solved_actor.pth
│ │ ├── TD3_BipedalWalker-v3_0_solved_crtic_1.pth
│ │ ├── TD3_BipedalWalker-v3_0_solved_crtic_2.pth
│ │ ├── TD3_BipedalWalker-v3_0_solved_actor_target.pth
│ │ ├── TD3_BipedalWalker-v3_0_solved_critic_1_target.pth
│ │ └── TD3_BipedalWalker-v3_0_solved_critic_2_target.pth
│ ├── README.md
│ ├── ReplayBuffer.py
│ └── Agent.py
├── HumanoidPyBulletEnv-v0
│ ├── pretrained
│ │ ├── actor_HumanoidPyBulletEnv-v0_final.pt
│ │ ├── critic_HumanoidPyBulletEnv-v0_final.pt
│ │ ├── actor_HumanoidPyBulletEnv-v0_checkpoint.pt
│ │ └── critic_HumanoidPyBulletEnv-v0_checkpoint.pt
│ ├── README.md
│ ├── multiprocessing_env.py
│ └── HumanoidPyBulletEnv-v0.ipynb
└── Taxi-v2
│ ├── main.py
│ ├── agent.py
│ ├── monitor.py
│ └── README.md
├── Unity-ML
└── Soccer
│ ├── Soccer_Windows_x86_64
│ ├── Soccer_Data
│ │ ├── app.info
│ │ ├── boot.config
│ │ ├── level0
│ │ ├── globalgamemanagers
│ │ ├── resources.assets
│ │ ├── sharedassets0.assets
│ │ ├── globalgamemanagers.assets
│ │ ├── sharedassets0.assets.resS
│ │ ├── Resources
│ │ │ ├── unity_builtin_extra
│ │ │ └── unity default resources
│ │ └── MonoBleedingEdge
│ │ │ └── etc
│ │ │ └── mono
│ │ │ ├── browscap.ini
│ │ │ ├── 2.0
│ │ │ ├── Browsers
│ │ │ │ └── Compat.browser
│ │ │ ├── settings.map
│ │ │ └── web.config
│ │ │ ├── 4.0
│ │ │ ├── Browsers
│ │ │ │ └── Compat.browser
│ │ │ ├── settings.map
│ │ │ └── web.config
│ │ │ ├── 4.5
│ │ │ ├── Browsers
│ │ │ │ └── Compat.browser
│ │ │ ├── settings.map
│ │ │ └── web.config
│ │ │ └── config
│ └── .DS_Store
│ ├── checkpoint_goalie_actor.pth
│ ├── checkpoint_goalie_critic.pth
│ ├── checkpoint_striker_actor.pth
│ ├── checkpoint_striker_critic.pth
│ ├── Noise.py
│ ├── Model.py
│ └── Agent.py
├── .gitattributes
├── README.md
└── .gitignore
/OpenAI/CartPole-v0/agents/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/OpenAI/CartPole-v0/.gitignore:
--------------------------------------------------------------------------------
1 | # Logs
2 | /logs
3 |
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/app.info:
--------------------------------------------------------------------------------
1 | Unity Technologies
2 | Unity Environment
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | Unity-ML/Soccer/Soccer_Windows_x86_64/* linguist-vendored
2 | *.html linguist-vendored
3 |
--------------------------------------------------------------------------------
/OpenAI/CartPole-v0/model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/CartPole-v0/model.h5
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/boot.config:
--------------------------------------------------------------------------------
1 | wait-for-native-debugger=0
2 | scripting-runtime-version=latest
3 |
--------------------------------------------------------------------------------
/OpenAI/CartPole-v0/assets/cartpole-v0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/CartPole-v0/assets/cartpole-v0.jpg
--------------------------------------------------------------------------------
/OpenAI/CartPole-v0/assets/game_reward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/CartPole-v0/assets/game_reward.png
--------------------------------------------------------------------------------
/OpenAI/LunarLander-v2/checkpoint_actor.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/LunarLander-v2/checkpoint_actor.pth
--------------------------------------------------------------------------------
/OpenAI/LunarLander-v2/checkpoint_critic.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/LunarLander-v2/checkpoint_critic.pth
--------------------------------------------------------------------------------
/OpenAI/Taxi-v3/Reinforcement Learning.ppsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/Taxi-v3/Reinforcement Learning.ppsx
--------------------------------------------------------------------------------
/Unity-ML/Soccer/checkpoint_goalie_actor.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/Unity-ML/Soccer/checkpoint_goalie_actor.pth
--------------------------------------------------------------------------------
/Unity-ML/Soccer/checkpoint_goalie_critic.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/Unity-ML/Soccer/checkpoint_goalie_critic.pth
--------------------------------------------------------------------------------
/Unity-ML/Soccer/checkpoint_striker_actor.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/Unity-ML/Soccer/checkpoint_striker_actor.pth
--------------------------------------------------------------------------------
/Unity-ML/Soccer/checkpoint_striker_critic.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/Unity-ML/Soccer/checkpoint_striker_critic.pth
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/Unity-ML/Soccer/Soccer_Windows_x86_64/.DS_Store
--------------------------------------------------------------------------------
/OpenAI/MountainCarContinuous-v0/checkpoint_actor.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/MountainCarContinuous-v0/checkpoint_actor.pth
--------------------------------------------------------------------------------
/OpenAI/MountainCarContinuous-v0/checkpoint_critic.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/MountainCarContinuous-v0/checkpoint_critic.pth
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/level0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/level0
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/globalgamemanagers:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/globalgamemanagers
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/resources.assets:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/resources.assets
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/sharedassets0.assets:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/sharedassets0.assets
--------------------------------------------------------------------------------
/OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_actor.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_actor.pth
--------------------------------------------------------------------------------
/OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_actor.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_actor.pth
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/globalgamemanagers.assets:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/globalgamemanagers.assets
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/sharedassets0.assets.resS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/sharedassets0.assets.resS
--------------------------------------------------------------------------------
/OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_crtic_1.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_crtic_1.pth
--------------------------------------------------------------------------------
/OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_crtic_2.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_crtic_2.pth
--------------------------------------------------------------------------------
/OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_crtic_1.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_crtic_1.pth
--------------------------------------------------------------------------------
/OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_crtic_2.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_crtic_2.pth
--------------------------------------------------------------------------------
/OpenAI/HumanoidPyBulletEnv-v0/pretrained/actor_HumanoidPyBulletEnv-v0_final.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/HumanoidPyBulletEnv-v0/pretrained/actor_HumanoidPyBulletEnv-v0_final.pt
--------------------------------------------------------------------------------
/OpenAI/Taxi-v2/main.py:
--------------------------------------------------------------------------------
1 | from agent import Agent
2 | from monitor import interact
3 | import gym
4 | import numpy as np
5 |
6 | env = gym.make('Taxi-v2')
7 | agent = Agent()
8 | avg_rewards, best_avg_reward = interact(env, agent)
--------------------------------------------------------------------------------
/OpenAI/HumanoidPyBulletEnv-v0/pretrained/critic_HumanoidPyBulletEnv-v0_final.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/HumanoidPyBulletEnv-v0/pretrained/critic_HumanoidPyBulletEnv-v0_final.pt
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/Resources/unity_builtin_extra:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/Resources/unity_builtin_extra
--------------------------------------------------------------------------------
/OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_actor_target.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_actor_target.pth
--------------------------------------------------------------------------------
/OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_actor_target.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_actor_target.pth
--------------------------------------------------------------------------------
/OpenAI/HumanoidPyBulletEnv-v0/pretrained/actor_HumanoidPyBulletEnv-v0_checkpoint.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/HumanoidPyBulletEnv-v0/pretrained/actor_HumanoidPyBulletEnv-v0_checkpoint.pt
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/Resources/unity default resources:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/Resources/unity default resources
--------------------------------------------------------------------------------
/OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_critic_1_target.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_critic_1_target.pth
--------------------------------------------------------------------------------
/OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_critic_2_target.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/BipedalWalker-v2/preTrained/TD3_BipedalWalker-v2_0_solved_critic_2_target.pth
--------------------------------------------------------------------------------
/OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_critic_1_target.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_critic_1_target.pth
--------------------------------------------------------------------------------
/OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_critic_2_target.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/BipedalWalker-v3/preTrained/TD3_BipedalWalker-v3_0_solved_critic_2_target.pth
--------------------------------------------------------------------------------
/OpenAI/HumanoidPyBulletEnv-v0/pretrained/critic_HumanoidPyBulletEnv-v0_checkpoint.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/OpenAI/HumanoidPyBulletEnv-v0/pretrained/critic_HumanoidPyBulletEnv-v0_checkpoint.pt
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/browscap.ini:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tobiassteidle/Reinforcement-Learning/HEAD/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/browscap.ini
--------------------------------------------------------------------------------
/OpenAI/CartPole-v0/requirements.txt:
--------------------------------------------------------------------------------
1 | prompt_toolkit==2.0.10
2 | matplotlib
3 | numpy==1.16.4
4 | pandas
5 | opencv-python
6 | pillow
7 | imutils
8 | scikit-image
9 | tqdm
10 | tensorflow-gpu>=2.4.0
11 | Keras==2.3.1
12 | h5py
13 | ipykernel
14 | jupyter
15 | gym
16 | gym[atari]
17 |
--------------------------------------------------------------------------------
/OpenAI/HumanoidPyBulletEnv-v0/README.md:
--------------------------------------------------------------------------------
1 | # Humanoid Walker Problem
2 |
3 | ### Getting Started
4 | The environment to the Humanoid is described [here](https://github.com/benelot/pybullet-gym/blob/master/README.md).
5 |
6 | ### Solution Video
7 | [](https://www.youtube.com/watch?v=dxZP1icxsMw "BipedalWalker-v3")
8 |
9 | The video shows in the first part the behaviour of the untrained agent and then in comparison the behaviour of the trained agent.
10 |
11 | ### Solution Info
12 | My learning algorithm is a [Proximal Policy Optimization(PPO)]([https://arxiv.org/pdf/1707.06347.pdf]).
13 |
14 | ### Instructions
15 |
16 | start Jupyter Notebook `HumanoidPyBulletEnv-v0.ipynb` and follow the instructions.
17 |
--------------------------------------------------------------------------------
/OpenAI/BipedalWalker-v3/README.md:
--------------------------------------------------------------------------------
1 | # BipedalWalker Problem
2 |
3 | ### Getting Started
4 | The environment to the BipedalWalker is described [here](https://github.com/openai/gym/wiki/BipedalWalker-v2).
5 |
6 | ### Solution Video
7 | [](https://www.youtube.com/watch?v=14yGAsIG-Rs "BipedalWalker-v3")
8 |
9 | The video shows in the first part the behaviour of the untrained agent and then in comparison the behaviour of the trained agent.
10 |
11 | ### Solution Info
12 | My learning algorithm is a [Twin Delayed Deep Deterministic Policy Gradient algorithm (TD3)]([https://arxiv.org/pdf/1802.09477.pdf]).
13 |
14 | ### Instructions
15 |
16 | start Jupyter Notebook `BipedalWalker-v3.ipynb` and follow the instructions.
17 |
--------------------------------------------------------------------------------
/OpenAI/BipedalWalker-v2/README.md:
--------------------------------------------------------------------------------
1 | # BipedalWalker Problem
2 |
3 | ### Getting Started
4 | The environment to the BipedalWalker is described [here](https://github.com/openai/gym/wiki/BipedalWalker-v2).
5 |
6 | ### Solution Video
7 | [](https://www.youtube.com/watch?v=QW6fWP5FDoU "BipedalWalker-v2")
8 |
9 | The video shows in the first part the behaviour of the untrained agent and then in comparison the behaviour of the trained agent.
10 |
11 | ### Solution Info
12 | My learning algorithm is a [Twin Delayed Deep Deterministic Policy Gradient algorithm (TD3)]([https://arxiv.org/pdf/1802.09477.pdf]).
13 |
14 | ### Instructions
15 |
16 | start Jupyter Notebook `BipedalWalker-v2.ipynb` and follow the instructions.
17 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Reinforcement-Learning
2 |
3 | ### Repository for OpenAI and Unity-ML Reinforcement Learning environments.
4 |
5 | #### Q - Learning
6 | [Taxi-v2 (Solution)](OpenAI/Taxi-v2)
7 |
8 | #### Q - Learning or SARSA
9 | [Taxi-v3 (Solution)](OpenAI/Taxi-v3) (Decay ε Greedy)
10 |
11 | #### Pytorch
12 | [LunarLanderContinuous-v2 (Solution DDPG)](OpenAI/LunarLander-v2)
13 | [MountainCarContinuous-v0 (Solution DDPG)](OpenAI/MountainCarContinuous-v0)
14 | [BipedalWalker-v2 (Solution TD3)](OpenAI/BipedalWalker-v2) solved after 1635 episodes
15 | [BipedalWalker-v3 (Solution TD3)](OpenAI/BipedalWalker-v3) solved after 678 episodes
16 | [HumanoidPyBulletEnv-v0 (Solution PPO)](OpenAI/HumanoidPyBulletEnv-v0)
17 |
18 | #### Tensorflow / Keras
19 | [CartPole-v0 (Solution DDQN, Duelling DQN (incl. Prioritized Replay Buffer (PER))](OpenAI/CartPole-v0)
20 |
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Noise.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | import copy
4 |
5 | class OUNoise:
6 | """Ornstein-Uhlenbeck process."""
7 |
8 | def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.1):
9 | """Initialize parameters and noise process."""
10 | self.mu = mu * np.ones(size)
11 | self.theta = theta
12 | self.sigma = sigma
13 | self.seed = random.seed(seed)
14 | self.reset()
15 |
16 | def reset(self):
17 | """Reset the internal state (= noise) to mean (mu)."""
18 | self.state = copy.copy(self.mu)
19 |
20 | def sample(self):
21 | """Update internal state and return it as a noise sample."""
22 | x = self.state
23 | dx = self.theta * (self.mu - x) + self.sigma * np.array([random.random() for i in range(len(x))])
24 | self.state = x + dx
25 | return self.state
26 |
--------------------------------------------------------------------------------
/OpenAI/MountainCarContinuous-v0/Noise.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | import copy
4 |
5 | class OUNoise:
6 | """Ornstein-Uhlenbeck process."""
7 |
8 | def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
9 | """Initialize parameters and noise process."""
10 | self.mu = mu * np.ones(size)
11 | self.theta = theta
12 | self.sigma = sigma
13 | self.seed = random.seed(seed)
14 | self.reset()
15 |
16 | def reset(self):
17 | """Reset the internal state (= noise) to mean (mu)."""
18 | self.state = copy.copy(self.mu)
19 |
20 | def sample(self):
21 | """Update internal state and return it as a noise sample."""
22 | x = self.state
23 | dx = self.theta * (self.mu - x) + self.sigma * np.array([random.random() for i in range(len(x))])
24 | self.state = x + dx
25 | return self.state
26 |
--------------------------------------------------------------------------------
/OpenAI/BipedalWalker-v2/ReplayBuffer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class ReplayBuffer:
4 | def __init__(self):
5 | self.buffer = []
6 |
7 | def add(self, transition):
8 | self.buffer.append(transition)
9 |
10 | def sample(self, batch_size):
11 | indexes = np.random.randint(0, len(self.buffer), size=batch_size)
12 | state, action, reward, next_state, done = [], [], [], [], []
13 |
14 | for i in indexes:
15 | s, a, r, s_, d = self.buffer[i]
16 | state.append(np.array(s, copy=False))
17 | action.append(np.array(a, copy=False))
18 | reward.append(np.array(r, copy=False))
19 | next_state.append(np.array(s_, copy=False))
20 | done.append(np.array(d, copy=False))
21 |
22 | return np.array(state), np.array(action), np.array(reward), np.array(next_state), np.array(done)
23 |
--------------------------------------------------------------------------------
/OpenAI/CartPole-v0/README.md:
--------------------------------------------------------------------------------
1 | # CartPole-v0
2 | Reinforcement Learning project to train a neural network to play the
3 | [OpenAI](https://openai.com/) environment [CartPole-v0](https://github.com/openai/gym/wiki/CartPole-v0).
4 | 
5 |
6 | ### Objectives
7 |
8 |
9 | ## Additional Information
10 | Tensorflow Version: GPU 2.0.0
11 |
12 | ## Installation
13 | 1. Create and activate a new environment.
14 | ```
15 | conda create -n openai python=3.6
16 | source activate openai
17 | ```
18 | 2. Install Dependencies.
19 | ```
20 | pip install -r requirements.txt
21 | pip install gym[atari]
22 | ```
23 |
24 | ### Launch Jupyter notebook
25 | ```
26 | jupyter notebook CartPole-v0.ipynb
27 | ```
28 |
29 | ### Additional commands
30 | Starts Tensorboard Visualisation.
31 | ```
32 | tensorboard --logdir=logs/
33 | ```
34 |
35 | #### Rewards
36 | 
37 |
38 |
--------------------------------------------------------------------------------
/OpenAI/BipedalWalker-v3/ReplayBuffer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class ReplayBuffer:
4 | def __init__(self, max_size=1000000):
5 | self.buffer = []
6 | self.max_size = int(max_size)
7 | self.size = 0
8 |
9 | def add(self, transition):
10 | self.size +=1
11 | # transiton is tuple of (state, action, reward, next_state, done)
12 | self.buffer.append(transition)
13 |
14 | def sample(self, batch_size):
15 | # delete 1/5th of the buffer when full
16 | if self.size > self.max_size:
17 | del self.buffer[0:int(self.size/5)]
18 | self.size = len(self.buffer)
19 |
20 | indexes = np.random.randint(0, len(self.buffer), size=batch_size)
21 | state, action, reward, next_state, done = [], [], [], [], []
22 |
23 | for i in indexes:
24 | s, a, r, s_, d = self.buffer[i]
25 | state.append(np.array(s, copy=False))
26 | action.append(np.array(a, copy=False))
27 | reward.append(np.array(r, copy=False))
28 | next_state.append(np.array(s_, copy=False))
29 | done.append(np.array(d, copy=False))
30 |
31 | return np.array(state), np.array(action), np.array(reward).reshape(-1, 1), np.array(next_state), np.array(done).reshape(-1, 1)
32 |
--------------------------------------------------------------------------------
/OpenAI/MountainCarContinuous-v0/README.md:
--------------------------------------------------------------------------------
1 | # MountainCarContinuous Problem
2 |
3 | ### Getting Started
4 | The environment to the MountainCarContinuous is described [here](https://github.com/openai/gym/wiki/MountainCarContinuous-v0).
5 |
6 | ### Solution Video
7 | [](https://www.youtube.com/watch?v=RGKRfxfEFEA "MountainCarContinuous-v0")
8 |
9 | The video shows the solution of the environment after 32 episodes.
10 |
11 | ### Solution Info
12 | My learning algorithm is a Deep Deterministic Policy Gradient.
13 |
14 | DDPG is an actor-critic algorithm and primarily uses two neural networks.
15 | One for the actor and one for the critic. These networks calculate action vectors for the current state and and generate a temporal-difference error signal each time step.
16 |
17 | DDPG uses a stochastic behavioral policy for good exploration and a deterministic target policy for estimating.
18 |
19 | The current state is the input of the actuator network and the output is a single value representing the action. The deterministic policy gradient theorem provides the update rule for the weights of the actor network.
20 |
21 | The critic's output is simply the estimated Q-value of the current state and the action given by the actor. The critic network is updated from the gradients obtained from the TD error signal.
22 |
23 | More general information about DDPG in [this](https://arxiv.org/pdf/1509.02971.pdf) paper.
24 |
25 | ### Instructions
26 |
27 | start Jupyter Notebook `MountainCarContinuous-v0 (DDPG).ipynb` and follow the instructions.
--------------------------------------------------------------------------------
/OpenAI/LunarLander-v2/README.md:
--------------------------------------------------------------------------------
1 | # LunarLander Problem
2 |
3 | ### Getting Started
4 | The environment to the LunarLanderContinuous is described [here](https://gym.openai.com/envs/LunarLanderContinuous-v2/).
5 |
6 | ### Solution Video
7 | [](https://www.youtube.com/watch?v=615X49z3u6o "LunarLanderContinuous-v2")
8 |
9 | The video shows in the first part the behaviour of the untrained agent and then in comparison the behaviour of the trained agent.
10 |
11 | ### Solution Info
12 | My learning algorithm is a Deep Deterministic Policy Gradient.
13 |
14 | DDPG is an actor-critic algorithm and primarily uses two neural networks.
15 | One for the actor and one for the critic. These networks calculate action vectors for the current state and and generate a temporal-difference error signal each time step.
16 |
17 | DDPG uses a stochastic behavioral policy for good exploration and a deterministic target policy for estimating.
18 |
19 | The current state is the input of the actuator network and the output is a single value representing the action. The deterministic policy gradient theorem provides the update rule for the weights of the actor network.
20 |
21 | The critic's output is simply the estimated Q-value of the current state and the action given by the actor. The critic network is updated from the gradients obtained from the TD error signal.
22 |
23 | More general information about DDPG in [this](https://arxiv.org/pdf/1509.02971.pdf) paper.
24 |
25 | ### Instructions
26 |
27 | start Jupyter Notebook `LunarLanderContinuous-v2 (DDPG).ipynb` and follow the instructions.
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | # IntelliJ
107 | .idea/
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/2.0/Browsers/Compat.browser:
--------------------------------------------------------------------------------
1 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/4.0/Browsers/Compat.browser:
--------------------------------------------------------------------------------
1 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/4.5/Browsers/Compat.browser:
--------------------------------------------------------------------------------
1 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/OpenAI/Taxi-v2/agent.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from collections import defaultdict
3 |
4 | class Agent:
5 |
6 | def __init__(self, nA=6):
7 | """ Initialize agent.
8 |
9 | Params
10 | ======
11 | - nA: number of actions available to the agent
12 | """
13 | self.i_episode = 1
14 | self.alpha = .01
15 | self.gamma = 1.0
16 | self.nA = nA
17 | self.Q = defaultdict(lambda: np.zeros(self.nA))
18 |
19 | def epsilon_greedy_probs(self, Q_s, eps=None):
20 | """ obtains the action probabilities corresponding to epsilon-greedy policy """
21 | epsilon = 1.0 / self.i_episode
22 | if eps is not None:
23 | epsilon = eps
24 | policy_s = np.ones(self.nA) * epsilon / self.nA
25 | policy_s[np.argmax(Q_s)] = 1 - epsilon + (epsilon / self.nA)
26 | return policy_s
27 |
28 | def update_Q(self, Qsa, Qsa_next, reward, alpha, gamma):
29 | """ updates the action-value function estimate using the most recent time step """
30 | return Qsa + (alpha * (reward + (gamma * Qsa_next) - Qsa))
31 |
32 | def select_action(self, state):
33 | """ Given the state, select an action.
34 |
35 | Params
36 | ======
37 | - state: the current state of the environment
38 |
39 | Returns
40 | =======
41 | - action: an integer, compatible with the task's action space
42 | """
43 | # get epsilon-greedy action probabilities
44 | policy_s = self.epsilon_greedy_probs(self.Q[state])
45 |
46 | # pick next action A
47 | return np.random.choice(np.arange(self.nA), p=policy_s)
48 |
49 | def step(self, state, action, reward, next_state, done):
50 | """ Update the agent's knowledge, using the most recently sampled tuple.
51 |
52 | Params
53 | ======
54 | - state: the previous state of the environment
55 | - action: the agent's previous choice of action
56 | - reward: last reward received
57 | - next_state: the current state of the environment
58 | - done: whether the episode is complete (True or False)
59 | """
60 | # update Q
61 | self.Q[state][action] = self.update_Q(self.Q[state][action], np.max(self.Q[next_state]), reward, self.alpha,
62 | self.gamma)
63 |
64 | self.i_episode += 1
65 |
--------------------------------------------------------------------------------
/OpenAI/Taxi-v2/monitor.py:
--------------------------------------------------------------------------------
1 | from collections import deque
2 | import sys
3 | import math
4 | import numpy as np
5 |
6 | def interact(env, agent, num_episodes=20000, window=100):
7 | """ Monitor agent's performance.
8 |
9 | Params
10 | ======
11 | - env: instance of OpenAI Gym's Taxi-v1 environment
12 | - agent: instance of class Agent (see Agent.py for details)
13 | - num_episodes: number of episodes of agent-environment interaction
14 | - window: number of episodes to consider when calculating average rewards
15 |
16 | Returns
17 | =======
18 | - avg_rewards: deque containing average rewards
19 | - best_avg_reward: largest value in the avg_rewards deque
20 | """
21 | # initialize average rewards
22 | avg_rewards = deque(maxlen=num_episodes)
23 | # initialize best average reward
24 | best_avg_reward = -math.inf
25 | # initialize monitor for most recent rewards
26 | samp_rewards = deque(maxlen=window)
27 | # for each episode
28 | for i_episode in range(1, num_episodes+1):
29 | # begin the episode
30 | state = env.reset()
31 | # initialize the sampled reward
32 | samp_reward = 0
33 | while True:
34 | # agent selects an action
35 | action = agent.select_action(state)
36 | # agent performs the selected action
37 | next_state, reward, done, _ = env.step(action)
38 | # agent performs internal updates based on sampled experience
39 | agent.step(state, action, reward, next_state, done)
40 | # update the sampled reward
41 | samp_reward += reward
42 | # update the state (s <- s') to next time step
43 | state = next_state
44 | if done:
45 | # save final sampled reward
46 | samp_rewards.append(samp_reward)
47 | break
48 | if (i_episode >= 100):
49 | # get average reward from last 100 episodes
50 | avg_reward = np.mean(samp_rewards)
51 | # append to deque
52 | avg_rewards.append(avg_reward)
53 | # update best average reward
54 | if avg_reward > best_avg_reward:
55 | best_avg_reward = avg_reward
56 | # monitor progress
57 | print("\rEpisode {}/{} || Best average reward {}".format(i_episode, num_episodes, best_avg_reward), end="")
58 | sys.stdout.flush()
59 | # check if task is solved (according to OpenAI Gym)
60 | if best_avg_reward >= 9.7:
61 | print('\nEnvironment solved in {} episodes.'.format(i_episode), end="")
62 | break
63 | if i_episode == num_episodes: print('\n')
64 | return avg_rewards, best_avg_reward
--------------------------------------------------------------------------------
/OpenAI/Taxi-v2/README.md:
--------------------------------------------------------------------------------
1 | # Taxi Problem
2 |
3 | ### Getting Started
4 |
5 | Read the description of the environment in subsection 3.1 of [this paper](https://arxiv.org/pdf/cs/9905014.pdf). You can verify that the description in the paper matches the OpenAI Gym environment by peeking at the code [here](https://github.com/openai/gym/blob/master/gym/envs/toy_text/taxi.py).
6 |
7 |
8 | ### Instructions
9 |
10 | The repository contains three files:
11 | - `agent.py`: Develop your reinforcement learning agent here. This is the only file that you should modify.
12 | - `monitor.py`: The `interact` function tests how well your agent learns from interaction with the environment.
13 | - `main.py`: Run this file in the terminal to check the performance of your agent.
14 |
15 | Begin by running the following command in the terminal:
16 | ```
17 | python main.py
18 | ```
19 |
20 | When you run `main.py`, the agent that you specify in `agent.py` interacts with the environment for 20,000 episodes. The details of the interaction are specified in `monitor.py`, which returns two variables: `avg_rewards` and `best_avg_reward`.
21 | - `avg_rewards` is a deque where `avg_rewards[i]` is the average (undiscounted) return collected by the agent from episodes `i+1` to episode `i+100`, inclusive. So, for instance, `avg_rewards[0]` is the average return collected by the agent over the first 100 episodes.
22 | - `best_avg_reward` is the largest entry in `avg_rewards`. This is the final score that you should use when determining how well your agent performed in the task.
23 |
24 | Your assignment is to modify the `agents.py` file to improve the agent's performance.
25 | - Use the `__init__()` method to define any needed instance variables. Currently, we define the number of actions available to the agent (`nA`) and initialize the action values (`Q`) to an empty dictionary of arrays. Feel free to add more instance variables; for example, you may find it useful to define the value of epsilon if the agent uses an epsilon-greedy policy for selecting actions.
26 | - The `select_action()` method accepts the environment state as input and returns the agent's choice of action. The default code that we have provided randomly selects an action.
27 | - The `step()` method accepts a (`state`, `action`, `reward`, `next_state`) tuple as input, along with the `done` variable, which is `True` if the episode has ended. The default code (which you should certainly change!) increments the action value of the previous state-action pair by 1. You should change this method to use the sampled tuple of experience to update the agent's knowledge of the problem.
28 |
29 | Once you have modified the function, you need only run `python main.py` to test your new agent.
30 |
31 | OpenAI Gym [defines "solving"](https://gym.openai.com/envs/Taxi-v1/) this task as getting average return of 9.7 over 100 consecutive trials.
32 |
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/2.0/settings.map:
--------------------------------------------------------------------------------
1 |
2 |
3 |
25 |
26 |
48 |
49 |
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/4.0/settings.map:
--------------------------------------------------------------------------------
1 |
2 |
3 |
25 |
26 |
48 |
49 |
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/4.5/settings.map:
--------------------------------------------------------------------------------
1 |
2 |
3 |
25 |
26 |
48 |
49 |
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/OpenAI/LunarLander-v2/model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 |
7 | def hidden_init(layer):
8 | fan_in = layer.weight.data.size()[0]
9 | lim = 1. / np.sqrt(fan_in)
10 | return (-lim, lim)
11 |
12 | class Actor(nn.Module):
13 | """Actor (Policy) Model."""
14 |
15 | def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64):
16 | """Initialize parameters and build model.
17 | Params
18 | ======
19 | state_size (int): Dimension of each state
20 | action_size (int): Dimension of each action
21 | seed (int): Random seed
22 | fc1_units (int): Number of nodes in first hidden layer
23 | fc2_units (int): Number of nodes in second hidden layer
24 | """
25 | super(Actor, self).__init__()
26 | self.seed = torch.manual_seed(seed)
27 |
28 | self.fc1 = nn.Linear(state_size, fc1_units)
29 | self.fc2 = nn.Linear(fc1_units, fc2_units)
30 | self.fc3 = nn.Linear(fc2_units, action_size)
31 | self.reset_parameters()
32 |
33 | def reset_parameters(self):
34 | self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
35 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
36 | self.fc3.weight.data.uniform_(-3e-3, 3e-3)
37 |
38 | def forward(self, state):
39 | """Build an actor (policy) network that maps states -> actions."""
40 | x = state
41 | x = F.relu(self.fc1(x))
42 | x = F.relu(self.fc2(x))
43 | return torch.tanh(self.fc3(x))
44 |
45 | class Critic(nn.Module):
46 | """Critic (Value) Model."""
47 |
48 | def __init__(self, state_size, action_size, seed, fcs1_units=128, fc2_units=64):
49 | """Initialize parameters and build model.
50 | Params
51 | ======
52 | state_size (int): Dimension of each state
53 | action_size (int): Dimension of each action
54 | seed (int): Random seed
55 | fcs1_units (int): Number of nodes in the first hidden layer
56 | fc2_units (int): Number of nodes in the second hidden layer
57 | fc3_units (int): Number of nodes in the third hidden layer
58 | """
59 | super(Critic, self).__init__()
60 | self.seed = torch.manual_seed(seed)
61 | self.bn0 = nn.BatchNorm1d(state_size)
62 | self.fcs1 = nn.Linear(state_size, fcs1_units)
63 | self.fc2 = nn.Linear(fcs1_units+action_size, fc2_units)
64 | self.fc3 = nn.Linear(fc2_units, 1)
65 | self.reset_parameters()
66 |
67 | def reset_parameters(self):
68 | self.fcs1.weight.data.uniform_(*hidden_init(self.fcs1))
69 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
70 | self.fc3.weight.data.uniform_(-3e-3, 3e-3)
71 |
72 | def forward(self, state, action):
73 | """Build a critic (value) network that maps (state, action) pairs -> Q-values."""
74 | state = self.bn0(state)
75 | xs = F.relu(self.fcs1(state))
76 | x = torch.cat((xs, action), dim=1)
77 | x = F.relu(self.fc2(x))
78 | return self.fc3(x)
--------------------------------------------------------------------------------
/OpenAI/MountainCarContinuous-v0/Model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 |
7 | def hidden_init(layer):
8 | fan_in = layer.weight.data.size()[0]
9 | lim = 1. / np.sqrt(fan_in)
10 | return (-lim, lim)
11 |
12 |
13 | class Actor(nn.Module):
14 | """Actor (Policy) Model."""
15 |
16 | def __init__(self, state_size, action_size, seed, fc1_units=5, fc2_units=5):
17 | """Initialize parameters and build model.
18 | Params
19 | ======
20 | state_size (int): Dimension of each state
21 | action_size (int): Dimension of each action
22 | seed (int): Random seed
23 | fc1_units (int): Number of nodes in first hidden layer
24 | fc2_units (int): Number of nodes in second hidden layer
25 | """
26 | super(Actor, self).__init__()
27 | self.seed = torch.manual_seed(seed)
28 |
29 | self.fc1 = nn.Linear(state_size, fc1_units)
30 | self.ln1 = nn.LayerNorm(fc1_units)
31 | self.fc2 = nn.Linear(fc1_units, fc2_units)
32 | self.ln2 = nn.LayerNorm(fc2_units)
33 | self.fc3 = nn.Linear(fc2_units, action_size)
34 | self.reset_parameters()
35 |
36 | def reset_parameters(self):
37 | self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
38 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
39 | self.fc3.weight.data.uniform_(-3e-3, 3e-3)
40 |
41 | def forward(self, state):
42 | """Build an actor (policy) network that maps states -> actions."""
43 | x = state
44 | x = self.fc1(x)
45 | x = self.ln1(x)
46 | x = F.relu(x)
47 | x = self.fc2(x)
48 | x = self.ln2(x)
49 | x = F.relu(x)
50 | x = self.fc3(x)
51 | return torch.tanh(x)
52 |
53 |
54 | class Critic(nn.Module):
55 | """Critic (Value) Model."""
56 |
57 | def __init__(self, state_size, action_size, seed, fc1_units=20, fc2_units=10):
58 | """Initialize parameters and build model.
59 | Params
60 | ======
61 | state_size (int): Dimension of each state
62 | action_size (int): Dimension of each action
63 | seed (int): Random seed
64 | fcs1_units (int): Number of nodes in the first hidden layer
65 | fc2_units (int): Number of nodes in the second hidden layer
66 | fc3_units (int): Number of nodes in the third hidden layer
67 | """
68 | super(Critic, self).__init__()
69 | self.seed = torch.manual_seed(seed)
70 | self.fc1 = nn.Linear(state_size, fc1_units)
71 | self.bn1 = nn.BatchNorm1d(fc1_units)
72 | self.fc2 = nn.Linear(fc1_units+action_size, fc2_units)
73 | self.fc3 = nn.Linear(fc2_units, 1)
74 | self.reset_parameters()
75 |
76 | def reset_parameters(self):
77 | self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
78 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
79 | self.fc3.weight.data.uniform_(-3e-3, 3e-3)
80 |
81 | def forward(self, state, action):
82 | """Build a critic (value) network that maps (state, action) pairs -> Q-values."""
83 | xs = self.fc1(state)
84 | xs = self.bn1(xs)
85 | xs = F.leaky_relu(xs)
86 | x = torch.cat((xs, action), dim=1)
87 | x = self.fc2(x)
88 | x = F.leaky_relu(x)
89 | return self.fc3(x)
90 |
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 |
7 | def hidden_init(layer):
8 | fan_in = layer.weight.data.size()[0]
9 | lim = 1. / np.sqrt(fan_in)
10 | return (-lim, lim)
11 |
12 |
13 | class Actor(nn.Module):
14 | """Actor (Policy) Model."""
15 |
16 | def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64):
17 | """Initialize parameters and build model.
18 | Params
19 | ======
20 | state_size (int): Dimension of each state
21 | action_size (int): Dimension of each action
22 | seed (int): Random seed
23 | fc1_units (int): Number of nodes in first hidden layer
24 | fc2_units (int): Number of nodes in second hidden layer
25 | """
26 | super(Actor, self).__init__()
27 | self.seed = torch.manual_seed(seed)
28 |
29 | self.fc1 = nn.Linear(state_size, fc1_units)
30 | self.bn1 = nn.BatchNorm1d(fc1_units)
31 | self.fc2 = nn.Linear(fc1_units, fc2_units)
32 | self.bn2 = nn.BatchNorm1d(fc2_units)
33 | self.fc3 = nn.Linear(fc2_units, action_size)
34 | self.bn3 = nn.BatchNorm1d(action_size)
35 | self.softmax = nn.Softmax(dim=1)
36 | self.reset_parameters()
37 |
38 | def reset_parameters(self):
39 | self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
40 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
41 | self.fc3.weight.data.uniform_(-3e-3, 3e-3)
42 |
43 | def forward(self, state):
44 | """Build an actor (policy) network that maps states -> actions."""
45 | x = self.fc1(state)
46 | x = F.relu(x)
47 | x = self.bn1(x)
48 | x = self.fc2(x)
49 | x = F.relu(x)
50 | x = self.bn2(x)
51 | x = self.fc3(x)
52 | x = self.bn3(x)
53 | #return torch.tanh(x)
54 | #return self.softmax(x)
55 |
56 | # transform to logits
57 | return F.log_softmax(x)
58 |
59 |
60 | class Critic(nn.Module):
61 | """Critic (Value) Model."""
62 |
63 | def __init__(self, state_size, action_size, seed, fc1_units=128, fc2_units=64):
64 | """Initialize parameters and build model.
65 | Params
66 | ======
67 | state_size (int): Dimension of each state
68 | action_size (int): Dimension of each action
69 | seed (int): Random seed
70 | fcs1_units (int): Number of nodes in the first hidden layer
71 | fc2_units (int): Number of nodes in the second hidden layer
72 | fc3_units (int): Number of nodes in the third hidden layer
73 | """
74 | super(Critic, self).__init__()
75 | self.seed = torch.manual_seed(seed)
76 |
77 | self.bn0 = nn.BatchNorm1d(state_size)
78 | self.fc1 = nn.Linear(state_size, fc1_units)
79 | self.bn1 = nn.BatchNorm1d(fc1_units)
80 | self.fc2 = nn.Linear(fc1_units+action_size, fc2_units)
81 | self.bn2 = nn.BatchNorm1d(fc2_units)
82 | self.fc3 = nn.Linear(fc2_units, 1)
83 | self.reset_parameters()
84 |
85 | def reset_parameters(self):
86 | self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
87 | self.fc2.weight.data.uniform_(*hidden_init(self.fc2))
88 | self.fc3.weight.data.uniform_(-3e-3, 3e-3)
89 |
90 | def forward(self, state, action):
91 | """Build a critic (value) network that maps (state, action) pairs -> Q-values."""
92 | state = self.bn0(state)
93 | xs = self.fc1(state)
94 | xs = self.bn1(xs)
95 | xs = F.leaky_relu(xs)
96 | x = torch.cat((xs, action), dim=1)
97 | x = self.fc2(x)
98 | x = self.bn2(x)
99 | x = F.leaky_relu(x)
100 | return self.fc3(x)
101 |
--------------------------------------------------------------------------------
/OpenAI/CartPole-v0/memory.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from collections import deque
3 |
4 | class Memory(object):
5 | def __init__(self, max_size=2000):
6 | self.max_size = max_size
7 | self.buffer = deque(maxlen=max_size)
8 |
9 | def add(self, experience):
10 | if len(self.buffer) <= self.max_size:
11 | self.buffer.append(experience)
12 | else:
13 | self.buffer[0] = experience
14 |
15 | def sample(self, batch_size):
16 | return [], rn.sample(self.buffer, batch_size), []
17 |
18 | def batch_update(self, indices, td_errors):
19 | pass
20 |
21 | class PrioritizedReplayBuffer(object):
22 | PER_e = 0.01
23 | PER_a = 0.6
24 | PER_b = 0.4
25 |
26 | PER_b_increment_per_sampling = 0.001
27 |
28 | absolute_error_upper = 1.
29 |
30 | def __init__(self, capacity):
31 | self.tree = SumTree(capacity)
32 |
33 | def add(self, experience):
34 | max_priority = np.max(self.tree.tree[-self.tree.capacity:])
35 |
36 | if max_priority == 0:
37 | max_priority = self.absolute_error_upper
38 |
39 | self.tree.add(max_priority, experience)
40 |
41 | def sample(self, n):
42 | memory_b = []
43 | b_idx, b_ISWeights = np.empty((n,), dtype=np.int32), np.empty((n, 1), dtype=np.float32)
44 | priority_segment = self.tree.total_priority / n
45 | self.PER_b = np.min([1., self.PER_b + self.PER_b_increment_per_sampling])
46 | p_min = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.total_priority
47 | max_weight = (p_min * n) ** (-self.PER_b)
48 |
49 | for i in range(n):
50 | a, b = priority_segment * i, priority_segment * (i + 1)
51 | value = np.random.uniform(a, b)
52 |
53 | index, priority, data = self.tree.get_leaf(value)
54 |
55 | sampling_probabilities = priority / self.tree.total_priority
56 |
57 | b_ISWeights[i, 0] = np.power(n * sampling_probabilities, -self.PER_b)/ max_weight
58 |
59 | b_idx[i]= index
60 |
61 | experience = [data]
62 |
63 | memory_b.append(experience)
64 |
65 | return b_idx, memory_b, b_ISWeights
66 |
67 | def batch_update(self, tree_idx, abs_errors):
68 | abs_errors += self.PER_e
69 | clipped_errors = np.minimum(abs_errors, self.absolute_error_upper)
70 | ps = np.power(clipped_errors, self.PER_a)
71 |
72 | for ti, p in zip(tree_idx, ps):
73 | self.tree.update(ti, p)
74 |
75 | class SumTree(object):
76 | data_pointer = 0
77 |
78 | def __init__(self, capacity):
79 | self.capacity = capacity
80 | self.tree = np.zeros(2 * capacity - 1)
81 | self.data = np.zeros(capacity, dtype=object)
82 |
83 | def add(self, priority, data):
84 | tree_index = self.data_pointer + self.capacity - 1
85 | self.data[self.data_pointer] = data
86 | self.update (tree_index, priority)
87 | self.data_pointer += 1
88 |
89 | if self.data_pointer >= self.capacity:
90 | self.data_pointer = 0
91 |
92 | def update(self, tree_index, priority):
93 | change = priority - self.tree[tree_index]
94 | self.tree[tree_index] = priority
95 |
96 | while tree_index != 0:
97 | tree_index = (tree_index - 1) // 2
98 | self.tree[tree_index] += change
99 |
100 | def get_leaf(self, v):
101 | parent_index = 0
102 |
103 | while True:
104 | left_child_index = 2 * parent_index + 1
105 | right_child_index = left_child_index + 1
106 |
107 | if left_child_index >= len(self.tree):
108 | leaf_index = parent_index
109 | break
110 |
111 | else:
112 | if v <= self.tree[left_child_index]:
113 | parent_index = left_child_index
114 |
115 | else:
116 | v -= self.tree[left_child_index]
117 | parent_index = right_child_index
118 |
119 | data_index = leaf_index - self.capacity + 1
120 |
121 | return leaf_index, self.tree[leaf_index], self.data[data_index]
122 |
123 | @property
124 | def total_priority(self):
125 | return self.tree[0]
126 |
--------------------------------------------------------------------------------
/OpenAI/HumanoidPyBulletEnv-v0/multiprocessing_env.py:
--------------------------------------------------------------------------------
1 | #This code is from openai baseline
2 | #https://github.com/openai/baselines/tree/master/baselines/common/vec_env
3 |
4 | import numpy as np
5 | from multiprocessing import Process, Pipe
6 | import gym
7 | import pybullet
8 | import pybulletgym
9 |
10 | def worker(remote, parent_remote, env_fn_wrapper):
11 | parent_remote.close()
12 | env = env_fn_wrapper.x()
13 | while True:
14 | cmd, data = remote.recv()
15 | if cmd == 'step':
16 | ob, reward, done, info = env.step(data)
17 | if done:
18 | ob = env.reset()
19 | remote.send((ob, reward, done, info))
20 | elif cmd == 'reset':
21 | ob = env.reset()
22 | remote.send(ob)
23 | elif cmd == 'reset_task':
24 | ob = env.reset_task()
25 | remote.send(ob)
26 | elif cmd == 'close':
27 | remote.close()
28 | break
29 | elif cmd == 'get_spaces':
30 | remote.send((env.observation_space, env.action_space))
31 | else:
32 | raise NotImplementedError
33 |
34 | class VecEnv(object):
35 | """
36 | An abstract asynchronous, vectorized environment.
37 | """
38 | def __init__(self, num_envs, observation_space, action_space):
39 | self.num_envs = num_envs
40 | self.observation_space = observation_space
41 | self.action_space = action_space
42 |
43 | def reset(self):
44 | """
45 | Reset all the environments and return an array of
46 | observations, or a tuple of observation arrays.
47 | If step_async is still doing work, that work will
48 | be cancelled and step_wait() should not be called
49 | until step_async() is invoked again.
50 | """
51 | pass
52 |
53 | def step_async(self, actions):
54 | """
55 | Tell all the environments to start taking a step
56 | with the given actions.
57 | Call step_wait() to get the results of the step.
58 | You should not call this if a step_async run is
59 | already pending.
60 | """
61 | pass
62 |
63 | def step_wait(self):
64 | """
65 | Wait for the step taken with step_async().
66 | Returns (obs, rews, dones, infos):
67 | - obs: an array of observations, or a tuple of
68 | arrays of observations.
69 | - rews: an array of rewards
70 | - dones: an array of "episode done" booleans
71 | - infos: a sequence of info objects
72 | """
73 | pass
74 |
75 | def close(self):
76 | """
77 | Clean up the environments' resources.
78 | """
79 | pass
80 |
81 | def step(self, actions):
82 | self.step_async(actions)
83 | return self.step_wait()
84 |
85 |
86 | class CloudpickleWrapper(object):
87 | """
88 | Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
89 | """
90 | def __init__(self, x):
91 | self.x = x
92 | def __getstate__(self):
93 | import cloudpickle
94 | return cloudpickle.dumps(self.x)
95 | def __setstate__(self, ob):
96 | import pickle
97 | self.x = pickle.loads(ob)
98 |
99 |
100 | class SubprocVecEnv(VecEnv):
101 | def __init__(self, env_fns, spaces=None):
102 | """
103 | envs: list of gym environments to run in subprocesses
104 | """
105 | self.waiting = False
106 | self.closed = False
107 | nenvs = len(env_fns)
108 | self.nenvs = nenvs
109 | self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
110 | self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
111 | for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
112 | for p in self.ps:
113 | p.daemon = True # if the main process crashes, we should not cause things to hang
114 | p.start()
115 | for remote in self.work_remotes:
116 | remote.close()
117 |
118 | self.remotes[0].send(('get_spaces', None))
119 | observation_space, action_space = self.remotes[0].recv()
120 | VecEnv.__init__(self, len(env_fns), observation_space, action_space)
121 |
122 | def step_async(self, actions):
123 | for remote, action in zip(self.remotes, actions):
124 | remote.send(('step', action))
125 | self.waiting = True
126 |
127 | def step_wait(self):
128 | results = [remote.recv() for remote in self.remotes]
129 | self.waiting = False
130 | obs, rews, dones, infos = zip(*results)
131 | return np.stack(obs), np.stack(rews), np.stack(dones), infos
132 |
133 | def reset(self):
134 | for remote in self.remotes:
135 | remote.send(('reset', None))
136 | return np.stack([remote.recv() for remote in self.remotes])
137 |
138 | def reset_idx(self, idx):
139 | self.remotes[idx].send(('reset', None))
140 | return self.remotes[idx].recv()
141 |
142 | def reset_task(self):
143 | for remote in self.remotes:
144 | remote.send(('reset_task', None))
145 | return np.stack([remote.recv() for remote in self.remotes])
146 |
147 | def close(self):
148 | if self.closed:
149 | return
150 | if self.waiting:
151 | for remote in self.remotes:
152 | remote.recv()
153 | for remote in self.remotes:
154 | remote.send(('close', None))
155 | for p in self.ps:
156 | p.join()
157 | self.closed = True
158 |
159 | def __len__(self):
160 | return self.nenvs
--------------------------------------------------------------------------------
/OpenAI/CartPole-v0/agents/DDQN.py:
--------------------------------------------------------------------------------
1 | from memory import *
2 | from keras.layers import *
3 | from keras.models import *
4 | from keras.optimizers import *
5 | from keras.initializers import *
6 |
7 | class DDQNAgent(object):
8 | def __init__(self,
9 | state_size,
10 | action_size,
11 | buffer_size=10000,
12 | batch_size=32,
13 | gamma=0.99,
14 | epsilon_start=1.0,
15 | epsilon_min=0.1,
16 | epsilon_steps_to_min=1000,
17 | tau=0.1,
18 | mode='QNetwork',
19 | use_PER=True,
20 | pre_trained=None):
21 |
22 | self.state_size = state_size
23 | self.action_size = action_size
24 |
25 |
26 | self.batch_size = batch_size
27 | self.gamma = gamma
28 | self.epsilon = epsilon_start
29 | self.epsilon_min = epsilon_min
30 | self.epsilon_step = (self.epsilon - self.epsilon_min) / epsilon_steps_to_min
31 | self.tau = tau
32 |
33 | self.model = self.build_model(mode, pre_trained)
34 | self.target_model = self.build_model(mode, pre_trained)
35 | self.hard_update_target_network()
36 |
37 | self.use_PER = use_PER
38 |
39 | if self.use_PER:
40 | self.replay_buffer = PrioritizedReplayBuffer(capacity=buffer_size)
41 | else:
42 | self.replay_buffer = Memory(max_size=buffer_size)
43 |
44 | def build_model(self, mode, pre_trained):
45 | model = Sequential()
46 | model.add(Dense(64, input_dim=self.state_size, activation='relu'))
47 | model.add(Dense(64, activation='relu'))
48 |
49 | if mode == "QNetwork":
50 | model.add(Dense(self.action_size, activation='linear'))
51 |
52 | if mode == "DuelingDQN":
53 | model.add(Dense(self.action_size + 1, activation='linear'))
54 | model.add(Lambda(lambda i: K.expand_dims(i[:,0],-1) + i[:,1:] - K.mean(i[:,1:], keepdims=True),
55 | output_shape=(self.action_size,)))
56 |
57 | if pre_trained:
58 | model = load_model(pre_trained)
59 |
60 | model.compile(optimizer=Adam(lr=0.001), loss='mse')
61 | return model
62 |
63 | def hard_update_target_network(self):
64 | pars = self.model.get_weights()
65 | self.target_model.set_weights(pars)
66 |
67 | def soft_update_target_network(self):
68 | pars_behavior = self.model.get_weights()
69 | pars_target = self.target_model.get_weights()
70 |
71 | ctr = 0
72 | for par_behavior,par_target in zip(pars_behavior,pars_target):
73 | par_target = par_target*(1-self.tau) + par_behavior*self.tau
74 | pars_target[ctr] = par_target
75 | ctr += 1
76 |
77 | self.target_model.set_weights(pars_target)
78 |
79 | def remember(self, state, action, reward, next_state, done):
80 | self.replay_buffer.add((state, action, reward, next_state, done))
81 |
82 | def preprocess(self, state):
83 | return np.reshape(state, [1, self.state_size])
84 |
85 | def act(self, state):
86 | # Update exploration rate
87 | if self.epsilon > self.epsilon_min:
88 | self.epsilon -= self.epsilon_step
89 |
90 | # Choose Action
91 | if np.random.rand() <= self.epsilon:
92 | action = np.random.choice(self.action_size)
93 | else:
94 | Qs = self.model.predict(state)[0]
95 | action = np.argmax(Qs)
96 |
97 | return action
98 |
99 | def train(self):
100 | indices, mini_batch, importance = self.replay_buffer.sample(self.batch_size)
101 |
102 | states = []
103 | actions = []
104 | rewards = []
105 | next_states = []
106 | dones = []
107 |
108 | Q_wants = []
109 | td_errors = np.zeros(self.batch_size)
110 |
111 | for i in range(len(mini_batch)):
112 | if not self.use_PER:
113 | state, action, reward, next_state, done = mini_batch[i]
114 | else:
115 | state = mini_batch[i][0][0]
116 | action = mini_batch[i][0][1]
117 | reward = mini_batch[i][0][2]
118 | next_state = mini_batch[i][0][3]
119 | done = mini_batch[i][0][4]
120 |
121 | states.append(state)
122 | actions.append(action)
123 | rewards.append(reward)
124 | next_states.append(next_state)
125 | dones.append(done)
126 |
127 | states_tensor = np.reshape(states,(self.batch_size,len(states[0])))
128 | Q_wants_pred = self.model.predict(states_tensor)
129 |
130 | next_states_tensor = np.reshape(next_states,(self.batch_size,len(next_states[0])))
131 | Q_next_state_vecs = self.model.predict(next_states_tensor)
132 | Q_target_next_state_vecs = self.target_model.predict(next_states_tensor)
133 |
134 | for i in range(len(mini_batch)):
135 | action = actions[i]
136 | reward = rewards[i]
137 | done = dones[i]
138 |
139 | Q_want = Q_wants_pred[i]
140 | Q_want_old = Q_want[action]
141 |
142 | if done:
143 | Q_want[action] = reward
144 | else:
145 | Q_next_state_vec = Q_next_state_vecs[i]
146 | action_max = np.argmax(Q_next_state_vec)
147 |
148 | Q_target_next_state_vec = Q_target_next_state_vecs[i]
149 | Q_target_next_state_max = Q_target_next_state_vec[action_max]
150 |
151 | Q_want[action] = reward + self.gamma*Q_target_next_state_max
152 | Q_want_tensor = np.reshape(Q_want,(1,len(Q_want)))
153 |
154 | Q_wants.append(Q_want)
155 | td_errors[i] = abs(Q_want_old - Q_want[action])
156 |
157 | states = np.array(states)
158 | Q_wants = np.array(Q_wants)
159 | self.model.fit(states, Q_wants, verbose=False, epochs=1)
160 |
161 | # update replay buffer
162 | self.replay_buffer.batch_update(indices, np.array(td_errors))
163 |
164 | def save(self, file='model.h5'):
165 | print('Save model...')
166 | self.model.save(file)
167 |
--------------------------------------------------------------------------------
/OpenAI/BipedalWalker-v2/Model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import torch.optim as optim
5 |
6 | # Hyperparameters
7 | gamma = 0.99 # discount for future rewards
8 | batch_size = 100 # num of transitions sampled from replay buffer
9 | polyak = 0.995 # target policy update parameter (1-tau)
10 | policy_noise = 0.2 # target policy smoothing noise
11 | noise_clip = 0.5
12 | policy_delay = 2 # delayed policy updates parameter
13 |
14 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
15 |
16 | class Actor(nn.Module):
17 | def __init__(self, state_dim, action_dim, max_action):
18 | super(Actor, self).__init__()
19 |
20 | self.l1 = nn.Linear(state_dim, 400)
21 | self.l2 = nn.Linear(400, 300)
22 | self.l3 = nn.Linear(300, action_dim)
23 |
24 | self.max_action = max_action
25 |
26 | def forward(self, state):
27 | a = F.relu(self.l1(state))
28 | a = F.relu(self.l2(a))
29 | a = torch.tanh(self.l3(a)) * self.max_action
30 | return a
31 |
32 | class Critic(nn.Module):
33 | def __init__(self, state_dim, action_dim):
34 | super(Critic, self).__init__()
35 |
36 | self.l1 = nn.Linear(state_dim + action_dim, 400)
37 | self.l2 = nn.Linear(400, 300)
38 | self.l3 = nn.Linear(300, 1)
39 |
40 | def forward(self, state, action):
41 | state_action = torch.cat([state, action], 1)
42 |
43 | q = F.relu(self.l1(state_action))
44 | q = F.relu(self.l2(q))
45 | q = self.l3(q)
46 | return q
47 |
48 | class TD3:
49 | def __init__(self, state_dim, action_dim, max_action):
50 |
51 | self.actor = Actor(state_dim, action_dim, max_action).to(device)
52 | self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
53 | self.actor_target.load_state_dict(self.actor.state_dict())
54 | self.actor_optimizer = optim.Adam(self.actor.parameters())
55 |
56 | self.critic_1 = Critic(state_dim, action_dim).to(device)
57 | self.critic_1_target = Critic(state_dim, action_dim).to(device)
58 | self.critic_1_target.load_state_dict(self.critic_1.state_dict())
59 | self.critic_1_optimizer = optim.Adam(self.critic_1.parameters())
60 |
61 | self.critic_2 = Critic(state_dim, action_dim).to(device)
62 | self.critic_2_target = Critic(state_dim, action_dim).to(device)
63 | self.critic_2_target.load_state_dict(self.critic_2.state_dict())
64 | self.critic_2_optimizer = optim.Adam(self.critic_2.parameters())
65 |
66 | self.max_action = max_action
67 |
68 | def select_action(self, state):
69 | state = torch.FloatTensor(state.reshape(1, -1)).to(device)
70 | return self.actor(state).cpu().data.numpy().flatten()
71 |
72 | def update(self, replay_buffer, n_iter):
73 |
74 | for i in range(n_iter):
75 | # Sample a batch of transitions from replay buffer:
76 | state, action_, reward, next_state, done = replay_buffer.sample(batch_size)
77 | state = torch.FloatTensor(state).to(device)
78 | action = torch.FloatTensor(action_).to(device)
79 | reward = torch.FloatTensor(reward).reshape((batch_size,1)).to(device)
80 | next_state = torch.FloatTensor(next_state).to(device)
81 | done = torch.FloatTensor(done).reshape((batch_size,1)).to(device)
82 |
83 | # Select next action according to target policy:
84 | noise = torch.FloatTensor(action_).data.normal_(0, policy_noise).to(device)
85 | noise = noise.clamp(-noise_clip, noise_clip)
86 | next_action = (self.actor_target(next_state) + noise)
87 | next_action = next_action.clamp(-self.max_action, self.max_action)
88 |
89 | # Compute target Q-value:
90 | target_Q1 = self.critic_1_target(next_state, next_action)
91 | target_Q2 = self.critic_2_target(next_state, next_action)
92 | target_Q = torch.min(target_Q1, target_Q2)
93 | target_Q = reward + ((1-done) * gamma * target_Q).detach()
94 |
95 | # Optimize Critic 1:
96 | current_Q1 = self.critic_1(state, action)
97 | loss_Q1 = F.mse_loss(current_Q1, target_Q)
98 | self.critic_1_optimizer.zero_grad()
99 | loss_Q1.backward()
100 | self.critic_1_optimizer.step()
101 |
102 | # Optimize Critic 2:
103 | current_Q2 = self.critic_2(state, action)
104 | loss_Q2 = F.mse_loss(current_Q2, target_Q)
105 | self.critic_2_optimizer.zero_grad()
106 | loss_Q2.backward()
107 | self.critic_2_optimizer.step()
108 |
109 | # Delayed policy updates:
110 | if i % policy_delay == 0:
111 | # Compute actor loss:
112 | actor_loss = -self.critic_1(state, self.actor(state)).mean()
113 |
114 | # Optimize the actor
115 | self.actor_optimizer.zero_grad()
116 | actor_loss.backward()
117 | self.actor_optimizer.step()
118 |
119 | # Polyak averaging update:
120 | for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
121 | target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data))
122 |
123 | for param, target_param in zip(self.critic_1.parameters(), self.critic_1_target.parameters()):
124 | target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data))
125 |
126 | for param, target_param in zip(self.critic_2.parameters(), self.critic_2_target.parameters()):
127 | target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data))
128 |
129 |
130 | def save(self, directory, name):
131 | torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, name))
132 | torch.save(self.actor_target.state_dict(), '%s/%s_actor_target.pth' % (directory, name))
133 |
134 | torch.save(self.critic_1.state_dict(), '%s/%s_crtic_1.pth' % (directory, name))
135 | torch.save(self.critic_1_target.state_dict(), '%s/%s_critic_1_target.pth' % (directory, name))
136 |
137 | torch.save(self.critic_2.state_dict(), '%s/%s_crtic_2.pth' % (directory, name))
138 | torch.save(self.critic_2_target.state_dict(), '%s/%s_critic_2_target.pth' % (directory, name))
139 |
140 | def load(self, directory, name):
141 | self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, name), map_location=lambda storage, loc: storage))
142 | self.actor_target.load_state_dict(torch.load('%s/%s_actor_target.pth' % (directory, name), map_location=lambda storage, loc: storage))
143 |
144 | self.critic_1.load_state_dict(torch.load('%s/%s_crtic_1.pth' % (directory, name), map_location=lambda storage, loc: storage))
145 | self.critic_1_target.load_state_dict(torch.load('%s/%s_critic_1_target.pth' % (directory, name), map_location=lambda storage, loc: storage))
146 |
147 | self.critic_2.load_state_dict(torch.load('%s/%s_crtic_2.pth' % (directory, name), map_location=lambda storage, loc: storage))
148 | self.critic_2_target.load_state_dict(torch.load('%s/%s_critic_2_target.pth' % (directory, name), map_location=lambda storage, loc: storage))
149 |
150 |
151 | def load_actor(self, directory, name):
152 | self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, name), map_location=lambda storage, loc: storage))
153 | self.actor_target.load_state_dict(torch.load('%s/%s_actor_target.pth' % (directory, name), map_location=lambda storage, loc: storage))
154 |
--------------------------------------------------------------------------------
/OpenAI/BipedalWalker-v3/Agent.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import torch.optim as optim
5 | import numpy as np
6 |
7 | # Hyperparameters
8 | gamma = 0.99 # discount for future rewards
9 | batch_size = 100 # num of transitions sampled from replay buffer
10 | polyak = 0.995 # target policy update parameter (1-tau)
11 | policy_noise = 0.2 # target policy smoothing noise
12 | noise_clip = 0.5
13 | exploration_noise = 0.1
14 |
15 | policy_delay = 2 # delayed policy updates parameter
16 | LR_ACTOR = 0.001
17 | LR_CRITIC = 0.001
18 |
19 | WEIGHT_DECAY = 0.0
20 |
21 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
22 |
23 | class Actor(nn.Module):
24 | def __init__(self, state_dim, action_dim, max_action):
25 | super(Actor, self).__init__()
26 |
27 | self.l1 = nn.Linear(state_dim, 400)
28 | self.l2 = nn.Linear(400, 300)
29 | self.l3 = nn.Linear(300, action_dim)
30 |
31 | self.max_action = max_action
32 |
33 | def forward(self, state):
34 | a = F.relu(self.l1(state))
35 | a = F.relu(self.l2(a))
36 | a = torch.tanh(self.l3(a)) * self.max_action
37 | return a
38 |
39 | class Critic(nn.Module):
40 | def __init__(self, state_dim, action_dim):
41 | super(Critic, self).__init__()
42 |
43 | self.l1 = nn.Linear(state_dim + action_dim, 400)
44 | self.l2 = nn.Linear(400, 300)
45 | self.l3 = nn.Linear(300, 1)
46 |
47 | def forward(self, state, action):
48 | state_action = torch.cat([state, action], 1)
49 |
50 | q = F.relu(self.l1(state_action))
51 | q = F.relu(self.l2(q))
52 | q = self.l3(q)
53 | return q
54 |
55 | class TD3:
56 | def __init__(self, env):
57 | self.env = env
58 |
59 | state_dim = env.observation_space.shape[0]
60 | action_dim = env.action_space.shape[0]
61 | max_action = float(env.action_space.high[0])
62 |
63 | self.actor = Actor(state_dim, action_dim, max_action).to(device)
64 | self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
65 | self.actor_target.load_state_dict(self.actor.state_dict())
66 | self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=LR_ACTOR)
67 |
68 | self.critic_1 = Critic(state_dim, action_dim).to(device)
69 | self.critic_1_target = Critic(state_dim, action_dim).to(device)
70 | self.critic_1_target.load_state_dict(self.critic_1.state_dict())
71 | self.critic_1_optimizer = optim.Adam(self.critic_1.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
72 |
73 | self.critic_2 = Critic(state_dim, action_dim).to(device)
74 | self.critic_2_target = Critic(state_dim, action_dim).to(device)
75 | self.critic_2_target.load_state_dict(self.critic_2.state_dict())
76 | self.critic_2_optimizer = optim.Adam(self.critic_2.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
77 |
78 | self.max_action = max_action
79 |
80 | def select_action(self, state):
81 | state = torch.FloatTensor(state.reshape(1, -1)).to(device)
82 | action = self.actor(state).cpu().data.numpy().flatten()
83 |
84 | if exploration_noise != 0:
85 | action = (action + np.random.normal(0, exploration_noise, size=self.env.action_space.shape[0]))
86 |
87 | return action.clip(self.env.action_space.low, self.env.action_space.high)
88 |
89 | def update(self, replay_buffer, n_iter):
90 | for i in range(n_iter):
91 | state, action, reward, next_state, done = replay_buffer.sample(batch_size)
92 |
93 | state = torch.FloatTensor(state).to(device)
94 | action = torch.FloatTensor(action).to(device)
95 | reward = torch.FloatTensor(reward).to(device)
96 | next_state = torch.FloatTensor(next_state).to(device)
97 | done = torch.FloatTensor(done).to(device)
98 |
99 | # Select next action according to target policy:
100 | noise = torch.empty_like(action).data.normal_(0, policy_noise).to(device)
101 | noise = noise.clamp(-noise_clip, noise_clip)
102 | next_action = (self.actor_target(next_state) + noise)
103 | next_action = next_action.clamp(-self.max_action, self.max_action)
104 |
105 | # Compute target Q-value:
106 | target_Q1 = self.critic_1_target(next_state, next_action)
107 | target_Q2 = self.critic_2_target(next_state, next_action)
108 | target_Q = torch.min(target_Q1, target_Q2)
109 | target_Q = reward + ((1-done) * gamma * target_Q).detach()
110 |
111 | # Optimize Critic 1:
112 | current_Q1 = self.critic_1(state, action)
113 | loss_Q1 = F.mse_loss(current_Q1, target_Q)
114 |
115 | self.critic_1_optimizer.zero_grad()
116 | loss_Q1.backward()
117 | self.critic_1_optimizer.step()
118 |
119 | # Optimize Critic 2:
120 | current_Q2 = self.critic_2(state, action)
121 | loss_Q2 = F.mse_loss(current_Q2, target_Q)
122 |
123 | self.critic_2_optimizer.zero_grad()
124 | loss_Q2.backward()
125 | self.critic_2_optimizer.step()
126 |
127 | # Delayed policy updates:
128 | if i % policy_delay == 0:
129 | # Compute actor loss:
130 | actor_loss = -self.critic_1(state, self.actor(state)).mean()
131 |
132 | # Optimize the actor
133 | self.actor_optimizer.zero_grad()
134 | actor_loss.backward()
135 | self.actor_optimizer.step()
136 |
137 | # Polyak averaging update:
138 | for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
139 | target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data))
140 |
141 | for param, target_param in zip(self.critic_1.parameters(), self.critic_1_target.parameters()):
142 | target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data))
143 |
144 | for param, target_param in zip(self.critic_2.parameters(), self.critic_2_target.parameters()):
145 | target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data))
146 |
147 |
148 | def save(self, directory, name):
149 | torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, name))
150 | torch.save(self.actor_target.state_dict(), '%s/%s_actor_target.pth' % (directory, name))
151 |
152 | torch.save(self.critic_1.state_dict(), '%s/%s_crtic_1.pth' % (directory, name))
153 | torch.save(self.critic_1_target.state_dict(), '%s/%s_critic_1_target.pth' % (directory, name))
154 |
155 | torch.save(self.critic_2.state_dict(), '%s/%s_crtic_2.pth' % (directory, name))
156 | torch.save(self.critic_2_target.state_dict(), '%s/%s_critic_2_target.pth' % (directory, name))
157 |
158 | def load(self, directory, name):
159 | self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, name), map_location=lambda storage, loc: storage))
160 | self.actor_target.load_state_dict(torch.load('%s/%s_actor_target.pth' % (directory, name), map_location=lambda storage, loc: storage))
161 |
162 | self.critic_1.load_state_dict(torch.load('%s/%s_crtic_1.pth' % (directory, name), map_location=lambda storage, loc: storage))
163 | self.critic_1_target.load_state_dict(torch.load('%s/%s_critic_1_target.pth' % (directory, name), map_location=lambda storage, loc: storage))
164 |
165 | self.critic_2.load_state_dict(torch.load('%s/%s_crtic_2.pth' % (directory, name), map_location=lambda storage, loc: storage))
166 | self.critic_2_target.load_state_dict(torch.load('%s/%s_critic_2_target.pth' % (directory, name), map_location=lambda storage, loc: storage))
167 |
168 |
169 | def load_actor(self, directory, name):
170 | self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, name), map_location=lambda storage, loc: storage))
171 | self.actor_target.load_state_dict(torch.load('%s/%s_actor_target.pth' % (directory, name), map_location=lambda storage, loc: storage))
172 |
--------------------------------------------------------------------------------
/OpenAI/MountainCarContinuous-v0/Agent.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | import copy
4 | from collections import namedtuple, deque
5 |
6 | from Model import Actor, Critic
7 | from Noise import OUNoise
8 |
9 | import torch
10 | import torch.nn.functional as F
11 | import torch.optim as optim
12 |
13 | BUFFER_SIZE = int(1e6) # replay buffer size
14 | BATCH_SIZE = 64 # minibatch size
15 | GAMMA = 0.99 # discount factor
16 | TAU = 1e-3 # for soft update of target parameters
17 | LR_ACTOR = 1e-2 # learning rate of the actor
18 | LR_CRITIC = 5e-3 # learning rate of the critic
19 | WEIGHT_DECAY = 0 # L2 weight decay
20 | EPSILON_MAX = 1.0
21 | EPSILON_MIN = 0.1
22 | EPSILON_DECAY = 1e-6
23 | LEARN_START = 20000
24 | UPDATE_EVERY = 1
25 | UPDATES_PER_STEP = 1
26 |
27 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
28 |
29 | class Agent():
30 | """Interacts with and learns from the environment."""
31 |
32 | def __init__(self, state_size, action_size, random_seed):
33 | """Initialize an Agent object.
34 |
35 | Params
36 | ======
37 | state_size (int): dimension of each state
38 | action_size (int): dimension of each action
39 | random_seed (int): random seed
40 | """
41 | self.state_size = state_size
42 | self.action_size = action_size
43 | self.seed = random.seed(random_seed)
44 | self.epsilon = EPSILON_MAX
45 |
46 | # Actor Network (w/ Target Network)
47 | self.actor_local = Actor(state_size, action_size, random_seed).to(device)
48 | self.actor_target = Actor(state_size, action_size, random_seed).to(device)
49 | self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)
50 |
51 | # Critic Network (w/ Target Network)
52 | self.critic_local = Critic(state_size, action_size, random_seed).to(device)
53 | self.critic_target = Critic(state_size, action_size, random_seed).to(device)
54 | self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
55 |
56 | # Noise process
57 | self.noise = OUNoise(action_size, random_seed, mu=0, theta=0.15, sigma=0.2)
58 |
59 | # Replay memory
60 | self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
61 |
62 | # Make sure target is with the same weight as the source
63 | self.hard_update(self.actor_target, self.actor_local)
64 | self.hard_update(self.critic_target, self.critic_local)
65 |
66 | self.t_step = 0
67 |
68 | def step(self, state, action, reward, next_state, done, timestep):
69 | """Save experience in replay memory, and use random sample from buffer to learn."""
70 | # Save experience / reward
71 | self.memory.add(state, action, reward, next_state, done)
72 |
73 | if len(self.memory) > LEARN_START:
74 | # Learn every UPDATE_EVERY time steps.
75 | self.t_step = (self.t_step + 1) % UPDATE_EVERY
76 | if self.t_step == 0:
77 | # Learn, if enough samples are available in memory
78 | if len(self.memory) > BATCH_SIZE:
79 | for _ in range(UPDATES_PER_STEP):
80 | experiences = self.memory.sample()
81 | self.learn(experiences, GAMMA)
82 |
83 | def act(self, state, add_noise=True):
84 | """Returns actions for given state as per current policy."""
85 |
86 | state = torch.from_numpy(state).float().to(device)
87 |
88 | self.actor_local.eval()
89 | with torch.no_grad():
90 | action = self.actor_local(state).cpu().data.numpy()
91 |
92 | self.actor_local.train()
93 |
94 | if add_noise:
95 | action += self.epsilon * self.noise.sample()
96 |
97 | return np.clip(action, -1, 1)
98 |
99 | def reset(self):
100 | self.noise.reset()
101 |
102 | def learn(self, experiences, gamma):
103 | """Update policy and value parameters using given batch of experience tuples.
104 | Q_targets = r + ? * critic_target(next_state, actor_target(next_state))
105 | where:
106 | actor_target(state) -> action
107 | critic_target(state, action) -> Q-value
108 |
109 | Params
110 | ======
111 | experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
112 | gamma (float): discount factor
113 | """
114 | states, actions, rewards, next_states, dones = experiences
115 |
116 | # ---------------------------- update critic ---------------------------- #
117 | # Get predicted next-state actions and Q values from target models
118 | actions_next = self.actor_target(next_states)
119 | Q_targets_next = self.critic_target(next_states, actions_next)
120 |
121 | # Compute Q targets for current states (y_i)
122 | Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
123 |
124 | # Compute critic loss
125 | Q_expected = self.critic_local(states, actions)
126 | critic_loss = F.mse_loss(Q_expected, Q_targets)
127 |
128 | # Minimize the loss
129 | self.critic_optimizer.zero_grad()
130 | critic_loss.backward()
131 | #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
132 | self.critic_optimizer.step()
133 |
134 | # ---------------------------- update actor ---------------------------- #
135 | # Compute actor loss
136 | actions_pred = self.actor_local(states)
137 | actor_loss = -self.critic_local(states, actions_pred).mean()
138 |
139 | # Minimize the loss
140 | self.actor_optimizer.zero_grad()
141 | actor_loss.backward()
142 | self.actor_optimizer.step()
143 |
144 | # ----------------------- update target networks ----------------------- #
145 | self.soft_update(self.critic_local, self.critic_target, TAU)
146 | self.soft_update(self.actor_local, self.actor_target, TAU)
147 |
148 | # ---------------------------- update noise ---------------------------- #
149 | if self.epsilon - EPSILON_DECAY > EPSILON_MIN:
150 | self.epsilon -= EPSILON_DECAY
151 | else:
152 | self.epsilon = EPSILON_MIN
153 |
154 | self.noise.reset()
155 |
156 | def soft_update(self, local_model, target_model, tau):
157 | """Soft update model parameters.
158 | ?_target = t*?_local + (1 - t)*?_target
159 |
160 | Params
161 | ======
162 | local_model: PyTorch model (weights will be copied from)
163 | target_model: PyTorch model (weights will be copied to)
164 | tau (float): interpolation parameter
165 | """
166 | for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
167 | target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
168 |
169 | def hard_update(self, target, source):
170 | for target_param, param in zip(target.parameters(), source.parameters()):
171 | target_param.data.copy_(param.data)
172 |
173 | class ReplayBuffer:
174 | """Fixed-size buffer to store experience tuples."""
175 |
176 | def __init__(self, action_size, buffer_size, batch_size, seed):
177 | """Initialize a ReplayBuffer object.
178 | Params
179 | ======
180 | buffer_size (int): maximum size of buffer
181 | batch_size (int): size of each training batch
182 | """
183 | self.action_size = action_size
184 | self.buffer_size = buffer_size
185 | self.batch_size = batch_size
186 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
187 | self.seed = random.seed(seed)
188 |
189 | self.reset()
190 |
191 | def add(self, state, action, reward, next_state, done):
192 | """Add a new experience to memory."""
193 | e = self.experience(state, action, reward, next_state, done)
194 | self.memory.append(e)
195 |
196 | def reset(self):
197 | self.memory = deque(maxlen=self.buffer_size)
198 |
199 | def sample(self):
200 | """Randomly sample a batch of experiences from memory."""
201 | experiences = random.sample(self.memory, k=self.batch_size)
202 |
203 | states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
204 | actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
205 | rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
206 | next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
207 | dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
208 |
209 | return states, actions, rewards, next_states, dones
210 |
211 | def __len__(self):
212 | """Return the current size of internal memory."""
213 | return len(self.memory)
--------------------------------------------------------------------------------
/OpenAI/LunarLander-v2/ddpg_agent.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | import copy
4 | from collections import namedtuple, deque
5 |
6 | from model import Actor, Critic
7 |
8 | import torch
9 | import torch.nn.functional as F
10 | import torch.optim as optim
11 |
12 | BUFFER_SIZE = int(1e6) # replay buffer size
13 | BATCH_SIZE = 64 # minibatch size
14 | GAMMA = 0.99 # discount factor
15 | TAU = 1e-3 # for soft update of target parameters
16 | LR_ACTOR = 1e-4 # learning rate of the actor
17 | LR_CRITIC = 1e-3 # learning rate of the critic
18 | WEIGHT_DECAY = 0.0001 # L2 weight decay
19 | EPSILON = 1.0
20 | EPSILON_MIN = 0.1
21 | EPSILON_DECAY = 1e-6
22 |
23 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
24 |
25 | class Agent():
26 | """Interacts with and learns from the environment."""
27 |
28 | def __init__(self, state_size, action_size, random_seed):
29 | """Initialize an Agent object.
30 |
31 | Params
32 | ======
33 | state_size (int): dimension of each state
34 | action_size (int): dimension of each action
35 | random_seed (int): random seed
36 | """
37 | self.state_size = state_size
38 | self.action_size = action_size
39 | self.seed = random.seed(random_seed)
40 | self.epsilon = EPSILON
41 |
42 | # Actor Network (w/ Target Network)
43 | self.actor_local = Actor(state_size, action_size, random_seed).to(device)
44 | self.actor_target = Actor(state_size, action_size, random_seed).to(device)
45 | self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)
46 |
47 | # Critic Network (w/ Target Network)
48 | self.critic_local = Critic(state_size, action_size, random_seed).to(device)
49 | self.critic_target = Critic(state_size, action_size, random_seed).to(device)
50 | self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
51 |
52 | # Noise process
53 | self.noise = OUNoise(action_size, random_seed)
54 |
55 | # Replay memory
56 | self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
57 |
58 | # Make sure target is with the same weight as the source
59 | self.hard_update(self.actor_target, self.actor_local)
60 | self.hard_update(self.critic_target, self.critic_local)
61 |
62 | def step(self, state, action, reward, next_state, done, timestep):
63 | """Save experience in replay memory, and use random sample from buffer to learn."""
64 | # Save experience / reward
65 | self.memory.add(state, action, reward, next_state, done)
66 |
67 | # Learn, if enough samples are available in memory
68 | if len(self.memory) > BATCH_SIZE and timestep % 20 == 0:
69 | for _ in range(10):
70 | experiences = self.memory.sample()
71 | self.learn(experiences, GAMMA)
72 |
73 | def act(self, state, add_noise=True):
74 | """Returns actions for given state as per current policy."""
75 |
76 | state = torch.from_numpy(state).float().to(device)
77 |
78 | self.actor_local.eval()
79 | with torch.no_grad():
80 | action = self.actor_local(state).cpu().data.numpy()
81 | self.actor_local.train()
82 |
83 | if add_noise:
84 | action += self.epsilon * self.noise.sample()
85 |
86 | return action
87 |
88 | def reset(self):
89 | self.noise.reset()
90 |
91 | def learn(self, experiences, gamma):
92 | """Update policy and value parameters using given batch of experience tuples.
93 | Q_targets = r + ? * critic_target(next_state, actor_target(next_state))
94 | where:
95 | actor_target(state) -> action
96 | critic_target(state, action) -> Q-value
97 |
98 | Params
99 | ======
100 | experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
101 | gamma (float): discount factor
102 | """
103 | states, actions, rewards, next_states, dones = experiences
104 |
105 | # ---------------------------- update critic ---------------------------- #
106 | # Get predicted next-state actions and Q values from target models
107 | actions_next = self.actor_target(next_states)
108 | Q_targets_next = self.critic_target(next_states, actions_next)
109 |
110 | # Compute Q targets for current states (y_i)
111 | Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
112 |
113 | # Compute critic loss
114 | Q_expected = self.critic_local(states, actions)
115 | critic_loss = F.mse_loss(Q_expected, Q_targets)
116 |
117 | # Minimize the loss
118 | self.critic_optimizer.zero_grad()
119 | critic_loss.backward()
120 | torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
121 | self.critic_optimizer.step()
122 |
123 | # ---------------------------- update actor ---------------------------- #
124 | # Compute actor loss
125 | actions_pred = self.actor_local(states)
126 | actor_loss = -self.critic_local(states, actions_pred).mean()
127 |
128 | # Minimize the loss
129 | self.actor_optimizer.zero_grad()
130 | actor_loss.backward()
131 | self.actor_optimizer.step()
132 |
133 | # ----------------------- update target networks ----------------------- #
134 | self.soft_update(self.critic_local, self.critic_target, TAU)
135 | self.soft_update(self.actor_local, self.actor_target, TAU)
136 |
137 | # ---------------------------- update noise ---------------------------- #
138 | if self.epsilon - EPSILON_DECAY > EPSILON_MIN:
139 | self.epsilon -= EPSILON_DECAY
140 |
141 | self.noise.reset()
142 |
143 | def soft_update(self, local_model, target_model, tau):
144 | """Soft update model parameters.
145 | ?_target = t*?_local + (1 - t)*?_target
146 |
147 | Params
148 | ======
149 | local_model: PyTorch model (weights will be copied from)
150 | target_model: PyTorch model (weights will be copied to)
151 | tau (float): interpolation parameter
152 | """
153 | for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
154 | target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
155 |
156 | def hard_update(self, target, source):
157 | for target_param, param in zip(target.parameters(), source.parameters()):
158 | target_param.data.copy_(param.data)
159 |
160 | class OUNoise:
161 | """Ornstein-Uhlenbeck process."""
162 |
163 | def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.3):
164 | """Initialize parameters and noise process."""
165 | self.mu = mu * np.ones(size)
166 | self.theta = theta
167 | self.sigma = sigma
168 | self.seed = random.seed(seed)
169 | self.reset()
170 |
171 | def reset(self):
172 | """Reset the internal state (= noise) to mean (mu)."""
173 | self.state = copy.copy(self.mu)
174 |
175 | def sample(self):
176 | """Update internal state and return it as a noise sample."""
177 | x = self.state
178 | dx = self.theta * (self.mu - x) + self.sigma * np.array([random.random() for i in range(len(x))])
179 | self.state = x + dx
180 | return self.state
181 |
182 | class ReplayBuffer:
183 | """Fixed-size buffer to store experience tuples."""
184 |
185 | def __init__(self, action_size, buffer_size, batch_size, seed):
186 | """Initialize a ReplayBuffer object.
187 | Params
188 | ======
189 | buffer_size (int): maximum size of buffer
190 | batch_size (int): size of each training batch
191 | """
192 | self.action_size = action_size
193 | self.memory = deque(maxlen=buffer_size) # internal memory (deque)
194 | self.batch_size = batch_size
195 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
196 | self.seed = random.seed(seed)
197 |
198 | def add(self, state, action, reward, next_state, done):
199 | """Add a new experience to memory."""
200 | e = self.experience(state, action, reward, next_state, done)
201 | self.memory.append(e)
202 |
203 | def sample(self):
204 | """Randomly sample a batch of experiences from memory."""
205 | experiences = random.sample(self.memory, k=self.batch_size)
206 |
207 | states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
208 | actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
209 | rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
210 | next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
211 | dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
212 |
213 | return (states, actions, rewards, next_states, dones)
214 |
215 | def __len__(self):
216 | """Return the current size of internal memory."""
217 | return len(self.memory)
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Agent.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import random
3 | import copy
4 | from collections import namedtuple, deque
5 |
6 | from Model import Actor, Critic
7 | from Noise import OUNoise
8 |
9 | import torch
10 | import torch.nn.functional as F
11 | import torch.optim as optim
12 |
13 | BUFFER_SIZE = int(1e6) # replay buffer size
14 | BATCH_SIZE = 1024 # minibatch size
15 | GAMMA = 0.99 # discount factor
16 | TAU = 1e-3 # for soft update of target parameters
17 | LR_ACTOR = 1e-4 # learning rate of the actor
18 | LR_CRITIC = 1e-3 # learning rate of the critic
19 | WEIGHT_DECAY = 0 # L2 weight decay
20 | EPSILON_MAX = 1.0
21 | EPSILON_MIN = 0.1
22 | EPSILON_DECAY = 0.995
23 | LEARN_START = 0
24 | UPDATE_EVERY = 1
25 | UPDATES_PER_STEP = 1
26 |
27 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
28 |
29 | class Agent():
30 | """Interacts with and learns from the environment."""
31 |
32 | def __init__(self, state_size, action_size, num_agents, random_seed):
33 | """Initialize an Agent object.
34 |
35 | Params
36 | ======
37 | state_size (int): dimension of each state
38 | action_size (int): dimension of each action
39 | num_agents (int): number of agents
40 | random_seed (int): random seed
41 | """
42 | self.state_size = state_size
43 | self.action_size = action_size
44 | self.num_agents = num_agents
45 | self.seed = random.seed(random_seed)
46 | self.epsilon = EPSILON_MAX
47 |
48 | # Actor Network (w/ Target Network)
49 | self.actor_local = Actor(state_size, action_size, random_seed).to(device)
50 | self.actor_target = Actor(state_size, action_size, random_seed).to(device)
51 | self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)
52 |
53 | # Critic Network (w/ Target Network)
54 | self.critic_local = Critic(state_size, action_size, random_seed).to(device)
55 | self.critic_target = Critic(state_size, action_size, random_seed).to(device)
56 | self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
57 |
58 | # Noise process
59 | self.noise = OUNoise(action_size, random_seed, mu=0, theta=0.15, sigma=0.2)
60 |
61 | # Noise process
62 | self.noise = [OUNoise(action_size, random_seed) for i in range(self.num_agents)]
63 |
64 | # Replay memory
65 | self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
66 |
67 | # Make sure target is with the same weight as the source
68 | self.hard_update(self.actor_target, self.actor_local)
69 | self.hard_update(self.critic_target, self.critic_local)
70 |
71 | self.t_step = 0
72 |
73 | def step(self, state, action, reward, next_state, done):
74 | """Save experience in replay memory, and use random sample from buffer to learn."""
75 | # Save experience / reward
76 | self.memory.add(state, action, reward, next_state, done, self.num_agents)
77 |
78 | if len(self.memory) > LEARN_START:
79 | # Learn every UPDATE_EVERY time steps.
80 | self.t_step = (self.t_step + 1) % UPDATE_EVERY
81 | if self.t_step == 0:
82 | # Learn, if enough samples are available in memory
83 | if len(self.memory) > BATCH_SIZE:
84 | for _ in range(UPDATES_PER_STEP):
85 | experiences = self.memory.sample()
86 | self.learn(experiences, GAMMA)
87 |
88 | def act(self, state, add_noise=True):
89 | """Returns actions for given state as per current policy."""
90 | #state = torch.from_numpy(state).float().unsqueeze(0).to(device)
91 | state = torch.from_numpy(state).float().to(device)
92 |
93 | self.actor_local.eval()
94 | with torch.no_grad():
95 | action = self.actor_local(state).cpu().data.numpy()
96 |
97 | self.actor_local.train()
98 |
99 | if add_noise:
100 | for i in range(self.num_agents):
101 | agent_action = action[i]
102 | for j in agent_action:
103 | j += self.epsilon * self.noise[i].sample()
104 |
105 | actions = []
106 | for i in range(len(action)):
107 | actions.append(np.argmax(action[i]))
108 |
109 | #print(action)
110 | return actions
111 |
112 | def reset(self):
113 | for i in range(self.num_agents):
114 | self.noise[i].reset()
115 |
116 | def learn(self, experiences, gamma):
117 | """Update policy and value parameters using given batch of experience tuples.
118 | Q_targets = r + ? * critic_target(next_state, actor_target(next_state))
119 | where:
120 | actor_target(state) -> action
121 | critic_target(state, action) -> Q-value
122 |
123 | Params
124 | ======
125 | experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples
126 | gamma (float): discount factor
127 | """
128 | states, actions, rewards, next_states, dones = experiences
129 |
130 | # ---------------------------- update critic ---------------------------- #
131 | # Get predicted next-state actions and Q values from target models
132 | actions_next = self.actor_target(next_states)
133 | Q_targets_next = self.critic_target(next_states, actions_next)
134 |
135 | # Compute Q targets for current states (y_i)
136 | Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
137 |
138 | # Compute critic loss
139 | Q_expected = self.critic_local(states, actions)
140 | critic_loss = F.mse_loss(Q_expected, Q_targets)
141 |
142 | # Minimize the loss
143 | self.critic_optimizer.zero_grad()
144 | critic_loss.backward()
145 | torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
146 | self.critic_optimizer.step()
147 |
148 | # ---------------------------- update actor ---------------------------- #
149 | # Compute actor loss
150 | actions_pred = self.actor_local(states)
151 | actor_loss = -self.critic_local(states, actions_pred).mean()
152 |
153 | # Minimize the loss
154 | self.actor_optimizer.zero_grad()
155 | actor_loss.backward()
156 | self.actor_optimizer.step()
157 |
158 | # ----------------------- update target networks ----------------------- #
159 | self.soft_update(self.critic_local, self.critic_target, TAU)
160 | self.soft_update(self.actor_local, self.actor_target, TAU)
161 |
162 | # ---------------------------- update noise ---------------------------- #
163 | if self.epsilon - EPSILON_DECAY > EPSILON_MIN:
164 | self.epsilon -= EPSILON_DECAY
165 | else:
166 | self.epsilon = EPSILON_MIN
167 |
168 | def soft_update(self, local_model, target_model, tau):
169 | """Soft update model parameters.
170 | ?_target = t*?_local + (1 - t)*?_target
171 |
172 | Params
173 | ======
174 | local_model: PyTorch model (weights will be copied from)
175 | target_model: PyTorch model (weights will be copied to)
176 | tau (float): interpolation parameter
177 | """
178 | for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
179 | target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
180 |
181 | def hard_update(self, target, source):
182 | for target_param, param in zip(target.parameters(), source.parameters()):
183 | target_param.data.copy_(param.data)
184 |
185 | class ReplayBuffer:
186 | """Fixed-size buffer to store experience tuples."""
187 |
188 | def __init__(self, action_size, buffer_size, batch_size, seed):
189 | """Initialize a ReplayBuffer object.
190 | Params
191 | ======
192 | buffer_size (int): maximum size of buffer
193 | batch_size (int): size of each training batch
194 | """
195 | self.action_size = action_size
196 | self.buffer_size = buffer_size
197 | self.batch_size = batch_size
198 | self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
199 | self.seed = random.seed(seed)
200 |
201 | self.reset()
202 |
203 | def add(self, state, action, reward, next_state, done, num_agents):
204 | """Add a new experience to memory."""
205 | for i in range(num_agents):
206 | e = self.experience(state[i], action[i], reward[i], next_state[i], done[i])
207 | self.memory.append(e)
208 |
209 | def reset(self):
210 | self.memory = deque(maxlen=self.buffer_size)
211 |
212 | def sample(self):
213 | """Randomly sample a batch of experiences from memory."""
214 | experiences = random.sample(self.memory, k=self.batch_size)
215 |
216 | states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
217 | actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
218 | rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
219 | next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
220 | dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
221 |
222 | return states, actions, rewards, next_states, dones
223 |
224 | def __len__(self):
225 | """Return the current size of internal memory."""
226 | return len(self.memory)
--------------------------------------------------------------------------------
/OpenAI/Taxi-v3/Taxi-v3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "commercial-proportion",
6 | "metadata": {},
7 | "source": [
8 | "#### Import dependencies"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "id": "favorite-cathedral",
15 | "metadata": {
16 | "scrolled": true
17 | },
18 | "outputs": [],
19 | "source": [
20 | "import gym\n",
21 | "import random\n",
22 | "import numpy as np\n",
23 | "import time\n",
24 | "from IPython import display\n",
25 | "import matplotlib.pyplot as plt\n",
26 | "from collections import defaultdict\n",
27 | "import pylab as pl\n",
28 | "\n",
29 | "%matplotlib inline"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "id": "reduced-prime",
35 | "metadata": {},
36 | "source": [
37 | "### Load Environment"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 2,
43 | "id": "three-flood",
44 | "metadata": {
45 | "scrolled": true
46 | },
47 | "outputs": [],
48 | "source": [
49 | "env = gym.make(\"Taxi-v3\")"
50 | ]
51 | },
52 | {
53 | "cell_type": "markdown",
54 | "id": "stupid-thailand",
55 | "metadata": {},
56 | "source": [
57 | "### Inspect Environment"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 3,
63 | "id": "integral-sharing",
64 | "metadata": {
65 | "scrolled": true
66 | },
67 | "outputs": [
68 | {
69 | "name": "stdout",
70 | "output_type": "stream",
71 | "text": [
72 | "Action size 6\n",
73 | "State size 500\n"
74 | ]
75 | }
76 | ],
77 | "source": [
78 | "# There are 6 discrete deterministic actions:\n",
79 | "# - 0: move south\n",
80 | "# - 1: move north\n",
81 | "# - 2: move east\n",
82 | "# - 3: move west\n",
83 | "# - 4: pickup passenger\n",
84 | "# - 5: drop off passenger\n",
85 | "\n",
86 | "action_size = env.action_space.n\n",
87 | "print(\"Action size \", action_size)\n",
88 | "\n",
89 | "# There are 500 discrete states since there are 25 taxi positions\n",
90 | "# 5 possible locations of the passenger (including the case when the passenger is in the taxi)\n",
91 | "# and 4 destination locations.\n",
92 | "# Start-Position is random\n",
93 | "state_size = env.observation_space.n\n",
94 | "print(\"State size \", state_size)"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 4,
100 | "id": "rocky-seventh",
101 | "metadata": {},
102 | "outputs": [
103 | {
104 | "name": "stdout",
105 | "output_type": "stream",
106 | "text": [
107 | "+---------+\n",
108 | "|\u001b[35mR\u001b[0m: | : :G|\n",
109 | "| : | : : |\n",
110 | "|\u001b[43m \u001b[0m: : : : |\n",
111 | "| | : | : |\n",
112 | "|Y| : |\u001b[34;1mB\u001b[0m: |\n",
113 | "+---------+\n",
114 | "\n"
115 | ]
116 | }
117 | ],
118 | "source": [
119 | "env.reset()\n",
120 | "env.render()"
121 | ]
122 | },
123 | {
124 | "cell_type": "markdown",
125 | "id": "measured-invalid",
126 | "metadata": {},
127 | "source": [
128 | "### Agent"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": 5,
134 | "id": "alternate-greek",
135 | "metadata": {
136 | "scrolled": true
137 | },
138 | "outputs": [],
139 | "source": [
140 | "class Agent():\n",
141 | " def __init__(self, n_actions, n_states, gamma=0.9):\n",
142 | " self.n_actions = n_actions\n",
143 | " \n",
144 | " self.gamma = gamma\n",
145 | " self.Q = np.zeros((n_states, n_actions))\n",
146 | " \n",
147 | " def decay_schedule(self, init_value, min_value, decay_ratio, max_steps, log_start=-2, log_base=10):\n",
148 | " decay_steps = int(max_steps * decay_ratio)\n",
149 | " rem_steps = max_steps - decay_steps\n",
150 | " values = np.logspace(log_start, 0, decay_steps, base=log_base, endpoint=True)[::-1]\n",
151 | " values = (values - values.min()) / (values.max() - values.min())\n",
152 | " values = (init_value - min_value) * values + min_value\n",
153 | " values = np.pad(values, (0, rem_steps), 'edge')\n",
154 | " return values \n",
155 | " \n",
156 | " def act(self, state, eps=0):\n",
157 | " if random.uniform(0, 1) < eps:\n",
158 | " return random.choice(np.arange(self.n_actions)) \n",
159 | " else:\n",
160 | " return np.argmax(self.Q[state])\n",
161 | " \n",
162 | " def learn(self, state, action, reward, next_state, done, alpha, algo='qlearn'): \n",
163 | " if algo == 'qlearn': \n",
164 | " # Q-Learning\n",
165 | " td_target = reward + self.gamma * np.max(self.Q[next_state, :]) * (not done)\n",
166 | " \n",
167 | " else: \n",
168 | " # SARSA\n",
169 | " td_target = reward + self.gamma * self.Q[next_state, self.act(next_state)] * (not done)\n",
170 | " \n",
171 | " td_error = td_target - self.Q[state, action] \n",
172 | " \n",
173 | " self.Q[state, action] = self.Q[state, action] + alpha * td_error"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "id": "english-label",
179 | "metadata": {},
180 | "source": [
181 | "### Q - Learning"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 6,
187 | "id": "brilliant-scenario",
188 | "metadata": {
189 | "scrolled": true
190 | },
191 | "outputs": [],
192 | "source": [
193 | "def learning(n_actions, n_states, episodes=50000, max_steps=500, print_every=5000):\n",
194 | " agent = Agent(n_actions, n_states)\n",
195 | " \n",
196 | " alphas = agent.decay_schedule(0.9, 0.01, 0.2, episodes)\n",
197 | " epsilons = agent.decay_schedule(1.0, 0.01, 0.5, episodes)\n",
198 | " \n",
199 | " for n_episode in range(episodes):\n",
200 | " state = env.reset() \n",
201 | " \n",
202 | " for n_step in range(max_steps):\n",
203 | " action = agent.act(state, epsilons[n_episode])\n",
204 | " next_state, reward, done, info = env.step(action) \n",
205 | " \n",
206 | " agent.learn(state, action, reward, next_state, done, alphas[n_episode])\n",
207 | " \n",
208 | " state = next_state\n",
209 | " \n",
210 | " if done: \n",
211 | " break\n",
212 | " \n",
213 | " if n_episode % print_every == 1:\n",
214 | " print('Episode: {0} done after {1} Steps.'.format(n_episode+1, n_step))\n",
215 | " \n",
216 | " print('Done.')\n",
217 | " env.close()\n",
218 | " \n",
219 | " return agent"
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "id": "historic-charger",
225 | "metadata": {},
226 | "source": [
227 | "#### Training"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 7,
233 | "id": "reasonable-dimension",
234 | "metadata": {},
235 | "outputs": [
236 | {
237 | "name": "stdout",
238 | "output_type": "stream",
239 | "text": [
240 | "Episode: 2 done after 199 Steps.\n",
241 | "Episode: 5002 done after 21 Steps.\n",
242 | "Episode: 10002 done after 12 Steps.\n",
243 | "Episode: 15002 done after 17 Steps.\n",
244 | "Episode: 20002 done after 15 Steps.\n",
245 | "Episode: 25002 done after 15 Steps.\n",
246 | "Episode: 30002 done after 11 Steps.\n",
247 | "Episode: 35002 done after 9 Steps.\n",
248 | "Episode: 40002 done after 11 Steps.\n",
249 | "Episode: 45002 done after 13 Steps.\n",
250 | "Done.\n"
251 | ]
252 | }
253 | ],
254 | "source": [
255 | "agent = learning(action_size, state_size)"
256 | ]
257 | },
258 | {
259 | "cell_type": "markdown",
260 | "id": "thirty-truck",
261 | "metadata": {},
262 | "source": [
263 | "### Replay trained Agent"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 8,
269 | "id": "closed-sport",
270 | "metadata": {
271 | "scrolled": true
272 | },
273 | "outputs": [],
274 | "source": [
275 | "def replay(agent, max_steps=20): \n",
276 | " n_steps = 0\n",
277 | "\n",
278 | " state, done = env.reset(), False\n",
279 | " rewards = 0\n",
280 | "\n",
281 | " while not done and n_steps < max_steps:\n",
282 | " action = agent.act(state)\n",
283 | " next_state, reward, done, info = env.step(action) \n",
284 | " \n",
285 | " state = next_state\n",
286 | " rewards += reward\n",
287 | " \n",
288 | " display.clear_output(wait=True)\n",
289 | " env.render()\n",
290 | " time.sleep(.5)\n",
291 | "\n",
292 | " n_steps+=1\n",
293 | "\n",
294 | " print('Solved after {0} Steps.'.format(n_steps))"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 9,
300 | "id": "african-output",
301 | "metadata": {
302 | "scrolled": false
303 | },
304 | "outputs": [
305 | {
306 | "name": "stdout",
307 | "output_type": "stream",
308 | "text": [
309 | "+---------+\n",
310 | "|R: | : :G|\n",
311 | "| : | : : |\n",
312 | "| : : : : |\n",
313 | "| | : | : |\n",
314 | "|\u001b[35m\u001b[34;1m\u001b[43mY\u001b[0m\u001b[0m\u001b[0m| : |B: |\n",
315 | "+---------+\n",
316 | " (Dropoff)\n",
317 | "Solved after 12 Steps.\n"
318 | ]
319 | }
320 | ],
321 | "source": [
322 | "for _ in range(5):\n",
323 | " replay(agent)\n",
324 | " time.sleep(1)"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": null,
330 | "id": "42e3925d",
331 | "metadata": {},
332 | "outputs": [],
333 | "source": []
334 | }
335 | ],
336 | "metadata": {
337 | "kernelspec": {
338 | "display_name": "Python 3",
339 | "language": "python",
340 | "name": "python3"
341 | },
342 | "language_info": {
343 | "codemirror_mode": {
344 | "name": "ipython",
345 | "version": 3
346 | },
347 | "file_extension": ".py",
348 | "mimetype": "text/x-python",
349 | "name": "python",
350 | "nbconvert_exporter": "python",
351 | "pygments_lexer": "ipython3",
352 | "version": "3.8.8"
353 | }
354 | },
355 | "nbformat": 4,
356 | "nbformat_minor": 5
357 | }
358 |
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/2.0/web.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
64 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
108 |
110 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
150 |
151 |
152 |
153 |
154 |
155 |
--------------------------------------------------------------------------------
/OpenAI/CartPole-v0/CartPole-v0.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "Using TensorFlow backend.\n"
13 | ]
14 | }
15 | ],
16 | "source": [
17 | "import gym\n",
18 | "import sys\n",
19 | "import numpy as np\n",
20 | "import random as rn\n",
21 | "import cv2\n",
22 | "import time\n",
23 | "import functools\n",
24 | "import datetime\n",
25 | "import tensorflow as tf\n",
26 | "from agents.DDQN import *\n",
27 | "from IPython import display\n",
28 | "import matplotlib\n",
29 | "import matplotlib.pyplot as plt\n",
30 | "from matplotlib import gridspec\n",
31 | "%matplotlib inline\n",
32 | "\n",
33 | "np.set_printoptions(threshold=sys.maxsize)"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {},
39 | "source": [
40 | "#### Set seed"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 2,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "SEED = 789325\n",
50 | "\n",
51 | "rn.seed(SEED)\n",
52 | "np.random.seed(SEED)\n",
53 | "tf.random.set_seed(SEED)"
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "#### Environment"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 3,
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "def build_environment(envName=\"CartPole-v0\", seed=None):\n",
70 | " env = gym.make(envName) \n",
71 | " if seed is not None:\n",
72 | " env.seed(seed) \n",
73 | " \n",
74 | " return env"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "#### Show Environment information"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 4,
87 | "metadata": {
88 | "scrolled": true
89 | },
90 | "outputs": [
91 | {
92 | "name": "stdout",
93 | "output_type": "stream",
94 | "text": [
95 | "Actions: 2\n",
96 | "Size of state: 4\n"
97 | ]
98 | }
99 | ],
100 | "source": [
101 | "env = build_environment(seed=SEED)\n",
102 | "\n",
103 | "# size of each action\n",
104 | "action_size = env.action_space.n\n",
105 | "print('Actions: ', action_size)\n",
106 | "if hasattr(env.env, 'get_action_meanings'):\n",
107 | " print(env.env.get_action_meanings())\n",
108 | "\n",
109 | "# examine the state space \n",
110 | "states = env.observation_space.shape\n",
111 | "state_size = states[0]\n",
112 | "print('Size of state:', state_size)\n",
113 | "\n",
114 | "env.close()"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {},
120 | "source": [
121 | "# Training"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": 5,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "def build_agent(pre_trained=None):\n",
131 | " return DDQNAgent(state_size,\n",
132 | " action_size,\n",
133 | " buffer_size=2000,\n",
134 | " epsilon_start=0.5,\n",
135 | " epsilon_steps_to_min=3500,\n",
136 | " mode=\"DuelingDQN\",\n",
137 | " use_PER=True,\n",
138 | " pre_trained=pre_trained)"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 6,
144 | "metadata": {},
145 | "outputs": [],
146 | "source": [
147 | "logdir = \"logs/\" + time.strftime(\"%Y%m%d_%H%M%S\")\n",
148 | "writer = tf.summary.create_file_writer(logdir)"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 7,
154 | "metadata": {},
155 | "outputs": [],
156 | "source": [
157 | "SAVE_EVERY_EPISODES = 100\n",
158 | "LEARNING_START_AFTER_STEPS = 500\n",
159 | "EPISODES = 80\n",
160 | "SCORE_TO_SOLVE = 195.0\n",
161 | "\n",
162 | "UPDATE_MODE = 'soft'\n",
163 | "UPDATE_TARGET_FREQUENCY = 10"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 8,
169 | "metadata": {},
170 | "outputs": [
171 | {
172 | "name": "stdout",
173 | "output_type": "stream",
174 | "text": [
175 | "Training started: 2019-12-29 13:29:18.235840\n",
176 | "Episode 1: Step 15 reward 15.0: \n",
177 | "Save model...\n",
178 | "Episode 5: Step 80 reward 30.0: \n",
179 | "Save model...\n",
180 | "Episode 6: Step 123 reward 43.0: \n",
181 | "Save model...\n",
182 | "Episode 7: Step 180 reward 57.0: \n",
183 | "Save model...\n"
184 | ]
185 | },
186 | {
187 | "name": "stderr",
188 | "output_type": "stream",
189 | "text": [
190 | "D:\\Deep Learning\\Reinforcement-Learning\\OpenAI\\CartPole-v0\\memory.py:47: RuntimeWarning: divide by zero encountered in double_scalars\n",
191 | " max_weight = (p_min * n) ** (-self.PER_b)\n",
192 | "D:\\Deep Learning\\Reinforcement-Learning\\OpenAI\\CartPole-v0\\memory.py:47: RuntimeWarning: divide by zero encountered in double_scalars\n",
193 | " max_weight = (p_min * n) ** (-self.PER_b)\n"
194 | ]
195 | },
196 | {
197 | "name": "stdout",
198 | "output_type": "stream",
199 | "text": [
200 | "Episode 46: Step 860 reward 58.0: \n",
201 | "Save model...\n",
202 | "Episode 47: Step 947 reward 87.0: \n",
203 | "Save model...\n",
204 | "Episode 61: Step 1757 reward 92.0: \n",
205 | "Save model...\n",
206 | "Episode 65: Step 2039 reward 105.0: \n",
207 | "Save model...\n",
208 | "Episode 67: Step 2269 reward 133.0: \n",
209 | "Save model...\n",
210 | "Episode 68: Step 2469 reward 200.0: \n",
211 | "Save model...\n",
212 | "Save model...\n",
213 | "Training finished\n"
214 | ]
215 | }
216 | ],
217 | "source": [
218 | "def train():\n",
219 | " env = build_environment(seed=SEED)\n",
220 | " agent = build_agent()\n",
221 | " \n",
222 | " max_reward = -9999999 \n",
223 | " game_rewards_deque = deque(maxlen=100) \n",
224 | " frame_count = 0\n",
225 | " \n",
226 | " print(\"Training started: \" + str(datetime.datetime.now()))\n",
227 | " \n",
228 | " frame_count = 0\n",
229 | " \n",
230 | " for i_episode in range(1, EPISODES+1):\n",
231 | " state = env.reset()\n",
232 | " \n",
233 | " game_reward = 0\n",
234 | " steps = 0\n",
235 | " \n",
236 | " while True:\n",
237 | " frame_count += 1\n",
238 | " steps += 1\n",
239 | " \n",
240 | " state = agent.preprocess(state) \n",
241 | " action = agent.act(state) \n",
242 | " \n",
243 | " next_state, reward, done, info = env.step(action) \n",
244 | " game_reward += reward\n",
245 | " \n",
246 | " agent.remember(state[0], action, reward, next_state, done)\n",
247 | " \n",
248 | " state = next_state\n",
249 | " \n",
250 | " if frame_count % 10000 == 0:\n",
251 | " print(\"Step count: {}\".format(frame_count))\n",
252 | " \n",
253 | " if done:\n",
254 | " break \n",
255 | " \n",
256 | " if frame_count > LEARNING_START_AFTER_STEPS: \n",
257 | " agent.train()\n",
258 | " if UPDATE_MODE == \"soft\":\n",
259 | " agent.soft_update_target_network()\n",
260 | " \n",
261 | " \n",
262 | " if UPDATE_MODE == \"hard\" and frame_count % UPDATE_TARGET_FREQUENCY == 0:\n",
263 | " agent.hard_update_target_network()\n",
264 | " \n",
265 | " # Log episode reward\n",
266 | " with writer.as_default():\n",
267 | " tf.summary.scalar(\"epsilon\", agent.epsilon, step=i_episode)\n",
268 | " tf.summary.scalar(\"game_reward\", game_reward, step=i_episode) \n",
269 | " \n",
270 | " if i_episode % SAVE_EVERY_EPISODES == 0:\n",
271 | " print(\"Save after {} episodes.\".format(i_episode))\n",
272 | " agent.save() \n",
273 | " \n",
274 | " game_rewards_deque.append(game_reward)\n",
275 | " \n",
276 | " if game_reward > max_reward:\n",
277 | " print(\"Episode {}: Step {} reward {}: \".format(i_episode, frame_count, game_reward))\n",
278 | " max_reward = game_reward\n",
279 | " agent.save() \n",
280 | " \n",
281 | " if np.mean(game_rewards_deque) >= SCORE_TO_SOLVE:\n",
282 | " agent.save()\n",
283 | " print(\"Solved in Episode {} Step {} reward {}: \".format(i_episode, frame_count, game_reward))\n",
284 | " break \n",
285 | " \n",
286 | " env.close()\n",
287 | " agent.save()\n",
288 | " \n",
289 | "train()\n",
290 | "print(\"Training finished\")"
291 | ]
292 | },
293 | {
294 | "cell_type": "markdown",
295 | "metadata": {},
296 | "source": [
297 | "# Show Result"
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": 9,
303 | "metadata": {},
304 | "outputs": [
305 | {
306 | "name": "stdout",
307 | "output_type": "stream",
308 | "text": [
309 | "Episode finished with score: 161.0\n"
310 | ]
311 | },
312 | {
313 | "data": {
314 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAD8CAYAAABXe05zAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAARYUlEQVR4nO3df6zddX3H8edrpVYjJMK4kNofo3M1GZhZ3E1nwrIwcdKxH8U/WEoy0z9Iyh+QaGaygSZD/mjiFn/sn2FWB7HZ1K6JEjrCNmunMSaO0mLBllK5SoVrm7bojLA/6lre++N+Ow7ltvf03nu8/ZzzfCQn5/v9fL/fc95vQl98+ZzP6UlVIUlqx68sdAGSpAtjcEtSYwxuSWqMwS1JjTG4JakxBrckNWZgwZ1kXZJDSSaS3DOo95GkUZNBrONOsgj4PvAHwCTwBHB7VT0z728mSSNmUHfca4GJqvphVf0C2AasH9B7SdJIuWRAr7sMeLFnfxL4nXOdfOWVV9Y111wzoFIkqT2HDx/mpZdeynTHBhXc073Z6+ZkkmwCNgGsXLmSPXv2DKgUSWrP+Pj4OY8NaqpkEljRs78cONJ7QlVtqarxqhofGxsbUBmSNHwGFdxPAKuTrEryJmADsGNA7yVJI2UgUyVVdSrJ3cB/AIuAh6rqwCDeS5JGzaDmuKmqx4DHBvX6kjSq/OakJDXG4JakxhjcktQYg1uSGmNwS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMYY3JLUGINbkhpjcEtSYwxuSWqMwS1JjTG4JakxBrckNcbglqTGzOmny5IcBl4GTgOnqmo8yRXAvwDXAIeBP6uq/55bmZKkM+bjjvv3q2pNVY13+/cAu6pqNbCr25ckzZNBTJWsB7Z221uBWwfwHpI0suYa3AV8LcneJJu6saur6ihA93zVHN9DktRjTnPcwA1VdSTJVcDOJM/2e2EX9JsAVq5cOccyJGl0zOmOu6qOdM/HgYeBtcCxJEsBuufj57h2S1WNV9X42NjYXMqQpJEy6+BO8tYkl53ZBj4A7Ad2ABu70zYCj8y1SEnSa+YyVXI18HCSM6/zpar69yRPANuT3AG8ANw29zIlSWfMOrir6ofAu6cZ/wlw01yKkiSdm9+clKTGGNyS1BiDW5IaY3BLUmMMbklqjMEtSY0xuCWpMQa3JDXG4JakxhjcktQYg1uSGmNwS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMYY3JLUGINbkhozY3AneSjJ8ST7e8auSLIzyXPd8+U9x+5NMpHkUJKbB1W4JI2qfu64vwCsO2vsHmBXVa0GdnX7JLkW2ABc113zQJJF81atJGnm4K6qbwE/PWt4PbC1294K3Nozvq2qTlbV88AEsHaeapUkMfs57qur6ihA93xVN74MeLHnvMlu7A2SbEqyJ8meEydOzLIMSRo98/3hZKYZq+lOrKotVTVeVeNjY2PzXIYkDa/ZBvexJEsBuufj3fgksKLnvOXAkdmXJ0k622yDewewsdveCDzSM74hyZIkq4DVwO65lShJ6nXJTCck+TJwI3BlkkngPuCTwPYkdwAvALcBVNWBJNuBZ4BTwF1VdXpAtUvSSJoxuKvq9nMcuukc528GNs+lKEnSufnNSUlqjMEtSY0xuCWpMQa3JDXG4JakxhjcktQYg1uSGmNwS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMYY3JLUGINbkhpjcEtSYwxuSWqMwS1JjZkxuJM8lOR4kv09Y59I8uMk+7rHLT3H7k0ykeRQkpsHVbgkjap+7ri/AKybZvyzVbWmezwGkORaYANwXXfNA0kWzVexkqQ+gruqvgX8tM/XWw9sq6qTVfU8MAGsnUN9kqSzzGWO++4kT3dTKZd3Y8uAF3vOmezG3iDJpiR7kuw5ceLEHMqQpNEy2+D+HPAOYA1wFPh0N55pzq3pXqCqtlTVeFWNj42NzbIMSRo9swruqjpWVaer6lXg87w2HTIJrOg5dTlwZG4lSpJ6zSq4kyzt2f0gcGbFyQ5gQ5IlSVYBq4HdcytRktTrkplOSPJl4EbgyiSTwH3AjUnWMDUNchi4E6CqDiTZDjwDnALuqqrTgyldkkbTjMFdVbdPM/zgec7fDGyeS1GSpHPzm5OS1BiDW5IaY3BLUmMMbklqjMEtSY0xuCWpMQa3JDVmxnXcUiv2brnzdfu/vekfFqgSabC849bQOjvIpWFhcEtSYwxuSWqMwa2hMN20iHPcGlYGtyQ1xuBW8/wQUqPG4JakxhjcktQYg1uSGmNwayi5okTDbMbgTrIiyTeSHExyIMmHu/ErkuxM8lz3fHnPNfcmmUhyKMnNg2xAkkZNP3fcp4CPVtVvAu8F7kpyLXAPsKuqVgO7un26YxuA64B1wANJFg2ieMkVJRpFMwZ3VR2tqie77ZeBg8AyYD2wtTttK3Brt70e2FZVJ6vqeWACWDvfhUvSqLqgOe4k1wDXA48DV1fVUZgKd+Cq7rRlwIs9l012Y2e/1qYke5LsOXHixIVXLkkjqu/gTnIp8BXgI1X18/OdOs1YvWGgaktVjVfV+NjYWL9lSDPyg0kNu76CO8lipkL7i1X11W74WJKl3fGlwPFufBJY0XP5cuDI/JQrSepnVUmAB4GDVfWZnkM7gI3d9kbgkZ7xDUmWJFkFrAZ2z1/J0hQ/mNSo6ucXcG4APgR8L8m+buxjwCeB7UnuAF4AbgOoqgNJtgPPMLUi5a6qOj3vlUvSiJoxuKvq20w/bw1w0zmu2QxsnkNdkqRz8JuTktQYg1tDxRUlGgUGtyQ1xuBWk1xRolFmcEtSYwxuSWqMwa2h4QeTGhUGtyQ1xuBWc/xgUqPO4JakxhjcktQYg1tDwQ8mNUoMbklqjMEtSY0xuNWU6VaUOE2iUWNwS1JjDG41w/Xb0hSDW5Ia08+PBa9I8o0kB5McSPLhbvwTSX6cZF/3uKXnmnuTTCQ5lOTmQTYgSaOmnx8LPgV8tKqeTHIZsDfJzu7YZ6vqU70nJ7kW2ABcB7wd+HqSd/qDwRoEP5jUKJrxjruqjlbVk932y8BBYNl5LlkPbKuqk1X1PDABrJ2PYiVJFzjHneQa4Hrg8W7o7iRPJ3koyeXd2DLgxZ7LJjl/0Esz8oNJ6TV9B3eSS4GvAB+pqp8DnwPeAawBjgKfPnPqNJfXNK+3KcmeJHtOnDhxwYVL0qjqK7iTLGYqtL9YVV8FqKpjVXW6ql4FPs9r0yGTwIqey5cDR85+zaraUlXjVTU+NjY2lx4kaaT0s6okwIPAwar6TM/40p7TPgjs77Z3ABuSLEmyClgN7J6/kiVptPWzquQG4EPA95Ls68Y+BtyeZA1T0yCHgTsBqupAku3AM0ytSLnLFSUaBFeUaFTNGNxV9W2mn7d+7DzXbAY2z6EuSdI5+M1JXfRcUSK9nsEtSY0xuCWpMQa3muQHkxplBrckNcbg1kXNDyalNzK4JakxBrckNcbgVnP8YFKjzuCWpMYY3JLUGINbFy1XlEjTM7glqTEGt35pklzQYy6vIw0zg1tNGb9zy0KXIC24fn5IQVoQ/3pk0/9v/8nbDWzpDO+4dVG67749r9vvDXFp1BncaobTJNKUfn4s+M1Jdid5KsmBJPd341ck2Znkue758p5r7k0ykeRQkpsH2YAkjZp+7rhPAu+rqncDa4B1Sd4L3APsqqrVwK5unyTXAhuA64B1wANJFg2ieA2vs+e0neOWXtPPjwUX8Eq3u7h7FLAeuLEb3wp8E/irbnxbVZ0Enk8yAawFvjOfhWu4TU2LvBbW9y9cKdJFp69VJd0d817gN4C/r6rHk1xdVUcBqupokqu605cB/9Vz+WQ3dk579+517a3mlf8+aZj1FdxVdRpYk+RtwMNJ3nWe06f7E1NvOCnZBGwCWLlyJT/60Y/6KUUN+2WG6dT/KErtGh8fP+exC1pVUlU/Y2pKZB1wLMlSgO75eHfaJLCi57LlwJFpXmtLVY1X1fjY2NiFlCFJI62fVSVj3Z02Sd4CvB94FtgBbOxO2wg80m3vADYkWZJkFbAa2D3fhUvSqOpnqmQpsLWb5/4VYHtVPZrkO8D2JHcALwC3AVTVgSTbgWeAU8Bd3VSLJGke9LOq5Gng+mnGfwLcdI5rNgOb51ydJOkN/OakJDXG4JakxhjcktQY/1pX/dK4tlqaH95xS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMYY3JLUGINbkhpjcEtSYwxuSWqMwS1JjTG4JakxBrckNcbglqTG9PNjwW9OsjvJU0kOJLm/G/9Ekh8n2dc9bum55t4kE0kOJbl5kA1I0qjp5+/jPgm8r6peSbIY+HaSf+uOfbaqPtV7cpJrgQ3AdcDbga8neac/GCxJ82PGO+6a8kq3u7h7nO9vxF8PbKuqk1X1PDABrJ1zpZIkoM857iSLkuwDjgM7q+rx7tDdSZ5O8lCSy7uxZcCLPZdPdmOSpHnQV3BX1emqWgMsB9YmeRfwOeAdwBrgKPDp7vRM9xJnDyTZlGRPkj0nTpyYVfGSNIouaFVJVf0M+CawrqqOdYH+KvB5XpsOmQRW9Fy2HDgyzWttqarxqhofGxubVfGSNIr6WVUyluRt3fZbgPcDzyZZ2nPaB4H93fYOYEOSJUlWAauB3fNbtiSNrn5WlSwFtiZZxFTQb6+qR5P8U5I1TE2DHAbuBKiqA0m2A88Ap4C7XFEiSfNnxuCuqqeB66cZ/9B5rtkMbJ5baZKk6fjNSUlqjMEtSY0xuCWpMQa3JDXG4JakxhjcktQYg1uSGmNwS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMYY3JLUGINbkhpjcEtSYwxuSWqMwS1JjTG4JakxBrckNcbglqTGGNyS1JhU1ULXQJITwP8ALy10LQNwJfbVmmHtzb7a8mtVNTbdgYsiuAGS7Kmq8YWuY77ZV3uGtTf7Gh5OlUhSYwxuSWrMxRTcWxa6gAGxr/YMa2/2NSQumjluSVJ/LqY7bklSHxY8uJOsS3IoyUSSexa6nguV5KEkx5Ps7xm7IsnOJM91z5f3HLu36/VQkpsXpuqZJVmR5BtJDiY5kOTD3XjTvSV5c5LdSZ7q+rq/G2+6rzOSLEry3SSPdvvD0tfhJN9Lsi/Jnm5sKHqblapasAewCPgB8OvAm4CngGsXsqZZ9PB7wHuA/T1jfwvc023fA/xNt31t1+MSYFXX+6KF7uEcfS0F3tNtXwZ8v6u/6d6AAJd224uBx4H3tt5XT39/AXwJeHRY/l3s6j0MXHnW2FD0NpvHQt9xrwUmquqHVfULYBuwfoFruiBV9S3gp2cNrwe2dttbgVt7xrdV1cmqeh6YYOqfwUWnqo5W1ZPd9svAQWAZjfdWU17pdhd3j6LxvgCSLAf+CPjHnuHm+zqPYe7tvBY6uJcBL/bsT3Zjrbu6qo7CVAACV3XjTfab5BrgeqbuTpvvrZtO2AccB3ZW1VD0Bfwd8JfAqz1jw9AXTP3H9WtJ9ibZ1I0NS28X7JIFfv9MMzbMy1ya6zfJpcBXgI9U1c+T6VqYOnWasYuyt6o6DaxJ8jbg4STvOs/pTfSV5I+B41W1N8mN/VwyzdhF11ePG6rqSJKrgJ1Jnj3Pua31dsEW+o57EljRs78cOLJAtcynY0mWAnTPx7vxpvpNspip0P5iVX21Gx6K3gCq6mfAN4F1tN/XDcCfJjnM1JTj+5L8M+33BUBVHemejwMPMzX1MRS9zcZCB/cTwOokq5K8CdgA7FjgmubDDmBjt70ReKRnfEOSJUlWAauB3QtQ34wydWv9IHCwqj7Tc6jp3pKMdXfaJHkL8H7gWRrvq6rurarlVXUNU3+O/rOq/pzG+wJI8tYkl53ZBj4A7GcIepu1hf50FLiFqRULPwA+vtD1zKL+LwNHgf9l6r/0dwC/CuwCnuuer+g5/+Ndr4eAP1zo+s/T1+8y9b+XTwP7usctrfcG/Bbw3a6v/cBfd+NN93VWjzfy2qqS5vtiatXZU93jwJmcGIbeZvvwm5OS1JiFniqRJF0gg1uSGmNwS1JjDG5JaozBLUmNMbglqTEGtyQ1xuCWpMb8H8EOG9Pp82HgAAAAAElFTkSuQmCC\n",
315 | "text/plain": [
316 | ""
317 | ]
318 | },
319 | "metadata": {
320 | "needs_background": "light"
321 | },
322 | "output_type": "display_data"
323 | }
324 | ],
325 | "source": [
326 | "env = build_environment(seed=SEED)\n",
327 | "agent = build_agent(pre_trained='model.h5')\n",
328 | "\n",
329 | "state = env.reset()\n",
330 | "final_reward = 0\n",
331 | "\n",
332 | "img = plt.imshow(env.render(mode='rgb_array'))\n",
333 | "while True:\n",
334 | " img.set_data(env.render(mode='rgb_array'))\n",
335 | " display.display(plt.gcf())\n",
336 | " display.clear_output(wait=True)\n",
337 | "\n",
338 | " state = np.reshape(state, [1, state_size])\n",
339 | " action = agent.act(state) \n",
340 | " next_state, reward, done, info = env.step(action)\n",
341 | " final_reward += reward \n",
342 | " \n",
343 | " state = next_state\n",
344 | " \n",
345 | " if done:\n",
346 | " print(\"Episode finished with score: {}\".format(final_reward))\n",
347 | " break\n",
348 | "env.close() "
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": null,
354 | "metadata": {},
355 | "outputs": [],
356 | "source": []
357 | }
358 | ],
359 | "metadata": {
360 | "kernelspec": {
361 | "display_name": "Python 3",
362 | "language": "python",
363 | "name": "python3"
364 | },
365 | "language_info": {
366 | "codemirror_mode": {
367 | "name": "ipython",
368 | "version": 3
369 | },
370 | "file_extension": ".py",
371 | "mimetype": "text/x-python",
372 | "name": "python",
373 | "nbconvert_exporter": "python",
374 | "pygments_lexer": "ipython3",
375 | "version": "3.6.9"
376 | }
377 | },
378 | "nbformat": 4,
379 | "nbformat_minor": 2
380 | }
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/4.0/web.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
79 |
80 |
81 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
94 |
95 |
98 |
99 |
100 |
103 |
104 |
105 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
123 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
197 |
199 |
201 |
202 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
228 |
229 |
230 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
249 |
250 |
251 |
252 |
253 |
254 |
--------------------------------------------------------------------------------
/Unity-ML/Soccer/Soccer_Windows_x86_64/Soccer_Data/MonoBleedingEdge/etc/mono/4.5/web.config:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
79 |
80 |
81 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
94 |
95 |
98 |
99 |
100 |
103 |
104 |
105 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
123 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
197 |
199 |
201 |
202 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
228 |
229 |
230 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
249 |
250 |
251 |
252 |
253 |
254 |
--------------------------------------------------------------------------------
/OpenAI/HumanoidPyBulletEnv-v0/HumanoidPyBulletEnv-v0.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## HumanoidPyBulletEnv-v0\n",
8 | "\n",
9 | "In this notebook, you will implement a PPO agent with OpenAI Gym's HumanoidPyBulletEnv-v0 environment."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "import math\n",
19 | "import random\n",
20 | "import sys\n",
21 | "import pathlib\n",
22 | "\n",
23 | "import gym\n",
24 | "import pybullet\n",
25 | "import pybulletgym\n",
26 | "import numpy as np\n",
27 | "\n",
28 | "import torch\n",
29 | "import torch.nn as nn\n",
30 | "import torch.optim as optim\n",
31 | "import torch.nn.functional as F\n",
32 | "from torch.distributions import Normal"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "from IPython.display import clear_output\n",
42 | "import matplotlib.pyplot as plt\n",
43 | "%matplotlib inline"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "Use CUDA
"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "use_cuda = torch.cuda.is_available()\n",
60 | "device = torch.device(\"cuda\" if use_cuda else \"cpu\")"
61 | ]
62 | },
63 | {
64 | "cell_type": "markdown",
65 | "metadata": {},
66 | "source": [
67 | "Create Environments
"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "from multiprocessing_env import SubprocVecEnv\n",
77 | "import time\n",
78 | "\n",
79 | "LOAD_CHECKPOINT = True\n",
80 | "DO_TRAINING = False\n",
81 | "\n",
82 | "num_envs = 8\n",
83 | "env_name = \"HumanoidPyBulletEnv-v0\"\n",
84 | "\n",
85 | "hidden_size = 64\n",
86 | "\n",
87 | "\n",
88 | "policy_optimizer_lr = 0.00005\n",
89 | "policy_stopping_kl = 0.02\n",
90 | "\n",
91 | "value_optimizer_lr = 0.00015\n",
92 | "value_stopping_mse = 25\n",
93 | "\n",
94 | "entropy_loss_weight = 0.01\n",
95 | "\n",
96 | "num_steps = 1024\n",
97 | "mini_batch_size = 64\n",
98 | "ppo_epochs = 15\n",
99 | "threshold_reward = 6000\n",
100 | "\n",
101 | "\n",
102 | "ACTOR_CHECKPOINT_PATH = pathlib.Path(\"./pretrained/\" + \"actor_\" + env_name + \"_checkpoint.pt\")\n",
103 | "ACTOR_FINAL_PATH = pathlib.Path(\"./pretrained/\" + \"actor_\" + env_name + \"_final.pt\")\n",
104 | "\n",
105 | "CRITIC_CHECKPOINT_PATH = pathlib.Path(\"./pretrained/\" + \"critic_\" + env_name + \"_checkpoint.pt\")\n",
106 | "CRITIC_FINAL_PATH = pathlib.Path(\"./pretrained/\" + \"critic_\" + env_name + \"_final.pt\")\n",
107 | "\n",
108 | "def make_env():\n",
109 | " def _thunk():\n",
110 | " env = gym.make(env_name)\n",
111 | " return env\n",
112 | "\n",
113 | " return _thunk\n",
114 | "\n",
115 | "envs = [make_env() for i in range(num_envs)]\n",
116 | "envs = SubprocVecEnv(envs)\n",
117 | "\n",
118 | "env = gym.make(env_name)"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "metadata": {},
124 | "source": [
125 | "Neural Network
"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": null,
131 | "metadata": {},
132 | "outputs": [],
133 | "source": [
134 | "class BaseModule(nn.Module):\n",
135 | " def __init__(self):\n",
136 | " super(BaseModule, self).__init__()\n",
137 | "\n",
138 | " def _build_network(self, num_inputs, num_outputs, hidden_size):\n",
139 | " \n",
140 | " if isinstance(hidden_size, int):\n",
141 | " return nn.Sequential(\n",
142 | " nn.Linear(num_inputs, hidden_size),\n",
143 | " nn.ReLU(),\n",
144 | " nn.Linear(hidden_size, num_outputs)\n",
145 | " )\n",
146 | " \n",
147 | " else:\n",
148 | " return nn.Sequential(\n",
149 | " nn.Linear(num_inputs, hidden_size[0]),\n",
150 | " nn.ReLU(),\n",
151 | " *self._build_hidden(hidden_size),\n",
152 | " nn.Linear(hidden_size[-1], num_outputs)\n",
153 | " ) \n",
154 | " \n",
155 | " def _build_hidden(self, hidden_size):\n",
156 | " hidden_layers = []\n",
157 | " for i in range(len(hidden_size)-1): \n",
158 | " hidden_layers.append(nn.Linear(hidden_size[i], hidden_size[i+1]))\n",
159 | " hidden_layers.append(nn.ReLU())\n",
160 | " return hidden_layers \n",
161 | " \n",
162 | "class Actor(BaseModule):\n",
163 | " def __init__(self, num_inputs, num_outputs, hidden_size, std=0.0):\n",
164 | " super(Actor, self).__init__()\n",
165 | " self.model = self._build_network(num_inputs, num_outputs, hidden_size) \n",
166 | " self.log_std = nn.Parameter(torch.ones(1, num_outputs) * std, requires_grad=True)\n",
167 | " \n",
168 | " def forward(self, x):\n",
169 | " mu = self.model(x) \n",
170 | " std = self.log_std.exp().expand_as(mu)\n",
171 | " dist = Normal(mu, std)\n",
172 | " return dist\n",
173 | " \n",
174 | "class Critic(BaseModule):\n",
175 | " def __init__(self, num_inputs, hidden_size):\n",
176 | " super(Critic, self).__init__()\n",
177 | " self.model = self._build_network(num_inputs, 1, hidden_size)\n",
178 | " \n",
179 | " def forward(self, x):\n",
180 | " return self.model(x)"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": null,
186 | "metadata": {},
187 | "outputs": [],
188 | "source": [
189 | "def plot(frame_idx, rewards):\n",
190 | " clear_output(True)\n",
191 | " plt.figure(figsize=(20,5))\n",
192 | " plt.subplot(131)\n",
193 | " plt.title('frame %s. reward: %s' % (frame_idx, rewards[-1]))\n",
194 | " \n",
195 | " mean = []\n",
196 | " for x in range(len(rewards)):\n",
197 | " mean.append(np.array(rewards[:x]).mean())\n",
198 | " \n",
199 | " plt.plot(rewards, label=\"Reward\")\n",
200 | " plt.plot(mean, label=\"mean\")\n",
201 | " plt.legend()\n",
202 | " plt.show()\n",
203 | " \n",
204 | "def test_env(vis=False):\n",
205 | " state = env.reset()\n",
206 | " if vis: env.render()\n",
207 | " done = False\n",
208 | " total_reward = 0\n",
209 | " while not done:\n",
210 | " state = torch.FloatTensor(state).unsqueeze(0).to(device)\n",
211 | " dist = policy_model(state)\n",
212 | " next_state, reward, done, _ = env.step(dist.sample().cpu().numpy()[0])\n",
213 | " state = next_state\n",
214 | " if vis: env.render()\n",
215 | " total_reward += reward\n",
216 | " return total_reward"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": null,
222 | "metadata": {},
223 | "outputs": [],
224 | "source": [
225 | "def compute_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.97):\n",
226 | " values = values + [next_value]\n",
227 | " gae = 0\n",
228 | " returns = []\n",
229 | " for step in reversed(range(len(rewards))):\n",
230 | " delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]\n",
231 | " gae = delta + gamma * tau * masks[step] * gae\n",
232 | " returns.insert(0, gae + values[step])\n",
233 | " return returns"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {},
240 | "outputs": [],
241 | "source": [
242 | "def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage):\n",
243 | " batch_size = states.size(0)\n",
244 | " for _ in range(batch_size // mini_batch_size):\n",
245 | " rand_ids = np.random.randint(0, batch_size, mini_batch_size)\n",
246 | " yield states[rand_ids, :], actions[rand_ids, :], log_probs[rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :]\n",
247 | "\n",
248 | "\n",
249 | "def ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, \n",
250 | " clip_param=0.1, value_loss_coef=0.5, entropy_coef=0.01, max_grad_norm=0.5):\n",
251 | " # Policy\n",
252 | " for _ in range(ppo_epochs):\n",
253 | " for state, action, old_log_probs, return_, advantage in ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages):\n",
254 | " dist = policy_model(state)\n",
255 | " \n",
256 | " entropy = dist.entropy().mean()\n",
257 | " new_log_probs = dist.log_prob(action)\n",
258 | "\n",
259 | " ratio = (new_log_probs - old_log_probs).exp()\n",
260 | " surr1 = ratio * advantage\n",
261 | " surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage\n",
262 | "\n",
263 | " policy_loss = -torch.min(surr1, surr2).mean()\n",
264 | " entropy_loss = -entropy.mean() * entropy_loss_weight\n",
265 | " \n",
266 | " policy_optimizer.zero_grad()\n",
267 | " \n",
268 | " if max_grad_norm:\n",
269 | " torch.nn.utils.clip_grad_norm_(policy_model.model.parameters(), max_grad_norm)\n",
270 | " \n",
271 | " (policy_loss + entropy_loss).backward()\n",
272 | " policy_optimizer.step()\n",
273 | " \n",
274 | " with torch.no_grad():\n",
275 | " dist = policy_model(state) \n",
276 | " logpas_pred_all = dist.log_prob(action) \n",
277 | " kl = (new_log_probs - logpas_pred_all).mean()\n",
278 | " if kl.item() > policy_stopping_kl:\n",
279 | " break\n",
280 | " \n",
281 | " # Value \n",
282 | " for _ in range(ppo_epochs):\n",
283 | " for state, action, old_log_probs, return_, advantage in ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantages):\n",
284 | " value = value_model.model(state)\n",
285 | " \n",
286 | " value_loss = 0.5 * (return_ - value).pow(2).mean()\n",
287 | " \n",
288 | " value_optimizer.zero_grad()\n",
289 | " \n",
290 | " if max_grad_norm:\n",
291 | " torch.nn.utils.clip_grad_norm_(value_model.model.parameters(), max_grad_norm) \n",
292 | " \n",
293 | " value_loss.backward()\n",
294 | " value_optimizer.step()\n",
295 | " \n",
296 | " with torch.no_grad():\n",
297 | " values_pred_all = value_model.model(state)\n",
298 | " mse = 0.5 * (value - values_pred_all).pow(2).mean()\n",
299 | " if mse.item() > value_stopping_mse:\n",
300 | " break"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": null,
306 | "metadata": {},
307 | "outputs": [],
308 | "source": [
309 | "def loadCheckpoint(filename, model):\n",
310 | " checkpoint = torch.load(filename)\n",
311 | " model.load_state_dict(checkpoint['model_state_dict'])\n",
312 | " model.to(device)\n",
313 | " model.eval()"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": null,
319 | "metadata": {},
320 | "outputs": [],
321 | "source": [
322 | "def saveCheckpoint(filename, epoch, model, optimizer):\n",
323 | " checkpoint = {\n",
324 | " 'epoch': epoch,\n",
325 | " 'model_state_dict': model.state_dict(),\n",
326 | " 'optimizer_state_dict': optimizer.state_dict(),\n",
327 | " }\n",
328 | "\n",
329 | " torch.save(checkpoint, filename)"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": null,
335 | "metadata": {},
336 | "outputs": [],
337 | "source": [
338 | "num_inputs = envs.observation_space.shape[0]\n",
339 | "num_outputs = envs.action_space.shape[0]\n",
340 | "\n",
341 | "policy_model = Actor(num_inputs, num_outputs, hidden_size).to(device)\n",
342 | "policy_optimizer = optim.Adam(policy_model.parameters(), lr=policy_optimizer_lr) \n",
343 | "\n",
344 | "value_model = Critic(num_inputs, hidden_size).to(device)\n",
345 | "value_optimizer = optim.Adam(value_model.parameters(), lr=value_optimizer_lr)\n",
346 | "\n",
347 | "if LOAD_CHECKPOINT: \n",
348 | " loadCheckpoint(ACTOR_CHECKPOINT_PATH, policy_model)\n",
349 | " loadCheckpoint(CRITIC_CHECKPOINT_PATH, value_model) \n",
350 | " \n",
351 | "print(policy_model)\n",
352 | "print(value_model)"
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": null,
358 | "metadata": {},
359 | "outputs": [],
360 | "source": [
361 | "def train():\n",
362 | " frame_idx = 0\n",
363 | " train_epoch = 0\n",
364 | "\n",
365 | " test_rewards = []\n",
366 | " best_reward = None\n",
367 | "\n",
368 | " state = envs.reset()\n",
369 | " early_stop = False\n",
370 | "\n",
371 | " while not early_stop:\n",
372 | " state = envs.reset()\n",
373 | "\n",
374 | " log_probs = []\n",
375 | " values = []\n",
376 | " states = []\n",
377 | " actions = []\n",
378 | " rewards = []\n",
379 | " masks = []\n",
380 | "\n",
381 | " for _ in range(num_steps):\n",
382 | " state = torch.FloatTensor(state).to(device)\n",
383 | " dist = policy_model(state)\n",
384 | " value = value_model(state)\n",
385 | "\n",
386 | " action = dist.sample()\n",
387 | " next_state, reward, done, _ = envs.step(action.cpu().numpy())\n",
388 | "\n",
389 | " log_prob = dist.log_prob(action)\n",
390 | "\n",
391 | " log_probs.append(log_prob)\n",
392 | " values.append(value)\n",
393 | " rewards.append(torch.FloatTensor(reward).unsqueeze(1).to(device))\n",
394 | " masks.append(torch.FloatTensor(1 - done).unsqueeze(1).to(device))\n",
395 | "\n",
396 | " states.append(state)\n",
397 | " actions.append(action)\n",
398 | "\n",
399 | " state = next_state\n",
400 | " frame_idx += 1\n",
401 | "\n",
402 | " next_state = torch.FloatTensor(next_state).to(device)\n",
403 | " next_value = value_model(next_state)\n",
404 | " returns = compute_gae(next_value, rewards, masks, values)\n",
405 | "\n",
406 | " returns = torch.cat(returns).detach()\n",
407 | " log_probs = torch.cat(log_probs).detach()\n",
408 | " values = torch.cat(values).detach()\n",
409 | " states = torch.cat(states)\n",
410 | " actions = torch.cat(actions)\n",
411 | " advantage = returns - values\n",
412 | "\n",
413 | " ppo_update(ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantage)\n",
414 | " train_epoch += 1 \n",
415 | "\n",
416 | " if train_epoch % 10 == 0:\n",
417 | " test_reward = np.mean([test_env() for _ in range(10)]) \n",
418 | " test_rewards.append(test_reward)\n",
419 | " plot(train_epoch, test_rewards)\n",
420 | "\n",
421 | " if best_reward is None or best_reward < test_reward: \n",
422 | " if best_reward is not None: \n",
423 | " saveCheckpoint(ACTOR_FINAL_PATH, train_epoch, policy_model, policy_optimizer)\n",
424 | " saveCheckpoint(CRITIC_FINAL_PATH, train_epoch, value_model, value_optimizer)\n",
425 | "\n",
426 | " best_reward = test_reward\n",
427 | "\n",
428 | " if test_reward > threshold_reward: \n",
429 | " early_stop = True \n",
430 | "\n",
431 | " if train_epoch % 100 == 0:\n",
432 | " saveCheckpoint(ACTOR_CHECKPOINT_PATH, train_epoch, policy_model, policy_optimizer)\n",
433 | " saveCheckpoint(CRITIC_CHECKPOINT_PATH, train_epoch, value_model, value_optimizer)\n",
434 | " \n",
435 | "if DO_TRAINING:\n",
436 | " train()"
437 | ]
438 | },
439 | {
440 | "cell_type": "markdown",
441 | "metadata": {},
442 | "source": [
443 | "### Replay"
444 | ]
445 | },
446 | {
447 | "cell_type": "code",
448 | "execution_count": null,
449 | "metadata": {},
450 | "outputs": [],
451 | "source": [
452 | "env = gym.make(env_name)\n",
453 | "env.render(mode=\"human\")\n",
454 | "\n",
455 | "for i_episode in range(5):\n",
456 | " \n",
457 | " state = env.reset()\n",
458 | " done = False\n",
459 | " total_reward = 0\n",
460 | " \n",
461 | " frame_idx = 0\n",
462 | " \n",
463 | " distance = 3\n",
464 | " yaw = 0\n",
465 | " \n",
466 | " humanPos, humanOrn = pybullet.getBasePositionAndOrientation(1)\n",
467 | " pybullet.resetDebugVisualizerCamera(distance, yaw, -20, humanPos) \n",
468 | " \n",
469 | " while not done:\n",
470 | " frame_idx += 1\n",
471 | " \n",
472 | " state = torch.FloatTensor(state).unsqueeze(0).to(device)\n",
473 | " dist = policy_model(state)\n",
474 | " action = dist.sample().cpu().numpy()[0]\n",
475 | " next_state, reward, done, _ = env.step(action)\n",
476 | " \n",
477 | " state = next_state\n",
478 | " total_reward += reward\n",
479 | " \n",
480 | " time.sleep(1/30)\n",
481 | " \n",
482 | " if frame_idx % 150 == 0:\n",
483 | " humanPos, humanOrn = pybullet.getBasePositionAndOrientation(1)\n",
484 | " pybullet.resetDebugVisualizerCamera(distance, yaw, -20, humanPos) \n",
485 | " \n",
486 | " print(\"episode:\", i_episode, \"reward:\", total_reward, \"frames\", frame_idx)\n",
487 | "\n",
488 | "env.close()"
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": null,
494 | "metadata": {},
495 | "outputs": [],
496 | "source": []
497 | },
498 | {
499 | "cell_type": "code",
500 | "execution_count": null,
501 | "metadata": {},
502 | "outputs": [],
503 | "source": []
504 | }
505 | ],
506 | "metadata": {
507 | "kernelspec": {
508 | "display_name": "Python 3",
509 | "language": "python",
510 | "name": "python3"
511 | },
512 | "language_info": {
513 | "codemirror_mode": {
514 | "name": "ipython",
515 | "version": 3
516 | },
517 | "file_extension": ".py",
518 | "mimetype": "text/x-python",
519 | "name": "python",
520 | "nbconvert_exporter": "python",
521 | "pygments_lexer": "ipython3",
522 | "version": "3.8.8"
523 | }
524 | },
525 | "nbformat": 4,
526 | "nbformat_minor": 2
527 | }
528 |
--------------------------------------------------------------------------------