├── rlib
    ├── __init__.py
    ├── A2C
    │   ├── __init__.py
    │   ├── A2C.py
    │   ├── ActorCritic.py
    │   └── A2C_lstm.py
    ├── DDQN
    │   ├── __init__.py
    │   └── SyncDQN.py
    ├── RND
    │   ├── __init__.py
    │   └── RND.py
    ├── utils
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   └── SyncMultiEnvTrainer.cpython-35.pyc.140156829641872
    │   ├── schedulers.py
    │   ├── random_agent.py
    │   ├── utils.py
    │   ├── play.py
    │   ├── ReplayMemory.py
    │   ├── VecEnv.py
    │   ├── wrappers.py
    │   └── SyncMultiEnvTrainer.py
    ├── Curiosity
    │   ├── __init__.py
    │   ├── logs
    │   │   └── Curiosity_LSTM
    │   │   │   └── FreewayDeterministic-v4
    │   │   │       └── 19-08-04_16-50-38
    │   │   │           └── train
    │   │   │               └── events.out.tfevents.1564933838.jhubuntu
    │   └── CuriosityA2C.py
    ├── networks
    │   ├── __init__.py
    │   └── networks.py
    ├── .vscode
    │   └── settings.json
    ├── A3C
    │   └── A3C.py
    ├── PPO
    │   └── PPO.py
    ├── VIN
    │   └── VIN.py
    ├── DAAC
    │   └── DAAC.py
    └── Unreal
    │   └── UnrealA2C2.py
├── setup.py
├── LICENSE
├── README.md
└── .gitignore


/rlib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rlib/A2C/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rlib/DDQN/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rlib/RND/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rlib/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rlib/Curiosity/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rlib/networks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rlib/utils/__pycache__/SyncMultiEnvTrainer.cpython-35.pyc.140156829641872:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rlib/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "/usr/bin/python3.5"
3 | }


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | 
3 | setup(name='rlib', version='2.0', packages=find_packages())
4 | 


--------------------------------------------------------------------------------
/rlib/Curiosity/logs/Curiosity_LSTM/FreewayDeterministic-v4/19-08-04_16-50-38/train/events.out.tfevents.1564933838.jhubuntu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jhare96/reinforcement-learning/HEAD/rlib/Curiosity/logs/Curiosity_LSTM/FreewayDeterministic-v4/19-08-04_16-50-38/train/events.out.tfevents.1564933838.jhubuntu


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2021 Joshua Hare
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |       http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/rlib/utils/schedulers.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | def polynomial_sheduler(optimiser, lr_final, decay_steps, power=1):
 4 |     lr_init = optimiser.defaults["lr"]
 5 |     assert lr_init > lr_final, f"lr_final ({lr_final}) must be be smaller than initial lr ({lr_init})"
 6 | 
 7 |     def polylambda(current_step: int):
 8 |         if current_step > decay_steps:
 9 |             return lr_final / lr_init  # as LambdaLR multiplies by lr_init
10 |         else:
11 |             decay = (lr_init - lr_final) * (1 - current_step / decay_steps) ** power + lr_final
12 |             return decay / lr_init  # as LambdaLR multiplies by lr_init
13 |     
14 |     return torch.optim.lr_scheduler.LambdaLR(optimiser, polylambda, last_epoch=-1)


--------------------------------------------------------------------------------
/rlib/utils/random_agent.py:
--------------------------------------------------------------------------------
 1 | import numpy as np 
 2 | import gym 
 3 | import matplotlib.pyplot as plt
 4 | import seaborn as sns
 5 | import os, time
 6 | import threading
 7 | sns.set()
 8 | 
 9 | ep_rewards = []
10 | 
11 | def run_episodes(env, number_episodes, max_steps):
12 |     for episode in range(number_episodes):
13 |         obs = env.reset()
14 |         ep_score = 0
15 |         for t in range(max_steps):
16 |             action = env.action_space.sample()
17 |             obs, r, done, info = env.step(action)
18 |             ep_score += r
19 | 
20 |             if done:
21 |                 ep_rewards.append(ep_score)
22 |                 break
23 | def main():
24 |     
25 |     env_id = 'MountainCar-v0'
26 |     envs = [gym.make(env_id) for i in range(64)]
27 |     num_eps = int(1e6) // 64
28 |     max_steps = 1000
29 | 
30 |     ep_rewards = []
31 |     threads = [threading.Thread(target=run_episodes, args=(envs[i], num_eps, max_steps)) for i in range(len(envs))]
32 | 
33 |     for thread in threads:
34 |         thread.start()
35 |         
36 |     for thread in threads:
37 |         thread.join()
38 |     
39 | 
40 |     ep_rewards = np.array(ep_rewards)
41 |     avg_reward_line = np.ones_like(ep_rewards) * np.mean(ep_rewards)
42 |     filename = 'experiments/random/' + env_id + '/'
43 |     if not os.path.exists(filename):
44 |         os.makedirs(filename)
45 |     np.save(filename + str(num_eps * len(envs)) + 'random.npy', ep_rewards)
46 |     plt.plot(ep_rewards)
47 |     plt.plot(avg_reward_line, '--', color='0.5')
48 |     plt.show()
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     main()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # reinforcement-learning
 2 | 
 3 | A small Pytorch based reinforcement learning library  
 4 | as used for my MSc dissertation project ['Dealing with sparse rewards in reinforcement learning'](https://arxiv.org/abs/1910.09281).  
 5 | 
 6 | This repository has working implementations of the following reinforcement agents:  
 7 |           1. Advantage Actor Critic [(A2C)](https://openai.com/blog/baselines-acktr-a2c/)  
 8 |           2. Synchronous n-step Double Deep Q Network (Sync-DDQN)  
 9 |           3. Proximal Policy Optimisation [(PPO)](https://arxiv.org/abs/1707.06347)  
10 |           4. Random Network Distillation [(RND)](https://arxiv.org/abs/1810.12894)
11 |           5. UNREAL-A2C2, A2C-CNN version of the [(UNREAL agent)](https://deepmind.com/blog/article/reinforcement-learning-unsupervised-auxiliary-tasks)    
12 |           6. Random Network Distillation with Auxiliary Learning (RANDAL), novel solution combining UNREAL and RND agents  
13 |           
14 |           
15 | # Install repository:
16 | ```bash
17 | git clone https://github.com/jhare96/reinforcement-learning.git
18 | pip install -e reinforcement-learning
19 | ```
20 | # To cite RANDAL agents in publications:  
21 | follow the link to the ArXiv publication https://arxiv.org/abs/1910.09281
22 |           
23 | # To cite this repository in publications:
24 | 
25 |     @misc{Hare_rlib,
26 |       author = {Joshua Hare},
27 |       title = {reinforcement learning library, rlib},
28 |       year = {2019},
29 |       publisher = {GitHub},
30 |       journal = {GitHub repository},
31 |       howpublished = {\url{https://github.com/jhare96/reinforcement-learning}},
32 |     }
33 | 
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ### Python ###
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | # pytype static type analyzer
136 | .pytype/
137 | 
138 | # Cython debug symbols
139 | cython_debug/
140 | 
141 | ### Text ###
142 | *.doc
143 | *.docx
144 | *.msg
145 | *.pages
146 | *.rtf
147 | *.txt
148 | *.wpd
149 | *.wps
150 | 
151 | ### VisualStudioCode ###
152 | .vscode/*
153 | !.vscode/settings.json
154 | !.vscode/tasks.json
155 | !.vscode/launch.json
156 | !.vscode/extensions.json
157 | *.code-workspace
158 | 
159 | # Local History for Visual Studio Code
160 | .history/
161 | 
162 | ### VisualStudioCode Patch ###
163 | # Ignore all local history of files
164 | .history
165 | .ionide
166 | 
167 | 
168 | *.mo
169 | *.egg-info
170 | *.egg
171 | *.EGG
172 | *.EGG-INFO
173 | bin
174 | build
175 | develop-eggs
176 | downloads
177 | eggs
178 | fake-eggs
179 | parts
180 | dist
181 | .installed.cfg
182 | .mr.developer.cfg
183 | .hg
184 | .bzr
185 | .svn
186 | *.pyc
187 | *.pyo
188 | *.tmp*
189 | .vscode
190 | 


--------------------------------------------------------------------------------
/rlib/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | 
  4 | def fastsample(policy:np.ndarray, k=1):
  5 |     return torch.multinomial(torch.from_numpy(policy), num_samples=k, replacement=True).view(-1).numpy()
  6 | 
  7 | def log_uniform(low=1e-10, high=1, size=()):
  8 |     return np.exp(np.random.uniform(low=np.log(low), high=np.log(high), size=size))
  9 | 
 10 | def stack_many(*args, stack=np.stack):
 11 |     return tuple([stack(arg) for arg in args])
 12 | 
 13 | def normalise(x, mean, std):
 14 |     return (x-mean)/std
 15 | 
 16 | def fold_batch(x):
 17 |     rows, cols = x.shape[0], x.shape[1]
 18 |     y = x.reshape(rows*cols,*x.shape[2:])
 19 |     return y
 20 | 
 21 | def unfold_batch(x, length, batch_size):
 22 |     return x.reshape(length, batch_size, *x.shape[1:])
 23 | 
 24 | def fold_many(*args):
 25 |     return tuple([fold_batch(arg) for arg in args])
 26 | 
 27 | def one_hot(x, num_classes):
 28 |     return np.eye(num_classes)[x]
 29 | 
 30 | def totorch(x, device='cuda'):
 31 |     x = torch.from_numpy(x).float().to(device)
 32 |     return x
 33 | 
 34 | def tonumpy(x):
 35 |     return x.detach().cpu().numpy()
 36 | 
 37 | def tonumpy_many(*args):
 38 |     return tuple([tonumpy(arg) for arg in args])
 39 | 
 40 | def totorch_many(*args, device='cuda'):
 41 |     return tuple([totorch(arg, device) for arg in args])
 42 | 
 43 | class Welfords_algorithm(object):
 44 |     #https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
 45 |     def __init__(self, mean=0, epsilon=1e-4):
 46 |         self.mean = mean
 47 |         self.n = epsilon
 48 |         self.M2 = 1
 49 |     
 50 |     def update(self, x):
 51 |         return self.update_from_mean(x.mean(axis=0))
 52 |     
 53 |     def update_from_mean(self, x):
 54 |         self.n +=1
 55 |         prev_mean = self.mean
 56 |         new_mean = prev_mean + ((x - prev_mean) / self.n)
 57 |         self.M2 += (x - new_mean) * (x - prev_mean)
 58 |         self.var = self.M2 / self.n
 59 |         self.mean = new_mean
 60 |         return self.mean, np.sqrt(self.var)
 61 | 
 62 | #https://github.com/openai/baselines/blob/master/baselines/common/running_mean_std.py
 63 | class RunningMeanStd(object):
 64 |     # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
 65 |     def __init__(self, epsilon=1e-4, shape=(), dtype=np.float32):
 66 |         self.mean = np.zeros(shape, dtype=dtype)
 67 |         self.var = np.ones(shape, dtype=dtype)
 68 |         self.count = epsilon
 69 | 
 70 |     def update(self, x):
 71 |         batch_mean = np.mean(x, axis=0)
 72 |         batch_var = np.var(x, axis=0)
 73 |         batch_count = x.shape[0]
 74 |         return self.update_from_moments(batch_mean, batch_var, batch_count)
 75 | 
 76 |     def update_from_moments(self, batch_mean, batch_var, batch_count):
 77 |         delta = batch_mean - self.mean
 78 |         tot_count = self.count + batch_count
 79 | 
 80 |         new_mean = self.mean + delta * batch_count / tot_count
 81 |         m_a = self.var * (self.count)
 82 |         m_b = batch_var * (batch_count)
 83 |         M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count)
 84 |         new_var = M2 / (self.count + batch_count)
 85 | 
 86 |         new_count = batch_count + self.count
 87 | 
 88 |         self.mean = new_mean
 89 |         self.var = new_var
 90 |         self.count = new_count
 91 |         
 92 |         return self.mean, np.sqrt(self.var)
 93 |     
 94 | 
 95 | 
 96 | def nstep_return(rewards, last_values, dones, gamma=0.99, clip=False):
 97 |     if clip:
 98 |         rewards = np.clip(rewards, -1, 1)
 99 | 
100 |     T = len(rewards)
101 |     
102 |     # Calculate R for advantage A = R - V 
103 |     R = np.zeros_like(rewards)
104 |     R[-1] = last_values * (1-dones[-1])
105 |     
106 |     for i in reversed(range(T-1)):
107 |         # restart score if done as BatchEnv automatically resets after end of episode
108 |         R[i] = rewards[i] + gamma * R[i+1] * (1-dones[i])
109 |     
110 |     return R
111 | 
112 | def lambda_return(rewards, values, last_values, dones, gamma=0.99, lambda_=0.8, clip=False):
113 |     if clip:
114 |         rewards = np.clip(rewards, -1, 1)
115 |     T = len(rewards)
116 |     # Calculate eligibility trace R^lambda 
117 |     R = np.zeros_like(rewards)
118 |     R[-1] =  last_values * (1-dones[-1])
119 |     for t in reversed(range(T-1)):
120 |         # restart score if done as BatchEnv automatically resets after end of episode
121 |         R[t] = rewards[t] + gamma * (lambda_* R[t+1] + (1.0-lambda_) * values[t+1]) * (1-dones[t])
122 |     
123 |     return R
124 | 
125 | def GAE(rewards, values, last_values, dones, gamma=0.99, lambda_=0.95, clip=False):
126 |     if clip:
127 |         rewards = np.clip(rewards, -1, 1)
128 |     # Generalised Advantage Estimation
129 |     Adv = np.zeros_like(rewards)
130 |     Adv[-1] = rewards[-1] + gamma * last_values * (1-dones[-1]) - values[-1]
131 |     T = len(rewards)
132 |     for t in reversed(range(T-1)):
133 |         delta = rewards[t] + gamma * values[t+1] * (1-dones[t]) - values[t]
134 |         Adv[t] = delta + gamma * lambda_ * Adv[t+1] * (1-dones[t])
135 |     
136 |     return Adv


--------------------------------------------------------------------------------
/rlib/utils/play.py:
--------------------------------------------------------------------------------
  1 | import gym 
  2 | import pygame
  3 | import matplotlib.pyplot as plt
  4 | #from gym.utils import play
  5 | 
  6 | from collections import deque
  7 | from pygame.locals import VIDEORESIZE
  8 | 
  9 | 
 10 | def display_arr(screen, arr, video_size, transpose):
 11 |     arr_min, arr_max = arr.min(), arr.max()
 12 |     arr = 255.0 * (arr - arr_min) / (arr_max - arr_min)
 13 |     pyg_img = pygame.surfarray.make_surface(arr.swapaxes(0, 1) if transpose else arr)
 14 |     pyg_img = pygame.transform.scale(pyg_img, video_size)
 15 |     screen.blit(pyg_img, (0,0))
 16 | 
 17 | def play(env, transpose=True, fps=30, zoom=None, callback=None, keys_to_action=None):
 18 |     """Allows one to play the game using keyboard.
 19 |     To simply play the game use:
 20 |         play(gym.make("Pong-v4"))
 21 |     Above code works also if env is wrapped, so it's particularly useful in
 22 |     verifying that the frame-level preprocessing does not render the game
 23 |     unplayable.
 24 |     If you wish to plot real time statistics as you play, you can use
 25 |     gym.utils.play.PlayPlot. Here's a sample code for plotting the reward
 26 |     for last 5 second of gameplay.
 27 |         def callback(obs_t, obs_tp1, action, rew, done, info):
 28 |             return [rew,]
 29 |         plotter = PlayPlot(callback, 30 * 5, ["reward"])
 30 |         env = gym.make("Pong-v4")
 31 |         play(env, callback=plotter.callback)
 32 |     Arguments
 33 |     ---------
 34 |     env: gym.Env
 35 |         Environment to use for playing.
 36 |     transpose: bool
 37 |         If True the output of observation is transposed.
 38 |         Defaults to true.
 39 |     fps: int
 40 |         Maximum number of steps of the environment to execute every second.
 41 |         Defaults to 30.
 42 |     zoom: float
 43 |         Make screen edge this many times bigger
 44 |     callback: lambda or None
 45 |         Callback if a callback is provided it will be executed after
 46 |         every step. It takes the following input:
 47 |             obs_t: observation before performing action
 48 |             obs_tp1: observation after performing action
 49 |             action: action that was executed
 50 |             rew: reward that was received
 51 |             done: whether the environment is done or not
 52 |             info: debug info
 53 |     keys_to_action: dict: tuple(int) -> int or None
 54 |         Mapping from keys pressed to action performed.
 55 |         For example if pressed 'w' and space at the same time is supposed
 56 |         to trigger action number 2 then key_to_action dict would look like this:
 57 |             {
 58 |                 # ...
 59 |                 sorted(ord('w'), ord(' ')) -> 2
 60 |                 # ...
 61 |             }
 62 |         If None, default key_to_action mapping for that env is used, if provided.
 63 |     """
 64 |     env.reset()
 65 |     rendered=env.render( mode='rgb_array')
 66 | 
 67 |     if keys_to_action is None:
 68 |         if hasattr(env, 'get_keys_to_action'):
 69 |             keys_to_action = env.get_keys_to_action()
 70 |         elif hasattr(env.unwrapped, 'get_keys_to_action'):
 71 |             keys_to_action = env.unwrapped.get_keys_to_action()
 72 |         else:
 73 |             assert False, env.spec.id + " does not have explicit key to action mapping, " + \
 74 |                           "please specify one manually"
 75 |     relevant_keys = set(sum(map(list, keys_to_action.keys()),[]))
 76 |     
 77 |     video_size=[rendered.shape[1],rendered.shape[0]]
 78 |     if zoom is not None:
 79 |         video_size = int(video_size[0] * zoom), int(video_size[1] * zoom)
 80 | 
 81 |     pressed_keys = []
 82 |     running = True
 83 |     env_done = True
 84 | 
 85 |     screen = pygame.display.set_mode(video_size)
 86 |     clock = pygame.time.Clock()
 87 | 
 88 | 
 89 |     while running:
 90 |         if env_done:
 91 |             env_done = False
 92 |             obs = env.reset()
 93 |         else:
 94 |             action = keys_to_action.get(tuple(sorted(pressed_keys)), 0)
 95 |             prev_obs = obs
 96 |             obs, rew, env_done, info = env.step(action)
 97 |             if callback is not None:
 98 |                 callback(prev_obs, obs, action, rew, env_done, info)
 99 |         if obs is not None:
100 |             rendered=env.render( mode='rgb_array')
101 |             display_arr(screen, rendered, transpose=transpose, video_size=video_size)
102 | 
103 |         # process pygame events
104 |         for event in pygame.event.get():
105 |             # test events, set key states
106 |             if event.type == pygame.KEYDOWN:
107 |                 if event.key in relevant_keys:
108 |                     pressed_keys.append(event.key)
109 |                 elif event.key == 27:
110 |                     running = False
111 |             elif event.type == pygame.KEYUP:
112 |                 if event.key in relevant_keys:
113 |                     pressed_keys.remove(event.key)
114 |             elif event.type == pygame.QUIT:
115 |                 running = False
116 |             elif event.type == VIDEORESIZE:
117 |                 video_size = event.size
118 |                 screen = pygame.display.set_mode(video_size)
119 |                 print(video_size)
120 | 
121 |         pygame.display.flip()
122 |         clock.tick(fps)
123 |     pygame.quit()
124 | 
125 | class PlayPlot(object):
126 |     def __init__(self, callback, horizon_timesteps, plot_names):
127 |         self.data_callback = callback
128 |         self.horizon_timesteps = horizon_timesteps
129 |         self.plot_names = plot_names
130 | 
131 |         assert plt is not None, "matplotlib backend failed, plotting will not work"
132 | 
133 |         num_plots = len(self.plot_names)
134 |         self.fig, self.ax = plt.subplots(num_plots)
135 |         if num_plots == 1:
136 |             self.ax = [self.ax]
137 |         for axis, name in zip(self.ax, plot_names):
138 |             axis.set_title(name)
139 |         self.t = 0
140 |         self.cur_plot = [None for _ in range(num_plots)]
141 |         self.data     = [deque(maxlen=horizon_timesteps) for _ in range(num_plots)]
142 | 
143 |     def callback(self, obs_t, obs_tp1, action, rew, done, info):
144 |         points = self.data_callback(obs_t, obs_tp1, action, rew, done, info)
145 |         for point, data_series in zip(points, self.data):
146 |             data_series.append(point)
147 |         self.t += 1
148 | 
149 |         xmin, xmax = max(0, self.t - self.horizon_timesteps), self.t
150 | 
151 |         for i, plot in enumerate(self.cur_plot):
152 |             if plot is not None:
153 |                 plot.remove()
154 |             self.cur_plot[i] = self.ax[i].scatter(range(xmin, xmax), list(self.data[i]), c='blue')
155 |             self.ax[i].set_xlim(xmin, xmax)
156 |         plt.pause(0.000001)
157 | 
158 | if __name__ == '__main__':
159 |     env = gym.make('MountainCar-v0')
160 |     def callback(obs_t, obs_tp1, action, rew, done, info):
161 |         return [rew,]
162 |     plotter = PlayPlot(callback, 30 * 5, ["reward"])
163 |     play(env, callback=plotter.callback)


--------------------------------------------------------------------------------
/rlib/A2C/A2C.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import gym
  3 | import numpy as np
  4 | import time, datetime
  5 | 
  6 | from rlib.networks.networks import*
  7 | from rlib.utils.VecEnv import*
  8 | from rlib.utils.wrappers import*
  9 | from rlib.utils.SyncMultiEnvTrainer import SyncMultiEnvTrainer
 10 | from rlib.utils.utils import stack_many, totorch, fastsample
 11 | from rlib.A2C.ActorCritic import ActorCritic
 12 | 
 13 | class A2C(SyncMultiEnvTrainer):
 14 |     def __init__(self, envs, model, val_envs, train_mode='nstep', return_type='nstep', log_dir='logs/A2C', model_dir='models/A2C', total_steps=10000, nsteps=5, gamma=0.99, lambda_=0.95,
 15 |                  validate_freq=1e6, save_freq=0, render_freq=0, num_val_episodes=50, max_val_steps=10000, log_scalars=True):
 16 |         
 17 |         super().__init__(envs, model, val_envs, log_dir=log_dir, model_dir=model_dir, train_mode=train_mode, return_type=return_type, total_steps=total_steps, nsteps=nsteps,
 18 |          gamma=gamma, lambda_=lambda_, validate_freq=validate_freq, save_freq=save_freq, render_freq=render_freq,
 19 |          num_val_episodes=num_val_episodes, max_val_steps=max_val_steps, log_scalars=log_scalars)
 20 | 
 21 |         hyperparas = {'learning_rate':model.lr, 'learning_rate_final':model.lr_final, 'lr_decay_steps':model.decay_steps , 'grad_clip':model.grad_clip, 'nsteps':nsteps, 'num_workers':self.num_envs,
 22 |                   'total_steps':total_steps, 'entropy_coefficient':model.entropy_coeff, 'value_coefficient':model.value_coeff , 'return type':self.return_type, 'gamma':self.gamma, 'lambda':self.lambda_}
 23 |         
 24 |         if log_scalars:
 25 |             filename = log_dir + '/' + 'hyperparameters.txt'
 26 |             self.save_hyperparameters(filename , **hyperparas)
 27 | 
 28 |     def get_action(self, state):
 29 |         policy, value = self.model.evaluate(state)
 30 |         action = int(fastsample(policy))
 31 |         return action
 32 |     
 33 |     def rollout(self,):
 34 |         rollout = []
 35 |         for t in range(self.nsteps):
 36 |             policies, values = self.model.evaluate(self.states)
 37 |             actions = fastsample(policies)
 38 |             next_states, rewards, dones, infos = self.env.step(actions)
 39 |             rollout.append((self.states, actions, rewards, values, dones))
 40 |             self.states = next_states
 41 |         
 42 |         states, actions, rewards, values, dones = stack_many(*zip(*rollout))
 43 |         _, last_values = self.model.evaluate(next_states)
 44 |         return states, actions, rewards, dones, values, last_values
 45 |         
 46 |     def _train_onestep(self):
 47 |         states = self.env.reset()
 48 |         y = np.zeros((self.num_envs))
 49 |         num_steps = self.total_steps // self.num_envs
 50 |         for t in range(1,num_steps+1):
 51 |             policies, values = self.model.evaluate(self.states)
 52 |             actions = fastsample(policies)
 53 |             next_states, rewards, dones, infos = self.env.step(actions)
 54 |             y = rewards + self.gamma * self.model.get_value(next_states) * (1-dones)
 55 |             
 56 |             l = self.model.backprop(states, y, actions)
 57 |             states = next_states
 58 |             
 59 |             if self.render_freq > 0 and t % ((self.validate_freq // self.num_envs) * self.render_freq) == 0:
 60 |                 render = True
 61 |             else:
 62 |                 render = False
 63 | 
 64 |             if self.validate_freq > 0 and t % (self.validate_freq // self.num_envs) == 0:
 65 |                 self.validation_summary(t,l,start,render)
 66 |                 start = time.time()
 67 |             
 68 |             if self.save_freq > 0 and  t % (self.save_freq // self.num_envs) == 0: 
 69 |                 self.s += 1
 70 |                 self.save(self.s)
 71 |                 print('saved model')
 72 | 
 73 | 
 74 |        
 75 | def main(env_id):
 76 |     num_envs = 32
 77 |     nsteps = 20
 78 |     
 79 |     current_time = datetime.datetime.now().strftime('%y-%m-%d_%H-%M-%S')
 80 | 
 81 |     train_log_dir = 'logs/A2C/' + env_id +'/GAE/' + current_time 
 82 |     model_dir = "models/A2C/" + env_id + '/GAE/' + current_time 
 83 |     
 84 |     
 85 |     classic_list = ['MountainCar-v0', 'Acrobot-v1', 'LunarLander-v2', 'CartPole-v0', 'CartPole-v1']
 86 |     if any(env_id in s for s in classic_list):
 87 |         print('Classic Control')
 88 |         val_envs = [gym.make(env_id) for i in range(10)]
 89 |         envs = BatchEnv(DummyEnv, env_id, num_envs, blocking=False)
 90 |     
 91 |     elif 'ApplePicker' in env_id:
 92 |         print('ApplePicker')
 93 |         make_args = {'num_objects':100, 'default_reward':-0.1}
 94 |         val_envs = [gym.make(env_id, **make_args) for i in range(10)]
 95 |         envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=True, make_args=make_args)
 96 |         print(val_envs[0])
 97 |         print(envs.envs[0])
 98 | 
 99 |     else:
100 |         print('Atari')
101 |         env = gym.make(env_id)
102 |         if env.unwrapped.get_action_meanings()[1] == 'FIRE':
103 |             reset = True
104 |             print('fire on reset')
105 |         else:
106 |             reset = False
107 |             print('only stack frames')
108 |         env.close()
109 |         val_envs = [AtariEnv(gym.make(env_id), k=4, episodic=False, reset=reset, clip_reward=False) for i in range(16)]
110 |         envs = BatchEnv(AtariEnv, env_id, num_envs, blocking=False, k=4, reset=reset, episodic=False, clip_reward=True)
111 |     
112 |     action_size = val_envs[0].action_space.n
113 |     input_size = val_envs[0].reset().shape
114 |     print('input shape', input_size)
115 |     print('action space', action_size)
116 | 
117 | 
118 |     
119 |     model = ActorCritic(UniverseCNN,
120 |                         input_size,
121 |                         action_size,
122 |                         lr=1e-3,
123 |                         lr_final=1e-4,
124 |                         entropy_coeff=0.01,
125 |                         decay_steps=50e6//(num_envs*nsteps),
126 |                         grad_clip=0.5)
127 |     
128 | 
129 |     a2c = A2C(envs=envs,
130 |               model=model,
131 |               model_dir=model_dir,
132 |               log_dir=train_log_dir,
133 |               val_envs=val_envs,
134 |               train_mode='nstep',
135 |               return_type='GAE',
136 |               total_steps=50e6,
137 |               nsteps=nsteps,
138 |               validate_freq=1e5,
139 |               save_freq=0,
140 |               render_freq=0,
141 |               num_val_episodes=50,
142 |               log_scalars=False)
143 | 
144 |     a2c.train()
145 | 
146 |     del a2c
147 | 
148 |     # a2c = A2C.load(A2C, model, envs, val_envs, model_dir + time + '/1.trainer')
149 |     # a2c.train()
150 | 
151 | 
152 | if __name__ == "__main__":
153 |     env_id_list = ['SpaceInvadersDeterministic-v4', 'FreewayDeterministic-v4', 'MontezumaRevengeDeterministic-v4', 'PongDeterministic-v4']
154 |     #env_id_list = ['MontezumaRevengeDeterministic-v4']
155 |     #env_id_list = ['MountainCar-v0', 'Acrobot-v1', 'CartPole-v1', ]
156 |     #env_id_list = ['ApplePicker-v0']
157 |     for env_id in env_id_list:
158 |         main(env_id)
159 | 


--------------------------------------------------------------------------------
/rlib/A2C/ActorCritic.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | import numpy as np
  4 | from rlib.networks.networks import MaskedLSTMCell, MaskedRNN, MaskedLSTMBlock
  5 | from rlib.utils.schedulers import polynomial_sheduler
  6 | from rlib.utils.utils import totorch, tonumpy, totorch_many, tonumpy_many
  7 | 
  8 | class ActorCritic(torch.nn.Module):
  9 |     def __init__(self, model, input_size, action_size, entropy_coeff=0.01, value_coeff=0.5, lr=1e-3, lr_final=1e-6,
 10 |                     decay_steps=6e5, grad_clip=0.5, build_optimiser=True, optim=torch.optim.RMSprop, optim_args={}, device='cuda', **model_args):
 11 |         super(ActorCritic, self).__init__()
 12 |         self.lr = lr
 13 |         self.lr_final = lr_final
 14 |         self.entropy_coeff = entropy_coeff
 15 |         self.value_coeff = value_coeff
 16 |         self.decay_steps = decay_steps
 17 |         self.grad_clip = grad_clip
 18 |         self.action_size = action_size
 19 |         self.device = device
 20 | 
 21 |         self.model = model(input_size, **model_args).to(self.device)
 22 |         self.dense_size = self.model.dense_size 
 23 |         self.policy_distrib = torch.nn.Linear(self.dense_size, action_size).to(self.device) # Actor
 24 |         self.V = torch.nn.Linear(self.dense_size, 1).to(self.device) # Critic 
 25 |         
 26 |         if build_optimiser:
 27 |             self.optimiser = optim(self.parameters(), lr, **optim_args)
 28 |             self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1)
 29 |         
 30 |     def loss(self, policy, R, V, actions_onehot):
 31 |         Advantage = R - V
 32 |         value_loss = 0.5 * torch.mean(torch.square(Advantage))
 33 | 
 34 |         log_policy = torch.log(torch.clip(policy, 1e-6, 0.999999))
 35 |         log_policy_actions = torch.sum(log_policy * actions_onehot, dim=1)
 36 |         policy_loss =  torch.mean(-log_policy_actions * Advantage.detach())
 37 | 
 38 |         entropy = torch.mean(torch.sum(policy * -log_policy, dim=1))
 39 |         loss =  policy_loss + self.value_coeff * value_loss - self.entropy_coeff * entropy
 40 |         return loss
 41 | 
 42 |     def forward(self, state):
 43 |         enc_state = self.model(state)
 44 |         policy = F.softmax(self.policy_distrib(enc_state), dim=-1)
 45 |         value = self.V(enc_state).view(-1)
 46 |         return policy, value
 47 |     
 48 |     def evaluate(self, state:np.ndarray):
 49 |         state = totorch(state, self.device)
 50 |         with torch.no_grad():
 51 |             policy, value = self.forward(state)
 52 |         return tonumpy(policy), tonumpy(value)
 53 |     
 54 |     def backprop(self, state, R, action):
 55 |         state, R, action = totorch_many(state, R, action, device=self.device)
 56 |         action_onehot = F.one_hot(action.long(), num_classes=self.action_size)
 57 |         policy, value = self.forward(state)
 58 |         loss = self.loss(policy, R, value, action_onehot)
 59 |         loss.backward()
 60 |         if self.grad_clip is not None:
 61 |             torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip)
 62 |         self.optimiser.step()
 63 |         self.optimiser.zero_grad()
 64 |         self.scheduler.step()
 65 |         return loss.detach().cpu().numpy()
 66 | 
 67 | 
 68 | 
 69 | class ActorCritic_LSTM(torch.nn.Module):
 70 |     def __init__(self, model, input_size, action_size, cell_size, entropy_coeff=0.01, value_coeff=0.5,
 71 |                     lr=1e-3, lr_final=1e-6, decay_steps=6e5, grad_clip=0.5, build_optimiser=True, optim=torch.optim.RMSprop, optim_args={}, device='cuda', **model_args):
 72 |         super(ActorCritic_LSTM, self).__init__()
 73 |         self.lr = lr
 74 |         self.lr_final = lr_final
 75 |         self.input_size = input_size
 76 |         self.entropy_coeff = entropy_coeff
 77 |         self.value_coeff = value_coeff
 78 |         self.decay_steps = decay_steps
 79 |         self.grad_clip = grad_clip
 80 |         self.cell_size = cell_size
 81 |         self.action_size = action_size
 82 |         self.device = device
 83 | 
 84 | 
 85 |         self.model = model(input_size, **model_args).to(self.device)
 86 |         self.dense_size = self.model.dense_size
 87 |         #self.lstm = MaskedRNN(MaskedLSTMCell(cell_size, self.dense_size), time_major=True)
 88 |         self.lstm = MaskedLSTMBlock(self.dense_size, cell_size, time_major=True).to(self.device)
 89 | 
 90 |         self.policy_distrib = torch.nn.Linear(cell_size, action_size, device=self.device) # Actor
 91 |         self.V = torch.nn.Linear(cell_size, 1, device=self.device) # Critic 
 92 | 
 93 | 
 94 |         if build_optimiser:
 95 |             self.optimiser = optim(self.parameters(), lr, **optim_args)
 96 |             self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1)
 97 |         
 98 |     def loss(self, policy, R, V, actions_onehot):
 99 |         Advantage = R - V
100 |         value_loss = 0.5 * torch.mean(torch.square(Advantage))
101 | 
102 |         log_policy = torch.log(torch.clip(policy, 1e-6, 0.999999))
103 |         log_policy_actions = torch.sum(log_policy * actions_onehot, dim=1)
104 |         policy_loss =  torch.mean(-log_policy_actions * Advantage.detach())
105 | 
106 |         entropy = torch.mean(torch.sum(policy * -log_policy, dim=1))
107 |         loss =  policy_loss + self.value_coeff * value_loss - self.entropy_coeff * entropy
108 |         return loss
109 | 
110 |     def forward(self, state, hidden=None, done=None):
111 |         T, num_envs = state.shape[:2]
112 |         folded_state = state.view(-1, *self.input_size)
113 |         enc_state = self.model(folded_state)
114 |         folded_enc_state = enc_state.view(T, num_envs, self.dense_size)
115 |         lstm_outputs, hidden = self.lstm(folded_enc_state, hidden, done)
116 |         policy = F.softmax(self.policy_distrib(lstm_outputs), dim=-1).view(-1, self.action_size)
117 |         value = self.V(lstm_outputs).view(-1)
118 |         return policy, value, hidden
119 |     
120 |     def evaluate(self, state:np.ndarray, hidden:np.ndarray=None, done=None):
121 |         state = totorch(state, self.device)
122 |         hidden = totorch_many(*hidden, device=self.device) if hidden is not None else None
123 |         with torch.no_grad():
124 |             policy, value, hidden = self.forward(state, hidden, done)
125 |         return tonumpy(policy), tonumpy(value), tonumpy_many(*hidden)
126 |     
127 |     def backprop(self, state, R, action, hidden, done):
128 |         state, R, action, done = totorch_many(state, R, action, done, device=self.device)
129 |         hidden = totorch_many(*hidden, device=self.device)
130 |         action_onehot = F.one_hot(action.long(), num_classes=self.action_size)
131 |         policy, value, hidden = self.forward(state, hidden, done)
132 |         loss = self.loss(policy, R, value, action_onehot)
133 |         loss.backward()
134 |         if self.grad_clip is not None:
135 |             torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip)
136 |         self.optimiser.step()
137 |         self.optimiser.zero_grad()
138 |         self.scheduler.step()
139 |         return loss.detach().cpu().numpy()
140 |     
141 |     def get_initial_hidden(self, batch_size):
142 |         return np.zeros((1, batch_size, self.cell_size)), np.zeros((1, batch_size, self.cell_size))
143 |     
144 |     def mask_hidden(self, hidden, dones):
145 |         mask = (1-dones).reshape(-1, 1)
146 |         return (hidden[0]*mask, hidden[1]*mask)


--------------------------------------------------------------------------------
/rlib/A3C/A3C.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import torch
  3 | import torch.multiprocessing as mp
  4 | import torch.nn.functional as F
  5 | import numpy as np
  6 | import math
  7 | import time
  8 | 
  9 | from rlib.A2C.ActorCritic import ActorCritic
 10 | from rlib.networks.networks import NatureCNN
 11 | from rlib.utils.wrappers import AtariEnv
 12 | from rlib.utils.utils import stack_many, tonumpy, totorch, lambda_return
 13 | 
 14 | 
 15 | def train(global_model, model, env, nsteps, num_episodes, ID):
 16 |     opt = torch.optim.RMSprop(global_model.parameters(), lr=1e-3)
 17 |     episode = 0
 18 |     episode_steps = 0
 19 |     episode_score = 0
 20 |     T = 0
 21 |     state = env.reset()
 22 |     start = time.time()
 23 |     while episode < num_episodes:
 24 |         rollout = []
 25 |         for t in range(nsteps):
 26 |             with torch.no_grad():
 27 |                 policy, value = model(totorch(state[None], device='cpu'))
 28 |                 policy, value = tonumpy(policy), tonumpy(value)
 29 |             action = np.random.choice(policy.shape[1], p=policy[0])
 30 |             next_state, reward, done, info = env.step(action)
 31 |             episode_score += reward
 32 |             rollout.append((state, action, reward, value, done))
 33 |             state = next_state
 34 | 
 35 |             T += 1
 36 |             episode_steps += 1
 37 | 
 38 |             if done or t == nsteps-1:
 39 |                 states, actions, rewards, values, dones = stack_many(*zip(*rollout))
 40 |                 with torch.no_grad():
 41 |                     _, last_values = model.forward(totorch(next_state[None], device='cpu'))
 42 |                     last_values = last_values.cpu().numpy()
 43 |                 
 44 | 
 45 |                     R = lambda_return(rewards, values, last_values, dones, gamma=0.9, lambda_=0.95, clip=False)
 46 |                 
 47 |                 loss = update_params(model, global_model, opt, states, actions, R)
 48 |                 
 49 |                 #self.T += t
 50 | 
 51 |                 if done:
 52 |                     episode += 1
 53 |                     state = env.reset()
 54 |                     if episode % 1 == 0:
 55 |                         time_taken = time.time() - start 
 56 |                         print(f'worker {ID}, total worker steps {T:,} local episode {episode}, episode score {episode_score} episode steps {episode_steps}, time taken {time_taken:,.1f}s, fps {episode_steps/time_taken:.2f}')
 57 |                     episode_steps = 0
 58 |                     episode_score = 0
 59 |                     start = time.time()
 60 |                     break
 61 |     
 62 | 
 63 | def update_params(lm, gm, gopt, states, actions, R):
 64 |     states, R, actions = totorch(states, 'cpu'), totorch(R, 'cpu'), totorch(actions, 'cpu')
 65 |     actions_onehot = F.one_hot(actions.long(), num_classes=lm.action_size)
 66 |     policies, values = lm.forward(states)
 67 |     loss = lm.loss(policies, R, values, actions_onehot)
 68 | 
 69 |     loss.backward()
 70 | 
 71 |     if lm.grad_clip is not None:
 72 |         torch.nn.utils.clip_grad_norm_(lm.parameters(), lm.grad_clip)
 73 |     
 74 |     for local_param, global_param in zip(lm.parameters(), gm.parameters()):
 75 |         global_param._grad = local_param.grad
 76 |     
 77 |     gopt.step()
 78 |     gopt.zero_grad()
 79 |     #self.scheduler.step()
 80 | 
 81 |     lm.load_state_dict(gm.state_dict())
 82 |     return loss.detach().cpu().numpy()
 83 | 
 84 | 
 85 | 
 86 | # class SharedAdam(torch.optim.Adam):
 87 | #     def __init__(self, params, lr=1e-3, betas=(0.9, 0.99), eps=1e-8,
 88 | #                  weight_decay=0):
 89 | #         super(SharedAdam, self).__init__(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
 90 | #         # State initialization
 91 | #         for group in self.param_groups:
 92 | #             for p in group['params']:
 93 | #                 state = self.state[p]
 94 | #                 state['step'] = 0
 95 | #                 state['exp_avg'] = torch.zeros_like(p.data)
 96 | #                 state['exp_avg_sq'] = torch.zeros_like(p.data)
 97 | 
 98 | #                 # share in memory
 99 | #                 state['exp_avg'].share_memory_()
100 | #                 state['exp_avg_sq'].share_memory_()
101 | 
102 | class SharedAdam(torch.optim.Adam):
103 |     """Implements Adam algorithm with shared states.
104 |     """
105 | 
106 |     def __init__(self,
107 |                  params,
108 |                  lr=1e-3,
109 |                  betas=(0.9, 0.999),
110 |                  eps=1e-8,
111 |                  weight_decay=0):
112 |         super(SharedAdam, self).__init__(params, lr, betas, eps, weight_decay)
113 | 
114 |         for group in self.param_groups:
115 |             for p in group['params']:
116 |                 state = self.state[p]
117 |                 state['step'] = torch.zeros(1)
118 |                 state['exp_avg'] = p.data.new().resize_as_(p.data).zero_()
119 |                 state['exp_avg_sq'] = p.data.new().resize_as_(p.data).zero_()
120 | 
121 |     def share_memory(self):
122 |         for group in self.param_groups:
123 |             for p in group['params']:
124 |                 state = self.state[p]
125 |                 state['step'].share_memory_()
126 |                 state['exp_avg'].share_memory_()
127 |                 state['exp_avg_sq'].share_memory_()
128 | 
129 |     def step(self, closure=None):
130 |         """Performs a single optimization step.
131 |         Arguments:
132 |             closure (callable, optional): A closure that reevaluates the model
133 |                 and returns the loss.
134 |         """
135 |         loss = None
136 |         if closure is not None:
137 |             loss = closure()
138 | 
139 |         for group in self.param_groups:
140 |             for p in group['params']:
141 |                 if p.grad is None:
142 |                     continue
143 |                 grad = p.grad.data
144 |                 state = self.state[p]
145 | 
146 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
147 |                 beta1, beta2 = group['betas']
148 | 
149 |                 state['step'] += 1
150 | 
151 |                 if group['weight_decay'] != 0:
152 |                     grad = grad.add(group['weight_decay'], p.data)
153 | 
154 |                 # Decay the first and second moment running average coefficient
155 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
156 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
157 | 
158 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
159 | 
160 |                 bias_correction1 = 1 - beta1 ** state['step'].item()
161 |                 bias_correction2 = 1 - beta2 ** state['step'].item()
162 |                 step_size = group['lr'] * math.sqrt(
163 |                     bias_correction2) / bias_correction1
164 | 
165 |                 p.data.addcdiv_(-step_size, exp_avg, denom)
166 | 
167 |         return loss
168 | 
169 | 
170 | 
171 | 
172 | if __name__ == '__main__':
173 |     env_id = 'SpaceInvadersDeterministic-v4'
174 |     env = AtariEnv(gym.make(env_id), reset=True)
175 |     input_size = env.reset().shape
176 |     action_size = env.action_space.n
177 |     
178 |     print('action_size', action_size)
179 | 
180 |     global_model = ActorCritic(NatureCNN, input_size, action_size, build_optimiser=False)
181 |     global_model.share_memory()
182 | 
183 |     #opt = SharedAdam(global_model.parameters(), lr=1e-3)
184 |     #opt.share_memory()
185 | 
186 |     #actor = ActorCritic(NatureCNN, input_size, action_size)
187 |     env_args = dict(k=4, rescale=84, episodic=True, reset=True, clip_reward=True, Noop=True, time_limit=None, channels_first=True)
188 |     model_args = dict(model=NatureCNN, input_size=input_size, action_size=action_size, build_optimiser=False)
189 | 
190 |     processes = []
191 |     for rank in range(8):
192 |         p = mp.Process(target=train, args=(global_model, ActorCritic(**model_args), AtariEnv(gym.make(env_id), **env_args), 20, 1000, rank))
193 |         p.start()
194 |         processes.append(p)
195 |         time.sleep(0.5)
196 |     for p in processes:
197 |         p.join()


--------------------------------------------------------------------------------
/rlib/utils/ReplayMemory.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | import gym
  3 | import time, copy
  4 | import scipy.misc
  5 | from collections import deque
  6 | #from DoubleDQN import ReplayMemory as RM
  7 | 
  8 | class FrameBuffer(object):
  9 |     def __init__(self, size, width, height, stack, Atari = True):
 10 |         self._idx = 0
 11 |         self._replay_length = size
 12 |         self._stack_size = stack
 13 |         self._Atari = Atari
 14 |         self._frames = np.empty((size,width,height), dtype=np.uint8)
 15 |         self._blank_frame = np.zeros((width,height))
 16 |         self._stacked_frames = deque([self._blank_frame for i in range(self._stack_size)], maxlen=self._stack_size)
 17 |     
 18 |     def preprocess_frame(self,frame):
 19 |         frame = scipy.misc.imresize(frame, [110,84,3])[110-84:,0:84,:]
 20 |         frame = np.dot(frame[...,:3], np.array([0.299, 0.587, 0.114])).astype(dtype=np.uint8)
 21 |         return frame
 22 | 
 23 |     def addFrame(self,frame):
 24 |         self._frames[self._idx] = self.preprocess_frame(frame)
 25 |         self._idx = (self._idx +1) % self._replay_length
 26 | 
 27 |     def stack_frames(self,frame,reset=False):
 28 |         self.addFrame(frame)
 29 |         if reset:
 30 |             for _ in range(self._stack_size):
 31 |                 self._stacked_frames.append(self._frames[self._idx-1])
 32 |         else:
 33 |             self._stacked_frames.append(self._frames[self._idx-1])
 34 |         
 35 |         return copy.copy(self._stacked_frames)
 36 | 
 37 | 
 38 | class NumpyReplayMemory(object):
 39 |     def __init__(self, replaysize, shape):
 40 |         self._idx = 0
 41 |         self._full_flag = False
 42 |         self._replay_length = replaysize
 43 |         self._states = np.zeros((replaysize,*shape), dtype=np.uint8)
 44 |         self._actions = np.zeros((replaysize), dtype=np.int)
 45 |         self._rewards = np.zeros((replaysize), dtype=np.int)
 46 |         self._next_states = np.zeros((replaysize,*shape), dtype=np.uint8)
 47 |         self._dones = np.zeros((replaysize), dtype=np.int)
 48 |         #self._stacked_frames = deque([np.zeros((width,height), dtype=np.uint8) for i in range(stack)], maxlen=stack)
 49 |     
 50 |     def addMemory(self,state,action,reward,next_state,done):
 51 |         self._states[self._idx] = state
 52 |         self._actions[self._idx] = action
 53 |         self._rewards[self._idx] = reward
 54 |         self._next_states[self._idx] = next_state
 55 |         self._dones[self._idx] = done
 56 |         if self._idx + 1 >= self._replay_length:
 57 |             self._idx = 0
 58 |             self._full_flag = True
 59 |         else:
 60 |             self._idx += 1
 61 |     
 62 |     def __len__(self):
 63 |         if self._full_flag == False:
 64 |             return self._idx
 65 |         else:
 66 |             return self._replay_length
 67 |     
 68 |     
 69 |     def sample(self,batch_size):
 70 |         if self._full_flag == False:
 71 |             idxs = np.random.choice(self._idx, size=batch_size, replace=False)
 72 |         else:
 73 |             idxs = np.random.choice(self._replay_length, size=batch_size, replace=False)
 74 |         
 75 |         states = self._states[idxs]
 76 |         actions = self._actions[idxs]
 77 |         rewards = self._rewards[idxs]
 78 |         next_states = self._next_states[idxs]
 79 |         dones = self._dones[idxs]
 80 | 
 81 |         return states, actions, rewards, next_states, dones, idxs
 82 | 
 83 | class replayMemory(object):
 84 |     def __init__(self,replay_length,pixels=True):
 85 |         self._replay_length = replay_length
 86 |         self._pixels = pixels
 87 |         self._memory = []
 88 |         self._idx = 0
 89 |     
 90 |     def addMemory(self,state,action,reward,next_state,final_state):
 91 |         if len(self._memory) < self._replay_length:
 92 |             self._memory.append((state,action,reward,next_state,final_state))
 93 |         else:
 94 |             self._memory[self._idx] = (state,action,reward,next_state,final_state)
 95 |         self._idx = (self._idx +1) % self._replay_length
 96 |     
 97 |     def getlen(self):
 98 |         return len(self._memory)
 99 |     
100 |     def resetMemory(self):
101 |         self._memory = []
102 |         self._idx = 0
103 |     
104 |     def sample(self, batch_size):
105 |         idxs = np.random.choice(np.arange(len(self._memory)), size=batch_size, replace=False)
106 |         sample = [self._memory[i] for i in idxs ]
107 |         
108 |         if self._pixels: #stack images to get k previous states
109 |             states = np.stack([np.stack(sample[i][0],axis=2) for i in range(len(sample))],axis=0)
110 |             next_states = np.stack([np.stack(sample[i][3],axis=2) for i in range(len(sample))],axis=0)
111 |         else:
112 |             states = np.stack([sample[i][0]for i in range(len(sample))],axis=0)
113 |             next_states = np.stack([sample[i][3] for i in range(len(sample))],axis=0)
114 |     
115 |         actions = np.array([sample[i][1]for i in range(len(sample))])
116 |         rewards = np.array([sample[i][2]for i in range(len(sample))])
117 |         final_state = np.array([sample[i][4]for i in range(len(sample))])
118 |         
119 |         return (states,actions,rewards,next_states,final_state)
120 | 
121 | 
122 | 
123 | def stack_frames(frame,stacked_frames,reset=False):
124 |     # Preprocess frame
125 |     frame = preprocess_frame(frame)
126 |     
127 |     if reset:
128 |         # Clear our stacked_frames
129 |         stacked_frames = deque([np.zeros((84,84), dtype=np.uint8) for i in range(4)], maxlen=4)
130 |         
131 |         # Because we're in a new episode, copy the same frame 4x
132 |         for i in range(4):
133 |             stacked_frames.append(frame)
134 |     
135 |         # Stack the frames
136 |         stacked_state = np.stack(stacked_frames, axis=2)
137 |         
138 |     else:
139 |         # Append frame to deque, automatically removes the oldest frame
140 |         stacked_frames.append(frame)
141 | 
142 |         # Build the stacked state (first dimension specifies different frames)
143 |         stacked_state = np.stack(stacked_frames, axis=2)
144 |     
145 |     return stacked_state, stacked_frames
146 | 
147 | def preprocess_frame(frame):
148 |     frame = scipy.misc.imresize(frame, [110,84,3])[110-84:,0:84,:]
149 |     frame = np.dot(frame[...,:3], np.array([0.299, 0.587, 0.114])).astype(dtype=np.uint8)
150 |     return frame
151 | 
152 | def main():
153 |     env = gym.make('SpaceInvaders-v0')
154 |     replay =NumpyReplayMemory(100000,84,84,4)
155 |     #framebuffer = FrameBuffer(100000,84,84,4)
156 | 
157 |     obs = env.reset()
158 |     #state = framebuffer.stack_frames(obs,reset=True)
159 |     state = replay.stack_frames(obs,reset=True)
160 |     print("state shape", state.shape)
161 |     avg_time = 0 
162 |     for t in range(int(1e7)):
163 |         start = time.time()
164 |         action = env.action_space.sample()
165 |         obs, reward, done, info = env.step(action)
166 |         #next_state = framebuffer.stack_frames(obs,reset=False)
167 |         next_state = replay.stack_frames(obs,reset=False)
168 |         print("next state shape", next_state.shape)
169 |         replay.addMemory(state,action,reward,next_state,done)
170 |         state = next_state
171 |         if done:
172 |             obs = env.reset()
173 |             #state = framebuffer.stack_frames(obs,reset=True)
174 |             state = replay.stack_frames(obs,reset=True)
175 |             
176 |         
177 |         if t > 32 :
178 |             
179 |             batch_states, batch_actions, batch_rewards, batch_next_states, batch_final_states = replay.sample(32)
180 |             end = time.time()
181 |             #if t % 100 == 0:
182 |                 #print("next_state ", t)
183 |                 #print("batch_actions shape", batch_actions.shape)
184 |                 #print("next_state shape", len(batch_next_states), ",", batch_states[0].shape)
185 |                 #for i in range(4):
186 |                     #scipy.misc.imshow(batch_next_states[-1,:,:,i])
187 |             
188 |             avg_time += (end-start)
189 |         if t % 10000 == 0:
190 |             print("time taken for 10000 steps", avg_time)
191 |             avg_time = 0
192 | 
193 | if __name__ == "__main__":
194 |     main()


--------------------------------------------------------------------------------
/rlib/utils/VecEnv.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import gym
  3 | import multiprocessing as mp
  4 | from itertools import chain
  5 | 
  6 | # Code was inspired from or modified from OpenAI baselines https://github.com/openai/baselines/tree/master/baselines/common
  7 | 
  8 | class Env(object):
  9 |     def __init__(self, env, worker_id=0): #, Wrappers=None, **wrapper_args):
 10 |         #self.env_id = env_id
 11 |         #env = gym.make(env_id)
 12 |         self.parent, self.child = mp.Pipe()
 13 |         self.worker = Worker(worker_id, env, self.child)
 14 |         self.worker.daemon = True
 15 |         self.worker.start()
 16 |         self.open = True        
 17 |     
 18 |     def __del__(self):
 19 |         self.close()
 20 |         self.parent.close()
 21 |         self.child.close()
 22 |     
 23 |     def __getattr__(self, name):
 24 |         attribute = self._send_step('getattr', name)
 25 |         return attribute()
 26 |         
 27 |     def _send_step(self,cmd,action):
 28 |         self.parent.send((cmd,action))
 29 |         return self._recieve
 30 |     
 31 |     def _recieve(self,):
 32 |         return self.parent.recv()
 33 | 
 34 |     def step(self,action, blocking=True):
 35 |         #if self.open:
 36 |         results = self._send_step('step', action)
 37 |         # if blocking:
 38 |         #     return results()
 39 |         # else:
 40 |         return results 
 41 |     
 42 |     def reset(self):
 43 |         #if self.open:
 44 |         results = self._send_step('reset', None)
 45 |         return results()
 46 |     
 47 |     def close(self):
 48 |         if self.open:
 49 |             self.open = False
 50 |             results = self._send_step('close', None)
 51 |             self.worker.join()
 52 |     
 53 |     def render(self):
 54 |         #if self.open:
 55 |         self._send_step('render', None)
 56 |     
 57 | class Worker(mp.Process):
 58 |     def __init__(self, worker_id, env, connection):
 59 |         import gym
 60 |         np.random.seed()
 61 |         mp.Process.__init__(self)
 62 |         self.env = env #gym.make(env_id)
 63 |         self.worker_id = worker_id
 64 |         self.connection = connection
 65 |     
 66 |     def _step(self):
 67 |         try:
 68 |             while True:
 69 |                 cmd, a = self.connection.recv()
 70 |                 if cmd == 'step':
 71 |                     obs, r, done, info = self.env.step(a)
 72 |                     # auto_reset moved to env wrappers 
 73 |                     self.connection.send((obs,r,done,info))
 74 |                 elif cmd == 'render':
 75 |                     self.env.render()
 76 |                     #self.connection.send((1))
 77 |                 elif cmd == 'reset':
 78 |                     obs = self.env.reset()
 79 |                     self.connection.send(obs)
 80 |                 elif cmd == 'getattr':
 81 |                     self.connection.send(getattr(self.env, a))
 82 |                 elif cmd == 'close':
 83 |                     self.env.close()
 84 |                     #self.connection.send((1))
 85 |                     break
 86 |         except KeyboardInterrupt:
 87 |             print("closing worker", self.worker_id)
 88 |         finally:
 89 |             self.env.close()
 90 |             #self.connection.close()
 91 | 
 92 |     def run(self,):
 93 |         self._step()
 94 | 
 95 | 
 96 | 
 97 | class BatchEnv(object):
 98 |     def __init__(self, env_constructor, env_id, num_envs, blocking=False, make_args={}, **env_args):
 99 |         #self.envs = [Env(env_constructor(gym.make(env_id),**env_args),worker_id=i) for i in range(num_envs)]
100 |         self.envs = []
101 |         for i in range(num_envs):
102 |             env = gym.make(env_id, **make_args)
103 |             self.envs.append(Env(env_constructor(env, **env_args)))
104 |         #self.envs = [env_constructor(env_id=env_id,**env_args, worker_id=i) for i in range(num_envs)]
105 |         self.blocking = blocking
106 | 
107 |     def __len__(self):
108 |         return len(self.envs)
109 |     
110 |     def __getattr__(self, name):
111 |         return getattr(self.envs[0], name)
112 | 
113 |     def step(self,actions):
114 |         if self.blocking: # wait for each process to return results before starting the next
115 |             results = [env.step(action,True) for env, action in zip(self.envs,actions)]
116 |         else:
117 |             results = [env.step(action,False) for env, action in zip(self.envs,actions)] # apply steps async
118 |             results = [result() for result in results] # collect results
119 |             
120 |         obs, rewards, done, info = zip(*results)
121 |         return np.stack(obs), np.stack(rewards), np.stack(done), info
122 |     
123 |     def reset(self):
124 |         obs = [env.reset() for env in self.envs]
125 |         return np.stack(obs)
126 |     
127 |     def close(self):
128 |         for env in self.envs:
129 |             env.close()
130 | 
131 | 
132 | 
133 | def chunks(l, n):
134 |     for i in range(0, len(l), n):
135 |         yield l[i:i+n]
136 | 
137 | class ChunkEnv(object):
138 |     def __init__(self, env_id, num_workers, num_chunks):
139 |         self.num_workers = num_workers
140 |         self.num_chunks = num_chunks
141 |         self.env_id = env_id
142 | 
143 |         self.workers = []
144 |         self.parents = []
145 |         for i in range(num_workers):
146 |             parent, child = mp.Pipe()
147 |             worker = ChunkWorker(env_id,num_chunks,child)
148 |             self.parents.append(parent)
149 |             self.workers.append(worker)
150 | 
151 |         try:
152 |             for worker in self.workers:
153 |                 worker.start()
154 | 
155 |         except KeyboardInterrupt:
156 |             self.close()
157 |             exit()
158 |             #for w in self.workers:
159 |                 #w.env.close()
160 |                 #w.terminate()
161 |                 #exit()
162 | 
163 |         
164 |     def _send_step(self,cmd,actions):
165 |         for parent, action_chunk in zip(self.parents,chunks(actions, self.num_chunks)):
166 |             parent.send((cmd,action_chunk))
167 |         return self._recieve
168 |     
169 |     def _recieve(self,):
170 |         return [parent.recv() for parent in self.parents]
171 | 
172 |     def step(self,actions,blocking=True):
173 |         results = self._send_step('step', actions)
174 |         if blocking:
175 |             results = list(chain.from_iterable(results()))
176 |             obs, rewards, dones, infos = zip(*results)
177 |             return np.stack(obs), np.stack(rewards), np.stack(dones), infos
178 |         else:
179 |             return results
180 |     
181 |     def reset(self):
182 |         results = self._send_step('reset',np.zeros((self.num_chunks*self.num_workers)))
183 |         results = list(chain.from_iterable(results()))
184 |         return np.stack(results)
185 |     
186 |     def close(self):
187 |         results = self._send_step('close',np.zeros((self.num_chunks*self.num_workers)))
188 |         for worker in self.workers:
189 |             worker.join()
190 | 
191 | class ChunkWorker(mp.Process):
192 |     def __init__(self, env_id, num_chunks, connection, render=False):
193 |         mp.Process.__init__(self)
194 |         self.envs = [gym.make(env_id) for i in range(num_chunks)]
195 |         self.connection = connection
196 |         self.render = render
197 |         
198 |     def run(self):
199 |         while True:
200 |             cmd, actions = self.connection.recv()
201 |             if cmd == 'step':
202 |                 results = []
203 |                 for a, env in zip(actions,self.envs):
204 |                     obs, r, done, info = env.step(a)
205 |                     # auto_reset moved to env wrappers
206 |                     if self.render:
207 |                         self.env.render()
208 |                     results.append((obs,r,done,info))
209 |                 self.connection.send(results)
210 |             elif cmd == 'reset':
211 |                 results = []
212 |                 for a, env in zip(actions,self.envs):
213 |                     obs = env.reset()
214 |                     results.append(obs)
215 |                 self.connection.send(results)
216 |             elif cmd == 'close':
217 |                 for env in self.envs:
218 |                     env.close()
219 |                 self.connection.send((1))
220 |                 break
221 | 
222 | 
223 | class DummyBatchEnv(object):
224 |     def __init__(self, env_constructor, env_id, num_envs, make_args={}, **env_args):
225 |         self.envs = [env_constructor(gym.make(env_id, **make_args),**env_args) for i in range(num_envs)]
226 | 
227 |     def __len__(self):
228 |         return len(self.envs)
229 | 
230 |     def __getattr__(self, name):
231 |         return getattr(self.envs[0], name)
232 | 
233 |     def step(self,actions):
234 |         results = [env.step(action) for env, action in zip(self.envs,actions)]
235 |         obs, rewards, done, info = zip(*results)
236 |         return np.stack(obs).copy(), np.stack(rewards).copy(), np.stack(done).copy(), info
237 |     
238 |     def reset(self):
239 |         obs = [env.reset() for env in self.envs]
240 |         return np.stack(obs).copy()
241 |     
242 |     def close(self):
243 |         for env in self.envs:
244 |             env.close()


--------------------------------------------------------------------------------
/rlib/A2C/A2C_lstm.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy
  3 | import gym
  4 | import os, time, datetime
  5 | import threading
  6 | from rlib.A2C.ActorCritic import ActorCritic_LSTM
  7 | from rlib.networks.networks import*
  8 | from rlib.utils.utils import fold_batch, stack_many, totorch, fastsample
  9 | from rlib.utils.SyncMultiEnvTrainer import SyncMultiEnvTrainer
 10 | from rlib.utils.VecEnv import*
 11 | from rlib.utils.wrappers import*
 12 | 
 13 | 
 14 | 
 15 | class A2CLSTM_Trainer(SyncMultiEnvTrainer):
 16 |     def __init__(self, envs, model, val_envs, train_mode='nstep', return_type='nstep', log_dir='logs/', model_dir='models/', total_steps=1000000, nsteps=20,
 17 |                 validate_freq=1e6, save_freq=0, render_freq=0, num_val_episodes=50, max_val_steps=10000, log_scalars=True):
 18 |         
 19 |         super().__init__(envs, model, val_envs, log_dir=log_dir, model_dir=model_dir, train_mode=train_mode, return_type=return_type,
 20 |                         total_steps=total_steps, nsteps=nsteps, validate_freq=validate_freq, save_freq=save_freq,
 21 |                         render_freq=render_freq, update_target_freq=0, num_val_episodes=num_val_episodes, max_val_steps=max_val_steps, log_scalars=log_scalars)
 22 |         
 23 |         
 24 |         self.prev_hidden = self.model.get_initial_hidden(self.num_envs)
 25 |         
 26 |         hyper_params = {'learning_rate':model.lr, 'learning_rate_final':model.lr_final, 'lr_decay_steps':model.decay_steps , 'grad_clip':model.grad_clip, 'nsteps':self.nsteps, 'num_workers':self.num_envs,
 27 |                   'total_steps':self.total_steps, 'entropy_coefficient':model.entropy_coeff, 'value_coefficient':model.value_coeff, 'gamma':self.gamma, 'lambda':self.lambda_}
 28 |         
 29 |         if self.log_scalars:
 30 |             filename = log_dir + '/hyperparameters.txt'
 31 |             self.save_hyperparameters(filename, **hyper_params)
 32 |     
 33 |     def _train_nstep(self):
 34 |         batch_size = (self.num_envs * self.nsteps)
 35 |         start = time.time()
 36 |         num_updates = self.total_steps // batch_size
 37 |         s = 0
 38 |         # main loop
 39 |         for t in range(1,num_updates+1):
 40 |             states, actions, rewards, first_hidden, dones, values, last_values = self.rollout()
 41 |             
 42 |             if self.return_type == 'nstep':
 43 |                 R = self.nstep_return(rewards, last_values, dones, gamma=self.gamma)
 44 |             elif self.return_type == 'GAE':
 45 |                 R = self.GAE(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_) + values
 46 |             elif self.return_type == 'lambda':
 47 |                 R = self.lambda_return(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_)
 48 |                 
 49 |             # stack all states, actions and Rs across all workers into a single batch
 50 |             actions, R = fold_batch(actions), fold_batch(R)
 51 |             l = self.model.backprop(states, R, actions, first_hidden, dones)
 52 | 
 53 |             if self.render_freq > 0 and t % ((self.validate_freq // batch_size) * self.render_freq) == 0:
 54 |                 render = True
 55 |             else:
 56 |                 render = False
 57 |      
 58 |             if self.validate_freq > 0 and t % (self.validate_freq //batch_size) == 0:
 59 |                 self.validation_summary(t,l,start,render)
 60 |                 start = time.time()
 61 |             
 62 |             if self.save_freq > 0 and  t % (self.save_freq // batch_size) == 0:
 63 |                 s += 1
 64 |                 self.saver.save(self.sess, str(self.model_dir  + str(s) + ".ckpt") )
 65 |                 print('saved model')
 66 |     
 67 |     
 68 |     def _validate_async(self, env, num_ep, max_steps, render=False):
 69 |         for episode in range(num_ep):
 70 |             state = env.reset()
 71 |             episode_score = []
 72 |             hidden = self.model.get_initial_hidden(1)
 73 |             for t in range(max_steps):
 74 |                 policy, value, hidden = self.model.evaluate(state[None, None], hidden)
 75 |                 #print('policy', policy, 'value', value)
 76 |                 action = int(fastsample(policy))
 77 |                 next_state, reward, done, info = env.step(action)
 78 |                 state = next_state
 79 | 
 80 |                 episode_score.append(reward)
 81 |                 
 82 |                 if render:
 83 |                     with self.lock:
 84 |                         env.render()
 85 | 
 86 |                 if done or t == max_steps -1:
 87 |                     tot_reward = np.sum(episode_score)
 88 |                     with self.lock:
 89 |                         self.validate_rewards.append(tot_reward)
 90 |                     
 91 |                     break
 92 |         if render:
 93 |             with self.lock:
 94 |                 env.close()
 95 |     
 96 |     def validate_sync(self, render):
 97 |         episode_scores = []
 98 |         env = self.val_envs
 99 |         for episode in range(self.num_val_episodes//len(env)):
100 |             states = env.reset()
101 |             episode_score = []
102 |             prev_hidden = self.model.get_initial_hidden(len(self.val_envs))
103 |             for t in range(self.val_steps):
104 |                 policies, values, hidden = self.model.evaluate(states[None], prev_hidden)
105 |                 actions = fastsample(policies)
106 |                 next_states, rewards, dones, infos = env.step(actions)
107 |                 states = next_states
108 | 
109 |                 episode_score.append(rewards*(1-dones))
110 |                 
111 |                 if render:
112 |                     with self.lock:
113 |                         env.render()
114 | 
115 |                 if dones.sum() == self.num_envs or t == self.val_steps -1:
116 |                     tot_reward = np.sum(np.stack(episode_score), axis=0)
117 |                     episode_scores.append(tot_reward)
118 |                     break
119 |         
120 |         return np.mean(episode_scores)
121 |             
122 |         
123 |     def rollout(self,):
124 |         rollout = []
125 |         first_hidden = self.prev_hidden
126 |         for t in range(self.nsteps):
127 |             policies, values, hidden = self.model.evaluate(self.states[None], self.prev_hidden)
128 |             actions = fastsample(policies)
129 |             next_states, rewards, dones, infos = self.env.step(actions)
130 |             rollout.append((self.states, actions, rewards, values, dones))
131 |             self.states = next_states
132 |             self.prev_hidden = self.model.mask_hidden(hidden, dones) # reset hidden state at end of episode
133 |             
134 |         states, actions, rewards, values, dones = stack_many(*zip(*rollout))
135 |         _, last_values, _ = self.model.evaluate(self.states[None], self.prev_hidden)
136 |         return states, actions, rewards, first_hidden, dones, values, last_values
137 |             
138 | 
139 | def main(env_id):
140 |     num_envs = 32
141 |     nsteps = 20
142 |     
143 |     classic_list = ['MountainCar-v0', 'Acrobot-v1', 'LunarLander-v2', 'CartPole-v0', 'CartPole-v1']
144 |     if any(env_id in s for s in classic_list):
145 |         print('Classic Control')
146 |         val_envs = [gym.make(env_id) for i in range(10)]
147 |         envs = BatchEnv(DummyEnv, env_id, num_envs, blocking=False)
148 | 
149 |     elif 'ApplePicker' in env_id:
150 |         print('ApplePicker')
151 |         make_args = {'num_objects':100, 'default_reward':-0.1}
152 |         val_envs = [gym.make(env_id, **make_args) for i in range(10)]
153 |         envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=True, make_args=make_args)
154 |         print(val_envs[0])
155 |         print(envs.envs[0])
156 | 
157 |     else:
158 |         print('Atari')
159 |         env = gym.make(env_id)
160 |         if env.unwrapped.get_action_meanings()[1] == 'FIRE':
161 |             reset = True
162 |             print('fire on reset')
163 |         else:
164 |             reset = False
165 |             print('only stack frames')
166 |         
167 |         env.close()
168 |         val_envs = [AtariEnv(gym.make(env_id), k=1, rescale=84, episodic=False, reset=reset, clip_reward=False) for i in range(16)]
169 |         envs = BatchEnv(AtariEnv, env_id, num_envs, rescale=84, blocking=False , k=1, reset=reset, episodic=False, clip_reward=True)
170 |     
171 |     action_size = val_envs[0].action_space.n
172 |     input_size = val_envs[0].reset().shape
173 |     
174 | 
175 |     current_time = datetime.datetime.now().strftime('%y-%m-%d_%H-%M-%S')
176 |     train_log_dir = 'logs/A2C_LSTM/' + env_id +'/' + current_time
177 |     model_dir = "models/A2C_LSTM/" + env_id + '/' + current_time
178 |     
179 | 
180 |     model = ActorCritic_LSTM(NatureCNN,
181 |                         input_size=input_size,
182 |                         action_size=action_size,
183 |                         cell_size=256,
184 |                         lr=1e-3,
185 |                         lr_final=1e-4,
186 |                         decay_steps=50e6//(num_envs*nsteps),
187 |                         grad_clip=0.5,
188 |                         optim=torch.optim.RMSprop,
189 |                         device='cuda')
190 | 
191 |     
192 |     a2c_trainer = A2CLSTM_Trainer(envs=envs,
193 |                                   model=model,
194 |                                   model_dir=model_dir,
195 |                                   log_dir=train_log_dir,
196 |                                   val_envs=val_envs,
197 |                                   train_mode='nstep',
198 |                                   return_type='GAE',
199 |                                   total_steps=50e6,
200 |                                   nsteps=nsteps,
201 |                                   validate_freq=1e6,
202 |                                   save_freq=0,
203 |                                   render_freq=0,
204 |                                   num_val_episodes=25,
205 |                                   log_scalars=False)
206 |     print(env_id)
207 |     
208 |     a2c_trainer.train()
209 | 
210 |     del model
211 | 
212 | if __name__ == "__main__":
213 |     env_id_list = ['SpaceInvadersDeterministic-v4', 'FreewayDeterministic-v4', 'MontezumaRevengeDeterministic-v4', 'PongDeterministic-v4']
214 |     #env_id_list = ['MountainCar-v0', 'Acrobot-v1']
215 |     #env_id_list = ['SuperMarioBros-1-1-v0']
216 |     for env_id in env_id_list:
217 |         main(env_id)


--------------------------------------------------------------------------------
/rlib/utils/wrappers.py:
--------------------------------------------------------------------------------
  1 | import gym 
  2 | import numpy as np
  3 | from PIL import Image
  4 | from collections import deque
  5 | import torch
  6 | 
  7 | # Code was inspired from or modified from OpenAI baselines https://github.com/openai/baselines/tree/master/baselines/common
  8 | 
  9 | 
 10 | def AtariValidate(env):
 11 |     env = FireResetEnv(env)
 12 |     env = NoopResetEnv(env, max_op=3000)
 13 |     env = StackEnv(env)
 14 |     return env
 15 | 
 16 | class RescaleEnv(gym.Wrapper):
 17 |     def __init__(self, env, size):
 18 |         gym.Wrapper.__init__(self, env)
 19 |         self.size = size
 20 |     
 21 |     def preprocess(self, frame):
 22 |         frame = np.array(Image.fromarray(frame).resize([self.size,self.size]))
 23 |         frame = np.dot(frame[...,:3], np.array([0.299, 0.587, 0.114])).astype(dtype=np.uint8)
 24 |         return frame[:,:,np.newaxis]
 25 | 
 26 |     def step(self, action):
 27 |         obs, reward, done, info = self.env.step(action)
 28 |         return self.preprocess(obs), reward, done, info
 29 |     
 30 |     def reset(self, **kwargs):
 31 |         obs = self.env.reset(**kwargs)
 32 |         return self.preprocess(obs)
 33 | 
 34 | 
 35 | class AtariRescale42x42(gym.Wrapper):
 36 |     def __init__(self, env):
 37 |         gym.Wrapper.__init__(self, env)
 38 |     
 39 |     def preprocess(self,frame):
 40 |         frame = np.array(Image.fromarray(frame).resize([84,110]))[110-84:,0:84,:]
 41 |         frame = np.dot(frame[...,:3], np.array([0.299, 0.587, 0.114])).astype(dtype=np.uint8)
 42 |         frame = np.array(Image.fromarray(frame).resize([42,42])).astype(dtype=np.uint8)
 43 |         return frame[:,:,np.newaxis]
 44 | 
 45 |     def step(self, action):
 46 |         obs, reward, done, info = self.env.step(action)
 47 |         return self.preprocess(obs), reward, done, info
 48 |     
 49 |     def reset(self, **kwargs):
 50 |         obs = self.env.reset(**kwargs)
 51 |         return self.preprocess(obs)
 52 | 
 53 | class AtariRescaleEnv(gym.Wrapper):
 54 |     def __init__(self, env):
 55 |         gym.Wrapper.__init__(self, env)
 56 |     
 57 |     def preprocess(self,frame):
 58 |         frame = np.array(Image.fromarray(frame).resize([84,110]))[110-84:,0:84,:]
 59 |         frame = np.dot(frame[...,:3], np.array([0.299, 0.587, 0.114])).astype(dtype=np.uint8)
 60 |         return frame[:,:,np.newaxis]
 61 | 
 62 |     def step(self, action):
 63 |         obs, reward, done, info = self.env.step(action)
 64 |         return self.preprocess(obs), reward, done, info
 65 |     
 66 |     def reset(self, **kwargs):
 67 |         obs = self.env.reset(**kwargs)
 68 |         return self.preprocess(obs)
 69 | 
 70 | class AtariRescaleColour(gym.Wrapper):
 71 |     def __init__(self, env):
 72 |         gym.Wrapper.__init__(self, env)
 73 |     
 74 |     def preprocess(self,frame):
 75 |         frame = np.array(Image.fromarray(frame).resize([84,110]))[110-84:,0:84,:]
 76 |         return frame
 77 | 
 78 |     def step(self, action):
 79 |         obs, reward, done, info = self.env.step(action)
 80 |         return self.preprocess(obs), reward, done, info
 81 |     
 82 |     def reset(self, **kwargs):
 83 |         obs = self.env.reset(**kwargs)
 84 |         return self.preprocess(obs)
 85 | 
 86 | 
 87 | class DummyEnv(gym.Wrapper):
 88 |     def __init__(self, env):
 89 |         gym.Wrapper.__init__(self, env)
 90 |     def step(self, action):
 91 |         return self.env.step(action)
 92 |     def reset(self, **kwargs):
 93 |         return self.env.reset(**kwargs)
 94 | 
 95 | class NoopResetEnv(gym.Wrapper):
 96 |     def __init__(self, env, max_op=7):
 97 |         gym.Wrapper.__init__(self, env)
 98 |         self.max_op = max_op
 99 | 
100 |     def reset(self, **kwargs):
101 |         obs = self.env.reset(**kwargs)
102 |         noops = np.random.randint(0, self.max_op)
103 |         for i in range(noops):
104 |             obs, reward, done, info = self.env.step(0)
105 |         return obs
106 |     
107 |     def step(self, action):
108 |         return self.env.step(action)
109 | 
110 | class ClipRewardEnv(gym.Wrapper):
111 |     def __init__(self, env):
112 |         gym.Wrapper.__init__(self, env)
113 |     
114 |     def step(self, action):
115 |         obs, reward, done, info = self.env.step(action)
116 |         reward = np.clip(reward, -1, 1)
117 |         return obs, reward, done, info
118 |     
119 |     def reset(self, **kwargs):
120 |         return self.env.reset(**kwargs)
121 | 
122 | class NoRewardEnv(gym.Wrapper):
123 |     def __init__(self, env):
124 |         gym.Wrapper.__init__(self, env)
125 |     
126 |     def step(self, action):
127 |         obs, reward, done, info = self.env.step(action)
128 |         return obs, 0, done, info
129 |     
130 |     def reset(self, **kwargs):
131 |         return self.env.reset(**kwargs)
132 |     
133 | class FireResetEnv(gym.Wrapper):
134 |     def __init__(self, env):
135 |         """Take action on reset for environments that are fixed until firing."""
136 |         gym.Wrapper.__init__(self, env)
137 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
138 |         assert len(env.unwrapped.get_action_meanings()) >= 3
139 | 
140 |     def reset(self, **kwargs):
141 |         self.env.reset(**kwargs)
142 |         obs, _, done, _ = self.env.step(1)
143 |         if done:
144 |             self.env.reset(**kwargs)
145 |         obs, _, done, _ = self.env.step(2)
146 |         if done:
147 |             self.env.reset(**kwargs)
148 |         return obs
149 | 
150 |     def step(self, ac):
151 |         return self.env.step(ac)
152 | 
153 | class EpisodicLifeEnv(gym.Wrapper):
154 |     def __init__(self, env):
155 |         gym.Wrapper.__init__(self,env)
156 |         self.lives = 0
157 |         self.end_of_episode = True
158 |     
159 | 
160 |     def step(self, action):
161 |         obs, reward, done, info = self.env.step(action)
162 |         self.end_of_episode = done 
163 |         lives = self.env.unwrapped.ale.lives()
164 |         if lives < self.lives:
165 |             done = True
166 |         self.lives = lives 
167 |         return obs, reward, done, info
168 |     
169 |     def reset(self, **kwargs):
170 |         if self.end_of_episode:
171 |             obs = self.env.reset(**kwargs)
172 |         else:
173 |             obs, _, _, _ = self.env.step(0)
174 |         return obs
175 | 
176 | class TimeLimitEnv(gym.Wrapper):
177 |     def __init__(self, env, time_limit):
178 |         gym.Wrapper.__init__(self, env)
179 |         self._time_limit=time_limit
180 |         self._step = 0
181 |     
182 |     def step(self, action):
183 |         obs, reward, done, info = self.env.step(action)
184 |         self._step += 1
185 |         if self._step > self._time_limit:
186 |             done = True
187 |         return obs, reward, done, info
188 |     
189 |     def reset(self, **kwargs):
190 |         self._step = 0
191 |         return self.env.reset(**kwargs)
192 | 
193 | 
194 | 
195 | class StackEnv(gym.Wrapper):
196 |     def __init__(self, env, k=4):
197 |         gym.Wrapper.__init__(self, env)
198 |         #self._stacked_frames = np.array(np.zeros([84,84,k]))
199 |         self._stacked_frames = deque([], maxlen=k)
200 |         self.k = k
201 | 
202 |     def step(self, action):
203 |         obs, reward, done, info = self.env.step(action)
204 |         obs = self.stack_frames(obs)
205 |         return obs, reward, done, info
206 |     
207 |     def reset(self, **kwargs):
208 |         obs = self.env.reset(**kwargs)
209 |         return self.stack_frames(obs, True)
210 | 
211 |     
212 |     def stack_frames(self,frame,reset=False):
213 |         if reset:
214 |             for i in range(self.k):
215 |                 self._stacked_frames.append(frame)
216 |         else:
217 |             self._stacked_frames.append(frame)
218 |         return np.concatenate(self._stacked_frames,axis=2)
219 | 
220 | 
221 | class AutoResetEnv(gym.Wrapper):
222 |     def __init__(self, env):
223 |         gym.Wrapper.__init__(self, env)
224 | 
225 |     def step(self, action):
226 |         obs, reward, done, info = self.env.step(action)
227 |         if done:
228 |             obs = self.env.reset()
229 |         return obs, reward, done, info
230 | 
231 | class ChannelsFirstEnv(gym.Wrapper):
232 |     def __init__(self, env):
233 |         gym.Wrapper.__init__(self, env)
234 | 
235 |     def step(self, action):
236 |         obs, reward, done, info = self.env.step(action)
237 |         return obs.transpose(2, 0, 1), reward, done, info
238 | 
239 |     def reset(self, **kwargs):
240 |         obs = self.env.reset(**kwargs)
241 |         return obs.transpose(2, 0, 1)
242 | 
243 | class GreyScaleEnv(gym.Wrapper):
244 |     def __init__(self, env):
245 |         gym.Wrapper.__init__(self, env)
246 |     
247 |     def preprocess(self,frame):
248 |         frame = np.dot(frame[...,:3], np.array([0.299, 0.587, 0.114])).astype(dtype=np.uint8)
249 |         return frame[:,:,None]
250 | 
251 |     def step(self, action):
252 |         obs, reward, done, info = self.env.step(action)
253 |         return self.preprocess(obs), reward, done, info
254 |     
255 |     def reset(self, **kwargs):
256 |         obs = self.env.reset(**kwargs)
257 |         return self.preprocess(obs)
258 | 
259 | class ToTorchEnv(gym.Wrapper):
260 |     def __init__(self, env, device='cuda:0'):
261 |         gym.Wrapper.__init__(self, env)
262 |         self.device = device
263 | 
264 |     def step(self, action:torch.Tensor):
265 |         obs, reward, done, info = self.env.step(action.cpu().numpy())
266 |         obs = torch.from_numpy(obs).float().to(self.device)
267 |         reward = torch.tensor(reward, device=self.device, dtype=torch.float32)
268 |         done = torch.tensor(done, device=self.device)
269 |         return obs, reward, done, info
270 | 
271 |     def reset(self, **kwargs):
272 |         obs = self.env.reset(**kwargs)
273 |         return torch.from_numpy(obs).float().to(self.device)
274 | 
275 | def apple_pickgame(env, k=1, grey_scale=False, auto_reset=False, max_steps=1000, channels_first=True):
276 |     if auto_reset:
277 |         env = AutoResetEnv(env)
278 |     if max_steps is not None:
279 |         env = TimeLimitEnv(env, time_limit=max_steps)
280 |     if grey_scale:
281 |         env = GreyScaleEnv(env)
282 |     if k > 1:
283 |         env = StackEnv(env, k)
284 |     if channels_first:
285 |         env = ChannelsFirstEnv(env)
286 |     return env
287 | 
288 | 
289 | def AtariEnv(env, k=4, rescale=84, episodic=True, reset=True, clip_reward=True, Noop=True, time_limit=None, channels_first=True, auto_reset=False):
290 |     ''' Wrapper function for Determinsitic Atari env 
291 |         assert 'Deterministic' in env.spec.id
292 |     '''
293 |     if reset:
294 |         env = FireResetEnv(env)
295 |     
296 |     if Noop:
297 |         if 'NoFrameskip' in env.spec.id :
298 |             max_op = 30
299 |         else:
300 |             max_op = 7
301 |         env = NoopResetEnv(env,max_op)
302 |     
303 |     if clip_reward:
304 |         env = ClipRewardEnv(env)
305 | 
306 |     if episodic:
307 |         env = EpisodicLifeEnv(env)
308 | 
309 |     if rescale == 42:
310 |         env = AtariRescale42x42(env)
311 |     elif rescale == 84:
312 |         env = AtariRescaleEnv(env)
313 |     else:
314 |         raise ValueError('84 or 42 are valid rescale sizes')
315 | 
316 |     if k > 1:
317 |         env = StackEnv(env,k)
318 |     
319 |     if time_limit is not None:
320 |         env = TimeLimitEnv(env, time_limit)
321 |     
322 |     if auto_reset:
323 |         env = AutoResetEnv(env)
324 | 
325 |     if channels_first:
326 |         env = ChannelsFirstEnv(env)
327 |     
328 |     return env


--------------------------------------------------------------------------------
/rlib/DDQN/SyncDQN.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | import numpy as np 
  4 | import gym
  5 | import threading
  6 | import time, datetime
  7 | from collections import OrderedDict
  8 | 
  9 | from rlib.networks.networks import*
 10 | from rlib.utils.wrappers import*
 11 | from rlib.utils.VecEnv import*
 12 | from rlib.utils.SyncMultiEnvTrainer import SyncMultiEnvTrainer
 13 | from rlib.utils.utils import one_hot, fold_batch, unfold_batch, totorch, tonumpy, totorch_many
 14 | from rlib.utils.schedulers import polynomial_sheduler
 15 | 
 16 | 
 17 | main_lock = threading.Lock()
 18 | 
 19 | def save_hyperparameters(filename, **kwargs):
 20 |     handle = open(filename, "w")
 21 |     for key, value in kwargs.items():
 22 |         handle.write("{} = {}\n" .format(key, value))
 23 |     handle.close()
 24 | 
 25 | 
 26 | 
 27 | class DQN(torch.nn.Module):
 28 |     def __init__(self, model, input_shape, action_size, lr=1e-3, lr_final=0, decay_steps=50e6, grad_clip=0.5, optim=torch.optim.Adam, optim_args={}, device='cuda', **model_args):
 29 |         super(DQN, self).__init__()
 30 |         self.lr = lr
 31 |         self.lr_final = lr_final
 32 |         self.decay_steps = decay_steps
 33 |         self.grad_clip = grad_clip
 34 |         self.action_size = action_size
 35 |         self.device = device
 36 | 
 37 |         self.model = model(input_shape, **model_args).to(self.device)
 38 |         self.Q = torch.nn.Linear(self.model.dense_size, action_size).to(self.device)
 39 |         
 40 |         self.optimiser = optim(self.parameters(), lr, **optim_args)
 41 |         self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1)
 42 |     
 43 |     def loss(self, Qsa, R, action_onehot):
 44 |         Qvalue = torch.sum(Qsa * action_onehot, dim=1)
 45 |         loss = torch.mean(torch.square(R - Qvalue))
 46 |         return loss
 47 | 
 48 |     def backprop(self, state:np.ndarray, R:np.ndarray, action:np.ndarray):
 49 |         state, R, action = totorch_many(state, R, action, device=self.device)
 50 |         action_onehot = F.one_hot(action.long(), num_classes=self.action_size)
 51 |         Qsa = self.forward(state)
 52 |         loss = self.loss(Qsa, R, action_onehot)
 53 |         loss.backward()
 54 |         if self.grad_clip is not None:
 55 |             torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip)
 56 |         self.optimiser.step()
 57 |         self.optimiser.zero_grad()
 58 |         self.scheduler.step()
 59 |         return loss.detach().cpu().numpy()
 60 |     
 61 |     def forward(self, state):
 62 |         Qsa = self.Q(self.model(state))
 63 |         return Qsa
 64 | 
 65 |     def evaluate(self, state):
 66 |         with torch.no_grad():
 67 |             Qsa = self.forward(totorch(state, self.device))
 68 |         return Qsa.cpu().numpy()
 69 | 
 70 | 
 71 | 
 72 | class SyncDDQN(SyncMultiEnvTrainer):
 73 |     def __init__(self, envs, model, target_model, val_envs, action_size, log_dir='logs/SyncDDQN/', model_dir='models/SyncDDQN/',
 74 |                      train_mode='nstep', return_type='nstep', total_steps=1000000, nsteps=5, gamma=0.99, lambda_=0.95,
 75 |                      validate_freq=1e6, save_freq=0, render_freq=0, update_target_freq=10000, num_val_episodes=50, log_scalars=True,
 76 |                      epsilon_start=1, epsilon_final=0.01, epsilon_steps = 1e6, epsilon_test=0.01):
 77 | 
 78 |         
 79 |         super().__init__(envs=envs, model=model, val_envs=val_envs, train_mode=train_mode, log_dir=log_dir, model_dir=model_dir, return_type=return_type, total_steps=total_steps,
 80 |                 nsteps=nsteps, gamma=gamma, lambda_=lambda_, validate_freq=validate_freq, save_freq=save_freq, render_freq=render_freq,
 81 |                 update_target_freq=update_target_freq, num_val_episodes=num_val_episodes, log_scalars=log_scalars)
 82 |         
 83 |         self.target_model = self.TargetQ = target_model
 84 |         self.Q = self.model # more readable alias
 85 |         self.epsilon = np.array([epsilon_start], dtype=np.float64)
 86 |         self.epsilon_final = epsilon_final
 87 |         self.epsilon_steps = epsilon_steps
 88 |         self.schedule = self.linear_schedule(self.epsilon , epsilon_final, epsilon_steps//self.num_envs)
 89 |         self.epsilon_test = np.array(epsilon_test, dtype=np.float64)
 90 | 
 91 |         self.action_size = action_size
 92 | 
 93 |         hyper_paras = {'learning_rate':self.model.lr, 'learning_rate_final':self.model.lr_final, 'lr_decay_steps':self.model.decay_steps , 'grad_clip':self.model.grad_clip,
 94 |          'nsteps':self.nsteps, 'num_workers':self.num_envs, 'return type':self.return_type, 'total_steps':self.total_steps, 'gamma':gamma, 'lambda':lambda_,
 95 |          'epsilon_start':self.epsilon, 'epsilon_final':self.epsilon_final, 'epsilon_steps':self.epsilon_steps, 'update_freq':update_target_freq}
 96 |         
 97 |         hyper_paras = OrderedDict(hyper_paras)
 98 | 
 99 |         if self.log_scalars:
100 |             filename = log_dir + '/hyperparameters.txt'
101 |             self.save_hyperparameters(filename, **hyper_paras)
102 |     
103 |     
104 |     class linear_schedule(object):
105 |         def __init__(self, epsilon, epsilon_final, num_steps=1000000):
106 |             self._counter = 0
107 |             self._epsilon = epsilon
108 |             self._epsilon_final = epsilon_final
109 |             self._step = (epsilon - epsilon_final) / num_steps
110 |             self._num_steps = num_steps
111 |         
112 |         def step(self,):
113 |             if self._counter < self._num_steps :
114 |                 self._epsilon -= self._step
115 |                 self._counter += 1
116 |             else:
117 |                 self._epsilon[:] = self._epsilon_final
118 |         
119 |         def get_epsilon(self,):
120 |             return self._epsilon
121 |     
122 |     def get_action(self, state):
123 |         if np.random.uniform() < self.epsilon_test:
124 |             action = np.random.randint(self.action_size)
125 |         else:
126 |             action = np.argmax(self.model.evaluate(state))
127 |         return action
128 | 
129 |     def update_target(self):
130 |         self.target_model.load_state_dict(self.model.state_dict())
131 | 
132 |     
133 |     def local_attr(self, attr):
134 |         attr['update_target_freq'] = self.target_freq
135 |         return attr
136 |         
137 |     def rollout(self):
138 |         rollout = []
139 |         for t in range(self.nsteps):
140 |             Qsa = self.Q.evaluate(self.states)
141 |             actions = np.argmax(Qsa, axis=1)
142 |             random = np.random.uniform(size=(self.num_envs))
143 |             random_actions = np.random.randint(self.action_size, size=(self.num_envs))
144 |             actions = np.where(random < self.epsilon, random_actions, actions)
145 |             next_states, rewards, dones, infos = self.env.step(actions)
146 |             rollout.append((self.states, actions, rewards, dones, infos))
147 |             self.states = next_states
148 |             self.schedule.step()
149 |         
150 |         states, actions, rewards, dones, infos = zip(*rollout)
151 |         states, actions, rewards, dones = np.stack(states), np.stack(actions), np.stack(rewards), np.stack(dones)
152 |         TargetQsa = unfold_batch(self.TargetQ.evaluate(fold_batch(states)), self.num_steps, self.num_envs) # Q(s,a; theta-1)
153 |         values = np.sum(TargetQsa * one_hot(actions, self.action_size), axis=-1) # Q(s, argmax_a Q(s,a; theta); theta-1)
154 |         
155 |         last_actions = np.argmax(self.Q.evaluate(next_states), axis=1)
156 |         last_TargetQsa = self.TargetQ.evaluate(next_states) # Q(s,a; theta-1)
157 |         last_values = np.sum(last_TargetQsa * one_hot(last_actions, self.action_size), axis=-1) # Q(s, argmax_a Q(s,a; theta); theta-1)
158 |         return states, actions, rewards, dones, values, last_values
159 | 
160 | 
161 | def stackFireReset(env):
162 |     return StackEnv(FireResetEnv(env))
163 | 
164 | 
165 | def main(env_id):
166 |     num_envs = 32
167 |     nsteps = 128
168 | 
169 |     current_time = datetime.datetime.now().strftime('%y-%m-%d_%H-%M-%S')
170 |     train_log_dir = 'logs/SyncDDQN/' + env_id + '/n-step/RMSprop/' + current_time
171 |     model_dir = "models/SyncDDQN/" + env_id + '/' + current_time
172 | 
173 |     env = gym.make(env_id)
174 |     
175 |     classic_list = ['MountainCar-v0', 'Acrobot-v1', 'LunarLander-v2', 'CartPole-v0', 'CartPole-v1']
176 |     if any(env_id in s for s in classic_list):
177 |         print('Classic Control')
178 |         val_envs = [gym.make(env_id) for i in range(16)]
179 |         envs = BatchEnv(DummyEnv, env_id, num_envs, blocking=False)
180 | 
181 |     elif 'ApplePicker' in env_id:
182 |         print('ApplePicker')
183 |         make_args = {'num_objects':100, 'default_reward':-0.01}
184 |         val_envs = [apple_pickgame(gym.make(env_id, **make_args), max_steps=5000, auto_reset=True, k=1) for i in range(15)]
185 |         envs = BatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=False, k=1)
186 |         print(val_envs[0])
187 |         print(envs.envs[0])
188 | 
189 |     else:
190 |         print('Atari')
191 |         if env.unwrapped.get_action_meanings()[1] == 'FIRE':
192 |             reset = True
193 |             print('fire on reset')
194 |         else:
195 |             reset = False
196 |             print('only stack frames')
197 |         
198 |         val_envs = [AtariEnv(gym.make(env_id), k=4, episodic=False, reset=reset, clip_reward=False) for i in range(15)]
199 |         envs = BatchEnv(AtariEnv, env_id, num_envs, blocking=False, k=4, reset=reset, episodic=False, clip_reward=True, time_limit=4500)
200 | 
201 |     action_size = val_envs[0].action_space.n
202 |     input_size = val_envs[0].reset().shape
203 | 
204 |     env.close()
205 |     print('action space', action_size)
206 | 
207 |     dqn_args = dict(model=NatureCNN,
208 |                     input_shape=input_size,
209 |                     action_size=action_size,
210 |                     lr=1e-3,
211 |                     lr_final=1e-6,
212 |                     grad_clip=0.5,
213 |                     decay_steps=50e6//(num_envs*nsteps),
214 |                     optim=torch.optim.RMSprop,
215 |                     device='cuda')
216 |       
217 |     Q = DQN(**dqn_args)
218 |     TargetQ = DQN(**dqn_args)
219 | 
220 |     DDQN = SyncDDQN(envs=envs,
221 |                     model=Q,
222 |                     target_model=TargetQ,
223 |                     model_dir=model_dir,
224 |                     log_dir=train_log_dir,
225 |                     val_envs=val_envs,
226 |                     action_size=action_size,
227 |                     train_mode='nstep',
228 |                     return_type='lambda',
229 |                     total_steps=50e6,
230 |                     nsteps=nsteps,
231 |                     gamma=0.99,
232 |                     lambda_=0.95,
233 |                     save_freq=0,
234 |                     render_freq=0,
235 |                     validate_freq=1e5,
236 |                     num_val_episodes=15,
237 |                     update_target_freq=10000,
238 |                     epsilon_start=1,
239 |                     epsilon_final=0.01,
240 |                     epsilon_steps=2e6,
241 |                     epsilon_test=0.01,
242 |                     log_scalars=False)
243 |     
244 |     DDQN.update_target()
245 |     DDQN.train()
246 | 
247 | if __name__ == "__main__":
248 |     import apple_picker
249 |     env_id_list = [ 'SpaceInvadersDeterministic-v4', 'FreewayDeterministic-v4','MontezumaRevengeDeterministic-v4', ]
250 |     #env_id_list = ['MontezumaRevengeDeterministic-v4']
251 |     #env_id_list = ['MountainCar-v0', 'CartPole-v1', 'Acrobot-v1', ]
252 |     env_id_list = ['ApplePicker-v0']
253 |     for env_id in env_id_list:
254 |         main(env_id)
255 | 


--------------------------------------------------------------------------------
/rlib/PPO/PPO.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | import numpy as np
  4 | import time, datetime
  5 | import gym
  6 | import copy
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | from rlib.networks.networks import *
 10 | from rlib.utils.VecEnv import*
 11 | from rlib.utils.wrappers import*
 12 | from rlib.utils.SyncMultiEnvTrainer import SyncMultiEnvTrainer
 13 | from rlib.utils.utils import fastsample, fold_batch, tonumpy, totorch, totorch_many, stack_many, fold_many
 14 | from rlib.utils.schedulers import polynomial_sheduler
 15 | 
 16 | class PPO(torch.nn.Module):
 17 |     def __init__(self, model, input_shape, action_size, lr=1e-3, lr_final=0, decay_steps=6e5, grad_clip=0.5, value_coeff=1.0, entropy_coeff=0.01, policy_clip=0.1,
 18 |                     build_optimiser=True, optim=torch.optim.Adam, optim_args={}, device='cuda', **model_args):
 19 |         super(PPO, self).__init__()
 20 |         self.lr = lr
 21 |         self.lr_final = lr_final
 22 |         self.action_size = action_size
 23 |         self.value_coeff = value_coeff
 24 |         self.entropy_coeff = entropy_coeff
 25 |         self.decay_steps = decay_steps
 26 |         self.grad_clip = grad_clip
 27 |         self.policy_clip = policy_clip
 28 |         self.device = device
 29 | 
 30 |         self.model = model(input_shape, **model_args).to(self.device)
 31 |         dense_size = self.model.dense_size
 32 |         self.policy = torch.nn.Sequential(torch.nn.Linear(dense_size, action_size), torch.nn.Softmax(dim=-1)).to(self.device)
 33 |         self.V = torch.nn.Linear(dense_size, 1).to(self.device)
 34 | 
 35 |         if build_optimiser:
 36 |             self.optimiser = optim(self.parameters(), lr, **optim_args)
 37 |             self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1)
 38 |         
 39 |     
 40 |     def forward(self, state):
 41 |         state_enc = self.model(state)
 42 |         policy = self.policy(state_enc)
 43 |         value = self.V(state_enc).view(-1)
 44 |         return policy, value
 45 |     
 46 |     def evaluate(self, state):
 47 |         with torch.no_grad():
 48 |             policy, value = self.forward(totorch(state, self.device))
 49 |         return tonumpy(policy), tonumpy(value)
 50 | 
 51 |     
 52 |     def loss(self, policy, R, V, Adv, action_onehot, old_policy):
 53 |         value_loss = 0.5 * torch.mean(torch.square(R - V))
 54 | 
 55 |         policy_actions = torch.sum(policy * action_onehot, dim=1)
 56 |         old_policy_actions = torch.sum(old_policy * action_onehot, dim=1)
 57 |         ratio = policy_actions / old_policy_actions
 58 |         policy_loss_unclipped = ratio * -Adv
 59 |         policy_loss_clipped = torch.clip_(ratio, 1 - self.policy_clip, 1 + self.policy_clip) * -Adv
 60 |         policy_loss = torch.mean(torch.maximum(policy_loss_unclipped, policy_loss_clipped))
 61 |         entropy = torch.mean(torch.sum(policy * -torch.log(policy), dim=1))
 62 | 
 63 |         loss =  policy_loss + self.value_coeff * value_loss - self.entropy_coeff * entropy
 64 |         return loss
 65 | 
 66 |     def backprop(self, state, R, Adv, action, old_policy):
 67 |         state, action, R, Adv, old_policy = totorch_many(state, action, R, Adv, old_policy, device=self.device)
 68 |         action_onehot = F.one_hot(action.long(), self.action_size)
 69 |         policy, value = self.forward(state)
 70 |         loss = self.loss(policy, R, value, Adv, action_onehot, old_policy)
 71 | 
 72 |         loss.backward()
 73 |         if self.grad_clip is not None:
 74 |             torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip)
 75 |         self.optimiser.step()
 76 |         self.optimiser.zero_grad()
 77 |         self.scheduler.step()
 78 |         return loss.detach().cpu().numpy()
 79 | 
 80 | 
 81 | class PPOTrainer(SyncMultiEnvTrainer):
 82 |     def __init__(self, envs, model, val_envs, train_mode='nstep', log_dir='logs/', model_dir='models/', total_steps=1000000, nsteps=5, gamma=0.99, lambda_=0.95, 
 83 |                     num_epochs=4, num_minibatches=4, validate_freq=1000000.0, save_freq=0, render_freq=0, num_val_episodes=50, max_val_steps=10000, log_scalars=True):
 84 |         
 85 |         super().__init__(envs, model, val_envs, train_mode=train_mode, log_dir=log_dir, model_dir=model_dir, total_steps=total_steps, nsteps=nsteps, gamma=gamma, lambda_=lambda_,
 86 |                             validate_freq=validate_freq, save_freq=save_freq, render_freq=render_freq, update_target_freq=0, num_val_episodes=num_val_episodes, max_val_steps=max_val_steps, log_scalars=log_scalars)
 87 | 
 88 |         self.num_epochs = num_epochs
 89 |         self.num_minibatches = num_minibatches
 90 | 
 91 |         hyper_paras = {'learning_rate':model.lr, 'learning_rate_final':model.lr_final, 'lr_decay_steps':model.decay_steps,
 92 |             'grad_clip':model.grad_clip, 'nsteps':self.nsteps, 'num_workers':self.num_envs, 'total_steps':self.total_steps,
 93 |             'entropy_coefficient':self.model.entropy_coeff, 'value_coefficient':self.model.value_coeff, 'gamma':self.gamma, 'lambda':self.lambda_}
 94 |         
 95 |         if log_scalars:
 96 |             filename = log_dir + '/hyperparameters.txt'
 97 |             self.save_hyperparameters(filename, **hyper_paras)
 98 |     
 99 |     
100 |     
101 |     def _train_nstep(self):
102 |         batch_size = self.num_envs * self.nsteps
103 |         num_updates = self.total_steps // batch_size
104 |         s = 0
105 |         mini_batch_size = self.nsteps//self.num_minibatches
106 |         start = time.time()
107 |         # main loop
108 |         for t in range(1,num_updates+1):
109 |             #rollout_start = time.time()
110 |             states, actions, rewards, values, last_values, old_policies, dones = self.rollout()
111 |             #print('rollout time', time.time()-rollout_start)
112 |             Adv = self.GAE(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_)
113 |             R = Adv + values
114 |             l = 0
115 |             
116 |             #backprop_time = time.time()
117 |             idxs = np.arange(len(states))
118 |             for epoch in range(self.num_epochs):
119 |                 np.random.shuffle(idxs)
120 |                 for batch in range(0, len(states), mini_batch_size):
121 |                     batch_idxs = idxs[batch: batch + mini_batch_size]
122 |                     # stack all states, actions and Rs across all workers into a single batch
123 |                     mb_states, mb_actions, mb_R, mb_Adv, mb_old_policies = fold_many(states[batch_idxs], actions[batch_idxs], 
124 |                                                                                      R[batch_idxs], Adv[batch_idxs],
125 |                                                                                      old_policies[batch_idxs])
126 | 
127 |                     l += self.model.backprop(mb_states.copy(), mb_R.copy(), mb_Adv.copy(), mb_actions.copy(), mb_old_policies.copy())
128 |             
129 |             #print('backprop time', time.time()-backprop_time)
130 |             l /= self.num_epochs
131 |            
132 |                     
133 |             if self.render_freq > 0 and t % ((self.validate_freq  // batch_size) * self.render_freq) == 0:
134 |                 render = True
135 |             else:
136 |                 render = False
137 |         
138 |             if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0:
139 |                 #val_time = time.time()
140 |                 self.validation_summary(t,l,start,render)
141 |                 #print('validation time', time.time()-val_time)
142 |                 start = time.time()
143 |             
144 |             if self.save_freq > 0 and  t % (self.save_freq // batch_size) == 0:
145 |                 s += 1
146 |                 self.save(s)
147 |                 print('saved model')
148 |             
149 |     
150 |     def get_action(self, states):
151 |         policies, values = self.model.evaluate(states)
152 |         actions = fastsample(policies)
153 |         return actions
154 | 
155 |     def rollout(self):
156 |         rollout = []
157 |         for t in range(self.nsteps):
158 |             policies, values = self.model.evaluate(self.states)
159 |             actions = fastsample(policies)
160 |             next_states, rewards, dones, infos = self.env.step(actions)
161 |             rollout.append((self.states, actions, rewards, values, policies, dones))
162 |             self.states = next_states
163 | 
164 |         states, actions, rewards, values, policies, dones = stack_many(*zip(*rollout))
165 |         policy, last_values, = self.model.evaluate(next_states)
166 |         return states, actions, rewards, values, last_values, policies, dones
167 | 
168 | 
169 | def main(env_id):
170 |     num_envs = 32
171 |     nsteps = 128
172 | 
173 |     classic_list = ['MountainCar-v0', 'Acrobot-v1', 'LunarLander-v2', 'CartPole-v0', 'CartPole-v1']
174 |     if any(env_id in s for s in classic_list):
175 |         print('Classic Control')
176 |         val_envs = [gym.make(env_id) for i in range(10)]
177 |         envs = BatchEnv(DummyEnv, env_id, num_envs, blocking=False)
178 |     
179 |     elif 'ApplePicker' in env_id:
180 |         print('ApplePicker')
181 |         make_args = {'num_objects':300, 'default_reward':0}
182 |         if 'Deterministic' in env_id:
183 |             envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=True, k=4, grey_scale=True, make_args=make_args)
184 |             val_envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=False, k=4, grey_scale=True, make_args=make_args)
185 |             for i in range(len(envs)):
186 |                 val_envs.envs[i].set_locs(envs.envs[i].item_locs_master, envs.envs[i].start_loc)
187 |             val_envs.reset()
188 |         else:
189 |         #val_envs = [apple_pickgame(gym.make(env_id), max_steps=5000, auto_reset=False, k=1) for i in range(16)]
190 |             val_envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=False, k=4, grey_scale=True)
191 |             envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=True, k=4, grey_scale=True)
192 |         print(val_envs.envs[0])
193 |         print(envs.envs[0])
194 | 
195 |     else:
196 |         print('Atari')
197 |         env = gym.make(env_id)
198 |         if env.unwrapped.get_action_meanings()[1] == 'FIRE':
199 |             reset = True
200 |             print('fire on reset')
201 |         else:
202 |             reset = False
203 |             print('only stack frames')
204 |         env.close()
205 |         val_envs = [AtariEnv(gym.make(env_id), k=4, episodic=False, reset=reset, clip_reward=False) for i in range(16)]
206 |         envs = BatchEnv(AtariEnv, env_id, num_envs, blocking=False, k=4, reset=reset, episodic=False, clip_reward=True)
207 |         
208 |     
209 |     action_size = val_envs.envs[0].action_space.n
210 |     input_size = val_envs.envs[0].reset().shape
211 |     
212 |     
213 |     current_time = datetime.datetime.now().strftime('%y-%m-%d_%H-%M-%S')
214 |     train_log_dir = 'logs/PPO/' + env_id + '/Adam/' + current_time
215 |     model_dir = "models/PPO/" + env_id + '/' + current_time
216 |     
217 |     
218 |     model = PPO(UniverseCNN,
219 |                 input_shape=input_size,
220 |                 action_size=action_size,
221 |                 lr=1e-4,
222 |                 lr_final=1e-5,
223 |                 decay_steps=200e6//(num_envs*nsteps),
224 |                 grad_clip=0.5,
225 |                 value_coeff=1.0,
226 |                 entropy_coeff=0.01,
227 |                 device='cuda'
228 |                 ).cuda()
229 | 
230 |     
231 |     ppo = PPOTrainer(envs=envs,
232 |                             model=model,
233 |                             model_dir=model_dir,
234 |                             log_dir=train_log_dir,
235 |                             val_envs=val_envs,
236 |                             train_mode='nstep',
237 |                             total_steps=200e6,
238 |                             nsteps=nsteps,
239 |                             num_epochs=2,
240 |                             num_minibatches=4,
241 |                             validate_freq=1e5,
242 |                             save_freq=0,
243 |                             render_freq=0,
244 |                             num_val_episodes=32,
245 |                             log_scalars=False)
246 |     ppo.train()
247 |     
248 | 
249 | if __name__ == "__main__":
250 |     import apple_picker
251 |     #env_id_list = ['SpaceInvadersDeterministic-v4', 'FreewayDeterministic-v4']# 'SpaceInvadersDeterministic-v4',]# , ]
252 |     #env_id_list = ['MountainCar-v0', 'Acrobot-v1', 'CartPole-v1', ]
253 |     env_id_list = ['ApplePickerDeterministic-v0']
254 |     for env_id in env_id_list:
255 |         main(env_id)
256 |             


--------------------------------------------------------------------------------
/rlib/networks/networks.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from typing import List
  4 | 
  5 | 
  6 | def deconv2d_outsize(height, width, kernel_size, stride, padding, dilation=[1,1], output_padding=[0,0]):
  7 |     h_out = (height-1) * stride[0] - 2*padding[0] + dilation[0] * (kernel_size[0]-1) + output_padding[0] + 1
  8 |     w_out = (width-1) * stride[1] - 2*padding[1] + dilation[1] * (kernel_size[1]-1) + output_padding[1] + 1
  9 |     return h_out, w_out
 10 | 
 11 | def conv2d_outsize(height, width, kernel_size, stride, padding):
 12 |     h_out = ((height + 2*padding[0] - (kernel_size[0] -1) -1) // stride[0]) + 1
 13 |     w_out = ((width + 2*padding[1] - (kernel_size[1] -1) -1) // stride[1]) + 1
 14 |     return h_out, w_out
 15 | 
 16 | class DeconvUniverse(torch.nn.Module):
 17 |     def __init__(self, output_size, deconv1_size=64, deconv2_size=64, deconv3_size=64, deconv4_size=64, padding=[0,0], conv_activation=torch.nn.ELU, weight_initialiser=torch.nn.init.xavier_uniform_, trainable=True):
 18 |         # output_size [channels, height, width] size of output after convolutions
 19 |         super(DeconvUniverse, self).__init__()
 20 |         self.output_size = output_size
 21 |         self.dense_size = np.prod(output_size)
 22 |         
 23 |         self.h1 = torch.nn.Sequential(torch.nn.ConvTranspose2d(output_size[0], deconv1_size, kernel_size=[3,3], stride=[2,2], padding=padding, output_padding=1), conv_activation())
 24 |         self.h2 = torch.nn.Sequential(torch.nn.ConvTranspose2d(deconv1_size, deconv2_size, kernel_size=[3,3], stride=[2,2], padding=padding, output_padding=0), conv_activation())
 25 |         self.h3 = torch.nn.Sequential(torch.nn.ConvTranspose2d(deconv2_size, deconv3_size, kernel_size=[3,3], stride=[2,2], padding=padding, output_padding=0), conv_activation())
 26 |         self.h4 = torch.nn.Sequential(torch.nn.ConvTranspose2d(deconv3_size, deconv4_size, kernel_size=[3,3], stride=[2,2], padding=padding, output_padding=1), conv_activation())
 27 |         c, h, w = self._conv_outsize()
 28 |         
 29 |         print('final outsize', (c, h, w))
 30 |         self.initialiser = weight_initialiser
 31 |         self.init_weights()
 32 |     
 33 |     def init_weights(self):
 34 |         self.apply(self._init_weights)
 35 |     
 36 |     def _init_weights(self, module):
 37 |         if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
 38 |             self.initialiser(module.weight)
 39 | 
 40 |     def _conv_outsize(self):
 41 |         _, h, w = self.output_size
 42 |         h, w = deconv2d_outsize(h, w, self.h1[0].kernel_size, self.h1[0].stride, self.h1[0].padding, self.h1[0].dilation, self.h1[0].output_padding)
 43 |         h, w = deconv2d_outsize(h, w, self.h2[0].kernel_size, self.h2[0].stride, self.h2[0].padding, self.h2[0].dilation, self.h2[0].output_padding)
 44 |         h, w = deconv2d_outsize(h, w, self.h3[0].kernel_size, self.h3[0].stride, self.h3[0].padding, self.h3[0].dilation, self.h3[0].output_padding)
 45 |         h, w = deconv2d_outsize(h, w, self.h4[0].kernel_size, self.h4[0].stride, self.h4[0].padding, self.h4[0].dilation, self.h4[0].output_padding)
 46 |         return self.h4[0].out_channels, h, w
 47 | 
 48 |     def forward(self, x):
 49 |         x = x.view(-1, *self.output_size)
 50 |         x = self.h1(x)
 51 |         x = self.h2(x)
 52 |         x = self.h3(x)
 53 |         x = self.h4(x)
 54 |         return x
 55 | 
 56 | class UniverseCNN(torch.nn.Module):
 57 |     def __init__(self, input_shape, conv1_size=64, conv2_size=64, conv3_size=64, conv4_size=64, padding=[0,0], dense_size=256, conv_activation=torch.nn.ELU, dense_activation=torch.nn.ReLU, weight_initialiser=torch.nn.init.xavier_uniform_, scale=True, trainable=True):
 58 |         # input_shape [channels, height, width]
 59 |         super(UniverseCNN, self).__init__()
 60 |         self.scale = scale
 61 |         self.input_shape = input_shape
 62 |         
 63 |         self.h1 = torch.nn.Sequential(torch.nn.Conv2d(input_shape[0], conv1_size, kernel_size=[3,3], stride=[2,2], padding=padding), conv_activation())
 64 |         self.h2 = torch.nn.Sequential(torch.nn.Conv2d(conv1_size, conv2_size, kernel_size=[3,3], stride=[2,2], padding=padding), conv_activation())
 65 |         self.h3 = torch.nn.Sequential(torch.nn.Conv2d(conv2_size, conv3_size, kernel_size=[3,3], stride=[2,2], padding=padding), conv_activation())
 66 |         self.h4 = torch.nn.Sequential(torch.nn.Conv2d(conv3_size, conv4_size, kernel_size=[3,3], stride=[2,2], padding=padding), conv_activation())
 67 |         self.flatten = torch.nn.Flatten()
 68 |         c, h, w = self._conv_outsize()
 69 |         self.dense = torch.nn.Sequential(torch.nn.Linear(h*w*c, dense_size), dense_activation())
 70 |         #self.dense_size = h*w*c
 71 |         self.dense_size = dense_size
 72 |         print('final outsize', (c, h, w))
 73 |         self.initialiser = weight_initialiser
 74 |         self.init_weights()
 75 |     
 76 |     def init_weights(self):
 77 |         self.apply(self._init_weights)
 78 |     
 79 |     def _init_weights(self, module):
 80 |         if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
 81 |             self.initialiser(module.weight)
 82 | 
 83 |     def _conv_outsize(self):
 84 |         _, h, w = self.input_shape
 85 |         h, w = conv2d_outsize(h, w, self.h1[0].kernel_size, self.h1[0].stride, self.h1[0].padding)
 86 |         h, w = conv2d_outsize(h, w, self.h2[0].kernel_size, self.h2[0].stride, self.h2[0].padding)
 87 |         h, w = conv2d_outsize(h, w, self.h3[0].kernel_size, self.h3[0].stride, self.h3[0].padding)
 88 |         h, w = conv2d_outsize(h, w, self.h4[0].kernel_size, self.h4[0].stride, self.h4[0].padding)
 89 |         return self.h4[0].out_channels, h, w
 90 | 
 91 |     def forward(self, x):
 92 |         x = x/255 if self.scale else x
 93 |         x = self.h1(x)
 94 |         x = self.h2(x)
 95 |         x = self.h3(x)
 96 |         x = self.h4(x)
 97 |         x = self.flatten(x)
 98 |         x = self.dense(x)
 99 |         return x
100 | 
101 | class NatureCNN(torch.nn.Module):
102 |     def __init__(self, input_shape, conv1_size=32, conv2_size=64, conv3_size=64, dense_size=512, padding=[0,0], conv_activation=torch.nn.ReLU, dense_activation=torch.nn.ReLU, weight_initialiser=torch.nn.init.xavier_uniform_, scale=True, trainable=True):
103 |         # input_shape [channels, height, width]
104 |         super(NatureCNN, self).__init__()
105 |         self.scale = scale
106 |         self.dense_size = dense_size
107 |         self.input_shape = input_shape
108 |         self.h1 = torch.nn.Sequential(torch.nn.Conv2d(input_shape[0], conv1_size, kernel_size=[8,8], stride=[4,4], padding=padding), conv_activation())
109 |         self.h2 = torch.nn.Sequential(torch.nn.Conv2d(conv1_size, conv2_size, kernel_size=[4,4], stride=[2,2], padding=padding), conv_activation())
110 |         self.h3 = torch.nn.Sequential(torch.nn.Conv2d(conv2_size, conv3_size, kernel_size=[3,3], stride=[1,1], padding=padding), conv_activation())
111 |         self.flatten = torch.nn.Flatten()
112 |         c, h, w = self._conv_outsize()
113 |         self.dense = torch.nn.Sequential(torch.nn.Linear(h*w*c, dense_size), dense_activation())
114 |         self.initialiser = weight_initialiser
115 |         self.init_weights()
116 |     
117 |     def init_weights(self):
118 |         self.apply(self._init_weights)
119 |     
120 |     def _init_weights(self, module):
121 |         if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
122 |             self.initialiser(module.weight)
123 | 
124 |     def _conv_outsize(self):
125 |         _, h, w = self.input_shape
126 |         h, w = conv2d_outsize(h, w, self.h1[0].kernel_size, self.h1[0].stride, self.h1[0].padding)
127 |         h, w = conv2d_outsize(h, w, self.h2[0].kernel_size, self.h2[0].stride, self.h2[0].padding)
128 |         h, w = conv2d_outsize(h, w, self.h3[0].kernel_size, self.h3[0].stride, self.h3[0].padding)
129 |         return self.h3[0].out_channels, h, w
130 | 
131 |     def forward(self, x):
132 |         x = x/255 if self.scale else x
133 |         x = self.h1(x)
134 |         x = self.h2(x)
135 |         x = self.h3(x)
136 |         x = self.flatten(x)
137 |         x = self.dense(x)
138 |         return x
139 | 
140 | 
141 | class MaskedRNN(torch.nn.Module):
142 |     ''' dynamic masked *hidden state* RNN for sequences that reset part way through an observation 
143 |         e.g. A2C 
144 |         args :
145 |             cell - cell of type tf.nn.rnn_cell
146 |             X - tensor of rank [time, batch, hidden] if time major == True (Default); or [batch, time, hidden] if time major == False
147 |             hidden_init - tensor or placeholder of intial cell hidden state
148 |             mask - tensor or placeholder of length time, for hidden state masking e.g. [True, False, False] will mask first hidden state
149 |             parallel_iterations - number of parallel iterations to run RNN over
150 |             swap_memory - bool flag to swap memory between GPU and CPU
151 |             time_major - bool flag to determine order of indices of input tensor 
152 |             scope - tf variable_scope of dynamic RNN loop
153 |             trainable - bool flag whether to perform backpropagation to RNN cell during while loop
154 |     '''
155 |     def __init__(self, cell, time_major=True):
156 |         super(MaskedRNN, self).__init__()
157 |         self.cell = cell
158 |         self.time_major = time_major
159 |     
160 |     def forward(self, x, hidden=None, mask=None):
161 |         '''args:
162 |             x - tensor of rank [time, batch, hidden] if time major == True (Default); or [batch, time, hidden] if time major == False
163 |             mask - tensor of rank [time], for hidden state masking e.g. [True, False, False] will mask first hidden state
164 |         returns:
165 |         '''
166 | 
167 |         if not self.time_major:
168 |             x = x.transpose(1, 0, 2)
169 |         
170 |         if mask is None:
171 |             mask = torch.zeros(x.shape[0], x.shape[1]).to(x.device)
172 |         
173 |         outputs = []
174 |         for t in range(x.shape[0]):
175 |             output, hidden = self.cell(x[t], hidden, mask[t])
176 |             outputs.append(output)
177 |         
178 |         outputs = torch.stack(outputs, dim=0)
179 |         outputs = outputs if self.time_major else outputs.transpose(1, 0, 2)
180 |         return outputs, hidden
181 | 
182 | def lstmgate(cell_size, input_size, trainable=True):
183 |     input_weight = torch.nn.Parameter(torch.nn.init.xavier_uniform_(torch.zeros(size=[input_size, cell_size], requires_grad=trainable)))
184 |     hidden_weight = torch.nn.Parameter(torch.nn.init.xavier_uniform_(torch.zeros(size=[cell_size, cell_size], requires_grad=trainable)))
185 |     bias = torch.nn.Parameter(torch.zeros(size=[cell_size], requires_grad=trainable))
186 |     return input_weight, hidden_weight, bias
187 | 
188 | def gemmlstmgate(cell_size, input_size, trainable=True):
189 |     input_weight = torch.nn.Parameter(torch.nn.init.xavier_uniform_(torch.zeros(size=[cell_size*4, input_size], requires_grad=trainable)))
190 |     hidden_weight = torch.nn.Parameter(torch.nn.init.xavier_uniform_(torch.zeros(size=[cell_size*4, cell_size], requires_grad=trainable)))
191 |     bias_input = torch.nn.Parameter(torch.zeros(size=[cell_size*4], requires_grad=trainable))
192 |     bias_hidden = torch.nn.Parameter(torch.zeros(size=[cell_size*4], requires_grad=trainable))
193 |     return input_weight, hidden_weight, bias_input, bias_hidden
194 | 
195 | class MaskedLSTMCell(torch.nn.Module):
196 |     def __init__(self, cell_size, input_size=None, trainable=True):
197 |         super(MaskedLSTMCell, self).__init__()
198 |         self._cell_size = cell_size
199 |         input_size = input_size if input_size is not None else cell_size # input_size == cell_size by default 
200 |         self._input_size = input_size
201 |         self.Wi, self.Wh, self.bi, self.bh = gemmlstmgate(cell_size, input_size, trainable) # batch gemm
202 |  
203 |     def init_hidden(self, batch_size, dtype, device):
204 |         cell = torch.zeros(1, batch_size, self._cell_size, dtype=dtype, device=device)
205 |         hidden = torch.zeros(1, batch_size, self._cell_size, dtype=dtype, device=device)
206 |         return (cell, hidden)
207 | 
208 |     def forward(self, x, state=None, done=None):
209 |         if state is None:
210 |             prev_cell, prev_hidden = self.init_hidden(x.shape[0], input.dtype, input.device)
211 |         else:
212 |             prev_cell, prev_hidden = state
213 |         if done is not None:
214 |             prev_cell *= (1-done).view(-1, 1)
215 |             prev_hidden *= (1-done).view(-1, 1)
216 |             
217 |         gates = (torch.matmul(x, self.Wi.t()) + self.bi + torch.matmul(prev_hidden[0], self.Wh.t())) + self.bh
218 |         i, f, c, o = gates.chunk(4, 1)
219 |         i = torch.sigmoid(i)
220 |         f = torch.sigmoid(f)
221 |         c = torch.tanh(c)
222 |         o = torch.sigmoid(o)
223 | 
224 |         cell = prev_cell * f + i * c
225 |         hidden = o * torch.tanh(cell)
226 |         return hidden, (cell, hidden)
227 | 
228 | 
229 | class MaskedLSTMBlock(torch.nn.Module):
230 |     def __init__(self, input_size, hidden_size, time_major=True):
231 |         super(MaskedLSTMBlock, self).__init__()
232 |         self.time_major = time_major
233 |         batch_first = not time_major
234 |         self.hidden_size = hidden_size
235 |         self.lstm = torch.nn.LSTM(input_size=input_size, hidden_size=hidden_size, batch_first=batch_first)
236 | 
237 |     def forward(self, x, hidden, done):
238 |         if not self.time_major:
239 |             x = x.transpose(1, 0, 2)
240 |         
241 |         if done is not None:
242 |             mask = (1-done)
243 |         else:
244 |             mask = torch.ones(x.shape[0], x.shape[1]).to(x.device)
245 |         
246 |         mask_zeros = ((mask[1:]==0).any(dim=-1).nonzero()+1).view(-1).cpu().numpy().tolist()
247 |         mask_zeros = [0] + mask_zeros + [mask.shape[0]+1]
248 |         outputs = []
249 |         for i in range(len(mask_zeros)-1):
250 |             start = mask_zeros[i]
251 |             end = mask_zeros[i+1]
252 |             #print('start, end', (start, end))
253 |             hidden = (mask[start].view(-1,1)*hidden[0], mask[start].view(-1,1)*hidden[1])
254 |             out, hidden = self.lstm(x[start:end], hidden)
255 |             outputs.append(out)
256 | 
257 |         outputs = torch.cat(outputs, dim=0)
258 |         outputs = outputs if self.time_major else outputs.transpose(1, 0, 2)
259 |         return outputs, hidden


--------------------------------------------------------------------------------
/rlib/Curiosity/CuriosityA2C.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | import numpy as np
  4 | import scipy
  5 | import gym
  6 | import os, time
  7 | import threading
  8 | from rlib.A2C.A2C import ActorCritic
  9 | from rlib.networks.networks import*
 10 | from rlib.utils.SyncMultiEnvTrainer import SyncMultiEnvTrainer
 11 | from rlib.utils.VecEnv import*
 12 | from rlib.utils.wrappers import*
 13 | from rlib.utils.utils import fastsample, fold_batch, one_hot, RunningMeanStd, normalise, stack_many, totorch_many
 14 | from rlib.utils.schedulers import polynomial_sheduler
 15 | 
 16 | class RollingObs(object):
 17 |     def __init__(self, mean=0):
 18 |         self.rolling = RunningMeanStd()
 19 |     
 20 |     def update(self, x):
 21 |         if len(x.shape) == 4: # assume image obs 
 22 |             return self.rolling.update(np.mean(x, axis=1, keepdims=True)) #[time*batch,height,width,stack] -> [height, width]
 23 |         else:
 24 |             return self.rolling.update(x) #[time*batch,*shape] -> [*shape]
 25 | 
 26 | 
 27 | class ICM(torch.nn.Module):
 28 |     def __init__(self, model_head, input_size, action_size, forward_coeff, device='cuda', **model_head_args):
 29 |         super(ICM, self).__init__()
 30 |         self.action_size = action_size
 31 |         self.forward_coeff = forward_coeff
 32 |         self.phi = model_head(input_size, **model_head_args)
 33 |         dense_size = self.phi.dense_size
 34 |         self.device = device
 35 | 
 36 |         # forward model 
 37 |         self.forward1 = torch.nn.Sequential(torch.nn.Linear(dense_size + action_size, dense_size), torch.nn.ReLU()).to(device)
 38 |         self.pred_state = torch.nn.Linear(dense_size, dense_size).to(device)
 39 | 
 40 |         # inverse model
 41 |         self.inverse1 = torch.nn.Sequential(torch.nn.Linear(dense_size*2, dense_size), torch.nn.ReLU()).to(device)
 42 |         self.pred_action = torch.nn.Sequential(torch.nn.Linear(dense_size*2, dense_size), torch.nn.ReLU()).to(device)
 43 | 
 44 |     
 45 |     def intr_reward(self, phi, action_onehot, phi_next):
 46 |         f1 = self.forward1(torch.cat([phi, action_onehot], dim=1))
 47 |         phi_pred = self.pred_state(f1)
 48 |         intr_reward = 0.5 * torch.sum(torch.square(phi_pred - phi_next), dim=1) # l2 distance metric ‖ˆφ(st+1)−φ(st+1)‖22
 49 |         return intr_reward
 50 |     
 51 |     def predict_action(self, phi1, phi2):
 52 |         phi_cat = torch.cat([phi1, phi2], dim=1)
 53 |         pred_action = self.pred_action(phi_cat)
 54 |         return pred_action
 55 | 
 56 |     def get_intr_reward(self, state, action, next_state):
 57 |         state, next_state, action = totorch_many(state, next_state, action, device=self.device)
 58 |         action = action.long()
 59 |         phi1 = self.phi(state)
 60 |         phi2 = self.phi(next_state)
 61 |         action_onehot = F.one_hot(action, self.action_size)
 62 |         with torch.no_grad():
 63 |             intr_reward = self.intr_reward(phi1, action_onehot, phi2)
 64 |         return intr_reward.cpu().numpy()
 65 | 
 66 |     def get_pred_action(self, state, next_state):
 67 |         state, next_state = totorch_many(state, next_state, device=self.device)
 68 |         return self.pred_action(state, next_state)
 69 | 
 70 |     def loss(self, state, action, next_state):
 71 |         action = action.long()
 72 |         phi1 = self.phi(state)
 73 |         phi2 = self.phi(next_state)
 74 |         action_onehot = F.one_hot(action, self.action_size)
 75 | 
 76 |         forward_loss = torch.mean(self.intr_reward(phi1, action_onehot, phi2))
 77 |         inverse_loss = F.cross_entropy(self.predict_action(phi1, phi2), action)
 78 |         return (1-self.forward_coeff) * inverse_loss + self.forward_coeff * forward_loss
 79 |         
 80 | 
 81 | class Curiosity(torch.nn.Module):
 82 |     def __init__(self,  policy_model, ICM_model, input_size, action_size, forward_coeff, policy_importance, reward_scale, entropy_coeff, value_coeff=0.5,
 83 |                     lr=1e-3, lr_final=1e-3, decay_steps=6e5, grad_clip=0.5, policy_args={}, ICM_args={}, device='cuda'):
 84 |         super(Curiosity, self).__init__()
 85 |         self.reward_scale, self.forward_coeff, self.policy_importance, self.entropy_coeff = reward_scale, forward_coeff, policy_importance, entropy_coeff
 86 |         self.lr, self.lr_final, self.decay_steps = lr, lr_final, decay_steps
 87 |         self.grad_clip = grad_clip
 88 |         self.action_size = action_size
 89 |         self.device = device
 90 | 
 91 |         try:
 92 |             iterator = iter(input_size)
 93 |         except TypeError:
 94 |             input_size = (input_size,)
 95 |         
 96 |         self.ICM = ICM(ICM_model, input_size, action_size, forward_coeff, device=device, **ICM_args)
 97 |         self.AC = ActorCritic(policy_model, input_size, action_size, entropy_coeff, value_coeff, lr, lr_final, decay_steps, grad_clip, build_optimiser=False, device=device, **policy_args)
 98 |         
 99 |         self.optimiser = torch.optim.RMSprop(self.parameters(), lr=lr)
100 |         self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1)
101 | 
102 |     def forward(self, state):
103 |         return self.AC.forward(state)
104 |     
105 |     def evaluate(self, state):
106 |         return self.AC.evaluate(state)
107 |     
108 |     def intrinsic_reward(self, state, action, next_state):
109 |         return self.ICM.get_intr_reward(state, action, next_state)
110 |     
111 |     def backprop(self, state, next_state, R, Adv, action, state_mean, state_std):
112 |         state, next_state, R, Adv, action, state_mean, state_std = totorch_many(state, next_state, R, Adv,
113 |                                                                         action, state_mean, state_std, device=self.device)
114 |         policy, value = self.AC.forward(state)
115 |         action_onehot = F.one_hot(action.long(), self.action_size)
116 |         policy_loss = self.AC.loss(policy, R, value, action_onehot)
117 |         ICM_loss = self.ICM.loss((state-state_mean)/state_std, action, (next_state-state_mean)/state_std)
118 |         loss = self.policy_importance * policy_loss + self.reward_scale * ICM_loss
119 |         loss.backward()
120 |         if self.grad_clip is not None:
121 |             torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip)
122 |         self.optimiser.step()
123 |         self.optimiser.zero_grad()
124 |         self.scheduler.step()
125 |         return loss.detach().cpu().numpy()
126 |     
127 | 
128 | 
129 | 
130 | class Curiosity_Trainer(SyncMultiEnvTrainer):
131 |     def __init__(self, envs, model, val_envs, train_mode='nstep', log_dir='logs/', total_steps=1000000, nsteps=5, validate_freq=1000000, save_freq=0, render_freq=0, num_val_episodes=50, max_val_steps=10000, log_scalars=True):
132 |         super().__init__(envs, model, val_envs, train_mode=train_mode, return_type='nstep', log_dir=log_dir, total_steps=total_steps, nsteps=nsteps, validate_freq=validate_freq,
133 |                             save_freq=save_freq, render_freq=render_freq, update_target_freq=0, num_val_episodes=num_val_episodes, max_val_steps=max_val_steps, log_scalars=log_scalars)
134 | 
135 |         self.state_obs = RollingObs()
136 |         self.state_mean = None
137 |         self.state_std = None
138 |         
139 |         hyper_paras = {'learning_rate':model.lr, 'learning_rate_final':model.lr_final, 'lr_decay_steps':model.decay_steps,
140 |          'grad_clip':model.grad_clip, 'nsteps':self.nsteps, 'num_workers':self.num_envs, 'total_steps':self.total_steps,
141 |           'entropy_coefficient':0.01, 'value_coefficient':0.5, 'reward_scale':model.reward_scale,
142 |           'forward_model_scale':model.forward_coeff, 'policy_importance':model.policy_importance,
143 |           'gamma':self.gamma, 'lambda':self.lambda_}
144 |     
145 |         if self.log_scalars:
146 |             filename = log_dir + '/hyperparameters.txt'
147 |             self.save_hyperparameters(filename, **hyper_paras)
148 |         
149 |         self.lambda_ = 0.95
150 |     
151 |     def init_state_obs(self, num_steps):
152 |         states = 0
153 |         for i in range(num_steps):
154 |             rand_actions = np.random.randint(0, self.model.action_size, size=self.num_envs)
155 |             next_states, rewards, dones, infos = self.env.step(rand_actions)
156 |             states += next_states
157 |         return states / num_steps
158 |     
159 |     
160 |     def _train_nstep(self):
161 |         num_updates = self.total_steps // (self.num_envs * self.nsteps)
162 |         s = 0
163 |         self.state_mean, self.state_std = self.state_obs.update(self.init_state_obs(10000//self.num_envs))
164 |         self.states = self.env.reset()
165 |         print(self.state_mean.shape, self.state_std.shape)
166 |         start = time.time()
167 |         # main loop
168 |         batch_size = self.num_envs * self.nsteps
169 |         for t in range(1,num_updates+1):
170 |             states, next_states, actions, rewards, dones, values = self.rollout()
171 |             _, last_values = self.model.evaluate(next_states[-1])
172 | 
173 |             R = self.nstep_return(rewards, last_values, dones)
174 |             Adv = R - values
175 |             #delta = rewards + self.gamma * values[:-1] - values[1:]
176 |             #Adv = self.multistep_target(delta, values[-1], dones, gamma=self.gamma*self.lambda_)
177 |                 
178 |             # stack all states, next_states, actions and Rs across all workers into a single batch
179 |             states, next_states, actions, R, Adv = fold_batch(states), fold_batch(next_states), fold_batch(actions), fold_batch(R), fold_batch(Adv)
180 |             mean, std = self.state_mean, self.state_std
181 |             
182 |             l = self.model.backprop(states, next_states, R, Adv, actions, mean, std)
183 |             
184 |             # self.state_mean, self.state_std = self.state_obs.update(states)
185 |             
186 |             if self.render_freq > 0 and t % (self.validate_freq * self.render_freq) == 0:
187 |                 render = True
188 |             else:
189 |                 render = False
190 |      
191 |             if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0:
192 |                 self.validation_summary(t,l,start,render)
193 |                 start = time.time()
194 |             
195 |             if self.save_freq > 0 and  t % (self.save_freq // batch_size) == 0:
196 |                 s += 1
197 |                 self.saver.save(self.sess, str(self.model_dir + self.current_time + '/' + str(s) + ".ckpt") )
198 |                 print('saved model')
199 |             
200 | 
201 |     
202 |     def get_action(self, state):
203 |         policy, value = self.model.evaluate(state)
204 |         action = int(np.random.choice(policy.shape[1], p=policy[0]))
205 |         return action
206 |             
207 | 
208 |     def rollout(self,):
209 |         rollout = []
210 |         for t in range(self.nsteps):
211 |             start = time.time()
212 |             policies, values = self.model.evaluate(self.states)
213 |             actions = fastsample(policies)
214 |             next_states, extr_rewards, dones, infos = self.env.step(actions)
215 |             
216 |             mean, std = self.state_mean[None], self.state_std[None]
217 |             intr_rewards = self.model.intrinsic_reward((self.states-mean)/std, actions, (next_states-mean)/std)
218 |             rewards = extr_rewards + intr_rewards
219 |             rollout.append((self.states, next_states, actions, rewards, values, dones))
220 |             self.states = next_states
221 |         
222 |         states, next_states, actions, rewards, values, dones = stack_many(*zip(*rollout))
223 |         return states, next_states, actions, rewards, dones, values
224 |             
225 | 
226 | def main(env_id):
227 |     num_envs = 32
228 |     nsteps = 20
229 |     
230 |     classic_list = ['MountainCar-v0', 'Acrobot-v1', 'LunarLander-v2', 'CartPole-v0', 'CartPole-v1']
231 |     if any(env_id in s for s in classic_list):
232 |         print('Classic Control')
233 |         val_envs = [gym.make(env_id) for i in range(1)]
234 |         envs = BatchEnv(DummyEnv, env_id, num_envs, blocking=False)
235 | 
236 |     else:
237 |         env = gym.make(env_id)
238 |         print('Atari')
239 |         if env.unwrapped.get_action_meanings()[1] == 'FIRE':
240 |             reset = True
241 |             print('fire on reset')
242 |         else:
243 |             reset = False
244 |             print('only stack frames')
245 |         
246 |         val_envs = [AtariEnv(gym.make(env_id), k=4, rescale=84, episodic=False, reset=reset, clip_reward=False) for i in range(1)]
247 |         envs = BatchEnv(AtariEnv, env_id, num_envs, blocking=False, rescale=84, k=4, reset=reset, episodic=False, clip_reward=True, time_limit=4500)
248 |         
249 |     
250 |     env.close()
251 |     action_size = val_envs[0].action_space.n
252 |     input_size = val_envs[0].reset().shape
253 |     
254 |     
255 | 
256 |     train_log_dir = 'logs/Curiosity/' + env_id + '/hyper_unclipped/'
257 | 
258 |     model = Curiosity(NatureCNN,
259 |                       NatureCNN,
260 |                       input_size=input_size,
261 |                       action_size=action_size,
262 |                       forward_coeff=0.2,
263 |                       policy_importance=1,
264 |                       reward_scale=1.0,
265 |                       entropy_coeff=0.01,
266 |                       #intr_coeff=1,
267 |                       lr=1e-3,
268 |                       lr_final=0,
269 |                       decay_steps=50e6//(num_envs*nsteps),
270 |                       grad_clip=0.5,
271 |                       policy_args={},
272 |                       ICM_args={'scale':False}).cuda() 
273 | 
274 |     
275 | 
276 |     curiosity = Curiosity_Trainer(envs=envs,
277 |                                   model=model,
278 |                                   val_envs=val_envs,
279 |                                   train_mode='nstep',
280 |                                   total_steps=5e6,
281 |                                   nsteps=nsteps,
282 |                                   validate_freq=1e5,
283 |                                   save_freq=0,
284 |                                   render_freq=0,
285 |                                   num_val_episodes=1,
286 |                                   log_dir=train_log_dir,
287 |                                   log_scalars=False)
288 |     print(env_id)
289 |     curiosity.train()
290 | 
291 |     del curiosity
292 | 
293 | if __name__ == "__main__":
294 |     env_id_list = ['SpaceInvadersDeterministic-v4', 'FreewayDeterministic-v4', 'MontezumaRevengeDeterministic-v4', 'PongDeterministic-v4']
295 |     #env_id_list = ['MountainCar-v0', 'Acrobot-v1', 'CartPole-v1', ]
296 |     #for i in range(5):
297 |     for env_id in env_id_list:
298 |         main(env_id)
299 |     


--------------------------------------------------------------------------------
/rlib/VIN/VIN.py:
--------------------------------------------------------------------------------
  1 | import torch 
  2 | import torch.nn.functional as F 
  3 | import numpy as np
  4 | from torch.utils.tensorboard import SummaryWriter
  5 | import datetime
  6 | import threading
  7 | import time
  8 | 
  9 | from rlib.utils.VecEnv import*
 10 | from rlib.utils.wrappers import*
 11 | from rlib.utils.utils import fold_batch, stack_many, one_hot, totorch, totorch_many, tonumpy
 12 | 
 13 | class VINCNN(torch.nn.Module):
 14 |     def __init__(self, input_size, action_size, k=10, lr=1e-3, device='cuda'):
 15 |         super(VINCNN, self).__init__()
 16 |         channels, height, width = input_size
 17 |         self.action_size = action_size
 18 |         self.conv_enc = torch.nn.Conv2d(channels, 150, kernel_size=[3,3], stride=[1,1], padding=1).to(device) # φ(s)
 19 |         self.R_bar = torch.nn.Conv2d(150, 1, kernel_size=[1,1], stride=[1,1], padding=0, bias=False).to(device)
 20 |         self.Q_bar = torch.nn.Conv2d(1, action_size, kernel_size=[3,3], stride=[1,1], padding=1, bias=False).to(device)
 21 |         self.w = torch.nn.Parameter(torch.zeros(action_size, 1, 3, 3), requires_grad=True).to(device)
 22 |         self.Q = torch.nn.Linear(action_size, action_size).to(device)
 23 |         self.k = k # nsteps to plan with VIN
 24 |         self.optim = torch.optim.RMSprop(params=self.parameters(), lr=lr)
 25 |         self.device = device
 26 |     
 27 |     def forward(self, img, x, y):
 28 |         hidden = self.conv_enc(img)
 29 |         R_bar = self.R_bar(hidden)
 30 |         Q_bar = self.Q_bar(R_bar)
 31 |         V_bar, _ = torch.max(Q_bar, dim=1, keepdim=True)
 32 |         batch_size = img.shape[0]
 33 |         psi = self._plan_ahead(R_bar, V_bar)[torch.arange(batch_size), :, x.long(), y.long()].view(batch_size, self.action_size) # ψ(s)
 34 |         Qsa = self.Q(psi)
 35 |         return Qsa
 36 |     
 37 | 
 38 |     def backprop(self, states, locs, R, actions):
 39 |         x, y = zip(*locs)
 40 |         Qsa = self.forward(totorch(states, self.device), torch.tensor(x).to(self.device), torch.tensor(y)).to(self.device)
 41 |         actions_onehot = totorch(one_hot(actions, self.action_size), self.device)
 42 |         Qvalue = torch.sum(Qsa * actions_onehot, axis=1)
 43 |         loss = torch.mean(torch.square(totorch(R).float().cuda() - Qvalue))
 44 |         
 45 |         loss.backward()
 46 |         self.optim.step()
 47 |         self.optim.zero_grad()
 48 |         return loss.detach().cpu().numpy()
 49 | 
 50 |     
 51 |     def value_iteration(self, r, V):
 52 |         return F.conv2d(
 53 |                 # Stack reward with most recent value
 54 |                 torch.cat([r, V], 1),
 55 |                 # Convolve r->q weights to r, and v->q weights for v. These represent transition probabilities
 56 |                 torch.cat([self.Q_bar.weight, self.w], 1),
 57 |                 stride=1,
 58 |                 padding=1)
 59 | 
 60 |     def _plan_ahead(self, r, V):
 61 |         for i in range(self.k):
 62 |             Q = self.value_iteration(r, V)
 63 |             V, _ = torch.max(Q, dim=1, keepdim=True)
 64 |         
 65 |         Q = self.value_iteration(r, V)
 66 |         return Q
 67 | 
 68 | 
 69 | 
 70 | class VINTrainer(object):
 71 |     def __init__(self, model, envs, val_envs, epsilon=0.1, epsilon_final=0.1, epsilon_steps=1000000, epsilon_test=0.1,
 72 |                 return_type='nstep', log_dir='logs/', model_dir='models/', total_steps=50000000, nsteps=20, gamma=0.99, lambda_=0.95, 
 73 |                 validate_freq=1e6, save_freq=0, render_freq=0, update_target_freq=0, num_val_episodes=50, log_scalars=True):
 74 |         self.model = model
 75 |         self.env = envs
 76 |         self.num_envs = len(envs)
 77 |         self.val_envs = val_envs
 78 |         self.total_steps = total_steps
 79 |         self.action_size = self.model.action_size
 80 |         self.epsilon = epsilon
 81 |         self.epsilon_test = epsilon_test
 82 |         self.states = self.env.reset()
 83 |         self.loc = self.get_locs()
 84 |         print('locs', self.loc)
 85 | 
 86 |         self.total_steps = int(total_steps)
 87 |         self.nsteps = nsteps
 88 |         self.return_type = return_type
 89 |         self.gamma = gamma
 90 |         self.lambda_ = lambda_
 91 | 
 92 |         self.validate_freq = int(validate_freq) 
 93 |         self.num_val_episodes = num_val_episodes
 94 | 
 95 |         self.save_freq = int(save_freq) 
 96 |         self.render_freq = render_freq
 97 |         self.target_freq = int(update_target_freq)
 98 |         self.t=1
 99 | 
100 |         self.validate_rewards = []
101 |         self.lock = threading.Lock()
102 |         self.scheduler = self.linear_schedule(epsilon, epsilon_final, epsilon_steps)
103 | 
104 |         self.log_scalars = log_scalars
105 |         self.log_dir = log_dir
106 | 
107 |         if log_scalars:
108 |             # Tensorboard Variables
109 |             train_log_dir = self.log_dir  + '/train'
110 |             self.train_writer = SummaryWriter(train_log_dir)
111 |     
112 |     def nstep_return(self, rewards, last_values, dones, gamma=0.99, clip=False):
113 |         if clip:
114 |             rewards = np.clip(rewards, -1, 1)
115 | 
116 |         T = len(rewards)
117 |         
118 |         # Calculate R for advantage A = R - V 
119 |         R = np.zeros_like(rewards)
120 |         R[-1] = last_values * (1-dones[-1])
121 |         
122 |         for i in reversed(range(T-1)):
123 |             # restart score if done as BatchEnv automatically resets after end of episode
124 |             R[i] = rewards[i] + gamma * R[i+1] * (1-dones[i])
125 |         
126 |         return R
127 |     
128 |     def lambda_return(self, rewards, values, last_values, dones, gamma=0.99, lambda_=0.8, clip=False):
129 |         if clip:
130 |             rewards = np.clip(rewards, -1, 1)
131 |         T = len(rewards)
132 |         # Calculate eligibility trace R^lambda 
133 |         R = np.zeros_like(rewards)
134 |         R[-1] =  last_values * (1-dones[-1])
135 |         for t in reversed(range(T-1)):
136 |             # restart score if done as BatchEnv automatically resets after end of episode
137 |             R[t] = rewards[t] + gamma * (lambda_* R[t+1] + (1.0-lambda_) * values[t+1]) * (1-dones[t])
138 |         
139 |         return R
140 | 
141 |     def GAE(self, rewards, values, last_values, dones, gamma=0.99, lambda_=0.95, clip=False):
142 |         if clip:
143 |             rewards = np.clip(rewards, -1, 1)
144 |         # Generalised Advantage Estimation
145 |         Adv = np.zeros_like(rewards)
146 |         Adv[-1] = rewards[-1] + gamma * last_values * (1-dones[-1]) - values[-1]
147 |         T = len(rewards)
148 |         for t in reversed(range(T-1)):
149 |             delta = rewards[t] + gamma * values[t+1] * (1-dones[t]) - values[t]
150 |             Adv[t] = delta + gamma * lambda_ * Adv[t+1] * (1-dones[t])
151 |         
152 |         return Adv
153 |     
154 |     def get_locs(self):
155 |         locs = []
156 |         for env in self.env.envs:
157 |             locs.append(env.agent_loc)
158 |         return locs
159 | 
160 |     def train(self):
161 |         self.train_nstep()
162 | 
163 | 
164 |     def train_nstep(self):
165 |         batch_size = self.num_envs * self.nsteps
166 |         num_updates = self.total_steps // batch_size
167 |         # main loop
168 |         start = time.time()
169 |         for t in range(self.t,num_updates+1):
170 |             states, locs, actions, rewards, dones, infos, values, last_values = self.rollout()
171 |             if self.return_type == 'nstep':
172 |                 R = self.nstep_return(rewards, last_values, dones, gamma=self.gamma)
173 |             elif self.return_type == 'GAE':
174 |                 R = self.GAE(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_) + values
175 |             elif self.return_type == 'lambda':
176 |                 R = self.lambda_return(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_, clip=False)
177 |             # stack all states, actions and Rs from all workers into a single batch
178 |             states, locs, actions, R = fold_batch(states), fold_batch(locs), fold_batch(actions), fold_batch(R)
179 |             #print('locs', locs.shape)    
180 |             l = self.model.backprop(states, locs, R, actions)
181 |      
182 |             if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0:
183 |                 self.validation_summary(t,l,start,False)
184 |                 start = time.time()
185 |             
186 |             if self.save_freq > 0 and  t % (self.save_freq // batch_size) == 0: 
187 |                 self.s += 1
188 |                 self.save(self.s)
189 |                 print('saved model')
190 |             
191 |             if self.target_freq > 0 and t % (self.target_freq // batch_size) == 0: # update target network (for value based learning e.g. DQN)
192 |                 self.update_target()
193 | 
194 |             self.t +=1
195 |     
196 |     def eval_state(self, state, loc):
197 |         with torch.no_grad():
198 |             x, y = zip(*loc)
199 |             x, y = torch.tensor(x).to(self.device), torch.tensor(y).to(self.device)
200 |             state_torch = totorch(state, self.device)
201 |             Qsa = self.model(state_torch, x, y)
202 |         return tonumpy(Qsa)
203 | 
204 |     def rollout(self):
205 |         rollout = []
206 |         for t in range(self.nsteps):
207 |             Qsa = self.eval_state(self.states, self.loc)
208 |             actions = np.argmax(Qsa, axis=1)
209 |             random = np.random.uniform(size=(self.num_envs))
210 |             random_actions = np.random.randint(self.action_size, size=(self.num_envs))
211 |             actions = np.where(random < self.epsilon, random_actions, actions)
212 |             next_states, rewards, dones, infos = self.env.step(actions)
213 |             values = np.sum(Qsa * one_hot(actions, self.action_size), axis=-1)
214 |             rollout.append((self.states, self.loc, actions, rewards, dones, infos, values))
215 |             self.states = next_states
216 |             self.epsilon = self.scheduler.step()
217 |             self.loc = self.get_locs()
218 |             
219 |         states, locs, actions, rewards, dones, infos, values = stack_many(*zip(*rollout))
220 | 
221 |         last_Qsa = self.eval_state(next_states, self.loc) # Q(s,a|theta)
222 |         last_actions = np.argmax(last_Qsa, axis=1)
223 |         last_values = np.sum(last_Qsa * one_hot(last_actions, self.action_size), axis=-1)
224 |         return states, locs, actions, rewards, dones, infos, values, last_values
225 |     
226 |     def get_action(self, state, loc):
227 |         Qsa = self.eval_state(state, loc)
228 |         if np.random.uniform() < self.epsilon_test:
229 |             action = np.random.choice(self.action_size)
230 |         else:
231 |             action = np.argmax(Qsa, axis=1)
232 |         return action
233 |     
234 |     def validation_summary(self,t,loss,start,render):
235 |         batch_size = self.num_envs * self.nsteps
236 |         tot_steps = t * batch_size
237 |         time_taken = time.time() - start
238 |         frames_per_update = (self.validate_freq // batch_size) * batch_size
239 |         fps = frames_per_update /time_taken 
240 |         num_val_envs = len(self.val_envs)
241 |         num_val_eps = [self.num_val_episodes//num_val_envs for i in range(num_val_envs)]
242 |         num_val_eps[-1] = num_val_eps[-1] + self.num_val_episodes % self.num_val_episodes//(num_val_envs)
243 |         render_array = np.zeros((len(self.val_envs)))
244 |         render_array[0] = render
245 |         threads = [threading.Thread(daemon=True, target=self.validate, args=(self.val_envs[i], num_val_eps[i], 10000, render_array[i])) for i in range(num_val_envs)]
246 |         try:
247 |             for thread in threads:
248 |                 thread.start()
249 |             
250 |             for thread in threads:
251 |                 thread.join()
252 |     
253 |         except KeyboardInterrupt:
254 |             for thread in threads:
255 |                 thread.join()
256 |     
257 |             
258 |         score = np.mean(self.validate_rewards)
259 |         self.validate_rewards = []
260 |         print("update %i, validation score %f, total steps %i, loss %f, time taken for %i frames:%fs, fps %f" %(t,score,tot_steps,loss,frames_per_update,time_taken,fps))
261 |         
262 |         if self.log_scalars:
263 |             self.train_writer.add_scalar('Validation/Score', score)
264 |             self.train_writer.add_scalar('Training/Loss', loss)
265 | 
266 | 
267 |     def validate(self,env,num_ep,max_steps,render=False):
268 |         episode_scores = []
269 |         for episode in range(num_ep):
270 |             state = env.reset()
271 |             loc = env.agent_loc
272 |             episode_score = []
273 |             for t in range(max_steps):
274 |                 action = self.get_action(state[np.newaxis], [loc])
275 |                 next_state, reward, done, info = env.step(action)
276 |                 state = next_state
277 |                 loc = env.agent_loc
278 | 
279 |                 episode_score.append(reward)
280 |                 
281 |                 if render:
282 |                     with self.lock:
283 |                         env.render()
284 | 
285 |                 if done or t == max_steps -1:
286 |                     tot_reward = np.sum(episode_score)
287 |                     with self.lock:
288 |                         self.validate_rewards.append(tot_reward)
289 |                     
290 |                     break
291 |         if render:
292 |             with self.lock:
293 |                 env.close()
294 |     
295 |     class linear_schedule(object):
296 |         def __init__(self, epsilon, epsilon_final, num_steps=1000000):
297 |             self._counter = 0
298 |             self._epsilon = epsilon
299 |             self._epsilon_final = epsilon_final
300 |             self._step = (epsilon - epsilon_final) / num_steps
301 |             self._num_steps = num_steps
302 |         
303 |         def step(self,):
304 |             if self._counter < self._num_steps :
305 |                 self._epsilon -= self._step
306 |                 self._counter += 1
307 |             else:
308 |                 self._epsilon = self._epsilon_final
309 |             
310 |             return self._epsilon
311 | 
312 | 
313 | 
314 | def main(env_id):
315 |     num_envs = 32
316 |     nsteps = 1
317 |     
318 |     current_time = datetime.datetime.now().strftime('%y-%m-%d_%H-%M-%S')
319 | 
320 |     train_log_dir = 'logs/VIN/' + env_id +'/n_step/' + current_time 
321 |     model_dir = "models/VIN/" + env_id + '/n_step/' + current_time 
322 |     
323 |     if 'ApplePicker' in env_id:
324 |         print('ApplePicker')
325 |         make_args = {'num_objects':300, 'default_reward':-0.01}
326 |         val_envs = [apple_pickgame(gym.make('ApplePicker-v0', **make_args)) for i in range(10)]
327 |         envs = DummyBatchEnv(apple_pickgame, 'ApplePicker-v0', num_envs, max_steps=1000, auto_reset=True, make_args=make_args)
328 |         print(val_envs[0])
329 |         print(envs.envs[0])
330 | 
331 |     else:
332 |         print('Atari')
333 |         env = gym.make(env_id)
334 |         if env.unwrapped.get_action_meanings()[1] == 'FIRE':
335 |             reset = True
336 |             print('fire on reset')
337 |         else:
338 |             reset = False
339 |             print('only stack frames')
340 |         env.close()
341 |         val_envs = [AtariEnv(gym.make(env_id), k=4, episodic=False, reset=reset, clip_reward=False) for i in range(5)]
342 |         envs = BatchEnv(AtariEnv, env_id, num_envs, blocking=False, k=4, reset=reset, episodic=False, clip_reward=True)
343 |     
344 |     action_size = val_envs[0].action_space.n
345 |     input_size = val_envs[0].reset().shape
346 |     print('input shape', input_size)
347 |     print('action space', action_size)
348 | 
349 | 
350 |     
351 |     vin = VINCNN(input_size,
352 |                  action_size,
353 |                  k=50,
354 |                  lr=1e-3).cuda()
355 |     
356 | 
357 |     trainer = VINTrainer(envs=envs,
358 |               model=vin,
359 |               log_dir=train_log_dir,
360 |               val_envs=val_envs,
361 |               return_type='nstep',
362 |               total_steps=10e6,
363 |               nsteps=nsteps,
364 |               validate_freq=1e5,
365 |               save_freq=0,
366 |               render_freq=0,
367 |               num_val_episodes=10,
368 |               log_scalars=False)
369 | 
370 |     trainer.train()
371 | 
372 | 
373 | if __name__ == "__main__":
374 |     import apple_picker
375 |     #env_id_list = ['SpaceInvadersDeterministic-v4', 'FreewayDeterministic-v4', 'MontezumaRevengeDeterministic-v4', 'PongDeterministic-v4']
376 |     #env_id_list = ['MontezumaRevengeDeterministic-v4']
377 |     env_id_list = ['ApplePicker-v0']
378 |     for env_id in env_id_list:
379 |         main(env_id)
380 | 


--------------------------------------------------------------------------------
/rlib/utils/SyncMultiEnvTrainer.py:
--------------------------------------------------------------------------------
  1 | import time, datetime, os
  2 | import threading
  3 | import numpy as np
  4 | import torch
  5 | import copy
  6 | import json
  7 | from typing import Union
  8 | from abc import ABC, abstractmethod
  9 | from rlib.utils.utils import fold_batch
 10 | from rlib.utils.VecEnv import BatchEnv, DummyBatchEnv
 11 | import torch
 12 | from torch.utils.tensorboard import SummaryWriter
 13 | 
 14 | 
 15 | 
 16 | class SyncMultiEnvTrainer(object):
 17 |     def __init__(self, envs: Union[BatchEnv, DummyBatchEnv], model:torch.nn.Module, val_envs: Union[list, BatchEnv, DummyBatchEnv], train_mode='nstep', return_type='nstep', log_dir='logs/', model_dir='models/', total_steps=50e6, nsteps=5, gamma=0.99, lambda_=0.95, 
 18 |                      validate_freq=1e6, save_freq=0, render_freq=0, update_target_freq=0, num_val_episodes=50, max_val_steps=10000, log_scalars=True):
 19 |         '''
 20 |             A synchronous multiple env training framework for pytorch
 21 | 
 22 |             Args:
 23 |                 envs - BatchEnv | DummyBatchEnv: multiple synchronous training environments 
 24 |                 model - reinforcement learning model
 25 |                 log_dir, log directory string for location of directory to log scalars log_dir='logs/', model_dir='models/',
 26 |                 val_envs - use your own discretion to choose which validation mode you wan't, recommended BatchEnv or list for Atari and DummyBatchEnv for Classic Control like envs
 27 |                     list: a list of envs for validation, uses threading to run environments asychronously
 28 |                     BatchEnv: uses multiprocessing to run validation envs sychronously in parallel
 29 |                     DummyBatchEnv: allows for sychronous env stepping without the overhead of multiprocessing, good for computationally cheap environments
 30 |                 train_mode - 'nstep' or 'onestep' species whether training is done using multiple step TD learning or single step
 31 |                 return_type - string to determine whether 'nstep', 'lambda' or 'GAE' returns are to be used
 32 |                 total_steps - number of Total training steps across all environements
 33 |                 nsteps - number of steps TD error is caluclated over
 34 |                 validate_freq - number of steps across all environements before performing validating, 0 for no validation 
 35 |                 save_freq - number of steps across all environements before saving model, 0 for no saving 
 36 |                 render_freq - multiple of validate_freq before rendering (i.e. render every X validations), 0 for no rendering
 37 |                 update_target_freq - number of steps across all environements before updating target model, 0 for no updating
 38 |                 num_val_episodes - number of episodes to average over when validating
 39 |                 max_val_steps - maximum number of steps for each validation episode (prevents infinite loops)
 40 |                 log_scalars - boolean flag whether to log tensorboard scalars to log_dir
 41 |         '''
 42 |         self.env = envs
 43 |         if isinstance(envs, list):
 44 |             self.validate_func = self.validate_async
 45 |         else:
 46 |             self.validate_func = self.validate_sync
 47 |         if train_mode not in ['nstep', 'onestep']:
 48 |             raise ValueError('train_mode %s is not a valid argument. Valid arguments are ... %s, %s' %(train_mode,'nstep','onestep'))
 49 |         assert num_val_episodes >= len(val_envs), 'number of validation epsiodes {} must be greater than or equal to the number of validation envs {}'.format(num_val_episodes, len(val_envs))
 50 |         if return_type not in ['nstep', 'lambda', 'GAE']:
 51 |             raise ValueError('return_type %s is not a valid argument. Valid arguments are ... %s, %s, %s' %(return_type, 'nstep', 'lambda', 'GAE'))
 52 |         self.train_mode = train_mode
 53 |         self.num_envs = len(envs)
 54 |         self.env_id = envs.spec.id
 55 |         self.val_envs = val_envs
 56 |         self.validate_rewards = []
 57 |         self.model = model
 58 |     
 59 |         self.total_steps = int(total_steps)
 60 |         self.nsteps = nsteps
 61 |         self.return_type = return_type
 62 |         self.gamma = gamma
 63 |         self.lambda_ = lambda_
 64 | 
 65 |         self.validate_freq = int(validate_freq) 
 66 |         self.num_val_episodes = num_val_episodes
 67 |         self.val_steps = max_val_steps
 68 |         self.lock = threading.Lock()
 69 | 
 70 |         self.save_freq = int(save_freq) 
 71 |         self.render_freq = render_freq
 72 |         self.target_freq = int(update_target_freq)
 73 |         self.s = 0 # number of saves made
 74 |         self.t = 1 # number of updates done
 75 |         self.log_scalars = log_scalars
 76 |         self.log_dir = log_dir
 77 |         self.model_dir = model_dir
 78 |         
 79 |         self.states = self.env.reset()
 80 | 
 81 |         if log_scalars:
 82 |             # Tensorboard Variables
 83 |             self.train_log_dir = self.log_dir  + '/train'
 84 |             self.train_writer = SummaryWriter(self.train_log_dir)
 85 | 
 86 |         if not os.path.exists(self.model_dir) and save_freq > 0:
 87 |             os.makedirs(self.model_dir)
 88 |     
 89 |     def __del__(self):
 90 |         self.env.close()
 91 |         
 92 | 
 93 |     def train(self):
 94 |         if self.train_mode == 'nstep':
 95 |             self._train_nstep()
 96 |         elif self.train_mode == 'onestep':
 97 |             self._train_onestep()
 98 |         else:
 99 |             raise ValueError('%s is not a valid training mode'%(self.train_mode))
100 |     
101 |     @abstractmethod
102 |     def _train_nstep(self):
103 |         '''
104 |             template for multi-step training loop for synchronous training over multiple environments
105 |         '''
106 |         start = time.time()
107 |         batch_size = self.num_envs * self.nsteps
108 |         num_updates = self.total_steps // batch_size
109 |         # main loop
110 |         for t in range(self.t,num_updates+1):
111 |             states, actions, rewards, dones, values, last_values = self.rollout()
112 |             if self.return_type == 'nstep':
113 |                 R = self.nstep_return(rewards, last_values, dones, gamma=self.gamma)
114 |             elif self.return_type == 'GAE':
115 |                 R = self.GAE(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_) + values
116 |             elif self.return_type == 'lambda':
117 |                 R = self.lambda_return(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_, clip=False)
118 |             # stack all states, actions and Rs from all workers into a single batch
119 |             states, actions, R = fold_batch(states), fold_batch(actions), fold_batch(R)    
120 |             l = self.model.backprop(states, R, actions)
121 | 
122 |             if self.render_freq > 0 and t % ((self.validate_freq // batch_size) * self.render_freq) == 0:
123 |                 render = True
124 |             else:
125 |                 render = False
126 |      
127 |             if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0:
128 |                 self.validation_summary(t,l,start,render)
129 |                 start = time.time()
130 |             
131 |             if self.save_freq > 0 and  t % (self.save_freq // batch_size) == 0: 
132 |                 self.s += 1
133 |                 self.save(self.s)
134 |                 print('saved model')
135 |             
136 |             if self.target_freq > 0 and t % (self.target_freq // batch_size) == 0: # update target network (for value based learning e.g. DQN)
137 |                 self.update_target()
138 | 
139 |             self.t +=1
140 |     
141 | 
142 |     @abstractmethod
143 |     def rollout(self):
144 |         raise NotImplementedError(self, 'No rollout method found')
145 |     
146 |     def nstep_return(self, rewards, last_values, dones, gamma=0.99, clip=False):
147 |         if clip:
148 |             rewards = np.clip(rewards, -1, 1)
149 | 
150 |         T = len(rewards)
151 |         
152 |         # Calculate R for advantage A = R - V 
153 |         R = np.zeros_like(rewards)
154 |         R[-1] = last_values * (1-dones[-1])
155 |         
156 |         for i in reversed(range(T-1)):
157 |             # restart score if done as BatchEnv automatically resets after end of episode
158 |             R[i] = rewards[i] + gamma * R[i+1] * (1-dones[i])
159 |         
160 |         return R
161 |     
162 |     def lambda_return(self, rewards, values, last_values, dones, gamma=0.99, lambda_=0.8, clip=False):
163 |         if clip:
164 |             rewards = np.clip(rewards, -1, 1)
165 |         T = len(rewards)
166 |         # Calculate eligibility trace R^lambda 
167 |         R = np.zeros_like(rewards)
168 |         R[-1] =  last_values * (1-dones[-1])
169 |         for t in reversed(range(T-1)):
170 |             # restart score if done as BatchEnv automatically resets after end of episode
171 |             R[t] = rewards[t] + gamma * (lambda_* R[t+1] + (1.0-lambda_) * values[t+1]) * (1-dones[t])
172 |         
173 |         return R
174 | 
175 |     def GAE(self, rewards, values, last_values, dones, gamma=0.99, lambda_=0.95, clip=False):
176 |         if clip:
177 |             rewards = np.clip(rewards, -1, 1)
178 |         # Generalised Advantage Estimation
179 |         Adv = np.zeros_like(rewards)
180 |         Adv[-1] = rewards[-1] + gamma * last_values * (1-dones[-1]) - values[-1]
181 |         T = len(rewards)
182 |         for t in reversed(range(T-1)):
183 |             delta = rewards[t] + gamma * values[t+1] * (1-dones[t]) - values[t]
184 |             Adv[t] = delta + gamma * lambda_ * Adv[t+1] * (1-dones[t])
185 |         
186 |         return Adv
187 |     
188 |     def validation_summary(self, t, loss, start, render):
189 |         batch_size = self.num_envs * self.nsteps
190 |         tot_steps = t * batch_size
191 |         time_taken = time.time() - start
192 |         frames_per_update = (self.validate_freq // batch_size) * batch_size
193 |         fps = frames_per_update / time_taken 
194 |         
195 |         score = self.validate_func(render)
196 |         print("update %i, validation score %f, total steps %i, loss %f, time taken for %i frames:%fs, fps %f \t\t\t" %(t,score,tot_steps,loss,frames_per_update,time_taken,fps))
197 |         
198 |         if self.log_scalars:
199 |             self.train_writer.add_scalar('validation/score', score, tot_steps)
200 |             self.train_writer.add_scalar('train/loss', loss, tot_steps)
201 |     
202 |     
203 |     def save_model(self, s):
204 |         model_loc = f'{self.model_dir}/{s}.pt'
205 |         # default saving method is to save session
206 |         torch.save(self.model.state_dict(), model_loc)
207 |     
208 |     def load_model(self, modelname, model_dir="models/"):
209 |         filename = model_dir + modelname + '.pt'
210 |         if os.path.exists(filename):
211 |             self.model.load_state_dict(torch.load(filename))
212 |             print("loaded:", filename)
213 |         else:
214 |             print(filename, " does not exist")
215 |     
216 |     def base_attr(self):
217 |         attributes = {'train_mode':self.train_mode,
218 |                 'total_steps':self.total_steps,
219 |                 'nsteps':self.nsteps,
220 |                 'return_type':self.return_type,
221 |                 'gamma':self.gamma,
222 |                 'lambda_':self.lambda_,
223 |                 'validate_freq':self.validate_freq,
224 |                 'num_val_episodes':self.num_val_episodes,
225 |                 'save_freq':self.save_freq,
226 |                 'render_freq':self.render_freq,
227 |                 'model_dir':self.model_dir,
228 |                 'train_log_dir':self.train_log_dir,
229 |                 's':self.s,
230 |                 't':self.t}
231 | 
232 |         return attributes
233 |     
234 |     def local_attr(self, attr):
235 |         # attr[variable] = z
236 |         return attr
237 | 
238 |     def save(self, s):
239 |         model_loc = str(self.model_dir + '/' + str(s) + '.trainer')
240 |         file = open(model_loc, 'w+')
241 |         attributes = self.base_attr()
242 |         # add local variables to dict 
243 |         attributes = self.local_attr(attributes)
244 |         json.dump(attributes, file)
245 |         # save model 
246 |         self.save_model(s)
247 |         file.close()
248 |     
249 |     def load(self, Class, model, model_checkpoint, envs, val_envs, filename, log_scalars=True, allow_gpu_growth=True, continue_train=True):
250 |         with open(filename, 'r') as file:
251 |             attrs = json.loads(file.read())
252 |         s = attrs.pop('s')
253 |         t = attrs.pop('t')
254 |         time = attrs.pop('current_time') 
255 |         print(attrs)
256 |         trainer = Class(envs=envs, model=model, val_envs=val_envs, log_scalars=log_scalars, gpu_growth=allow_gpu_growth, **attrs)
257 |         if continue_train:
258 |             trainer.s = s
259 |             trainer.t = t
260 |         self.load_model(model_checkpoint, trainer.model_dir)
261 |         return trainer
262 | 
263 |     @abstractmethod
264 |     def update_target(self):
265 |         pass
266 |         
267 |     @abstractmethod
268 |     def _train_onestep(self):
269 |         ''' more efficient implementation of train_nstep when nsteps=1
270 |         '''
271 |         raise NotImplementedError(self, 'does not have an one-step training implementation')
272 |     
273 |     def save_hyperparameters(self, filename, **kwargs):
274 |         handle = open(filename, "w")
275 |         for key, value in kwargs.items():
276 |             handle.write("{} = {}\n" .format(key, value))
277 |         handle.close()
278 | 
279 |     def validate_async(self, render=False):
280 |         num_val_envs = len(self.val_envs)
281 |         num_val_eps = [self.num_val_episodes//num_val_envs for i in range(num_val_envs)]
282 |         num_val_eps[-1] = num_val_eps[-1] + self.num_val_episodes % self.num_val_episodes//(num_val_envs)
283 |         render_array = np.zeros((len(self.val_envs)))
284 |         render_array[0] = render
285 |         threads = [threading.Thread(daemon=True, target=self._validate_async, args=(self.val_envs[i], num_val_eps[i], self.val_steps, render_array[i])) for i in range(num_val_envs)]
286 |         
287 |         try:
288 |             for thread in threads:
289 |                 thread.start()
290 |             
291 |             for thread in threads:
292 |                 thread.join()
293 |     
294 |         except KeyboardInterrupt:
295 |             for thread in threads:
296 |                 thread.join()
297 |     
298 |             
299 |         score = np.mean(self.validate_rewards)
300 |         self.validate_rewards = []
301 |         return score
302 |     
303 |     def _validate_async(self, env, num_ep, max_steps, render=False):
304 |         'single env validation'
305 |         for episode in range(num_ep):
306 |             state = env.reset()
307 |             episode_score = []
308 |             for t in range(max_steps):
309 |                 action = self.get_action(state[np.newaxis])
310 |                 next_state, reward, done, info = env.step(action)
311 |                 state = next_state
312 |                 #print('state', state, 'action', action, 'reward', reward)
313 | 
314 |                 episode_score.append(reward)
315 |                 
316 |                 if render:
317 |                     with self.lock:
318 |                         env.render()
319 | 
320 |                 if done or t == max_steps -1:
321 |                     tot_reward = np.sum(episode_score)
322 |                     with self.lock:
323 |                         self.validate_rewards.append(tot_reward)
324 |                     
325 |                     break
326 |         if render:
327 |             with self.lock:
328 |                 env.close()
329 |     
330 |     def validate_sync(self, render=False):
331 |         'batch env validation'
332 |         episode_scores = []
333 |         env = self.val_envs
334 |         for episode in range(self.num_val_episodes//len(env)):
335 |             states = env.reset()
336 |             episode_score = []
337 |             for t in range(self.val_steps):
338 |                 actions = self.get_action(states)
339 |                 next_states, rewards, dones, infos = env.step(actions)
340 |                 states = next_states
341 |                 #print('state', state, 'action', action, 'reward', reward)
342 | 
343 |                 episode_score.append(rewards*(1-dones))
344 |                 
345 |                 if render:
346 |                     with self.lock:
347 |                         env.render()
348 | 
349 |                 if dones.sum() == self.num_envs or t == self.val_steps -1:
350 |                     tot_reward = np.sum(np.stack(episode_score), axis=0)
351 |                     episode_scores.append(tot_reward)
352 |                     break
353 |         
354 |         return np.mean(episode_scores)
355 |     
356 |     def get_action(self, state): # include small fn in order to reuse validate 
357 |         raise NotImplementedError('get_action method is required when using the default validation functions, check that this is implemented properly')
358 |     
359 |     def fold_batch(self, x):
360 |         rows, cols = x.shape[0], x.shape[1]
361 |         y = x.reshape(rows*cols,*x.shape[2:])
362 |         return y
363 | 
364 | 
365 | 
366 | 
367 | 
368 | # class Runner(ABC):
369 | #     def __init__(self,model,env,num_steps):
370 | #         self.model = model
371 | #         self.env = env
372 | #         self.num_steps = num_steps
373 | #         self.states = self.env.reset()
374 |     
375 | #     @abstractmethod
376 | #     def run(self):
377 | #         pass


--------------------------------------------------------------------------------
/rlib/DAAC/DAAC.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import time
  4 | import datetime
  5 | import torch
  6 | import torch.nn.functional as F
  7 | 
  8 | from rlib.networks.networks import *
  9 | from rlib.utils.VecEnv import*
 10 | from rlib.utils.wrappers import*
 11 | from rlib.utils.SyncMultiEnvTrainer import SyncMultiEnvTrainer
 12 | from rlib.utils.utils import fastsample, fold_batch, tonumpy, totorch, totorch_many, stack_many, fold_many
 13 | from rlib.utils.schedulers import polynomial_sheduler
 14 | 
 15 | class ValueModel(torch.nn.Module):
 16 |     def __init__(self, model, input_shape, action_size, lr=1e-3, lr_final=0, decay_steps=6e5, grad_clip=0.5,
 17 |                     build_optimiser=True, optim=torch.optim.Adam, optim_args={}, device='cuda', **model_args):
 18 |         super(ValueModel, self).__init__()
 19 |         self.lr = lr
 20 |         self.lr_final = lr_final
 21 |         self.action_size = action_size
 22 |         self.decay_steps = decay_steps
 23 |         self.grad_clip = grad_clip
 24 |         self.device = device
 25 | 
 26 |         self.model = model(input_shape, **model_args).to(self.device)
 27 |         dense_size = self.model.dense_size
 28 |         self.V = torch.nn.Linear(dense_size, 1).to(self.device)
 29 | 
 30 |         if build_optimiser:
 31 |             self.optimiser = optim(self.parameters(), lr, **optim_args)
 32 |             self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1)
 33 | 
 34 |     
 35 |     def forward(self, state):
 36 |         enc_state = self.model(state)
 37 |         value = self.V(enc_state).view(-1)
 38 |         return value
 39 |     
 40 |     def loss(self, V, R):
 41 |         value_loss = 0.5 * torch.mean(torch.square(R - V))
 42 |         return value_loss
 43 | 
 44 |     def backprop(self, state, R):
 45 |         state, R = totorch_many(state, R, device=self.device)
 46 |         value = self.forward(state)
 47 |         loss = self.loss(value, R)
 48 | 
 49 |         loss.backward()
 50 |         if self.grad_clip is not None:
 51 |             torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip)
 52 |         self.optimiser.step()
 53 |         self.optimiser.zero_grad()
 54 |         self.scheduler.step()
 55 |         return loss.detach().cpu().numpy()
 56 | 
 57 | 
 58 | 
 59 | class PolicyModel(torch.nn.Module):
 60 |     # PPO Policy 
 61 |     def __init__(self, model, input_shape, action_size, lr=1e-3, lr_final=0, decay_steps=6e5, grad_clip=0.5, entropy_coeff=0.01, policy_clip=0.1, adv_coeff=0.25,
 62 |                     build_optimiser=True, optim=torch.optim.Adam, optim_args={}, device='cuda', **model_args):
 63 |         super(PolicyModel, self).__init__()
 64 |         self.lr = lr
 65 |         self.lr_final = lr_final
 66 |         self.action_size = action_size
 67 |         self.entropy_coeff = entropy_coeff
 68 |         self.decay_steps = decay_steps
 69 |         self.grad_clip = grad_clip
 70 |         self.policy_clip = policy_clip
 71 |         self.adv_coeff = adv_coeff
 72 |         self.device = device
 73 | 
 74 |         self.model = model(input_shape, **model_args).to(self.device)
 75 |         dense_size = self.model.dense_size
 76 |         self.policy = torch.nn.Sequential(torch.nn.Linear(dense_size, action_size), torch.nn.Softmax(dim=-1)).to(self.device)
 77 |         self.Adv = torch.nn.Linear(dense_size, 1).to(self.device)
 78 | 
 79 |         if build_optimiser:
 80 |             self.optimiser = optim(self.parameters(), lr, **optim_args)
 81 |             self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1)
 82 | 
 83 |     
 84 |     def forward(self, state):
 85 |         enc_state = self.model(state)
 86 |         policy = self.policy(enc_state)
 87 |         Adv = self.Adv(enc_state).view(-1)
 88 |         return policy, Adv
 89 |     
 90 |     def loss(self, policy, Adv_hat, Adv, action_onehot, old_policy):
 91 |         policy_actions = torch.sum(policy * action_onehot, dim=1)
 92 |         old_policy_actions = torch.sum(old_policy * action_onehot, dim=1)
 93 |         ratio = policy_actions / old_policy_actions
 94 |         policy_loss_unclipped = ratio * -Adv
 95 |         policy_loss_clipped = torch.clip_(ratio, 1 - self.policy_clip, 1 + self.policy_clip) * -Adv
 96 |         policy_loss = torch.mean(torch.maximum(policy_loss_unclipped, policy_loss_clipped))
 97 |         entropy = torch.mean(torch.sum(policy * -torch.log(policy), dim=1))
 98 | 
 99 |         adv_loss = torch.square(Adv_hat - Adv).sum(dim=-1).mean()
100 | 
101 |         loss =  policy_loss - self.entropy_coeff * entropy + self.adv_coeff * adv_loss
102 |         return loss
103 | 
104 |     def backprop(self, state, Adv, action, old_policy):
105 |         state, action, Adv, old_policy = totorch_many(state, action, Adv, old_policy, device=self.device)
106 |         policy, Adv_hat = self.forward(state)
107 |         action_onehot = F.one_hot(action.long(), self.action_size)
108 |         loss = self.loss(policy, Adv_hat, Adv, action_onehot, old_policy)
109 | 
110 |         loss.backward()
111 |         if self.grad_clip is not None:
112 |             torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip)
113 |         self.optimiser.step()
114 |         self.optimiser.zero_grad()
115 |         self.scheduler.step()
116 |         return loss.detach().cpu().numpy()
117 | 
118 | 
119 | 
120 | class DAAC(torch.nn.Module):
121 |     # Decoupling Value and Policy for Generalization in Reinforcement Learning
122 |     # https://arxiv.org/pdf/2102.10330.pdf
123 |     def __init__(self, policy_model, value_model, input_shape, action_size, entropy_coeff=0.01, adv_coeff=0.25, policy_clip=0.1, lr=5e-4, lr_final=1e-5, decay_steps=6e5, grad_clip=0.2, device='cuda',
124 |                     policy_optim=torch.optim.Adam, policy_optim_args={}, policy_model_args={},
125 |                     value_optim=torch.optim.Adam, value_optim_args={}, value_model_args={}):
126 |         super(DAAC, self).__init__()
127 |         self.lr = lr
128 |         self.lr_final = lr_final
129 |         self.decay_steps = decay_steps
130 |         self.entropy_coeff = entropy_coeff
131 |         self.adv_coeff = adv_coeff
132 |         self.grad_clip = grad_clip
133 |         self.policy_clip = policy_clip
134 | 
135 |         self.value = ValueModel(value_model, input_shape, action_size, lr=lr, lr_final=lr_final, decay_steps=decay_steps, grad_clip=grad_clip, optim=value_optim, optim_args=value_optim_args, device=device, **value_model_args)
136 |         
137 |         self.policy = PolicyModel(policy_model, input_shape, action_size, lr=lr, lr_final=lr_final, decay_steps=decay_steps, grad_clip=grad_clip,
138 |                                     entropy_coeff=entropy_coeff, adv_coeff=adv_coeff, policy_clip=policy_clip,
139 |                                     optim=policy_optim, optim_args=policy_optim_args, device=device, **policy_model_args)
140 |     
141 |     def get_policy(self, state:np.ndarray):
142 |         with torch.no_grad():
143 |             policy, Adv = self.policy.forward(totorch(state, self.policy.device))
144 |         return tonumpy(policy), tonumpy(Adv)
145 |     
146 |     def get_value(self, state:np.ndarray):
147 |         with torch.no_grad():
148 |             value = self.value.forward(totorch(state, self.value.device))
149 |         return tonumpy(value)
150 |     
151 |     def evaluate(self, state:np.ndarray):
152 |         with torch.no_grad():
153 |             policy, _ = self.policy.forward(totorch(state, self.policy.device))
154 |             value = self.value.forward(totorch(state, self.value.device))
155 |         return tonumpy(policy), tonumpy(value)
156 | 
157 |     def backprop(self, state, R, Adv, action, old_policy):
158 |         policy_loss = self.policy.backprop(state, Adv, action, old_policy)
159 |         value_loss = self.value.backprop(state, R)
160 |         return policy_loss + value_loss
161 | 
162 | 
163 | class DAACTrainer(SyncMultiEnvTrainer):
164 |     def __init__(self, envs, model, val_envs, train_mode='nstep', log_dir='logs/', model_dir='models/', total_steps=1000000, nsteps=5, gamma=0.99, lambda_=0.95, 
165 |                     policy_epochs=1, value_epochs=9, num_minibatches=8, validate_freq=1000000.0, save_freq=0, render_freq=0, num_val_episodes=50, max_val_steps=10000, log_scalars=True):
166 |         
167 |         super().__init__(envs, model, val_envs, train_mode=train_mode, log_dir=log_dir, model_dir=model_dir, total_steps=total_steps, nsteps=nsteps, gamma=gamma, lambda_=lambda_,
168 |                             validate_freq=validate_freq, save_freq=save_freq, render_freq=render_freq, update_target_freq=0, num_val_episodes=num_val_episodes, max_val_steps=max_val_steps, log_scalars=log_scalars)
169 | 
170 |         self.policy_epochs = policy_epochs
171 |         self.value_epochs = value_epochs
172 |         self.num_minibatches = num_minibatches
173 | 
174 |         hyper_paras = {'learning_rate':model.lr, 'learning_rate_final':model.lr_final, 'lr_decay_steps':model.decay_steps,
175 |             'grad_clip':model.grad_clip, 'nsteps':self.nsteps, 'num_workers':self.num_envs, 'total_steps':self.total_steps,
176 |             'entropy_coefficient':self.model.entropy_coeff, 'advantage_coefficient':self.model.adv_coeff, 'value_coefficient':1.0, 'policy_clip':self.model.policy_clip,
177 |             'num_minibatches':self.num_minibatches, 'policy_epochs':self.policy_epochs, 'value_epochs':self.value_epochs, 'gamma':self.gamma, 'lambda':self.lambda_
178 |             }
179 |         
180 |         if log_scalars:
181 |             filename = log_dir + '/hyperparameters.txt'
182 |             self.save_hyperparameters(filename, **hyper_paras)
183 |     
184 |     
185 |     
186 |     def _train_nstep(self):
187 |         batch_size = self.num_envs * self.nsteps
188 |         num_updates = self.total_steps // batch_size
189 |         s = 0
190 |         mini_batch_size = self.nsteps//self.num_minibatches
191 |         start = time.time()
192 |         # main loop
193 |         for t in range(1,num_updates+1):
194 |             #rollout_start = time.time()
195 |             states, actions, rewards, values, last_values, old_policies, dones = self.rollout()
196 |             #print('rollout time', time.time()-rollout_start)
197 |             Adv = self.GAE(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_)
198 |             R = self.lambda_return(rewards, values, last_values, dones, gamma=self.gamma, lambda_=self.lambda_)
199 |             l = 0
200 |             
201 |             
202 |             idxs = np.arange(len(states))
203 |             value_loss = 0
204 |             for epoch in range(self.value_epochs):
205 |                 np.random.shuffle(idxs)
206 |                 for batch in range(0, len(states), mini_batch_size):
207 |                     batch_idxs = idxs[batch: batch + mini_batch_size]
208 |                     # stack all states, actions and Rs across all workers into a single batch
209 |                     mb_states, mb_Rs, = fold_many(states[batch_idxs], R[batch_idxs])
210 |                     
211 |                     value_loss += self.model.value.backprop(mb_states.copy(), mb_Rs.copy())
212 |             
213 |             value_loss /= self.value_epochs
214 | 
215 |             idxs = np.arange(len(states))
216 |             policy_loss = 0
217 |             for epoch in range(self.policy_epochs):
218 |                 np.random.shuffle(idxs)
219 |                 for batch in range(0, len(states), mini_batch_size):
220 |                     batch_idxs = idxs[batch: batch + mini_batch_size]
221 |                     # stack all states, actions and Rs across all workers into a single batch
222 |                     mb_states, mb_actions, mb_Adv, mb_old_policies = fold_many(states[batch_idxs], actions[batch_idxs], 
223 |                                                                                      Adv[batch_idxs], old_policies[batch_idxs])
224 | 
225 |                     policy_loss += self.model.policy.backprop(mb_states.copy(), mb_Adv.copy(), mb_actions.copy(), mb_old_policies.copy())
226 |             
227 |             policy_loss /= self.policy_epochs
228 |             l = policy_loss + value_loss
229 |             
230 |             if self.render_freq > 0 and t % ((self.validate_freq  // batch_size) * self.render_freq) == 0:
231 |                 render = True
232 |             else:
233 |                 render = False
234 |         
235 |             if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0:
236 |                 #val_time = time.time()
237 |                 self.validation_summary(t,l,start,render)
238 |                 #print('validation time', time.time()-val_time)
239 |                 start = time.time()
240 |             
241 |             if self.save_freq > 0 and  t % (self.save_freq // batch_size) == 0:
242 |                 s += 1
243 |                 self.save(s)
244 |                 print('saved model')
245 |             
246 |     
247 |     def get_action(self, states):
248 |         policies, values = self.model.evaluate(states)
249 |         actions = fastsample(policies)
250 |         return actions
251 | 
252 |     def rollout(self):
253 |         rollout = []
254 |         for t in range(self.nsteps):
255 |             policies, values = self.model.evaluate(self.states)
256 |             actions = fastsample(policies)
257 |             next_states, rewards, dones, infos = self.env.step(actions)
258 |             rollout.append((self.states, actions, rewards, values, policies, dones))
259 |             self.states = next_states
260 | 
261 |         states, actions, rewards, values, policies, dones = stack_many(*zip(*rollout))
262 |         policy, last_values, = self.model.evaluate(next_states)
263 |         return states, actions, rewards, values, last_values, policies, dones
264 | 
265 | 
266 | def main(env_id):
267 |     num_envs = 32
268 |     nsteps = 128
269 | 
270 |     classic_list = ['MountainCar-v0', 'Acrobot-v1', 'LunarLander-v2', 'CartPole-v0', 'CartPole-v1']
271 |     if any(env_id in s for s in classic_list):
272 |         print('Classic Control')
273 |         val_envs = [gym.make(env_id) for i in range(10)]
274 |         envs = BatchEnv(DummyEnv, env_id, num_envs, blocking=False)
275 |     
276 |     elif 'ApplePicker' in env_id:
277 |         print('ApplePicker')
278 |         make_args = {'num_objects':300, 'default_reward':0}
279 |         if 'Deterministic' in env_id:
280 |             envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=True, k=4, grey_scale=True, make_args=make_args)
281 |             val_envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=False, k=4, grey_scale=True, make_args=make_args)
282 |             for i in range(len(envs)):
283 |                 val_envs.envs[i].set_locs(envs.envs[i].item_locs_master, envs.envs[i].start_loc)
284 |             val_envs.reset()
285 |         else:
286 |         #val_envs = [apple_pickgame(gym.make(env_id), max_steps=5000, auto_reset=False, k=1) for i in range(16)]
287 |             val_envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=False, k=4, grey_scale=True)
288 |             envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=True, k=4, grey_scale=True)
289 |         print(val_envs.envs[0])
290 |         print(envs.envs[0])
291 | 
292 |     else:
293 |         print('Atari')
294 |         env = gym.make(env_id)
295 |         if env.unwrapped.get_action_meanings()[1] == 'FIRE':
296 |             reset = True
297 |             print('fire on reset')
298 |         else:
299 |             reset = False
300 |             print('only stack frames')
301 |         env.close()
302 |         val_envs = [AtariEnv(gym.make(env_id), k=4, episodic=False, reset=reset, clip_reward=False) for i in range(16)]
303 |         envs = BatchEnv(AtariEnv, env_id, num_envs, blocking=False, k=4, reset=reset, episodic=False, clip_reward=True)
304 |         
305 |     
306 |     action_size = val_envs.envs[0].action_space.n
307 |     input_size = val_envs.envs[0].reset().shape
308 |     
309 |     
310 |     current_time = datetime.datetime.now().strftime('%y-%m-%d_%H-%M-%S')
311 |     train_log_dir = 'logs/PPO/' + env_id + '/Adam/' + current_time
312 |     model_dir = "models/PPO/" + env_id + '/' + current_time
313 |     
314 |     
315 |     model = DAAC(policy_model=NatureCNN,
316 |                 value_model=NatureCNN,
317 |                 input_shape=input_size,
318 |                 action_size=action_size,
319 |                 lr=5e-4,
320 |                 lr_final=1e-5,
321 |                 decay_steps=200e6//(num_envs*nsteps),
322 |                 grad_clip=0.5,
323 |                 adv_coeff=0.25,
324 |                 entropy_coeff=0.01,
325 |                 policy_clip=0.1,
326 |                 device='cuda'
327 |                 )
328 | 
329 |     
330 |     daac = DAACTrainer(envs=envs,
331 |                             model=model,
332 |                             model_dir=model_dir,
333 |                             log_dir=train_log_dir,
334 |                             val_envs=val_envs,
335 |                             train_mode='nstep',
336 |                             total_steps=200e6,
337 |                             nsteps=nsteps,
338 |                             policy_epochs=1,
339 |                             value_epochs=1,
340 |                             num_minibatches=8,
341 |                             validate_freq=1e5,
342 |                             save_freq=0,
343 |                             render_freq=0,
344 |                             num_val_episodes=32,
345 |                             log_scalars=False)
346 |     daac.train()
347 |     
348 | 
349 | if __name__ == "__main__":
350 |     import apple_picker
351 |     #env_id_list = ['SpaceInvadersDeterministic-v4', 'FreewayDeterministic-v4']# 'SpaceInvadersDeterministic-v4',]# , ]
352 |     #env_id_list = ['MountainCar-v0', 'Acrobot-v1', 'CartPole-v1', ]
353 |     env_id_list = ['ApplePickerDeterministic-v0']
354 |     for env_id in env_id_list:
355 |         main(env_id)


--------------------------------------------------------------------------------
/rlib/Unreal/UnrealA2C2.py:
--------------------------------------------------------------------------------
  1 | from numpy.core.fromnumeric import size
  2 | import torch
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | import gym
  6 | import os, time, datetime
  7 | 
  8 | from rlib.utils.utils import fastsample, fold_batch, one_hot, RunningMeanStd, stack_many, totorch, totorch_many, tonumpy, GAE
  9 | from rlib.utils.schedulers import polynomial_sheduler
 10 | from collections import deque
 11 | from rlib.networks.networks import*
 12 | from rlib.utils.SyncMultiEnvTrainer import SyncMultiEnvTrainer
 13 | from rlib.utils.VecEnv import*
 14 | from rlib.utils.wrappers import*
 15 | 
 16 | from rlib.A2C.ActorCritic import ActorCritic
 17 | 
 18 | # A2C-CNN version of Unsupervised Reinforcement Learning with Auxiliary Tasks (UNREAL) https://arxiv.org/abs/1611.05397
 19 | # Modifications:
 20 | #   no action-reward fed into policy
 21 | #   Use greyscaled images
 22 | #   deconvolute to pixel grid that overlaps FULL image
 23 | #   Generalised Advantage Estimation
 24 | #   Assumes input image size is 84x84
 25 | 
 26 | #torch.backends.cudnn.benchmark=True
 27 | 
 28 | def sign(x):
 29 |     if x < 0:
 30 |         return 2
 31 |     elif x == 0:
 32 |         return 0
 33 |     elif x > 0:
 34 |         return 1
 35 |     else:
 36 |         raise ValueError
 37 | 
 38 | class UnrealA2C2(torch.nn.Module):
 39 |     def __init__(self, policy_model, input_shape, action_size, pixel_control=True, RP=1.0, PC=1.0, VR=1.0, entropy_coeff=0.001, value_coeff=0.5,
 40 |                     lr=1e-3, lr_final=1e-4, decay_steps=50e6, grad_clip=0.5, policy_args={}, optim=torch.optim.RMSprop, device='cuda', optim_args={}):
 41 |         super(UnrealA2C2, self).__init__()
 42 |         self.RP, self.PC, self.VR = RP, PC, VR
 43 |         self.lr = lr
 44 |         self.entropy_coeff, self.value_coeff = entropy_coeff, value_coeff
 45 |         self.pixel_control = pixel_control
 46 |         self.grad_clip = grad_clip
 47 |         self.action_size = action_size
 48 |         self.device = device
 49 | 
 50 |         try:
 51 |             iterator = iter(input_shape)
 52 |         except TypeError:
 53 |             input_size = (input_shape,)
 54 |         
 55 |         self.policy = ActorCritic(policy_model, input_shape, action_size, entropy_coeff=entropy_coeff, value_coeff=value_coeff, 
 56 |                                   build_optimiser=False, device=device, **policy_args)
 57 | 
 58 |         
 59 | 
 60 |         if pixel_control:
 61 |             self.feat_map = torch.nn.Sequential(torch.nn.Linear(self.policy.dense_size, 32*8*8), torch.nn.ReLU()).to(device)
 62 |             self.deconv1 = torch.nn.Sequential(torch.nn.ConvTranspose2d(32, 32, kernel_size=[3,3], stride=[1,1]), torch.nn.ReLU()).to(device)
 63 |             self.deconv_advantage = torch.nn.ConvTranspose2d(32, action_size, kernel_size=[3,3], stride=[2,2]).to(device)
 64 |             self.deconv_value = torch.nn.ConvTranspose2d(32, 1, kernel_size=[3,3], stride=[2,2]).to(device)
 65 |                 
 66 |         # reward model
 67 |         self.r1 = torch.nn.Sequential(torch.nn.Linear(self.policy.dense_size, 128), torch.nn.ReLU()).to(device)
 68 |         self.r2 = torch.nn.Linear(128, 3).to(device)
 69 | 
 70 |         self.optimiser = optim(self.parameters(), lr, **optim_args)
 71 |         self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1)
 72 | 
 73 |     def forward(self, state):
 74 |         return self.policy.forward(state)
 75 | 
 76 |     def evaluate(self, state):
 77 |         return self.policy.evaluate(state)
 78 | 
 79 |     def Qaux(self, enc_state):
 80 |         # Auxillary Q value calculated via dueling network 
 81 |         # Z. Wang, N. de Freitas, and M. Lanctot. Dueling Network Architectures for Deep ReinforcementLearning. https://arxiv.org/pdf/1511.06581.pdf
 82 |         batch_size = enc_state.shape[0]
 83 |         feat_map = self.feat_map(enc_state).view([batch_size,32,8,8])
 84 |         deconv1 = self.deconv1(feat_map)
 85 |         deconv_adv = self.deconv_advantage(deconv1)
 86 |         deconv_value = self.deconv_value(deconv1)
 87 |         qaux = deconv_value + deconv_adv - torch.mean(deconv_adv, dim=1, keepdim=True)
 88 |         return qaux
 89 | 
 90 |     def get_pixel_control(self, state:np.ndarray):
 91 |         with torch.no_grad():
 92 |             enc_state = self.policy.model(totorch(state, self.device))
 93 |             Qaux = self.Qaux(enc_state)
 94 |         return tonumpy(Qaux)
 95 | 
 96 |     def pixel_loss(self, Qaux, Qaux_actions, Qaux_target):
 97 |         # Qaux_target temporal difference target for Q_aux
 98 |         #print('max qaux actions', Qaux_actions)
 99 |         #print('action_size', self.action_size)
100 |         one_hot_actions = F.one_hot(Qaux_actions.long(), self.action_size)
101 |         pixel_action = one_hot_actions.view([-1,self.action_size,1,1])
102 |         Q_aux_action = torch.sum(Qaux * pixel_action, dim=1)
103 |         pixel_loss = 0.5 * torch.mean(torch.square(Qaux_target - Q_aux_action)) # l2 loss for Q_aux over all pixels and batch
104 |         return pixel_loss
105 | 
106 |     def reward_loss(self, reward_states, reward_target):
107 |         r1 = self.r1(self.policy.model(reward_states))
108 |         pred_reward = self.r2(r1)
109 |         reward_loss = torch.mean(F.cross_entropy(pred_reward, reward_target.long()))  # cross entropy over caterogical reward
110 |         return reward_loss
111 |     
112 |     def replay_loss(self, R, V):
113 |         return torch.mean(torch.square(R - V))
114 |         
115 |     def forward_loss(self, states, R, actions):
116 |         states, R, actions = totorch_many(states, R, actions, device=self.device)
117 |         actions_onehot = F.one_hot(actions.long(), num_classes=self.action_size)
118 |         policies, values = self.forward(states)
119 |         forward_loss = self.policy.loss(policies, R, values, actions_onehot)
120 |         return forward_loss
121 |     
122 |     def auxiliary_loss(self, reward_states, rewards, Qaux_target, Qaux_actions, replay_states, replay_R):
123 |         reward_states, rewards, Qaux_target, Qaux_actions, replay_states, replay_R = totorch_many(reward_states, rewards, Qaux_target,
124 |                                                                                                     Qaux_actions, replay_states, replay_R, device=self.device)
125 |         
126 |         policy_enc = self.policy.model(replay_states)
127 |         replay_values = self.policy.V(policy_enc)
128 |         reward_loss = self.reward_loss(reward_states, rewards)
129 |         replay_loss = self.replay_loss(replay_R, replay_values)
130 |         aux_loss = self.RP * reward_loss + self.VR * replay_loss
131 |         
132 |         Qaux_actions = Qaux_actions.long()
133 |         
134 |         if self.pixel_control:
135 |             Qaux = self.Qaux(policy_enc)
136 |             pixel_loss = self.pixel_loss(Qaux, Qaux_actions, Qaux_target)
137 |             aux_loss += self.PC * pixel_loss
138 |         
139 |         return aux_loss
140 | 
141 |     def backprop(self, states, R, actions, reward_states, rewards, Qaux_target, Qaux_actions, replay_states, replay_R):
142 |         forward_loss = self.forward_loss(states, R, actions)
143 |         aux_losses = self.auxiliary_loss(reward_states, rewards, Qaux_target, Qaux_actions, replay_states, replay_R)
144 | 
145 |         loss = forward_loss + aux_losses
146 |         loss.backward()
147 |         if self.grad_clip is not None:
148 |             torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip)
149 |         self.optimiser.step()
150 |         self.optimiser.zero_grad()
151 |         self.scheduler.step()
152 |         return loss.detach().cpu().numpy()
153 | 
154 | 
155 | 
156 | 
157 | 
158 | class UnrealTrainer(SyncMultiEnvTrainer):
159 |     def __init__(self, envs, model,  val_envs, train_mode='nstep', log_dir='logs/UnrealA2C2', model_dir='models/UnrealA2C2', total_steps=1000000, nsteps=5,
160 |                 normalise_obs=True, validate_freq=1000000, save_freq=0, render_freq=0, num_val_episodes=50, replay_length=2000, max_val_steps=10000, log_scalars=True):
161 |         
162 |         super().__init__(envs, model,  val_envs, train_mode=train_mode,  log_dir=log_dir, model_dir=model_dir, total_steps=total_steps, nsteps=nsteps, validate_freq=validate_freq,
163 |                             save_freq=save_freq, render_freq=render_freq, update_target_freq=0, num_val_episodes=num_val_episodes, max_val_steps=max_val_steps, log_scalars=log_scalars)
164 |         
165 |         self.replay = deque([], maxlen=replay_length) #replay length per actor
166 |         self.action_size = self.model.action_size
167 | 
168 |         hyper_paras = {'learning_rate':model.lr, 'grad_clip':model.grad_clip, 'nsteps':nsteps, 'num_workers':self.num_envs,
169 |                   'total_steps':self.total_steps, 'entropy_coefficient':model.entropy_coeff, 'value_coefficient':model.value_coeff,
170 |                   'gamma':self.gamma, 'lambda':self.lambda_}
171 |         
172 |         if log_scalars:
173 |             filename = log_dir + '/hyperparameters.txt'
174 |             self.save_hyperparameters(filename, **hyper_paras)
175 |         
176 |         self.normalise_obs = normalise_obs
177 |         
178 |         if self.normalise_obs:
179 |             self.obs_running = RunningMeanStd()
180 |             self.state_mean = np.zeros_like(self.states)
181 |             self.state_std = np.ones_like(self.states)
182 |             self.aux_reward_rolling = RunningMeanStd()
183 |     
184 |     def populate_memory(self):
185 |         for t in range(2000//self.nsteps):
186 |             states, *_ = self.rollout()
187 |             #self.state_mean, self.state_std = self.obs_running.update(fold_batch(states)[...,-1:])
188 |             self.update_minmax(states)
189 | 
190 | 
191 |     def update_minmax(self, obs):
192 |         minima = obs.min()
193 |         maxima = obs.max()
194 |         if minima < self.state_min:
195 |             self.state_min = minima
196 |         if maxima > self.state_max:
197 |             self.state_max = maxima
198 |     
199 |     def norm_obs(self, obs):
200 |         ''' normalise pixel intensity changes by recording min and max pixel observations
201 |             not using per pixel normalisation because expected image is singular greyscale frame
202 |         '''
203 |         return (obs - self.state_min) * (1/(self.state_max - self.state_min))
204 |     
205 |     def auxiliary_target(self, pixel_rewards, last_values, dones):
206 |         T = len(pixel_rewards)
207 |         R = np.zeros((T,*last_values.shape))
208 |         dones = dones[:,:,np.newaxis,np.newaxis]
209 |         R[-1] = last_values * (1-dones[-1])
210 |         
211 |         for i in reversed(range(T-1)):
212 |             # restart score if done as BatchEnv automatically resets after end of episode
213 |             R[i] = pixel_rewards[i] + 0.99 * R[i+1] * (1-dones[-1])
214 |         
215 |         return R
216 |     
217 |     def pixel_rewards(self, prev_state, states):
218 |         # states of rank [T, B, channels, 84, 84]
219 |         T = len(states) # time length 
220 |         B = states.shape[1] # batch size
221 |         pixel_rewards = np.zeros((T,B,21,21))
222 |         states = states[:,:,-1,:,:]
223 |         prev_state = prev_state[:,-1,:,:]
224 |         if self.normalise_obs:
225 |             states = self.norm_obs(states)
226 |             #print('states, max', states.max(), 'min', states.min(), 'mean', states.mean())
227 |             prev_state = self.norm_obs(prev_state)
228 |             
229 |         pixel_rewards[0] = np.abs(states[0] - prev_state).reshape(-1,4,4,21,21).mean(axis=(1,2))
230 |         for i in range(1,T):
231 |             pixel_rewards[i] = np.abs(states[i] - states[i-1]).reshape(-1,4,4,21,21).mean(axis=(1,2))
232 |         #print('pixel reward',pixel_rewards.shape, 'max', pixel_rewards.max(), 'mean', pixel_rewards.mean())
233 |         return pixel_rewards
234 | 
235 |     def sample_replay(self):
236 |         workers = np.random.choice(self.num_envs, replace=False, size=2) # randomly sample from one of n workers
237 |         sample_start = np.random.randint(1, len(self.replay) - self.nsteps -2)
238 |         replay_sample = []
239 |         for i in range(sample_start, sample_start+self.nsteps):
240 |             replay_sample.append(self.replay[i])
241 |                 
242 |         replay_states = np.stack([replay_sample[i][0][workers] for i in range(len(replay_sample))])
243 |         replay_actions = np.stack([replay_sample[i][1][workers] for i in range(len(replay_sample))])
244 |         replay_rewards = np.stack([replay_sample[i][2][workers] for i in range(len(replay_sample))])
245 |         replay_values = np.stack([replay_sample[i][3][workers] for i in range(len(replay_sample))])
246 |         replay_dones = np.stack([replay_sample[i][4][workers] for i in range(len(replay_sample))])
247 |         #print('replay dones shape', replay_dones.shape)
248 |         #print('replay_values shape', replay_values.shape)
249 |         
250 |         next_state = self.replay[sample_start+self.nsteps][0][workers] # get state 
251 |         _, replay_last_values = self.model.evaluate(next_state)
252 |         replay_R = GAE(replay_rewards, replay_values, replay_last_values, replay_dones, gamma=0.99, lambda_=0.95) + replay_values
253 | 
254 |         if self.model.pixel_control:
255 |             prev_states = self.replay[sample_start-1][0][workers]
256 |             Qaux_value = self.model.get_pixel_control(next_state)
257 |             pixel_rewards = self.pixel_rewards(prev_states, replay_states)
258 |             Qaux_target = self.auxiliary_target(pixel_rewards, np.max(Qaux_value, axis=1), replay_dones)
259 |         else:
260 |             Qaux_target = np.zeros((len(replay_states),1,1,1)) # produce fake Qaux to save writing unecessary code
261 |         
262 |         return fold_batch(replay_states), fold_batch(replay_actions), fold_batch(replay_R), fold_batch(Qaux_target), fold_batch(replay_dones)
263 |         #return replay_states, replay_actions, replay_R, Qaux_target, replay_dones
264 |     
265 |     def sample_reward(self):
266 |         # worker = np.random.randint(0,self.num_envs) # randomly sample from one of n workers
267 |         replay_rewards = np.array([self.replay[i][2] for i in range(len(self.replay))])
268 |         worker = np.argmax(np.sum(replay_rewards, axis=0)) # sample experience from best worker
269 |         nonzero_idxs = np.where(np.abs(replay_rewards) > 0)[0] # idxs where |reward| > 0 
270 |         zero_idxs = np.where(replay_rewards == 0)[0] # idxs where reward == 0 
271 |         
272 |         
273 |         if len(nonzero_idxs) ==0 or len(zero_idxs) == 0: # if nonzero or zero idxs do not exist i.e. all rewards same sign 
274 |             idx = np.random.randint(len(replay_rewards))
275 |         elif np.random.uniform() > 0.5: # sample from zero and nonzero rewards equally
276 |             #print('nonzero')
277 |             idx = np.random.choice(nonzero_idxs)
278 |         else:
279 |             idx = np.random.choice(zero_idxs)
280 |         
281 |         
282 |         reward_states = self.replay[idx][0][worker]
283 |         reward = np.array([sign(replay_rewards[idx,worker])]) # source of error
284 |     
285 |         return reward_states[None], reward
286 |     
287 |     def _train_nstep(self):
288 |         batch_size = self.num_envs * self.nsteps
289 |         num_updates = self.total_steps // batch_size
290 |         s = 0
291 |         self.state_min = 0
292 |         self.state_max = 0
293 |         self.populate_memory()
294 |         # main loop
295 |         start = time.time()
296 |         for t in range(1,num_updates+1):
297 |             states, actions, rewards, values, dones, last_values = self.rollout()
298 |             
299 |             # R = self.nstep_return(rewards, last_values, dones, clip=False)
300 |             R = GAE(rewards, values, last_values, dones, gamma=0.99, lambda_=0.95) + values
301 |             
302 |             # stack all states, actions and Rs across all workers into a single batch
303 |             states, actions, rewards, R = fold_batch(states), fold_batch(actions), fold_batch(rewards), fold_batch(R)
304 |             
305 |             #self.state_mean, self.state_std = self.obs_running.update(states[...,-1:]) # update state normalisation statistics
306 |             self.update_minmax(states)
307 | 
308 |             reward_states, sample_rewards = self.sample_reward()
309 |             replay_states, replay_actions, replay_R, Qaux_target, replay_dones = self.sample_replay()
310 |             
311 |             l = self.model.backprop(states, R, actions,
312 |                 reward_states, sample_rewards, Qaux_target, replay_actions, replay_states, replay_R)
313 |             
314 |             if self.render_freq > 0 and t % ((self.validate_freq // batch_size) * self.render_freq) == 0:
315 |                 render = True
316 |             else:
317 |                 render = False
318 |      
319 |             if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0:
320 |                 self.validation_summary(t,l,start,render)
321 |                 start = time.time()
322 |             
323 |             if self.save_freq > 0 and  t % (self.save_freq // batch_size) == 0:
324 |                 s += 1
325 |                 self.save(self.s)
326 |                 print('saved model')
327 | 
328 | 
329 |     def rollout(self,):
330 |         rollout = []
331 |         for t in range(self.nsteps):
332 |             policies, values = self.model.evaluate(self.states)
333 |             # Qaux = self.model.get_pixel_control(self.states, self.prev_hidden, self.prev_actions_rewards[np.newaxis])
334 |             actions = fastsample(policies)
335 |             next_states, rewards, dones, infos = self.env.step(actions)
336 | 
337 |             rollout.append((self.states, actions, rewards, values, dones))
338 |             self.replay.append((self.states, actions, rewards, values, dones)) # add to replay memory
339 |             self.states = next_states
340 |         
341 |         states, actions, rewards, values, dones = stack_many(*zip(*rollout))
342 |         _, last_values = self.model.evaluate(next_states)
343 |         return states, actions, rewards, values, dones, last_values
344 | 
345 |     def get_action(self, state):
346 |         policy, value = self.model.evaluate(state)
347 |         action = int(np.random.choice(policy.shape[1], p=policy[0]))
348 |         return action
349 | 
350 | 
351 | def main(env_id):
352 |     num_envs = 32
353 |     nsteps = 20
354 | 
355 |     
356 |     
357 |     
358 |     classic_list = ['MountainCar-v0', 'Acrobot-v1', 'LunarLander-v2', 'CartPole-v0', 'CartPole-v1']
359 |     if any(env_id in s for s in classic_list):
360 |         print('Classic Control')
361 |         val_envs = [gym.make(env_id) for i in range(16)]
362 |         envs = BatchEnv(DummyEnv, env_id, num_envs, blocking=False)
363 | 
364 |     elif 'ApplePicker' in env_id:
365 |         print('ApplePicker')
366 |         make_args = {'num_objects':300, 'default_reward':0}
367 |         val_envs = [apple_pickgame(gym.make(env_id, **make_args), max_steps=5000, auto_reset=False, grey_scale=False, k=1) for i in range(15)]
368 |         envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=True, grey_scale=False, k=1, make_args=make_args)
369 |         print(val_envs[0])
370 |         print(envs.envs[0])
371 | 
372 |     else:
373 |         print('Atari')
374 |         env = gym.make(env_id)
375 |         if env.unwrapped.get_action_meanings()[1] == 'FIRE':
376 |             reset = True
377 |             print('fire on reset')
378 |         else:
379 |             reset = False
380 |             print('only stack frames')
381 |         env.close()
382 |         val_envs = [AtariEnv(gym.make(env_id), k=4, episodic=False, reset=reset, clip_reward=False) for i in range(15)]
383 |         envs = BatchEnv(AtariEnv, env_id, num_envs, blocking=False, k=4, reset=reset, episodic=False, clip_reward=True, time_limit=4500)
384 |         
385 |     
386 |     
387 |     action_size = val_envs[0].action_space.n
388 |     input_size = val_envs[0].reset().shape
389 |     
390 |     
391 |     current_time = datetime.datetime.now().strftime('%y-%m-%d_%H-%M-%S')
392 |     train_log_dir = 'logs/UnrealA2C2/' + env_id + '/' + current_time
393 |     model_dir = "models/UnrealA2C2/" + env_id + '/' + current_time
394 | 
395 | 
396 | 
397 |     model = UnrealA2C2(UniverseCNN,
398 |                       input_shape=input_size,
399 |                       action_size=action_size,
400 |                       PC=1,
401 |                       entropy_coeff=0.01,
402 |                       lr=1e-3,
403 |                       lr_final=1e-6,
404 |                       decay_steps=50e6//(num_envs*nsteps),
405 |                       pixel_control=True,
406 |                       grad_clip=0.5,
407 |                       policy_args=dict(),
408 |                       ).cuda()
409 | 
410 |     
411 | 
412 |     auxiliary = UnrealTrainer(envs=envs,
413 |                                 model=model,
414 |                                 model_dir=model_dir,
415 |                                 log_dir=train_log_dir,
416 |                                 val_envs=val_envs,
417 |                                 train_mode='nstep',
418 |                                 total_steps=50e6,
419 |                                 nsteps=nsteps,
420 |                                 normalise_obs=True,
421 |                                 validate_freq=5e5,
422 |                                 save_freq=0,
423 |                                 render_freq=0,
424 |                                 num_val_episodes=15,
425 |                                 log_scalars=True)
426 | 
427 |     
428 |     
429 |     
430 | 
431 |     auxiliary.train()
432 | 
433 |     del auxiliary
434 | 
435 | 
436 | if __name__ == "__main__":
437 |     import apple_picker
438 |     env_id_list = ['SpaceInvadersDeterministic-v4', 'MontezumaRevengeDeterministic-v4' 'FreewayDeterministic-v4', 'PongDeterministic-v4' ]
439 |     #env_id_list = ['MountainCar-v0','CartPole-v1', 'Acrobot-v1']
440 |     env_id_list = ['ApplePicker-v0']
441 |     for env_id in env_id_list:
442 |         main(env_id)
443 |     


--------------------------------------------------------------------------------
/rlib/RND/RND.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import gym
  3 | import time, datetime
  4 | import torch
  5 | import torch.nn.functional as F
  6 | from rlib.utils.utils import fold_batch, one_hot, Welfords_algorithm, stack_many, RunningMeanStd, tonumpy_many
  7 | 
  8 | from rlib.networks.networks import *
  9 | from rlib.utils.VecEnv import*
 10 | from rlib.utils.wrappers import*
 11 | from rlib.utils.SyncMultiEnvTrainer import SyncMultiEnvTrainer
 12 | from rlib.utils.utils import fastsample, fold_batch, tonumpy, totorch, totorch_many, stack_many, fold_many
 13 | from rlib.utils.schedulers import polynomial_sheduler
 14 | 
 15 | 
 16 | class RewardForwardFilter(object):
 17 |     # https://github.com/openai/random-network-distillation
 18 |     def __init__(self, gamma):
 19 |         self.rewems = None
 20 |         self.gamma = gamma
 21 |     def update(self, rews):
 22 |         if self.rewems is None:
 23 |             self.rewems = rews
 24 |         else:
 25 |             self.rewems = self.rewems * self.gamma + rews
 26 |         return self.rewems      
 27 | 
 28 | 
 29 | 
 30 | class PPOIntrinsic(torch.nn.Module):
 31 |     def __init__(self, model, input_size, action_size, lr=1e-3, lr_final=0, decay_steps=6e5, grad_clip=0.5,
 32 |                     entropy_coeff=0.01, policy_clip=0.1, extr_coeff=2.0, intr_coeff=1.0,
 33 |                     build_optimiser=True, optim=torch.optim.Adam, optim_args={}, device='cuda', **model_args):
 34 |         super(PPOIntrinsic, self).__init__()
 35 |         self.action_size = action_size
 36 |         self.input_size = input_size
 37 |         
 38 |         self.lr = lr
 39 |         self.lr_final = lr_final
 40 |         self.decay_steps = decay_steps
 41 |         self.grad_clip = grad_clip
 42 |         
 43 |         self.entropy_coeff = entropy_coeff
 44 |         self.policy_clip = policy_clip
 45 |         self.extr_coeff = extr_coeff
 46 |         self.intr_coeff = intr_coeff
 47 | 
 48 |         self.device = device
 49 | 
 50 |         self.model = model(input_size, **model_args).to(self.device)
 51 |         self.dense_size = dense_size = self.model.dense_size
 52 |         self.policy = torch.nn.Sequential(torch.nn.Linear(dense_size, action_size), torch.nn.Softmax(dim=-1)).to(self.device) # Actor
 53 |         self.Ve = torch.nn.Linear(dense_size, 1).to(self.device) # Critic (Extrinsic)
 54 |         self.Vi = torch.nn.Linear(dense_size, 1).to(self.device) # Intrinsic Value i.e. expected instrinsic value of state 
 55 | 
 56 |         if build_optimiser:
 57 |             self.optimiser = optim(self.parameters(), lr, **optim_args)
 58 |             self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1)
 59 |         
 60 |     
 61 |     def forward(self, state):
 62 |         state_enc = self.model(state)
 63 |         policy = self.policy(state_enc)
 64 |         value_extr = self.Ve(state_enc).view(-1)
 65 |         value_intr = self.Ve(state_enc).view(-1)
 66 |         return policy, value_extr, value_intr
 67 |     
 68 |     def evaluate(self, state):
 69 |         with torch.no_grad():
 70 |             policy, value_extr, value_intr = self.forward(totorch(state, self.device))
 71 |         return tonumpy(policy), tonumpy(value_extr), tonumpy(value_intr)
 72 | 
 73 |     
 74 |     def loss(self, policy, Re, Ri, Ve, Vi, Adv, action_onehot, old_policy):
 75 |         extr_value_loss = 0.5 * torch.mean(torch.square(Re - Ve))
 76 |         intr_value_loss = 0.5 * torch.mean(torch.square(Ri - Vi))
 77 | 
 78 |         policy_actions = torch.sum(policy * action_onehot, dim=1)
 79 |         old_policy_actions = torch.sum(old_policy * action_onehot, dim=1)
 80 |         ratio = policy_actions / old_policy_actions
 81 |         policy_loss_unclipped = ratio * -Adv
 82 |         policy_loss_clipped = torch.clip_(ratio, 1 - self.policy_clip, 1 + self.policy_clip) * -Adv
 83 |         policy_loss = torch.mean(torch.maximum(policy_loss_unclipped, policy_loss_clipped))
 84 |         entropy = torch.mean(torch.sum(policy * -torch.log(policy), dim=1))
 85 | 
 86 |         value_loss = self.extr_coeff * extr_value_loss + self.intr_coeff * intr_value_loss
 87 |         loss =  policy_loss + value_loss - self.entropy_coeff * entropy
 88 |         return loss
 89 | 
 90 |     def backprop(self, state, Re, Ri, Adv, action, old_policy):
 91 |         state, action, Re, Ri, Adv, old_policy = totorch_many(state, action, Re, Ri, Adv, old_policy, device=self.device)
 92 |         action_onehot = F.one_hot(action.long(), self.action_size)
 93 |         policy, Ve, Vi = self.forward(state)
 94 |         loss = self.loss(policy, Re, Ri, Ve, Vi, Adv, action_onehot, old_policy)
 95 | 
 96 |         loss.backward()
 97 |         if self.grad_clip is not None:
 98 |             torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip)
 99 |         self.optimiser.step()
100 |         self.optimiser.zero_grad()
101 |         self.scheduler.step()
102 |         return loss.detach().cpu().numpy()
103 | 
104 | 
105 | class PredictorCNN(torch.nn.Module):
106 |     def __init__(self, input_size, conv1_size=32, conv2_size=64, conv3_size=64, dense_size=512, padding=[0,0], init_scale=np.sqrt(2), scale=True, trainable=True):
107 |         # input_shape [channels, height, width]
108 |         super(PredictorCNN, self).__init__()
109 |         self.scale = scale
110 |         self.dense_size = dense_size
111 |         self.input_size = input_size
112 |         self.init_scale = init_scale
113 |         self.h1 = torch.nn.Sequential(torch.nn.Conv2d(input_size[0], conv1_size, kernel_size=[8,8], stride=[4,4], padding=padding), torch.nn.LeakyReLU())
114 |         self.h2 = torch.nn.Sequential(torch.nn.Conv2d(conv1_size, conv2_size, kernel_size=[4,4], stride=[2,2], padding=padding), torch.nn.LeakyReLU())
115 |         self.h3 = torch.nn.Sequential(torch.nn.Conv2d(conv2_size, conv3_size, kernel_size=[3,3], stride=[1,1], padding=padding), torch.nn.LeakyReLU())
116 |         self.flatten = torch.nn.Flatten()
117 |         c, h, w = self._conv_outsize()
118 |         in_size = h*w*c
119 |         if trainable:
120 |             self.dense = torch.nn.Sequential(
121 |                             torch.nn.Linear(h*w*c, dense_size), torch.nn.ReLU(),
122 |                             torch.nn.Linear(dense_size, dense_size), torch.nn.ReLU(),
123 |                             torch.nn.Linear(dense_size, dense_size)
124 |             )
125 |         else:
126 |             self.dense = torch.nn.Linear(h*w*c, dense_size)
127 |         
128 |         self.init_weights()
129 |         self.set_trainable(trainable)
130 |     
131 |     def set_trainable(self, trainable):
132 |         if not trainable:
133 |             for param in self.parameters():
134 |                 param.requires_grad = False
135 | 
136 |     def init_weights(self):
137 |         self.apply(self._init_weights)
138 |     
139 |     def _init_weights(self, module):
140 |         if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
141 |             torch.nn.init.orthogonal_(module.weight, gain=self.init_scale)
142 | 
143 |     def _conv_outsize(self):
144 |         _, h, w = self.input_size
145 |         h, w = conv2d_outsize(h, w, self.h1[0].kernel_size, self.h1[0].stride, self.h1[0].padding)
146 |         h, w = conv2d_outsize(h, w, self.h2[0].kernel_size, self.h2[0].stride, self.h2[0].padding)
147 |         h, w = conv2d_outsize(h, w, self.h3[0].kernel_size, self.h3[0].stride, self.h3[0].padding)
148 |         return self.h3[0].out_channels, h, w
149 | 
150 |     def forward(self, x):
151 |         x = x/255 if self.scale else x
152 |         x = self.h1(x)
153 |         x = self.h2(x)
154 |         x = self.h3(x)
155 |         x = self.flatten(x)
156 |         x = self.dense(x)
157 |         return x
158 | 
159 | class PredictorMLP(torch.nn.Module):
160 |     def __init__(self, input_size, num_layers=2, dense_size=64, activation=torch.nn.LeakyReLU, init_scale=np.sqrt(2), trainable=True):
161 |         # input_shape = feature_size
162 |         super(PredictorMLP, self).__init__()
163 |         self.dense_size = dense_size
164 |         self.input_size = input_size
165 |         self.init_scale = init_scale
166 |         layers = []
167 |         in_size = input_size
168 |         for l in range(num_layers):
169 |             layers.append(torch.nn.Linear(in_size, dense_size))
170 |             layers.append(activation())
171 |             in_size = dense_size
172 |         layers.append(torch.nn.Linear(dense_size, dense_size))
173 |         self.layers = torch.nn.ModuleList(layers)
174 |         
175 |         self.init_weights()
176 |         self.set_trainable(trainable)
177 |     
178 |     def set_trainable(self, trainable):
179 |         if not trainable:
180 |             for param in self.parameters():
181 |                 param.requires_grad = False
182 |     
183 |     def init_weights(self):
184 |         self.apply(self._init_weights)
185 |     
186 |     def _init_weights(self, module):
187 |         if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
188 |             torch.nn.init.orthogonal_(module.weight, gain=self.init_scale)
189 | 
190 |     def forward(self, x):
191 |         for layer in self.layers:
192 |             x = layer(x)
193 |         return x
194 | 
195 | class RND(torch.nn.Module):
196 |     # EXPLORATION BY RANDOM NETWORK DISTILLATION
197 |     # https://arxiv.org/pdf/1810.12894.pdf
198 |     def __init__(self, policy_model, target_model, input_size, action_size, entropy_coeff=0.001,
199 |                  intr_coeff=0.5, extr_coeff=1.0, lr=1e-4, lr_final=0, decay_steps=1e5, grad_clip=0.5, policy_clip=0.1, policy_args={}, RND_args={}, optim=torch.optim.Adam, optim_args={}, device='cuda'):
200 |         super(RND, self).__init__()
201 |         self.intr_coeff = intr_coeff
202 |         self.extr_coeff = extr_coeff
203 |         self.entropy_coeff = entropy_coeff
204 |         self.lr = lr
205 |         self.grad_clip = grad_clip
206 |         self.action_size = action_size
207 |         self.device = device
208 |         
209 |         target_size = (1, input_size[1], input_size[2]) if len(input_size) == 3 else input_size # only use last frame in frame-stack for convolutions
210 | 
211 |         self.policy = PPOIntrinsic(policy_model, input_size, action_size, lr, lr_final, decay_steps, grad_clip,
212 |                                     entropy_coeff=entropy_coeff, policy_clip=policy_clip, extr_coeff=extr_coeff, intr_coeff=intr_coeff, device=device, build_optimiser=False, **policy_args)
213 |         
214 |         # randomly weighted and fixed neural network, acts as a random_id for each state
215 |         self.target_model = target_model(target_size, trainable=False).to(device)
216 | 
217 |         # learns to predict target model 
218 |         # i.e. provides rewards based ability to predict a fixed random function, thus behaves as density map of explored areas
219 |         self.predictor_model = target_model(target_size, trainable=True).to(device)
220 |         
221 |         self.optimiser = optim(self.parameters(), lr, **optim_args)
222 |         self.scheduler = polynomial_sheduler(self.optimiser, lr_final, decay_steps, power=1)
223 | 
224 | 
225 |     def forward(self, state):
226 |         return self.policy.forward(state)
227 |     
228 |     def evaluate(self, state):
229 |         return self.policy.evaluate(state)
230 |     
231 |     def _intr_reward(self, next_state, state_mean, state_std):
232 |         norm_next_state = torch.clip((next_state - state_mean) / state_std, -5, 5)
233 |         intr_reward = torch.square(self.predictor_model(norm_next_state) - self.target_model(norm_next_state).detach()).sum(dim=-1)
234 |         return intr_reward 
235 |     
236 |     def intrinsic_reward(self, next_state:np.ndarray, state_mean:np.ndarray, state_std):
237 |         next_state, state_mean, state_std = totorch_many(next_state, state_mean, state_std, device=self.device)
238 |         with torch.no_grad():
239 |             intr_reward = self._intr_reward(next_state, state_mean, state_std)
240 |         return tonumpy(intr_reward)
241 |     
242 | 
243 |     def backprop(self, state, next_state, R_extr, R_intr, Adv, actions, old_policy, state_mean, state_std):
244 |         state, next_state, R_extr, R_intr, Adv, actions, old_policy, state_mean, state_std = totorch_many(state, next_state, R_extr, R_intr,
245 |                                                                                                           Adv, actions, old_policy, state_mean, state_std, device=self.device)
246 |         policy, Ve, Vi = self.policy.forward(state)
247 |         actions_onehot = F.one_hot(actions.long(), self.action_size)
248 |         policy_loss = self.policy.loss(policy, R_extr, R_intr, Ve, Vi, Adv, actions_onehot, old_policy)
249 |         
250 |         predictor_loss = self._intr_reward(next_state, state_mean, state_std).mean()
251 |         loss = policy_loss + predictor_loss
252 |         
253 |         loss.backward()
254 |         if self.grad_clip is not None:
255 |             torch.nn.utils.clip_grad_norm_(self.parameters(), self.grad_clip)
256 |         self.optimiser.step()
257 |         self.optimiser.zero_grad()
258 |         self.scheduler.step()
259 |         return loss.detach().cpu().numpy()
260 | 
261 | 
262 | class RNDTrainer(SyncMultiEnvTrainer):
263 |     def __init__(self, envs, model, val_envs, train_mode='nstep', log_dir='logs/', model_dir='models/', total_steps=1000000, nsteps=5, gamma_extr=0.999, gamma_intr=0.99, lambda_=0.95, 
264 |                     init_obs_steps=600, num_epochs=4, num_minibatches=4, validate_freq=1000000.0, save_freq=0, render_freq=0, num_val_episodes=50, max_val_steps=10000, log_scalars=True):
265 |         
266 |         super().__init__(envs, model, val_envs, train_mode=train_mode, log_dir=log_dir, model_dir=model_dir, total_steps=total_steps, nsteps=nsteps, gamma=gamma_extr, lambda_=lambda_,
267 |                             validate_freq=validate_freq, save_freq=save_freq, render_freq=render_freq, update_target_freq=0, num_val_episodes=num_val_episodes, max_val_steps=max_val_steps, log_scalars=log_scalars)
268 |         
269 |         self.gamma_intr = gamma_intr
270 |         self.num_epochs = num_epochs
271 |         self.num_minibatches = num_minibatches
272 |         self.pred_prob = 1 / (self.num_envs / 32.0)
273 |         self.state_obs = RunningMeanStd()
274 |         self.forward_filter = RewardForwardFilter(gamma_intr)
275 |         self.intr_rolling = RunningMeanStd()
276 |         self.init_obs_steps = init_obs_steps
277 | 
278 |         hyper_paras = {'learning_rate':model.lr, 'grad_clip':model.grad_clip, 'nsteps':self.nsteps, 'num_workers':self.num_envs, 'total_steps':self.total_steps,
279 |                         'entropy_coefficient':0.001, 'value_coefficient':1.0, 'intrinsic_value_coefficient':model.intr_coeff,
280 |                         'extrinsic_value_coefficient':model.extr_coeff, 'init_obs_steps':init_obs_steps, 'gamma_intrinsic':self.gamma_intr, 'gamma_extrinsic':self.gamma,
281 |                         'lambda':self.lambda_, 'predictor_dropout_probability':self.pred_prob
282 |                         }
283 |         
284 |         if log_scalars:
285 |             filename = log_dir + '/hyperparameters.txt'
286 |             self.save_hyperparameters(filename, **hyper_paras)
287 |     
288 |     def init_state_obs(self, num_steps):
289 |         states = 0
290 |         for i in range(num_steps):
291 |             rand_actions = np.random.randint(0, self.model.action_size, size=self.num_envs)
292 |             next_states, rewards, dones, infos = self.env.step(rand_actions)
293 |             next_states = next_states[:, -1] if len(next_states.shape) == 4 else next_states # [num_envs, channels, height, width] for convolutions, assume frame stack
294 |             states += next_states
295 |         return states / num_steps
296 |     
297 |     
298 |     def _train_nstep(self):
299 |         # stats for normalising states
300 |         self.state_mean, self.state_std = self.state_obs.update(self.init_state_obs(self.init_obs_steps))
301 |         self.states = self.env.reset() # reset to state s_0
302 | 
303 |         batch_size = self.num_envs * self.nsteps
304 |         num_updates = self.total_steps // batch_size
305 |         s = 0
306 |         mini_batch_size = self.nsteps//self.num_minibatches
307 |         start = time.time()
308 |         # main loop
309 |         for t in range(1,num_updates+1):
310 |             states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, last_values_extr, last_values_intr, old_policies, dones = self.rollout()
311 |             self.state_mean, self.state_std = self.state_obs.update(next_states) # update state normalisation statistics
312 |             mean, std = self.state_mean, self.state_std
313 | 
314 |             int_rff = np.array([self.forward_filter.update(intr_rewards[i]) for i in range(len(intr_rewards))]) 
315 |             R_intr_mean, R_intr_std = self.intr_rolling.update(int_rff.ravel()) # normalise intrinsic rewards
316 |             intr_rewards /= R_intr_std
317 |             
318 |             
319 |             Adv_extr = self.GAE(extr_rewards, values_extr, last_values_extr, dones, gamma=self.gamma, lambda_=self.lambda_)
320 |             Adv_intr = self.GAE(intr_rewards, values_intr, last_values_intr, dones, gamma=self.gamma_intr, lambda_=self.lambda_)
321 |             Re = Adv_extr + values_extr
322 |             Ri = Adv_intr + values_intr
323 |             total_Adv = Adv_extr + Adv_intr
324 |             l = 0
325 |             
326 |             # perform minibatch gradient descent for K epochs 
327 |             idxs = np.arange(len(states))
328 |             for epoch in range(self.num_epochs):
329 |                 np.random.shuffle(idxs)
330 |                 for batch in range(0, len(states), mini_batch_size):
331 |                     batch_idxs = idxs[batch: batch + mini_batch_size]
332 |                     # stack all states, actions and Rs across all workers into a single batch
333 |                     mb_states, mb_nextstates, mb_actions, mb_Re, mb_Ri, mb_Adv, mb_old_policies = fold_many(states[batch_idxs], next_states[batch_idxs], \
334 |                                                                                                                  actions[batch_idxs], Re[batch_idxs], Ri[batch_idxs], \
335 |                                                                                                                  total_Adv[batch_idxs], old_policies[batch_idxs])
336 |                     
337 |                     mb_nextstates = mb_nextstates[np.where(np.random.uniform(size=(mini_batch_size)) < self.pred_prob)]
338 |                     l += self.model.backprop(mb_states.copy(), mb_nextstates.copy(), mb_Re.copy(), mb_Ri.copy(), mb_Adv.copy(), mb_actions.copy(), mb_old_policies.copy(), mean.copy(), std.copy())
339 |             
340 |             
341 |             l /= self.num_epochs
342 |            
343 |                     
344 |             if self.render_freq > 0 and t % ((self.validate_freq  // batch_size) * self.render_freq) == 0:
345 |                 render = True
346 |             else:
347 |                 render = False
348 |         
349 |             if self.validate_freq > 0 and t % (self.validate_freq // batch_size) == 0:
350 |                 self.validation_summary(t,l,start,render)
351 |                 start = time.time()
352 |             
353 |             if self.save_freq > 0 and  t % (self.save_freq // batch_size) == 0:
354 |                 s += 1
355 |                 self.save(s)
356 |                 print('saved model')
357 |             
358 |     
359 |     def get_action(self, states):
360 |         policies, values_extr, values_intr = self.model.evaluate(states)
361 |         actions = fastsample(policies)
362 |         return actions
363 | 
364 |     def rollout(self):
365 |         rollout = []
366 |         for t in range(self.nsteps):
367 |             policies, values_extr, values_intr = self.model.evaluate(self.states)
368 |             actions = fastsample(policies)
369 |             next_states, extr_rewards, dones, infos = self.env.step(actions)
370 |             
371 |             next_states__ = next_states[:, -1:] if len(next_states.shape) == 4 else next_states # [num_envs, channels, height, width] for convolutions 
372 |             intr_rewards = self.model.intrinsic_reward(next_states__, self.state_mean, self.state_std)
373 |             
374 |             rollout.append((self.states, next_states__, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones))
375 |             self.states = next_states
376 | 
377 |         states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, policies, dones = stack_many(*zip(*rollout))
378 |         last_policy, last_values_extr, last_values_intr, = self.model.evaluate(self.states)
379 |         return states, next_states, actions, extr_rewards, intr_rewards, values_extr, values_intr, last_values_extr, last_values_intr, policies, dones
380 |     
381 | 
382 | def main(env_id):
383 |     num_envs = 32
384 |     nsteps = 128
385 | 
386 |     classic_list = ['MountainCar-v0', 'Acrobot-v1', 'LunarLander-v2', 'CartPole-v0', 'CartPole-v1']
387 |     if any(env_id in s for s in classic_list):
388 |         print('Classic Control')
389 |         val_envs = [gym.make(env_id) for i in range(10)]
390 |         envs = BatchEnv(DummyEnv, env_id, num_envs, blocking=False)
391 |     
392 |     elif 'ApplePicker' in env_id:
393 |         print('ApplePicker')
394 |         make_args = {'num_objects':300, 'default_reward':0}
395 |         if 'Deterministic' in env_id:
396 |             envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=True, k=4, grey_scale=True, make_args=make_args)
397 |             val_envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=False, k=4, grey_scale=True, make_args=make_args)
398 |             for i in range(len(envs)):
399 |                 val_envs.envs[i].set_locs(envs.envs[i].item_locs_master, envs.envs[i].start_loc)
400 |             val_envs.reset()
401 |         else:
402 |         #val_envs = [apple_pickgame(gym.make(env_id), max_steps=5000, auto_reset=False, k=1) for i in range(16)]
403 |             val_envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=False, k=4, grey_scale=True)
404 |             envs = DummyBatchEnv(apple_pickgame, env_id, num_envs, max_steps=5000, auto_reset=True, k=4, grey_scale=True)
405 |         print(val_envs.envs[0])
406 |         print(envs.envs[0])
407 | 
408 |     else:
409 |         print('Atari')
410 |         env = gym.make(env_id)
411 |         if env.unwrapped.get_action_meanings()[1] == 'FIRE':
412 |             reset = True
413 |             print('fire on reset')
414 |         else:
415 |             reset = False
416 |             print('only stack frames')
417 |         env.close()
418 |         val_envs = BatchEnv(AtariEnv, env_id, num_envs=16, k=4, blocking=False, episodic=False, reset=reset, clip_reward=False, auto_reset=True)
419 |         envs = BatchEnv(AtariEnv, env_id, num_envs, blocking=False, k=4, reset=reset, episodic=False, clip_reward=True)
420 |         
421 |     
422 |     action_size = val_envs.envs[0].action_space.n
423 |     input_size = val_envs.envs[0].reset().shape
424 | 
425 |     print('action_size', action_size)
426 |     print('input_size', input_size)
427 |     
428 |     
429 |     current_time = datetime.datetime.now().strftime('%y-%m-%d_%H-%M-%S')
430 |     train_log_dir = 'logs/RND/' + env_id + '/Adam/' + current_time
431 |     model_dir = "models/RND/" + env_id + '/' + current_time
432 |     
433 |     
434 |     model = RND(NatureCNN,
435 |                 PredictorCNN,
436 |                 input_size=input_size,
437 |                 action_size=action_size,
438 |                 lr=1e-4,
439 |                 lr_final=1e-5,
440 |                 decay_steps=200e6//(num_envs*nsteps),
441 |                 grad_clip=0.5,
442 |                 intr_coeff=1.0,
443 |                 extr_coeff=2.0,
444 |                 entropy_coeff=0.001,
445 |                 optim=torch.optim.Adam,
446 |                 optim_args={},
447 |                 device='cuda'
448 |                 )
449 | 
450 |     
451 |     rnd = RNDTrainer(envs=envs,
452 |                         model=model,
453 |                         model_dir=model_dir,
454 |                         log_dir=train_log_dir,
455 |                         val_envs=val_envs,
456 |                         train_mode='nstep',
457 |                         total_steps=200e6,
458 |                         nsteps=nsteps,
459 |                         init_obs_steps=128*50,
460 |                         num_epochs=4,
461 |                         num_minibatches=4,
462 |                         validate_freq=1e5,
463 |                         save_freq=0,
464 |                         render_freq=0,
465 |                         num_val_episodes=32,
466 |                         log_scalars=False)
467 |     rnd.train()
468 | 
469 | 
470 | if __name__ == "__main__":
471 |     env_id_list = ['MontezumaRevengeDeterministic-v4', 'SpaceInvadersDeterministic-v4', 'FreewayDeterministic-v4']
472 |     #env_id_list = ['MountainCar-v0', 'CartPole-v1' , 'Acrobot-v1', ]
473 |     for env_id in env_id_list:
474 |         main(env_id)
475 |     


--------------------------------------------------------------------------------