├── .demo
    ├── pong-v4_dist4c_nonsmooth.png
    ├── pong-v4_dist4c_smooth.png
    ├── pong-v4_double.png
    ├── pong-v4_dueling.png
    ├── pong-v4_origsingle.png
    ├── pong-v4_prioritized.png
    ├── pong-v4_various_smooth.png
    ├── tensorboard_text.png
    └── test_agent.gif
├── .gitignore
├── LICENSE
├── README.md
├── dev_setup.py
├── examples
    ├── README.md
    ├── deep_q_learning
    │   ├── README.md
    │   ├── atari_play.py
    │   ├── gym_workout.py
    │   ├── run_project
    │   │   ├── ATARI.sh
    │   │   ├── GYM.sh
    │   │   ├── TEST.sh
    │   │   ├── atari_config.yaml
    │   │   └── gym_config.yaml
    │   └── test.py
    ├── gorila_dqn
    │   ├── README.md
    │   ├── client.py
    │   ├── launcher.py
    │   ├── run_project
    │   │   ├── ATARI.sh
    │   │   ├── TEST.sh
    │   │   └── config.yaml
    │   ├── server.py
    │   └── test.py
    └── prioritized_dqn
    │   ├── README.md
    │   ├── learn.py
    │   ├── run_project
    │       ├── ATARI.sh
    │       ├── TEST.sh
    │       └── config.yaml
    │   └── test.py
└── pytorl
    ├── README.md
    ├── __init__.py
    ├── agents
        ├── DQN.py
        ├── __init__.py
        ├── _base_agent.py
        └── dist_DQN.py
    ├── distributed
        ├── __init__.py
        ├── _slurm.py
        ├── async_ops.py
        ├── initialize.py
        ├── param_server.py
        └── sync_ops.py
    ├── envs
        ├── __init__.py
        ├── _base_env.py
        ├── ale_atari.py
        └── gym_ctrl.py
    ├── lib
        ├── __init__.py
        ├── _tree.py
        ├── explore.py
        └── replay.py
    ├── networks
        ├── __init__.py
        ├── atari_conv.py
        ├── ctrl_mlp.py
        └── io.py
    ├── settings
        ├── __init__.py
        └── entries.py
    └── utils
        ├── __init__.py
        ├── config.py
        ├── decorators.py
        └── recorder.py


/.demo/pong-v4_dist4c_nonsmooth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kareido/pytorl/2f2f5258425166b8bfbde985a229fecdef3752d9/.demo/pong-v4_dist4c_nonsmooth.png


--------------------------------------------------------------------------------
/.demo/pong-v4_dist4c_smooth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kareido/pytorl/2f2f5258425166b8bfbde985a229fecdef3752d9/.demo/pong-v4_dist4c_smooth.png


--------------------------------------------------------------------------------
/.demo/pong-v4_double.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kareido/pytorl/2f2f5258425166b8bfbde985a229fecdef3752d9/.demo/pong-v4_double.png


--------------------------------------------------------------------------------
/.demo/pong-v4_dueling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kareido/pytorl/2f2f5258425166b8bfbde985a229fecdef3752d9/.demo/pong-v4_dueling.png


--------------------------------------------------------------------------------
/.demo/pong-v4_origsingle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kareido/pytorl/2f2f5258425166b8bfbde985a229fecdef3752d9/.demo/pong-v4_origsingle.png


--------------------------------------------------------------------------------
/.demo/pong-v4_prioritized.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kareido/pytorl/2f2f5258425166b8bfbde985a229fecdef3752d9/.demo/pong-v4_prioritized.png


--------------------------------------------------------------------------------
/.demo/pong-v4_various_smooth.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kareido/pytorl/2f2f5258425166b8bfbde985a229fecdef3752d9/.demo/pong-v4_various_smooth.png


--------------------------------------------------------------------------------
/.demo/tensorboard_text.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kareido/pytorl/2f2f5258425166b8bfbde985a229fecdef3752d9/.demo/tensorboard_text.png


--------------------------------------------------------------------------------
/.demo/test_agent.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kareido/pytorl/2f2f5258425166b8bfbde985a229fecdef3752d9/.demo/test_agent.gif


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # jupyter notebook for testing
 2 | test_playground*.ipynb
 3 | debug_playground*.ipynb
 4 | visualization.ipynb
 5 | .ipynb_checkpoints
 6 | 
 7 | # Byte-compiled / optimized / DLL files
 8 | __pycache__/
 9 | *.py[cod]
10 | *$py.class
11 | 
12 | # network state_dict
13 | *.pth
14 | 
15 | # train log
16 | log.txt
17 | 
18 | # tensorboard files
19 | events.out.tfevents.*
20 | 
21 | # setting file(s):
22 | pytorl.yaml
23 | 
24 | # others:
25 | .DS_Store
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Zhe Huang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PyToRL: PyTorch Toolbox for Reinforcement Learning
 2 | ### [PROJECT CURRENTLY UNDER DEVELOPMENT]
 3 | 
 4 | &nbsp;  
 5 | 
 6 | **Simple Description:**
 7 | <pre>
 8 | This project, named pytorl, is intended to be an RL toolbox for pytorch and 
 9 | contains RL algorithm implementations using this pytorl toolbox. As I am 
10 | currently learning RL, I am going to update this project with other agents, 
11 | algorithms and faster or more efficient implementations soon. 
12 | </pre>
13 | 
14 | &nbsp;  
15 | 
16 | **Current Progress:**
17 | <pre>
18 | Implemented 4 DQN(and its variants) algorithms and a distributed DQN learning
19 | algorithm named Gorila via parameter server architecture. Will move on to A2C,
20 | A3C, TRPO, PPO ...
21 | 
22 | Note that I use slurm for distributed RL training since my work is done on
23 | clusters, but I still provide a "local run" option which helps the project runs
24 | without slurm :).
25 | </pre>
26 | 
27 | &nbsp;  
28 | 
29 | **Some Dependencies Currently Used for Developing:**
30 | > gym == 0.10.11 with atari  
31 | > numpy == 1.14.3  
32 | > python == 3.6.5  
33 | > pytorch == 1.0.1  
34 | > tensorboard = 1.9.0  
35 | > tensorboardX == 1.4  
36 | 
37 | &nbsp;  
38 | 
39 | **Simple Setup from Scratch:**
40 | ```bash
41 | # 1. clone this repo to local
42 | $ git clone <this repo address>.git
43 | # 2. run simple setup script
44 | # the <path> here is where you want to put your experiments latter on
45 | # your experiments launched by this toolbox provided "rl-run" entry will be 
46 | # automatically moved to the <path> and start executing at there rather than
47 | # running directly at the dir currently you are working on, good for developing
48 | $ python dev_setup.py -dir <path>
49 | ```
50 | 
51 | &nbsp;  
52 | 
53 | **Run Example Projects after Setup:**
54 | ```bash
55 | # run_project folder contains scripts for job starting
56 | $ cd examples/<project main folder>/run_project
57 | $ sh <script filename> [-h] <options>
58 | # for example, you can try sh ATARI.sh -N testrun --local
59 | # you can always use sh <script filename> [-h] for options help
60 | ```
61 | 
62 | &nbsp;  
63 | 
64 | **Demo of Some RL Results:**  
65 | 
66 | ![pong-v4_various_smooth](./.demo/pong-v4_various_smooth.png)  
67 | *(reward/episode while training ale atari PongNoFrameskip-v4 via diff. DQN algorithms)*  
68 | 
69 | &nbsp;  
70 | 
71 | **Acknowledgements:**  
72 | 1. During the development, I referred to some helpful resources, listed as follows:  
73 |     - @qfettes/DeepRL-Tutorials repo for nice tutorials and designs
74 |     - @openai/baselines for some implementations
75 |     - my mentor and colleague Huabin Zheng (https://dblp.org/pers/hd/z/Zheng:Huabin)  
76 |       for technical consultation and extraordinary design ideas
77 |     - other helpful sources and geniuses from websites such as stackoverflow, GeeksforGeeks, etc.
78 | 
79 | <pre>    Thanks them all for making our open source world a better place! :) </pre>
80 | 


--------------------------------------------------------------------------------
/dev_setup.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import argparse
  3 | import os
  4 | import re
  5 | from site import getsitepackages
  6 | import yaml
  7 | 
  8 | """
  9 | Since the whole project is under development, for convenient modification I temporarily do not use
 10 | setuptools. So, this module serves as a 'mock' installation.
 11 | """
 12 | 
 13 | # why module name is also a variable? well, since it's under dev, everything changes ...
 14 | MODULE_NAME = 'pytorl'
 15 | 
 16 | ENTRY_MODULE = '%s.settings.entries' % MODULE_NAME
 17 | 
 18 | ENTRY_POINTS = {
 19 |     'rlrun': 'rl_run',
 20 |     # 'lrun': 'lrun',
 21 | }
 22 | 
 23 | TEMPLATE = '''{2}
 24 | # -*- coding: utf-8 -*-
 25 | import re
 26 | import sys
 27 | 
 28 | from {0} import {1}
 29 | 
 30 | if __name__ == '__main__':
 31 |     sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
 32 |     sys.exit({1}())
 33 | '''
 34 | 
 35 | 
 36 | def setup():
 37 |     parser = argparse.ArgumentParser()
 38 |     parser.add_argument('--experiment-dir', '-dir', required=True,
 39 |         help='specify the directory for %s experiments' % MODULE_NAME)
 40 |     opt = parser.parse_args()
 41 | 
 42 |     ################################################################################
 43 |     # 1. making .yaml to settings
 44 |     print('[Step 1] dump {0}.yaml to {0}/settings ..........'.format(MODULE_NAME), flush=True)
 45 |     exp_dir = os.path.abspath(opt.experiment_dir)
 46 |     yaml_file = os.path.abspath('{0}/settings/{0}.yaml'.format(MODULE_NAME))
 47 | 
 48 |     while True:
 49 |         print('[Step 1] please confirm that from now on '
 50 |               'you want to save your experiments in:\n'
 51 |               '[Step 1] >>>>>>>>>>>>> [%s] <<<<<<<<<<<<< [Y/n]:' % exp_dir, end='', flush=True)
 52 |         response = input().strip()
 53 |         if response in {'Y', 'y'}: break
 54 |         elif response in {'N', 'n'}: sys.exit()
 55 |         else: continue
 56 | 
 57 |     if not os.path.isdir(exp_dir):
 58 |         if os.path.exists(exp_dir): raise FileExistsError('experiment_dir exists and is a file')
 59 |         os.makedirs(exp_dir)
 60 | 
 61 |     yaml_data = {
 62 |         'module_name': MODULE_NAME,
 63 |         'entry_module': ENTRY_MODULE,
 64 |         'entry_points': ENTRY_POINTS,
 65 |         'experiment_dir': exp_dir,
 66 |     }
 67 | 
 68 |     with open(yaml_file, 'w') as f:
 69 |         f.write(
 70 |             '# this yaml file is generated by dev_setup.py as a record and config\n'
 71 |             '# warning: change experiment_dir content will have effect on the rl-run\n'
 72 |             '# only modify it when you are intended to do so\n\n'
 73 |         )
 74 |         stream = yaml.dump(yaml_data, default_flow_style=False)
 75 |         f.write(stream.replace('  ', '    '))
 76 |     print('[Step 1] >>>>>>>>>>>>> [OK] >>>>>>>>>>>>>', flush=True)
 77 | 
 78 |     ################################################################################
 79 |     # 2. write .pth to site-packages
 80 |     print('[Step 2] write %s.pth to site-packages ..........' % MODULE_NAME, flush=True)
 81 |     site_path = getsitepackages()[0]
 82 |     pkg_path = os.getcwd()
 83 | 
 84 |     if not os.path.isdir(os.path.join(pkg_path, MODULE_NAME)):
 85 |         raise ModuleNotFoundError('%s module not found' % MODULE_NAME)
 86 |     if not os.path.isdir(site_path):
 87 |         raise NotADirectoryError('site-packages path [%s] not found' % site_path)
 88 | 
 89 |     site_file = os.path.join(site_path, '%s.pth' % MODULE_NAME)
 90 | 
 91 |     with open(site_file, 'w') as f:
 92 |         f.write(pkg_path)
 93 |     print('[Step 2] >>>>>>>>>>>>> [OK] >>>>>>>>>>>>>', flush=True)
 94 | 
 95 |     ################################################################################
 96 |     # 3. setup command line entry points
 97 |     print('[Step 3] setup command line entry points ..........', flush=True)
 98 |     interpreter = sys.executable
 99 |     shebang = '#!' + interpreter
100 |     bin_path = os.path.dirname(interpreter)
101 | 
102 |     if not os.path.isdir(bin_path):
103 |         raise NotADirectoryError('executable path not found')
104 | 
105 |     for entry, method in ENTRY_POINTS.items():
106 |         script_text = TEMPLATE.format(ENTRY_MODULE, method, shebang)
107 |         bin_file = os.path.join(bin_path, entry)
108 |         with open(bin_file, 'w') as f:
109 |             f.write(script_text)
110 |         os.chmod(bin_file, 0o775)
111 | 
112 |     print('[Step 3] >>>>>>>>>>>>> [OK] >>>>>>>>>>>>>', flush=True)
113 |     print('[Setup ] <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<', flush=True)
114 |     print('[Setup ] <<<<<<<<<< [ALL DONE] <<<<<<<<<<', flush=True)
115 | 
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     setup()
120 | 
121 | 
122 | 
123 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | ### Examples of Different Reinforcement Learning Algorithms
 2 | 
 3 | #### [CURRENTLY UNDER PROGRESS]  
 4 | 
 5 | &nbsp;  
 6 | 
 7 | **Description:**
 8 | <pre> These folders contains RL training scripts implemented using pytorl 
 9 | package, showing how this toolbox can be used to tackle different problems. </pre>
10 | 
11 | &nbsp;  
12 | 
13 | **Implementation List:**
14 | > - [x] DQN
15 | > - [x] Double DQN (in deep_q_learning folder)
16 | > - [x] Prioritized DQN
17 | > - [x] Dueling DQN (in prioritized_DQN folder)
18 | > - [x] Gorila DQN (i.e. distributed DQN)
19 | > - [ ] A2C
20 | > - [ ] A3C
21 | > - [ ] TRPO
22 | > - [ ] PPO1
23 | > - [ ] PPO2
24 | > - [ ] ...
25 | > - [ ] ...  
26 | 
27 | &nbsp;  
28 | 
29 | **Note:**
30 | <pre> You may be noticed that there could be much code redundancy in those examples, 
31 | which seems to be inefficient and unnecessary. In reality, it is designed to be 
32 | that since I believe sometimes redundancy is much better than over modulized 
33 | implemetations.</pre>


--------------------------------------------------------------------------------
/examples/deep_q_learning/README.md:
--------------------------------------------------------------------------------
 1 | ## Natual & Double Deep Q-learning
 2 | 
 3 | This DQN example contains following implementations:  
 4 | 
 5 | *atari_play.py:* This module is designed to solve atari tasks. This module implements double deep q-learning and can be switched to original deep q-learning via run_project/atari_config.yaml. By default, it uses the original q-network which takes the stacked 2-D grapic observations as the input and q-values as the output.  
 6 | 
 7 | *gym_workout.py:* This module is designed to solve openai gym classic control problems, which is a different learning environment than ale atari environment. This module implements original deep q-learning and can be switched to double deep q-learning via run_project/gym_config.yaml.  By default, it uses the original q-network which takes the stacked 1-D nparray observations as the input and q-values as the output.  
 8 | 
 9 | Check the config to see what you can change and how to switch between different versions of deep q-learning.  
10 | 
11 | #### 1. Human-Level Control Through Deep Reinforcement Learning
12 | Source: https://www.nature.com/articles/nature14236  
13 | 
14 | This is the original deep q-learning (Nature version). It is also the basic one. 
15 | 
16 | In this repo the only difference is that I "maxpooled" through all skipped frames rather than just using the last two frames (as in original publication and openai baselines repo) since I found the former works better. 
17 | 
18 | This DQN is also known as "natural DQN".  
19 | 
20 | #### 2. Deep Reinforcement Learning with Double Q-learning
21 | Source: https://arxiv.org/abs/1509.06461  
22 | 
23 | This is the improved version of deep q-learning trying to tackle the problem of agent overestimating action values in the preivous natural DQN.
24 | 
25 | &nbsp;  
26 | 
27 | **Run Example:**
28 | 
29 | ```bash
30 | $ cd run_project/
31 | $ sh [-h] <script filename> <options>
32 | # for example, you can try sh ATARI.sh -N testrun --local
33 | # you can always use sh <script filename> [-h] for options help
34 | ```
35 | 
36 | &nbsp;  
37 | 
38 | **Test Atari Learning Result:**
39 | 
40 | ```bash
41 | $ cd run_project/
42 | $ sh TEST.sh -N <run name>
43 | # for example, you can try sh TEST.sh -N test_agent
44 | ```
45 | 
46 | &nbsp;  
47 | 
48 | **Result Demo:**  
49 | 1. Natural DQN  
50 | ![pong-v4_origsingle](../../.demo/pong-v4_origsingle.png)  
51 | *(reward/episode while training ale atari PongNoFrameskip-v4 via Natrual DQN)*  
52 | 
53 | 2. Double DQN  
54 | ![pong-v4_double](../../.demo/pong-v4_double.png)  
55 | *(reward/episode while training ale atari PongNoFrameskip-v4 via Double DQN)*  
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/examples/deep_q_learning/atari_play.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import time
  4 | import numpy as np
  5 | import torch
  6 | import torchvision.transforms as T
  7 | from pytorl.agents import DoubleDQN_Agent, DQN_Agent
  8 | from pytorl.envs import make_atari_env
  9 | from pytorl.networks import Q_Network
 10 | import pytorl.utils as utils
 11 | import pytorl.lib as lib
 12 | 
 13 | os.environ.setdefault('run_name', 'default')
 14 | 
 15 | 
 16 | def main():
 17 |     ################################################################
 18 |     # DEVICE
 19 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 20 |     print('current device: [%s]' % device, flush=True)
 21 | 
 22 |     ################################################################
 23 |     # CONFIG
 24 |     cfg_reader = utils.ConfigReader(default='run_project/atari_config.yaml')
 25 |     config = cfg_reader.get_config()
 26 |     seed, num_episodes = config.seed, config.solver.episodes
 27 |     double_dqn = config.solver.double_dqn
 28 | 
 29 |     ################################################################
 30 |     # RECORDER
 31 |     # tensorboard
 32 |     tensorboard = utils.tensorboard_writer(logdir='..')
 33 |     tensorboard.add_textfile('config', cfg_reader.config_path)
 34 | 
 35 |     ################################################################
 36 |     # ATARI ENVIRONMENT
 37 |     resize = T.Compose(
 38 |         [T.ToPILImage(),
 39 |         T.Grayscale(1),
 40 |         T.Resize((84, 84), interpolation=3),
 41 |         T.ToTensor()]
 42 |     )
 43 |     frames_stack = config.solver.frames_stack
 44 |     env = make_atari_env(
 45 |         config.solver.env, 
 46 |         resize,
 47 |         render=config.record.render
 48 |     )
 49 | 
 50 |     env.set_episodic_init('FIRE')
 51 |     env.set_frames_stack(frames_stack)
 52 |     env.set_single_life(True)
 53 |     env.set_frames_action(config.solver.frames_action)
 54 |     num_actions = env.num_actions()
 55 | 
 56 |     ################################################################
 57 |     # UTILITIES
 58 |     replay = lib.LazyReplay(
 59 |         capacity=config.replay.capacity,
 60 |         batch_size=config.replay.batch_size,
 61 |         init_size=config.replay.init_size,
 62 |         frames_stack=env.frames_stack()
 63 |     )
 64 | 
 65 |     get_thres = lib.eps_greedy_func(
 66 |         eps_start=config.greedy.start,
 67 |         eps_end=config.greedy.end,
 68 |         num_decays=config.greedy.frames,
 69 |         global_frames_func=env.global_frames
 70 |     )
 71 | 
 72 |     ################################################################
 73 |     # AGENT
 74 |     q_net = Q_Network(input_size=(frames_stack, 84, 84),
 75 |                       num_actions=num_actions).to(device)
 76 | 
 77 |     target_net = Q_Network(input_size=(frames_stack, 84, 84),
 78 |                            num_actions=num_actions).to(device)
 79 | 
 80 |     loss_func = cfg_reader.get_loss_func(config.solver.loss)
 81 |     optimizer_func = cfg_reader.get_optimizer_func(config.solver.optimizer)
 82 |     dqn_agent_func = DoubleDQN_Agent if double_dqn else DQN_Agent
 83 |     
 84 |     agent = dqn_agent_func(
 85 |         device=device,
 86 |         q_net=q_net,
 87 |         target_net=target_net,
 88 |         loss_func=loss_func,
 89 |         optimizer_func=optimizer_func,
 90 |         replay=replay
 91 |      )
 92 |     
 93 |     agent.reset()
 94 |     agent.set_exploration(get_sample=env.sample, get_thres=get_thres)
 95 |     agent.set_tensorboard(tensorboard)
 96 |     agent.set_optimize_scheme(
 97 |         lr=config.solver.lr,
 98 |         gamma=config.solver.gamma,
 99 |         optimize_freq=config.solver.optimize_freq,
100 |         update_target_freq=config.solver.update_target_freq
101 |     )
102 | 
103 |     ################################################################
104 |     # SEEDING
105 |     random.seed(seed)
106 |     np.random.seed(seed)
107 |     torch.cuda.manual_seed(seed)
108 |     torch.manual_seed(seed)
109 |     env.seed(seed)
110 | 
111 |     ################################################################
112 |     # PRETRAIN
113 |     # setting up initial random observations and replays during this session
114 |     print('now about to setup randomized [%s] required initial experience replay...' %
115 |               agent.replay.init_size, flush=True)
116 |     while True:
117 |         env.reset()
118 |         curr_state, done = env.state().clone(), False
119 |         while len(agent.replay) < agent.replay.init_size and not done:
120 |             action = env.sample()
121 |             next_observ, reward, done, _ = env.step(action)
122 |             next_state = env.state().clone()
123 |             agent.replay.push(curr_state, action, next_state, reward)
124 |             curr_state = next_state
125 | 
126 |         print(time.strftime('[%Y-%m-%d-%H:%M:%S'), '%s]:' % os.environ['run_name'],
127 |               'initializing experience replay progressing [%s/%s]' % (
128 |               len(agent.replay), agent.replay.init_size), flush=True)
129 |         if not done: break
130 |         # save final action into reply buffer
131 |         agent.replay.push(curr_state, action, None, reward)
132 | 
133 |     print(time.strftime('[%Y-%m-%d-%H:%M:%S'), '%s]:' % os.environ['run_name'],
134 |           'experience replay initialization completed [%s/%s]' % (
135 |           len(agent.replay), agent.replay.init_size), flush=True)
136 | 
137 |     env.refresh()
138 | 
139 |     ################################################################
140 |     # TRAINING
141 |     for _ in range(num_episodes):
142 |         env.reset()
143 |         # get initial state
144 |         curr_state, done = env.state().clone(), False
145 |         while True:
146 |             action = agent.next_action(env.state)
147 |             if not done:
148 |                 next_observ, reward, done, _ = env.step(action)
149 |                 next_state = env.state().clone()
150 |             else:
151 |                 next_state = None
152 |             agent.replay.push(curr_state, action, next_state, reward)
153 |             curr_state = next_state
154 |             agent.optimize()
155 |             if done: break
156 | 
157 |         print(time.strftime('[%Y-%m-%d-%H:%M:%S'), '%s]:' % os.environ['run_name'],
158 |               'episode [%s/%s], ep-reward [%s], threshold [%.2f], timesteps [%s], frames [%s]' %
159 |               (env.global_episodes(), num_episodes, env.episodic_reward(), get_thres(),
160 |                agent.optimize_counter(), env.global_frames()), flush=True)
161 |         # recording via tensorboard
162 |         tensorboard.add_scalar('episode/reward', env.episodic_reward(), env.global_episodes())
163 |         tensorboard.add_scalar('episode/thres', get_thres(), env.global_episodes())
164 | 
165 |         if env.global_episodes() % config.record.save_freq == 0:
166 |             agent.save_pth(agent.q_net, config.record.save_path,
167 |                            filename='q_net.pth', obj_name='q_network')
168 |             agent.save_pth(agent.target_net, config.record.save_path,
169 |                            filename='target_net.pth', obj_name='target_network')
170 | 
171 | 
172 | if __name__ == '__main__':
173 |     main()
174 | 
175 | 
176 | 


--------------------------------------------------------------------------------
/examples/deep_q_learning/gym_workout.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import time
  4 | import numpy as np
  5 | import torch
  6 | import torchvision.transforms as T
  7 | from pytorl.agents import DoubleDQN_Agent, DQN_Agent
  8 | from pytorl.envs import make_ctrl_env
  9 | from pytorl.networks import Q_MLP
 10 | import pytorl.utils as utils
 11 | import pytorl.lib as lib
 12 | 
 13 | os.environ.setdefault('run_name', 'default')
 14 | 
 15 | 
 16 | def main():
 17 |     ################################################################
 18 |     # DEVICE
 19 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 20 |     print('current device: [%s]' % device, flush=True)
 21 | 
 22 |     ################################################################
 23 |     # CONFIG
 24 |     cfg_reader = utils.ConfigReader(default='run_project/gym_config.yaml')
 25 |     config = cfg_reader.get_config()
 26 |     seed, num_episodes = config.seed, config.solver.episodes
 27 |     double_dqn = config.solver.double_dqn
 28 | 
 29 |     ################################################################
 30 |     # RECORDER
 31 |     # tensorboard
 32 |     tensorboard = utils.tensorboard_writer(logdir='..')
 33 |     tensorboard.add_textfile('config', cfg_reader.config_path)
 34 | 
 35 |     ################################################################
 36 |     # CLASSIC CONTROL ENVIRONMENT
 37 |     frames_stack = config.solver.frames_stack
 38 |     env = make_ctrl_env(config.solver.env, render=config.record.render)
 39 |     # seeding
 40 |     env.seed(seed)
 41 |     env.set_frames_stack(frames_stack)
 42 |     env.set_frames_action(config.solver.frames_action)
 43 |     num_actions = env.num_actions()
 44 |     # try decap the environment limit
 45 | #     try:
 46 | #         env._max_episode_steps = 10000
 47 | #     except: pass
 48 | 
 49 |     ################################################################
 50 |     # UTILITIES
 51 |     replay = lib.VanillaReplay(
 52 |         capacity=config.replay.capacity,
 53 |         batch_size=config.replay.batch_size,
 54 |         init_size=config.replay.init_size
 55 |     )
 56 | 
 57 |     get_thres = lib.eps_greedy_func(
 58 |         eps_start=config.greedy.start,
 59 |         eps_end=config.greedy.end,
 60 |         num_decays=config.greedy.frames,
 61 |         global_frames_func=env.global_frames
 62 |     )
 63 | 
 64 |     ################################################################
 65 |     # AGENT
 66 |     q_net = Q_MLP(input_size=(frames_stack, env.observ_shape()),
 67 |                       num_actions=num_actions).to(device)
 68 | 
 69 |     target_net = Q_MLP(input_size=(frames_stack, env.observ_shape()),
 70 |                       num_actions=num_actions).to(device)
 71 | 
 72 |     loss_func = cfg_reader.get_loss_func(config.solver.loss)
 73 |     optimizer_func = cfg_reader.get_optimizer_func(config.solver.optimizer)
 74 |     dqn_agent_func = DoubleDQN_Agent if double_dqn else DQN_Agent
 75 | 
 76 |     agent = dqn_agent_func(
 77 |         device=device,
 78 |         q_net=q_net,
 79 |         target_net=target_net,
 80 |         loss_func=loss_func,
 81 |         optimizer_func=optimizer_func,
 82 |         replay=replay
 83 |     )
 84 |     
 85 |     agent.reset()
 86 |     agent.set_exploration(get_sample=env.sample, get_thres=get_thres)
 87 |     agent.set_tensorboard(tensorboard)
 88 |     agent.set_optimize_scheme(
 89 |         lr=config.solver.lr,
 90 |         gamma=config.solver.gamma,
 91 |         optimize_freq=config.solver.optimize_freq,
 92 |         update_target_freq=config.solver.update_target_freq
 93 |     )
 94 | 
 95 |     ################################################################
 96 |     # SEEDING
 97 |     random.seed(seed)
 98 |     np.random.seed(seed)
 99 |     torch.cuda.manual_seed(seed)
100 |     torch.manual_seed(seed)
101 |     env.seed(seed)
102 | 
103 |     ################################################################
104 |     # PRETRAIN
105 |     # setting up initial random observations and replays during this session
106 |     print('now about to setup randomized [%s] required initial experience replay...' %
107 |               agent.replay.init_size, flush=True)
108 |     while True:
109 |         env.reset()
110 |         curr_state, done = env.state().clone(), False
111 |         while len(agent.replay) < agent.replay.init_size and not done:
112 |             action = env.sample()
113 |             next_observ, reward, done, _ = env.step(action)
114 |             next_state = env.state().clone()
115 |             agent.replay.push(curr_state, action, next_state, reward)
116 |             curr_state = next_state
117 | 
118 |         print(time.strftime('[%Y-%m-%d-%H:%M:%S'), '%s]:' % os.environ['run_name'],
119 |               'initializing experience replay progressing [%s/%s]' % (
120 |               len(agent.replay), agent.replay.init_size), flush=True)
121 |         if not done: break
122 |         # save final action into reply buffer
123 |         agent.replay.push(curr_state, action, None, reward)
124 | 
125 |     print(time.strftime('[%Y-%m-%d-%H:%M:%S'), '%s]:' % os.environ['run_name'],
126 |           'experience replay initialization completed [%s/%s]' % (
127 |           len(agent.replay), agent.replay.init_size), flush=True)
128 | 
129 |     env.refresh()
130 | 
131 |     ################################################################
132 |     # TRAINING
133 |     for _ in range(num_episodes):
134 |         env.reset()
135 |         # get initial state
136 |         curr_state, done = env.state().clone(), False
137 |         while True:
138 |             action = agent.next_action(env.state)
139 |             if not done:
140 |                 next_observ, reward, done, _ = env.step(action)
141 |                 next_state = env.state().clone()
142 |             else:
143 |                 next_state = None
144 |             agent.replay.push(curr_state, action, next_state, reward)
145 |             curr_state = next_state
146 |             agent.optimize()
147 |             if done: break
148 | 
149 |         print(time.strftime('[%Y-%m-%d-%H:%M:%S'), '%s]:' % os.environ['run_name'],
150 |               'episode [%s/%s], ep-reward [%s], threshold [%.2f], timesteps [%s], frames [%s]' %
151 |               (env.global_episodes(), num_episodes, env.episodic_reward(), get_thres(),
152 |                agent.optimize_counter(), env.global_frames()), flush=True)
153 |         # recording via tensorboard
154 |         tensorboard.add_scalar('episode/reward', env.episodic_reward(), env.global_episodes())
155 |         tensorboard.add_scalar('episode/thres', get_thres(), env.global_episodes())
156 | 
157 |         if env.global_episodes() % config.record.save_freq == 0:
158 |             agent.save_pth(agent.q_net, config.record.save_path,
159 |                            filename='q_net.pth', obj_name='q_network')
160 |             agent.save_pth(agent.target_net, config.record.save_path,
161 |                            filename='target_net.pth', obj_name='target_network')
162 | 
163 | 
164 | if __name__ == '__main__':
165 |     main()
166 | 
167 | 
168 | 


--------------------------------------------------------------------------------
/examples/deep_q_learning/run_project/ATARI.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/bash 
  2 | OPTSPEC=":LNhp-:"
  3 | 
  4 | alg='ATARI DEEP Q-LEARNING' # name of algorithm
  5 | py_filename='atari_play.py'
  6 | local=false
  7 | rn_prefix='torl' # a prefix of the run name to help manage experiment dir
  8 | rn_suffix='default'
  9 | 
 10 | echo
 11 | 
 12 | while getopts "$OPTSPEC" optchar; do
 13 |     case "${optchar}" in
 14 |         -)
 15 |             case "${OPTARG}" in
 16 |                 partition)
 17 |                     partition="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 18 |                     ;;
 19 |                 partition=*)
 20 |                     partition=${OPTARG#*=}
 21 |                     ;;
 22 |                 prefix)
 23 |                     rn_prefix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 24 |                     ;;
 25 |                 prefix=*)
 26 |                     rn_prefix=${OPTARG#*=}
 27 |                     ;;
 28 |                 name)
 29 |                     rn_suffix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 30 |                     ;;
 31 |                 name=*)
 32 |                     rn_suffix=${OPTARG#*=}
 33 |                     ;;
 34 |                 local)
 35 |                     local=true
 36 |                     echo -e "\e[46m [lrun] LOCAL RUN (NON SRUN) MODE SPECIFIED \e[0m"
 37 |                     ;;
 38 |                 *)
 39 |                     if [ "$OPTERR" = 1 ]; then
 40 |                         echo -e "\e[41m [ERROR] UNKNOWN ARGUMENT '-${OPTARG}' SPECIFIED \e[0m" >&2
 41 |                         exit 2
 42 |                     fi
 43 |                     ;;
 44 |             esac;;
 45 |         L)
 46 |             local=true
 47 |             echo -e "\e[46m [lrun] LOCAL RUN (NON SRUN) MODE SPECIFIED \e[0m"
 48 |             ;;
 49 |         N)
 50 |             rn_suffix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 51 |             ;;
 52 |         h)
 53 |             echo "USAGE: $0 <OPTIONS...>" >&2
 54 |             echo "          [-h][-L, --local    using lrun instead of srun]" >&2
 55 |             echo "          [-h][    --prefix[=]<run name prefix>]" >&2
 56 |             echo "          [-h][-N, --name[=]<run name suffix>]" >&2
 57 |             echo "          [-h][-p, --partition=<srun partition>]" >&2
 58 |             echo
 59 |             exit 2
 60 |             ;;
 61 |         # lower case p
 62 |         p)
 63 |             partition="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 64 |             ;;
 65 |         *)
 66 |             if [ "$OPTERR" != 1 ] || [ "${OPTSPEC:0:1}" = ":" ]; then
 67 |                 echo -e "\e[41m [ERROR] UNKNOWN ARGUMENT '-${OPTARG}' SPECIFIED \e[0m" >&2
 68 |                 exit 2
 69 |             fi
 70 |             ;;
 71 |     esac
 72 | done
 73 | 
 74 | cd ..
 75 | 
 76 | local_cmd="lrun -n1"
 77 | run_cmd="srun -J ${rn_suffix} -p ${partition} --gres gpu:1"
 78 | py_cmd="python ${py_filename} 2>&1 | tee -a ../log.txt"
 79 | 
 80 | if [ ${local} == true ]; then
 81 |     run_cmd=${local_cmd}
 82 | else
 83 |     if [ -z "${partition}" ]; then
 84 |         echo -e "\e[41m [ERROR] PARTITION(--partition or -p) NOT SPECIFIED \e[0m"  >&2
 85 |         exit 2
 86 |     fi
 87 | fi
 88 | 
 89 | echo '________________________________________________________________________________'
 90 | echo -e "\e[42m STARTING \e[0m\e[43m ${alg} \e[0m"
 91 | echo           "rl-run -rn ${rn_prefix}/${rn_suffix} -c"
 92 | echo           "${run_cmd}"
 93 | echo           "${py_cmd}"
 94 | echo '‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾'
 95 | 
 96 | rl-run -rn ${rn_prefix}/${rn_suffix} -c "${run_cmd} ${py_cmd}"
 97 | 
 98 | echo '________________________________________________________________________________'
 99 | echo          "RUN_NAME: ${rn_prefix}/${rn_suffix}"
100 | echo          "CMD_LINE: sh $0 $@"
101 | echo -e "\e[44m FINISHED \e[0m\e[43m ${alg} \e[0m"
102 | echo '‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾'
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/examples/deep_q_learning/run_project/GYM.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/bash 
  2 | OPTSPEC=":LNhp-:"
  3 | 
  4 | alg='CLASSIC CONTROL DEEP Q-LEARNING' # name of algorithm
  5 | py_filename='gym_workout.py'
  6 | local=false
  7 | rn_prefix='torl' # a prefix of the run name to help manage experiment dir
  8 | rn_suffix='default'
  9 | 
 10 | echo
 11 | 
 12 | while getopts "$OPTSPEC" optchar; do
 13 |     case "${optchar}" in
 14 |         -)
 15 |             case "${OPTARG}" in
 16 |                 partition)
 17 |                     partition="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 18 |                     ;;
 19 |                 partition=*)
 20 |                     partition=${OPTARG#*=}
 21 |                     ;;
 22 |                 prefix)
 23 |                     rn_prefix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 24 |                     ;;
 25 |                 prefix=*)
 26 |                     rn_prefix=${OPTARG#*=}
 27 |                     ;;
 28 |                 name)
 29 |                     rn_suffix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 30 |                     ;;
 31 |                 name=*)
 32 |                     rn_suffix=${OPTARG#*=}
 33 |                     ;;
 34 |                 local)
 35 |                     local=true
 36 |                     echo -e "\e[46m [lrun] LOCAL RUN (NON SRUN) MODE SPECIFIED \e[0m"
 37 |                     ;;
 38 |                 *)
 39 |                     if [ "$OPTERR" = 1 ]; then
 40 |                         echo -e "\e[41m [ERROR] UNKNOWN ARGUMENT '-${OPTARG}' SPECIFIED \e[0m" >&2
 41 |                         exit 2
 42 |                     fi
 43 |                     ;;
 44 |             esac;;
 45 |         L)
 46 |             local=true
 47 |             echo -e "\e[46m [lrun] LOCAL RUN (NON SRUN) MODE SPECIFIED \e[0m"
 48 |             ;;
 49 |         N)
 50 |             rn_suffix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 51 |             ;;
 52 |         h)
 53 |             echo "USAGE: $0 <OPTIONS...>" >&2
 54 |             echo "          [-h][-L, --local    using lrun instead of srun]" >&2
 55 |             echo "          [-h][    --prefix[=]<run name prefix>]" >&2
 56 |             echo "          [-h][-N, --name[=]<run name suffix>]" >&2
 57 |             echo "          [-h][-p, --partition=<srun partition>]" >&2
 58 |             echo
 59 |             exit 2
 60 |             ;;
 61 |         # lower case p
 62 |         p)
 63 |             partition="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 64 |             ;;
 65 |         *)
 66 |             if [ "$OPTERR" != 1 ] || [ "${OPTSPEC:0:1}" = ":" ]; then
 67 |                 echo -e "\e[41m [ERROR] UNKNOWN ARGUMENT '-${OPTARG}' SPECIFIED \e[0m" >&2
 68 |                 exit 2
 69 |             fi
 70 |             ;;
 71 |     esac
 72 | done
 73 | 
 74 | cd ..
 75 | 
 76 | local_cmd="lrun -n1"
 77 | run_cmd="srun -J ${rn_suffix} -p ${partition} --gres gpu:1"
 78 | py_cmd="python ${py_filename} 2>&1 | tee -a ../log.txt"
 79 | 
 80 | if [ ${local} == true ]; then
 81 |     run_cmd=${local_cmd}
 82 | else
 83 |     if [ -z "${partition}" ]; then
 84 |         echo -e "\e[41m [ERROR] PARTITION(--partition or -p) NOT SPECIFIED \e[0m"  >&2
 85 |         exit 2
 86 |     fi
 87 | fi
 88 | 
 89 | echo '________________________________________________________________________________'
 90 | echo -e "\e[42m STARTING \e[0m\e[43m ${alg} \e[0m"
 91 | echo           "rl-run -rn ${rn_prefix}/${rn_suffix} -c"
 92 | echo           "${run_cmd}"
 93 | echo           "${py_cmd}"
 94 | echo '‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾'
 95 | 
 96 | rl-run -rn ${rn_prefix}/${rn_suffix} -c "${run_cmd} ${py_cmd}"
 97 | 
 98 | echo '________________________________________________________________________________'
 99 | echo          "RUN_NAME: ${rn_prefix}/${rn_suffix}"
100 | echo          "CMD_LINE: sh $0 $@"
101 | echo -e "\e[44m FINISHED \e[0m\e[43m ${alg} \e[0m"
102 | echo '‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾'
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/examples/deep_q_learning/run_project/TEST.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash 
 2 | OPTSPEC=":Nh-:"
 3 | 
 4 | alg='TEST LEARNING OUTCOME'
 5 | py_filename='test.py'
 6 | rn_prefix='torl' # a prefix of the run name to help manage experiment dir
 7 | rn_suffix='default'
 8 | 
 9 | echo
10 | 
11 | while getopts "$OPTSPEC" optchar; do
12 |     case "${optchar}" in
13 |         -)
14 |             case "${OPTARG}" in
15 |                 prefix)
16 |                     rn_prefix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
17 |                     ;;
18 |                 prefix=*)
19 |                     rn_prefix=${OPTARG#*=}
20 |                     ;;
21 |                 name)
22 |                     rn_suffix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
23 |                     ;;
24 |                 name=*)
25 |                     rn_suffix=${OPTARG#*=}
26 |                     ;;
27 |                 *)
28 |                     if [ "$OPTERR" = 1 ]; then
29 |                         echo -e "\e[41m [ERROR] UNKNOWN ARGUMENT '-${OPTARG}' SPECIFIED \e[0m" >&2
30 |                         exit 2
31 |                     fi
32 |                     ;;
33 |             esac;;
34 |         N)
35 |             rn_suffix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
36 |             ;;
37 |         h)
38 |             echo "USAGE: $0 <OPTIONS...>" >&2
39 |             echo "          [-h][    --prefix[=]<run name prefix>]" >&2
40 |             echo "          [-h][-N, --name[=]<run name suffix>]" >&2
41 |             echo
42 |             exit 2
43 |             ;;
44 |         *)
45 |             if [ "$OPTERR" != 1 ] || [ "${OPTSPEC:0:1}" = ":" ]; then
46 |                 echo -e "\e[41m [ERROR] UNKNOWN ARGUMENT '-${OPTARG}' SPECIFIED \e[0m" >&2
47 |                 exit 2
48 |             fi
49 |             ;;
50 |     esac
51 | done
52 | 
53 | cd ..
54 | 
55 | run_cmd="lrun -n1" # note that test only runs under local env
56 | py_cmd="python ${py_filename} 2>&1 | tee -a ../log.txt"
57 | 
58 | echo '________________________________________________________________________________'
59 | echo -e "\e[42m STARTING \e[0m\e[43m ${alg} \e[0m"
60 | echo           "rl-run -rn ${rn_prefix}/${rn_suffix} -c"
61 | echo           "${run_cmd}"
62 | echo           "${py_cmd}"
63 | echo '‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾'
64 | 
65 | rl-run -rn ${rn_prefix}/${rn_suffix} -c "${run_cmd} ${py_cmd}"
66 | 
67 | echo '________________________________________________________________________________'
68 | echo          "RUN_NAME: ${rn_prefix}/${rn_suffix}"
69 | echo          "CMD_LINE: sh $0 $@"
70 | echo -e "\e[44m FINISHED \e[0m\e[43m ${alg} \e[0m"
71 | echo '‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾'
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/examples/deep_q_learning/run_project/atari_config.yaml:
--------------------------------------------------------------------------------
 1 | seed: 123
 2 | solver:
 3 |     env: PongNoFrameskip-v4
 4 |     double_dqn: True
 5 |     lr: 0.0001
 6 |     gamma: 0.99
 7 |     episodes: 4000
 8 |     # this specifies how much frames stacked for one input
 9 |     frames_stack: 4
10 |     # this specifies how many frames an action lasts
11 |     frames_action: 4
12 |     # note: optimize_freq w.r.t timesteps
13 |     optimize_freq: 1
14 |     # note: update_target_freq w.r.t timesteps
15 |     update_target_freq: 1000
16 |     loss: smooth_l1_loss
17 |     optimizer: Adam
18 | greedy:
19 |     start: 1
20 |     end: 0.02
21 |     frames: 100000
22 | replay:
23 |     # note: capacity w.r.t states
24 |     capacity: 100000
25 |     init_size: default
26 |     batch_size: 32
27 | record:
28 |     save_freq: 100
29 |     save_path: ../checkpoint
30 |     load_path: /Users/zhe/Desktop
31 |     render: False
32 |     
33 | 


--------------------------------------------------------------------------------
/examples/deep_q_learning/run_project/gym_config.yaml:
--------------------------------------------------------------------------------
 1 | seed: 123
 2 | solver:
 3 |     env: CartPole-v1
 4 |     double_dqn: False
 5 |     lr: 0.00025
 6 |     gamma: 0.999
 7 |     episodes: 1000000
 8 |     # this specifies how much frames stacked for one input
 9 |     frames_stack: 4
10 |     # this specifies how many frames an action lasts
11 |     frames_action: 1
12 |     # note: optimize_freq w.r.t timesteps
13 |     optimize_freq: 1
14 |     # note: update_target_freq w.r.t timesteps
15 |     update_target_freq: 2000
16 |     loss: smooth_l1_loss
17 |     optimizer: Adam
18 | greedy:
19 |     start: 0.9
20 |     end: 0.05
21 |     frames: 4000
22 | replay:
23 |     # note: capacity w.r.t states
24 |     capacity: 100000
25 |     init_size: 10000
26 |     batch_size: 128
27 | record:
28 |     save_freq: 10000
29 |     save_path: ../checkpoint
30 |     load_path: ../checkpoint
31 |     render: False
32 |     
33 | 
34 | 


--------------------------------------------------------------------------------
/examples/deep_q_learning/test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | import time
 4 | import numpy as np
 5 | import torch
 6 | import torchvision.transforms as T
 7 | from pytorl.agents import DQN_Agent
 8 | from pytorl.envs import make_atari_env
 9 | from pytorl.networks import Q_Network, init_network
10 | import pytorl.utils as utils
11 | import pytorl.lib as lib
12 | 
13 | os.environ.setdefault('run_name', 'default')
14 | 
15 | 
16 | def main():
17 |     ################################################################
18 |     # DEVICE
19 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20 |     print('current device: [%s]' % device, flush=True)
21 | 
22 |     ################################################################
23 |     # CONFIG
24 |     cfg_reader = utils.ConfigReader(default='run_project/atari_config.yaml')
25 |     config = cfg_reader.get_config()
26 |     seed, num_episodes = config.seed, config.solver.episodes
27 | 
28 |     ################################################################
29 |     # RECORDER
30 |     # tensorboard
31 |     tensorboard = utils.tensorboard_writer(logdir='..')
32 |     tensorboard.add_textfile('config', cfg_reader.config_path)
33 | 
34 |     ################################################################
35 |     # ATARI ENVIRONMENT
36 |     resize = T.Compose([T.ToPILImage(),
37 |                     T.Grayscale(1),
38 |                     T.Resize((84, 84), interpolation=3),
39 |                     T.ToTensor()])
40 |     frames_stack = config.solver.frames_stack
41 |     env = make_atari_env(config.solver.env, resize,
42 |                         render=config.record.render)
43 | 
44 |     env.set_episodic_init('FIRE')
45 |     env.set_frames_stack(frames_stack)
46 |     env.set_single_life(True)
47 |     env.set_frames_action(4)
48 |     num_actions = env.num_actions()
49 | 
50 |     ################################################################
51 |     # AGENT
52 |     q_net = Q_Network(input_size=(frames_stack, 84, 84),
53 |                       num_actions=num_actions)
54 |     init_network(q_net, config.record.load_path, 'q_net.pth', obj_name='q_network')
55 |     q_net = q_net.to(device)
56 |     agent = DQN_Agent(device=device, q_net=q_net)
57 |     agent.set_exploration(env.sample, lib.eps_greedy_func(config.greedy.end, config.greedy.end))
58 | 
59 |     ################################################################
60 |     # SEEDING
61 |     random.seed(seed)
62 |     np.random.seed(seed)
63 |     torch.cuda.manual_seed(seed)
64 |     torch.manual_seed(seed)
65 |     env.seed(seed)
66 | 
67 |     ################################################################
68 |     # TESTING
69 |     for _ in range(num_episodes):
70 |         env.reset()
71 |         # get initial state
72 |         done = False
73 |         while True:
74 |             action = agent.next_action(env.state)
75 |             next_observ, reward, done, _ = env.step(action)
76 |             time.sleep(0.02)
77 |             if done: break
78 | 
79 |         print(time.strftime('[%Y-%m-%d-%H:%M:%S'), '%s]:' % os.environ['run_name'],
80 |               'episode [%s/%s], ep-reward [%s], frames [%s]' %
81 |               (env.global_episodes(), num_episodes, env.episodic_reward(),
82 |                env.global_frames()), flush=True)
83 |         # recording via tensorboard
84 |         tensorboard.add_scalar('episode/reward', env.episodic_reward(), env.global_episodes())
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     main()
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/examples/gorila_dqn/README.md:
--------------------------------------------------------------------------------
 1 | ## Gorila DQN
 2 | 
 3 | This example project implements Gorila DQN, a distributed training strategy via parameter server architecture.  
 4 | 
 5 | Since it is a distributed program, it is environment sensible so it does not guarantee a perfect work on  
 6 | your environment. I use slurm to develope it and have setup a local run mode to generate some dummy slurm  
 7 | environment variables to let it run without slurm. While this should work, it has the probability for failure.  
 8 | 
 9 | Also, this distributed module has only been tested under 'gloo' backend. It may fall under other circumstances.
10 | 
11 | This example's bash launch script and config are vary different from previous ones. You'd better check the  
12 | script helper before using it. Also check the config to see what you can change.
13 | 
14 | #### 1. Massively Parallel Methods for Deep Reinforcement Learning
15 | Source: https://arxiv.org/abs/1507.04296  
16 | 
17 | 
18 | #### (To do) Asynchronous Stochastic Gradient Descent with Delay Compensation
19 | Source: https://arxiv.org/abs/1609.08326
20 | 
21 | &nbsp;  
22 | 
23 | **Run Example:**  
24 | 
25 | - to be familiar with the launch script  
26 | 
27 | > ```bash
28 | > $ cd run_project/
29 | > $ sh [-h] <script filename> <options>
30 | > # you can always use sh <script filename> [-h] for options help
31 | > # USAGE: ATARI.sh <OPTIONS...>
32 | > #           [-h][-G, --gpu[=]<corresp. to srun gres gpu:>]
33 | > #           [-h][-L, --local    using lrun instead of srun]
34 | > #           [-h][-c, --cpus-per-task[=]<num cpus per task>]
35 | > #           [-h][-n, --ntasks[=]<total num tasks>]
36 | > #           [-h][    --ntasks-per-node[=]<num tasks per node>]
37 | > #           [-h][    --prefix[=]<run name prefix>]
38 | > #           [-h][-N, --name[=]<run name suffix>]
39 | > #           [-h][-p, --partition=<srun partition>]
40 | > ```
41 | 
42 | - if you are using slurm workload manager, you can try:  
43 | 
44 | > ```bash
45 | > # you can try this for a 8gpu, single node job setting
46 | > $ sh ATARI.sh -N rungpu8 -n 8 --gpu=8 -p <partition name>
47 | > # or if you wanna run it on multiple nodes (e.g. 16gpus on 2 nodes), try:
48 | > $ sh ATARI.sh -N rungpu16 -p <partition name> -n 16 --gpu=8 --ntasks-per-node=8
49 | > ```
50 | 
51 | - if you want to launch Gorila DQN without slurm, you may try:  
52 | 
53 | > ```bash
54 | > # may not work but you can try things lile
55 | > $ sh ATARI.sh -N localrun -n 8 --local
56 | > ```
57 | 
58 | &nbsp;  
59 | 
60 | **Test Learning Result:**
61 | 
62 | ```bash
63 | $ cd run_project/
64 | $ sh TEST.sh -N <run name>
65 | # for example, you can try sh TEST.sh -N test_agent
66 | ```
67 | 
68 | &nbsp;  
69 | 
70 | **Result Demo:**  
71 | 1. Gorila DQN (without smoothing)  
72 | ![pong-v4_dist4c_nonsmooth](../../.demo/pong-v4_dist4c_nonsmooth.png)  
73 | *(reward/episode while training ale atari PongNoFrameskip-v4 via Gorila DQN with 1 server, 3 clients)*  
74 | 
75 | 2. Double DQN (with smoothing)  
76 | ![pong-v4_dist4c_smooth](../../.demo/pong-v4_dist4c_smooth.png)  
77 | *(reward/episode while training ale atari PongNoFrameskip-v4 via Double DQN with 1 server, 3 clients)*  
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/examples/gorila_dqn/client.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import time
  4 | import numpy as np
  5 | import torch
  6 | import torch.distributed as dist
  7 | from torch.nn.utils import vector_to_parameters
  8 | import torchvision.transforms as T
  9 | from pytorl.agents import GorilaDQN_ClientAgent
 10 | import pytorl.distributed as rl_dist
 11 | from pytorl.envs import make_atari_env
 12 | import pytorl.lib as lib
 13 | from pytorl.networks import Dueling_DQN, Q_Network
 14 | import pytorl.utils as utils
 15 | 
 16 | 
 17 | def param_client_proc(master_rank, worker_group):
 18 |     ################################################################
 19 |     # CONFIG
 20 |     rank, world_size = dist.get_rank(), dist.get_world_size()
 21 |     master_rank = rl_dist.get_master_rank()
 22 |     
 23 |     cfg_reader = utils.ConfigReader(default='run_project/config.yaml')
 24 |     config = cfg_reader.get_config()
 25 |     seed, num_episodes = config.seed, config.client.episodes
 26 |     update_target_freq = config.client.update_target_freq
 27 |     gradients_push_freq = config.client.gradients_push_freq
 28 |     delay_factor = config.client.delay_factor
 29 |     record_rank = config.record.record_rank
 30 |     specified_device = config.client.device
 31 |     assert record_rank != master_rank and record_rank <= world_size - 1
 32 |     
 33 |     ################################################################
 34 |     # DEVICE
 35 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 36 |     if '%s' % device == specified_device: 
 37 |         msg = 'using specified'
 38 |     elif specified_device == 'cuda': 
 39 |         msg = 'cuda not fuound, current'
 40 |     else: 
 41 |         msg = 'found cuda, but using specified'
 42 |         device = torch.device(specified_device)
 43 |         
 44 |     print('[rank %s] %s device: [%s]' % (rank, msg, device), flush=True)
 45 |     
 46 |     ################################################################
 47 |     # RECORDER
 48 |     # tensorboard
 49 |     if rank == record_rank:
 50 |         print('[rank %s] tensorboard started at specified record rank [%s]' % (rank, rank), flush=True)
 51 |         tensorboard = utils.tensorboard_writer(logdir='..')
 52 |         tensorboard.add_textfile('config', cfg_reader.config_path)
 53 |     else:
 54 |         tensorboard = None
 55 | 
 56 |     ################################################################
 57 |     # ATARI ENVIRONMENT
 58 |     resize = T.Compose(
 59 |         [T.ToPILImage(),
 60 |         T.Grayscale(1),
 61 |         T.Resize((84, 84), interpolation=3),
 62 |         T.ToTensor()]
 63 |     )
 64 |     frames_stack = config.solver.frames_stack
 65 |     env = make_atari_env(
 66 |         config.solver.env, 
 67 |         resize,
 68 |         render=config.record.render
 69 |     )
 70 | 
 71 |     env.set_episodic_init('FIRE')
 72 |     env.set_frames_stack(frames_stack)
 73 |     env.set_single_life(True)
 74 |     env.set_frames_action(config.client.frames_action)
 75 |     num_actions = env.num_actions()
 76 |     
 77 |     ################################################################
 78 |     # UTILITIES
 79 |     get_beta = lib.beta_priority_func(
 80 |         beta_start=config.replay.beta.start,
 81 |         beta_end=config.replay.beta.end,
 82 |         num_incres=config.replay.beta.frames,
 83 |         global_frames_func=env.global_frames
 84 |     )
 85 | 
 86 |     get_thres = lib.eps_greedy_func(
 87 |         eps_start=config.greedy.start,
 88 |         eps_end=config.greedy.end,
 89 |         num_decays=config.greedy.frames,
 90 |         global_frames_func=env.global_frames
 91 |     )
 92 |     
 93 |     ################################################################
 94 |     # SEEDING
 95 |     random.seed(seed + rank)
 96 |     np.random.seed(seed + rank)
 97 |     torch.cuda.manual_seed(seed)
 98 |     torch.manual_seed(seed)
 99 |     env.seed(seed + rank)
100 | 
101 |     ################################################################
102 |     # AGENT
103 |     if config.solver.dueling:
104 |         network = Dueling_DQN
105 |     else:
106 |         network = Q_Network
107 |     
108 |     q_net = network(input_size=(frames_stack, 84, 84),
109 |                       num_actions=num_actions).to(device)
110 | 
111 |     target_net = network(input_size=(frames_stack, 84, 84),
112 |                            num_actions=num_actions).to(device)
113 | 
114 |     loss_func = cfg_reader.get_loss_func(config.client.loss)
115 | 
116 |     agent = GorilaDQN_ClientAgent(
117 |         device = device,
118 |         q_net = target_net,
119 |         target_net = q_net,
120 |         loss_func = loss_func,
121 |      )
122 |     
123 |     agent.set_prioritized_replay(
124 |         capacity=config.replay.capacity, 
125 |         batch_size=config.replay.batch_size, 
126 |         init_size=config.replay.init_size, 
127 |         alpha=config.replay.alpha,
128 |         beta_func=get_beta, 
129 |     )
130 |     agent.set_exploration(get_sample=env.sample, get_thres=get_thres)
131 |     agent.set_gradient_scheme(
132 |         gamma=config.client.gamma,
133 |         gradient_freq=1
134 |     )
135 |     agent.set_tensorboard(tensorboard)
136 |     
137 |     ################################################################
138 |     # CLIENT
139 |     client = rl_dist.ParamClient(device)
140 |     client.set_recv(2)
141 |     client.set_info(agent.shard_len, agent.gradient_counter)
142 |     client.set_param_update(agent.q_net)
143 |     # q network initialization
144 |     overhead, params = client.recv_param()
145 |     vector_to_parameters(params, agent.q_net.parameters())
146 |     glb_updates, server = overhead
147 |     agent.update_target()
148 | 
149 |     ################################################################
150 |     # PRETRAIN
151 |     # setting up initial random observations and replays during this session
152 |     print('[rank %s] now about to setup randomized [%s] required initial experience replay...' % (
153 |               rank, agent.replay.init_size), flush=True)
154 |     while True:
155 |         env.reset()
156 |         curr_state, done = env.state().clone(), False
157 |         while len(agent.replay) < agent.replay.init_size and not done:
158 |             action = env.sample()
159 |             next_observ, reward, done, _ = env.step(action)
160 |             next_state = env.state().clone()
161 |             agent.replay.push(curr_state, action, next_state, reward)
162 |             curr_state = next_state
163 | 
164 |         if not done: break
165 |         # save final action into reply buffer
166 |         agent.replay.push(curr_state, action, None, reward)
167 | 
168 |     print('[rank %s]' % rank, time.strftime('[%Y-%m-%d-%H:%M:%S'), '%s]:' % os.environ['run_name'],
169 |           'prioritized experience replay initialization completed [%s/%s]' % (
170 |           len(agent.replay), agent.replay.init_size), flush=True)
171 | 
172 |     env.refresh()
173 |     # make a barrier to prevent global timeout problem
174 |     dist.barrier(worker_group)
175 |     
176 |     ################################################################
177 |     # TRAINING
178 |     last_target_update, warned = 0, False
179 |     
180 |     for _ in range(num_episodes):
181 |         env.reset()
182 |         # get initial state
183 |         agent.zero_grad_()
184 |         curr_state, done = env.state().clone(), False
185 |         overhead, params = client.recv_param()
186 |         vector_to_parameters(params, agent.q_net.parameters())
187 |         glb_updates, server = overhead
188 |         while True:
189 |             action = agent.next_action(env.state)
190 |             overhead, params = client.recv_param()     
191 |             vector_to_parameters(params, agent.q_net.parameters())
192 |             glb_updates, server = overhead
193 |             if not done:
194 |                 next_observ, reward, done, _ = env.step(action)
195 |                 next_state = env.state().clone()
196 |             else:
197 |                 next_state = None
198 |             agent.replay.push(curr_state, action, next_state, reward)
199 |             # gradient time delay
200 |             glb_avg_time = glb_updates / agent.num_clients
201 |             local_time = agent.gradient_counter() / gradients_push_freq
202 |             if glb_avg_time * (1 - delay_factor) <= local_time:
203 |                 warned = False
204 |                 agent.backward()
205 |                 # push gradient
206 |                 if agent.gradient_counter() % gradients_push_freq == 0:
207 |                     client.isend_shard(agent.gradient)
208 |                     agent.zero_grad_()
209 |             else:
210 |                 if not warned:
211 |                     warned = True
212 |                     print('[rank %s]' % rank, time.strftime('[%Y-%m-%d-%H:%M:%S'), 
213 |                       '%s]:' % os.environ['run_name'], 'server average '
214 |                       'time [%.1f], local time [%.1f],' % (glb_avg_time, local_time), 
215 |                       'skip gradients due to delay timed out ...', flush=True)
216 |                 agent.zero_grad_()
217 |                 agent.gradient_counter('add')
218 |             # update target network
219 |             if glb_updates - last_target_update >= update_target_freq: 
220 |                 overhead, params = client.recv_param()
221 |                 vector_to_parameters(params, agent.target_net.parameters())
222 |                 last_target_update = glb_updates
223 |             curr_state = next_state
224 |             if done: break
225 |                 
226 |         print('[rank %s]' % rank, time.strftime('[%Y-%m-%d-%H:%M:%S'), 
227 |               '%s]:' % os.environ['run_name'], 'episode [%s/%s], ep-reward [%s], eps [%.2f], '
228 |               'beta [%.2f], timesteps [%s], frames [%s], global_updates [%s], server [%s]' %
229 |               (env.global_episodes(), num_episodes, env.episodic_reward(), get_thres(), get_beta(),
230 |                agent.gradient_counter(), env.global_frames(), glb_updates, server), flush=True)
231 |         
232 |         if tensorboard is not None:
233 |         # recording via tensorboard
234 |             tensorboard.add_scalar('episode/reward', env.episodic_reward(), env.global_episodes())
235 |             tensorboard.add_scalar('episode/thres', get_thres(), env.global_episodes())  
236 | 
237 |             
238 |             


--------------------------------------------------------------------------------
/examples/gorila_dqn/launcher.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytorl.distributed as rl_dist
 3 | import torch.distributed as dist
 4 | from client import param_client_proc
 5 | from server import param_server_proc
 6 | 
 7 | 
 8 | os.environ.setdefault('run_name', 'default')
 9 | 
10 | 
11 | def main():
12 |     rank, world_size, master_rank, worker_list = rl_dist.slurm_param_server_arch(port=23228)
13 |     worker_group = dist.new_group(ranks=worker_list)
14 |     
15 |     if rank == master_rank:
16 |         print('master service running at rank [%s]' % rank, flush=True)
17 |         param_server_proc(master_rank, worker_list)
18 |     else:
19 |         param_client_proc(master_rank, worker_group)
20 |     
21 | 
22 | if __name__ == '__main__':
23 |     main()
24 | 


--------------------------------------------------------------------------------
/examples/gorila_dqn/run_project/ATARI.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/bash 
  2 | OPTSPEC=":GLNchnp-:"
  3 | 
  4 | alg='ATARI GORILA DQN' # name of algorithm
  5 | py_filename='launcher.py'
  6 | local=false
  7 | rn_prefix='gorila' # a prefix of the run name to help manage experiment dir
  8 | rn_suffix='default'
  9 | default_num_tasks=4
 10 | default_cpus_per_task=1
 11 | 
 12 | echo
 13 | 
 14 | while getopts "$OPTSPEC" optchar; do
 15 |     case "${optchar}" in
 16 |         -)
 17 |             case "${OPTARG}" in
 18 |                 cpus-per-task)
 19 |                     cpus="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 20 |                     ;;
 21 |                 cpus-per-task=*)
 22 |                     cpus=${OPTARG#*=}
 23 |                     ;;
 24 |                 gpu)
 25 |                     gpus="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 26 |                     ;;
 27 |                 gpu=*)
 28 |                     gpus=${OPTARG#*=}
 29 |                     ;;
 30 |                 partition)
 31 |                     partition="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 32 |                     ;;
 33 |                 partition=*)
 34 |                     partition=${OPTARG#*=}
 35 |                     ;;
 36 |                 prefix)
 37 |                     rn_prefix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 38 |                     ;;
 39 |                 prefix=*)
 40 |                     rn_prefix=${OPTARG#*=}
 41 |                     ;;
 42 |                 name)
 43 |                     rn_suffix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 44 |                     ;;
 45 |                 name=*)
 46 |                     rn_suffix=${OPTARG#*=}
 47 |                     ;;
 48 |                 ntasks)
 49 |                     tasks="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 50 |                     ;;
 51 |                 ntasks=*)
 52 |                     tasks=${OPTARG#*=}
 53 |                     ;;
 54 |                 ntasks-per-node)
 55 |                     ntasks_per_node="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 56 |                     ;;
 57 |                 ntasks-per-node=*)
 58 |                     ntasks_per_node=${OPTARG#*=}
 59 |                     ;;
 60 |                 local)
 61 |                     local=true
 62 |                     echo -e "\e[46m [lrun] LOCAL RUN (NON SRUN) MODE SPECIFIED \e[0m"
 63 |                     ;;
 64 |                 *)
 65 |                     if [ "$OPTERR" = 1 ]; then
 66 |                         echo -e "\e[41m [ERROR] UNKNOWN ARGUMENT '-${OPTARG}' SPECIFIED \e[0m" >&2
 67 |                         exit 2
 68 |                     fi
 69 |                     ;;
 70 |             esac;;
 71 |         G)
 72 |             gpus="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 73 |             ;;
 74 |         L)
 75 |             local=true
 76 |             echo -e "\e[46m [lrun] LOCAL RUN (NON SRUN) MODE SPECIFIED \e[0m"
 77 |             ;;
 78 |         N)
 79 |             rn_suffix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 80 |             ;;
 81 |         c)
 82 |             cpus="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 83 |             ;;
 84 |         h)
 85 |             echo "USAGE: $0 <OPTIONS...>" >&2
 86 |             echo "          [-h][-G, --gpu[=]<corresp. to srun gres gpu:>]" >&2
 87 |             echo "          [-h][-L, --local    using lrun instead of srun]" >&2
 88 |             echo "          [-h][-c, --cpus-per-task[=]<num cpus per task>]" >&2
 89 |             echo "          [-h][-n, --ntasks[=]<total num tasks>]" >&2
 90 |             echo "          [-h][    --ntasks-per-node[=]<num tasks per node>]" >&2
 91 |             echo "          [-h][    --prefix[=]<run name prefix>]" >&2
 92 |             echo "          [-h][-N, --name[=]<run name suffix>]" >&2
 93 |             echo "          [-h][-p, --partition=<srun partition>]" >&2
 94 |             echo
 95 |             exit 2
 96 |             ;;
 97 |         n)
 98 |             ntasks="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 99 |             ;;
100 |         # lower case p
101 |         p)
102 |             partition="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
103 |             ;;
104 |         *)
105 |             if [ "$OPTERR" != 1 ] || [ "${OPTSPEC:0:1}" = ":" ]; then
106 |                 echo -e "\e[41m [ERROR] UNKNOWN ARGUMENT '-${OPTARG}' SPECIFIED \e[0m" >&2
107 |                 exit 2
108 |             fi
109 |             ;;
110 |     esac
111 | done
112 | 
113 | cd ..
114 | 
115 | if [ -z "${tasks}" ]; then
116 |     tasks=${default_num_tasks}
117 | fi
118 | 
119 | if [ -z "${ntasks_per_node}" ]; then
120 |     ntasks_per_node=${tasks}
121 | fi
122 | 
123 | if [ -z "${cpus}" ]; then
124 |     cpus=${default_cpus_per_task}
125 | fi
126 | 
127 | if [ -z "${gpus}" ]; then
128 |     gpus=${ntasks_per_node}
129 | fi
130 | 
131 | local_cmd="lrun -n${tasks}"
132 | 
133 | run_cmd="srun -J ${rn_suffix} -p ${partition} -n${tasks} --gres gpu:${gpus} --ntasks-per-node ${ntasks_per_node}"
134 |     
135 | py_cmd="python ${py_filename} 2>&1 | tee -a ../log.txt"
136 | 
137 | if [ ${local} == true ]; then
138 |     run_cmd=${local_cmd}
139 | else
140 |     if [ -z "${partition}" ]; then
141 |         echo -e "\e[41m [ERROR] PARTITION(--partition or -p) NOT SPECIFIED \e[0m"  >&2
142 |         exit 2
143 |     fi
144 | fi
145 | 
146 | echo -e "\e[46m [ntasks] USING DEFAULT VALUE [${tasks}] (AS SPECIFIED IN THIS SCRIPT)\e[0m"
147 | if [ ${local} == false ]; then
148 |     echo -e "\e[46m [ntasks_per_node] USING DEFAULT VALUE [${ntasks_per_node}] (EQUALS TO [ntasks]) \e[0m"
149 |     echo -e "\e[46m [cpus_per_task] USING DEFAULT VALUE [${cpus}] (AS SPECIFIED IN THIS SCRIPT) \e[0m"
150 |     echo -e "\e[46m [gpu] USING DEFAULT VALUE [${gpus}] (EQUALS TO [ntasks_per_node]) \e[0m"
151 | fi
152 | 
153 | echo '________________________________________________________________________________'
154 | echo -e "\e[42m STARTING \e[0m\e[43m ${alg} \e[0m"
155 | echo           "rl-run -rn ${rn_prefix}/${rn_suffix} -c"
156 | echo           "${run_cmd}"
157 | echo           "${py_cmd}"
158 | echo '‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾'
159 | 
160 | rl-run -rn ${rn_prefix}/${rn_suffix} -c "${run_cmd} ${py_cmd}"
161 | 
162 | echo '________________________________________________________________________________'
163 | echo          "RUN_NAME: ${rn_prefix}/${rn_suffix}"
164 | echo          "CMD_LINE: sh $0 $@"
165 | echo -e "\e[44m FINISHED \e[0m\e[43m ${alg} \e[0m"
166 | echo '‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾'
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/examples/gorila_dqn/run_project/TEST.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash 
 2 | OPTSPEC=":Nh-:"
 3 | 
 4 | alg='TEST LEARNING OUTCOME'
 5 | py_filename='test.py'
 6 | rn_prefix='torl' # a prefix of the run name to help manage experiment dir
 7 | rn_suffix='default'
 8 | 
 9 | echo
10 | 
11 | while getopts "$OPTSPEC" optchar; do
12 |     case "${optchar}" in
13 |         -)
14 |             case "${OPTARG}" in
15 |                 prefix)
16 |                     rn_prefix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
17 |                     ;;
18 |                 prefix=*)
19 |                     rn_prefix=${OPTARG#*=}
20 |                     ;;
21 |                 name)
22 |                     rn_suffix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
23 |                     ;;
24 |                 name=*)
25 |                     rn_suffix=${OPTARG#*=}
26 |                     ;;
27 |                 *)
28 |                     if [ "$OPTERR" = 1 ]; then
29 |                         echo -e "\e[41m [ERROR] UNKNOWN ARGUMENT '-${OPTARG}' SPECIFIED \e[0m" >&2
30 |                         exit 2
31 |                     fi
32 |                     ;;
33 |             esac;;
34 |         N)
35 |             rn_suffix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
36 |             ;;
37 |         h)
38 |             echo "USAGE: $0 <OPTIONS...>" >&2
39 |             echo "          [-h][    --prefix[=]<run name prefix>]" >&2
40 |             echo "          [-h][-N, --name[=]<run name suffix>]" >&2
41 |             echo
42 |             exit 2
43 |             ;;
44 |         *)
45 |             if [ "$OPTERR" != 1 ] || [ "${OPTSPEC:0:1}" = ":" ]; then
46 |                 echo -e "\e[41m [ERROR] UNKNOWN ARGUMENT '-${OPTARG}' SPECIFIED \e[0m" >&2
47 |                 exit 2
48 |             fi
49 |             ;;
50 |     esac
51 | done
52 | 
53 | cd ..
54 | 
55 | run_cmd="lrun -n1" # note that test only runs under local env
56 | py_cmd="python ${py_filename} 2>&1 | tee -a ../log.txt"
57 | 
58 | echo '________________________________________________________________________________'
59 | echo -e "\e[42m STARTING \e[0m\e[43m ${alg} \e[0m"
60 | echo           "rl-run -rn ${rn_prefix}/${rn_suffix} -c"
61 | echo           "${run_cmd}"
62 | echo           "${py_cmd}"
63 | echo '‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾'
64 | 
65 | rl-run -rn ${rn_prefix}/${rn_suffix} -c "${run_cmd} ${py_cmd}"
66 | 
67 | echo '________________________________________________________________________________'
68 | echo          "RUN_NAME: ${rn_prefix}/${rn_suffix}"
69 | echo          "CMD_LINE: sh $0 $@"
70 | echo -e "\e[44m FINISHED \e[0m\e[43m ${alg} \e[0m"
71 | echo '‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾'
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/examples/gorila_dqn/run_project/config.yaml:
--------------------------------------------------------------------------------
 1 | seed: 123
 2 | solver:
 3 |     env: PongNoFrameskip-v4
 4 |     # this specifies how much frames stacked for one input
 5 |     # also specifies the number of network input channels
 6 |     frames_stack: 4
 7 |     double_dqn: True
 8 |     dueling: True
 9 | server:
10 |     num_threads: 8
11 |     lr: 0.0001
12 |     optimizer: Adam
13 |     shard_factor: all
14 |     device: cuda
15 | client:
16 |     gamma: 0.99
17 |     episodes: 4000
18 |     # this specifies how many frames an action lasts
19 |     frames_action: 4
20 |     # note: gradient push freq w.r.t local timesteps
21 |     gradients_push_freq: 1
22 |     # note: update_target_freq w.r.t global timesteps
23 |     update_target_freq: 50
24 |     delay_factor: 0.1
25 |     loss: smooth_l1_loss
26 |     device: cuda
27 | greedy:
28 |     start: 1
29 |     end: 0.02
30 |     frames: 100000
31 | replay:
32 |     # note: capacity w.r.t states
33 |     capacity: 1000000
34 |     init_size: 10000
35 |     batch_size: 32
36 |     alpha: 0.6
37 |     beta:
38 |         start: 0.4
39 |         end: 1
40 |         frames: 100000
41 | record:
42 |     record_rank: 1
43 |     save_freq: 10000
44 |     save_path: ../checkpoint
45 |     load_path: ../checkpoint
46 |     render: False
47 |     
48 | 


--------------------------------------------------------------------------------
/examples/gorila_dqn/server.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | import threading
 4 | from threading import Lock
 5 | import time
 6 | import numpy as np
 7 | import torch
 8 | import torch.distributed as dist
 9 | import torchvision.transforms as T
10 | from pytorl.agents import GorilaDQN_ServerAgent
11 | import pytorl.distributed as rl_dist
12 | from pytorl.envs import make_atari_env
13 | import pytorl.lib as lib
14 | from pytorl.networks import Dueling_DQN, Q_Network
15 | import pytorl.utils as utils
16 | 
17 | 
18 | def param_server_proc(master_rank, worker_list):
19 |     ################################################################
20 |     # CONFIG & SETTINGS
21 |     rank, world_size = dist.get_rank(), dist.get_world_size()
22 |     master_rank = rl_dist.get_master_rank()
23 |     
24 |     cfg_reader = utils.ConfigReader(default='run_project/config.yaml')
25 |     config = cfg_reader.get_config()
26 |     seed, frames_stack = config.seed, config.solver.frames_stack
27 |     save_freq, save_path = config.record.save_freq, config.record.save_path
28 |     num_servers, shard_factor = config.server.num_threads, config.server.shard_factor
29 |     record_rank = config.record.record_rank
30 |     specified_device = config.server.device
31 |     assert record_rank != master_rank and record_rank <= world_size - 1
32 |     
33 |     env = make_atari_env(config.solver.env, T.Compose([]), render=False)
34 |     num_actions = env.num_actions()
35 |     
36 |     ################################################################
37 |     # DEVICE
38 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
39 |     if '%s' % device == specified_device: 
40 |         msg = 'using specified'
41 |     elif specified_device == 'cuda': 
42 |         msg = 'cuda not fuound, current'
43 |     else: 
44 |         msg = 'found cuda, but using specified'
45 |         device = torch.device(specified_device)
46 |         
47 |     print('[master rank %s] %s device: [%s]' % (rank, msg, device), flush=True)
48 | 
49 |     ################################################################
50 |     # SEEDING
51 |     random.seed(seed + rank)
52 |     np.random.seed(seed + rank)
53 |     torch.cuda.manual_seed(seed)
54 |     torch.manual_seed(seed)
55 |     
56 |     ################################################################
57 |     # AGENT
58 |     if config.solver.dueling:
59 |         network = Dueling_DQN
60 |     else:
61 |         network = Q_Network
62 |     
63 |     q_net = network(input_size=(frames_stack, 84, 84),
64 |                       num_actions=num_actions).to(device)
65 |     
66 |     optimizer_func = cfg_reader.get_optimizer_func(config.server.optimizer)
67 |     
68 |     agent = GorilaDQN_ServerAgent(
69 |         device = device,
70 |         q_net = q_net,
71 |         optimizer_func = optimizer_func,
72 |         shard_factor = shard_factor, 
73 |      )
74 |     
75 |     agent.reset()
76 |     agent.set_optimize_scheme(
77 |         lr=config.server.lr,
78 |         optimize_freq=1,
79 |     )
80 |     agent.set_checkpoint(save_freq, save_path)
81 | 
82 |     ################################################################
83 |     # SERVICE
84 |     
85 |     server_lock = Lock()
86 |     server = []
87 |     for idx in range(num_servers): 
88 |         server.append(rl_dist.ParamServer(device, idx, server_lock))
89 |         server[idx].set_listen(4, agent.optimize_counter)
90 |         server[idx].set_param_update(agent.q_net, agent.optimize)
91 |         
92 |     for idx in range(num_servers - 1): server[idx].start()
93 |     print('server current running threads: [%s]' % threading.active_count(), flush=True)
94 |     server[num_servers - 1].run()
95 |     
96 |     
97 |         


--------------------------------------------------------------------------------
/examples/gorila_dqn/test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | import time
 4 | import numpy as np
 5 | import torch
 6 | import torchvision.transforms as T
 7 | from pytorl.agents import DQN_Agent
 8 | from pytorl.envs import make_atari_env
 9 | from pytorl.networks import Q_Network, init_network
10 | import pytorl.utils as utils
11 | import pytorl.lib as lib
12 | 
13 | os.environ.setdefault('run_name', 'default')
14 | 
15 | 
16 | def main():
17 |     ################################################################
18 |     # DEVICE
19 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20 |     print('current device: [%s]' % device, flush=True)
21 | 
22 |     ################################################################
23 |     # CONFIG
24 |     cfg_reader = utils.ConfigReader(default='run_project/config.yaml')
25 |     config = cfg_reader.get_config()
26 |     seed, num_episodes = config.seed, config.client.episodes
27 | 
28 |     ################################################################
29 |     # RECORDER
30 |     # tensorboard
31 |     tensorboard = utils.tensorboard_writer(logdir='..')
32 |     tensorboard.add_textfile('config', cfg_reader.config_path)
33 | 
34 |     ################################################################
35 |     # ATARI ENVIRONMENT
36 |     resize = T.Compose([T.ToPILImage(),
37 |                     T.Grayscale(1),
38 |                     T.Resize((84, 84), interpolation=3),
39 |                     T.ToTensor()])
40 |     frames_stack = config.solver.frames_stack
41 |     env = make_atari_env(config.solver.env, resize,
42 |                         render=config.record.render)
43 | 
44 |     env.set_episodic_init('FIRE')
45 |     env.set_frames_stack(frames_stack)
46 |     env.set_single_life(True)
47 |     env.set_frames_action(4)
48 |     num_actions = env.num_actions()
49 | 
50 |     ################################################################
51 |     # AGENT
52 |     q_net = Q_Network(input_size=(frames_stack, 84, 84),
53 |                       num_actions=num_actions)
54 |     init_network(q_net, config.record.load_path, 'q_net.pth', obj_name='q_network')
55 |     q_net = q_net.to(device)
56 |     agent = DQN_Agent(device=device, q_net=q_net)
57 |     agent.set_exploration(env.sample, lib.eps_greedy_func(config.greedy.end, config.greedy.end))
58 | 
59 |     ################################################################
60 |     # SEEDING
61 |     random.seed(seed)
62 |     np.random.seed(seed)
63 |     torch.cuda.manual_seed(seed)
64 |     torch.manual_seed(seed)
65 |     env.seed(seed)
66 | 
67 |     ################################################################
68 |     # TESTING
69 |     for _ in range(num_episodes):
70 |         env.reset()
71 |         # get initial state
72 |         done = False
73 |         while True:
74 |             action = agent.next_action(env.state)
75 |             next_observ, reward, done, _ = env.step(action)
76 |             time.sleep(0.02)
77 |             if done: break
78 | 
79 |         print(time.strftime('[%Y-%m-%d-%H:%M:%S'), '%s]:' % os.environ['run_name'],
80 |               'episode [%s/%s], ep-reward [%s], frames [%s]' %
81 |               (env.global_episodes(), num_episodes, env.episodic_reward(),
82 |                env.global_frames()), flush=True)
83 |         # recording via tensorboard
84 |         tensorboard.add_scalar('episode/reward', env.episodic_reward(), env.global_episodes())
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     main()
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/examples/prioritized_dqn/README.md:
--------------------------------------------------------------------------------
 1 | ## Prioritized & Dueling DQN
 2 | 
 3 | This example project implements DQN wit proiritized replay and dueling DQN.
 4 | Check the config to see what you can change and how to switch between different versions of deep q-learning.  
 5 | 
 6 | #### 1. Prioritized Experience Replay
 7 | Source: https://arxiv.org/abs/1511.05952
 8 | 
 9 | 
10 | #### 2. Dueling Network Architectures for Deep Reinforcement Learning
11 | Source: https://arxiv.org/abs/1511.06581  
12 | 
13 | &nbsp;  
14 | 
15 | **Run Example:**
16 | 
17 | ```bash
18 | $ cd run_project/
19 | $ sh [-h] <script filename> <options>
20 | # for example, you can try sh ATARI.sh -N testrun --local
21 | # you can always use sh <script filename> [-h] for options help
22 | ```
23 | 
24 | &nbsp;  
25 | 
26 | **Test Learning Result:**
27 | 
28 | ```bash
29 | $ cd run_project/
30 | $ sh TEST.sh -N <run name>
31 | # for example, you can try sh TEST.sh -N test_agent
32 | ```
33 | 
34 | &nbsp;  
35 | 
36 | **Result Demo:**  
37 | 1. Prioritized DQN  
38 | ![pong-v4_prioritized](../../.demo/pong-v4_prioritized.png)  
39 | *(reward/episode while training ale atari PongNoFrameskip-v4 via Prioritized DQN)*  
40 | 
41 | 2. Dueling DQN  
42 | ![pong-v4_dueling](../../.demo/pong-v4_dueling.png)  
43 | *(reward/episode while training ale atari PongNoFrameskip-v4 via Dueling DQN)*   
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/examples/prioritized_dqn/learn.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import time
  4 | import numpy as np
  5 | import torch
  6 | import torchvision.transforms as T
  7 | from pytorl.envs import make_atari_env
  8 | from pytorl.agents import PrioritizedDQN_Agent
  9 | from pytorl.networks import Dueling_DQN, Q_Network
 10 | import pytorl.utils as utils
 11 | import pytorl.lib as lib
 12 | 
 13 | os.environ.setdefault('run_name', 'default')
 14 | 
 15 | 
 16 | def main():
 17 |     ################################################################
 18 |     # DEVICE
 19 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 20 |     print('current device: [%s]' % device, flush=True)
 21 | 
 22 |     ################################################################
 23 |     # CONFIG
 24 |     cfg_reader = utils.ConfigReader(default='run_project/config.yaml')
 25 |     config = cfg_reader.get_config()
 26 |     seed, num_episodes = config.seed, config.solver.episodes
 27 | 
 28 |     ################################################################
 29 |     # RECORDER
 30 |     # tensorboard
 31 |     tensorboard = utils.tensorboard_writer(logdir='..')
 32 |     tensorboard.add_textfile('config', cfg_reader.config_path)
 33 | 
 34 |     ################################################################
 35 |     # ATARI ENVIRONMENT
 36 |     resize = T.Compose(
 37 |         [T.ToPILImage(),
 38 |         T.Grayscale(1),
 39 |         T.Resize((84, 84), interpolation=3),
 40 |         T.ToTensor()]
 41 |     )
 42 |     frames_stack = config.solver.frames_stack
 43 |     env = make_atari_env(
 44 |         config.solver.env, 
 45 |         resize,
 46 |         render=config.record.render
 47 |     )
 48 | 
 49 |     env.set_episodic_init('FIRE')
 50 |     env.set_frames_stack(frames_stack)
 51 |     env.set_single_life(True)
 52 |     env.set_frames_action(config.solver.frames_action)
 53 |     num_actions = env.num_actions()
 54 | 
 55 |     ################################################################
 56 |     # UTILITIES
 57 |     get_beta = lib.beta_priority_func(
 58 |         beta_start=config.replay.beta.start,
 59 |         beta_end=config.replay.beta.end,
 60 |         num_incres=config.replay.beta.frames,
 61 |         global_frames_func=env.global_frames
 62 |     )
 63 | 
 64 |     get_thres = lib.eps_greedy_func(
 65 |         eps_start=config.greedy.start,
 66 |         eps_end=config.greedy.end,
 67 |         num_decays=config.greedy.frames,
 68 |         global_frames_func=env.global_frames
 69 |     )
 70 | 
 71 |     ################################################################
 72 |     # AGENT
 73 |     if config.solver.dueling:
 74 |         network = Dueling_DQN
 75 |     else:
 76 |         network = Q_Network
 77 |     
 78 |     q_net = network(input_size=(frames_stack, 84, 84),
 79 |                       num_actions=num_actions).to(device)
 80 | 
 81 |     target_net = network(input_size=(frames_stack, 84, 84),
 82 |                            num_actions=num_actions).to(device)
 83 | 
 84 |     loss_func = cfg_reader.get_loss_func(config.solver.loss)
 85 |     optimizer_func = cfg_reader.get_optimizer_func(config.solver.optimizer)
 86 |     
 87 |     agent = PrioritizedDQN_Agent(
 88 |         device = device,
 89 |         q_net = q_net,
 90 |         target_net = target_net,
 91 |         loss_func = loss_func,
 92 |         optimizer_func = optimizer_func,
 93 |      )
 94 |     
 95 |     agent.set_prioritized_replay(
 96 |         capacity=config.replay.capacity, 
 97 |         batch_size=config.replay.batch_size, 
 98 |         init_size=config.replay.init_size, 
 99 |         alpha=config.replay.alpha,
100 |         beta_func=get_beta, 
101 |         )
102 |     agent.reset()
103 |     agent.set_exploration(get_sample=env.sample, get_thres=get_thres)
104 |     agent.set_tensorboard(tensorboard)
105 |     agent.set_optimize_scheme(
106 |         lr=config.solver.lr,
107 |         gamma=config.solver.gamma,
108 |         optimize_freq=config.solver.optimize_freq,
109 |         update_target_freq=config.solver.update_target_freq
110 |     )
111 | 
112 |     ################################################################
113 |     # SEEDING
114 |     random.seed(seed)
115 |     np.random.seed(seed)
116 |     torch.cuda.manual_seed(seed)
117 |     torch.manual_seed(seed)
118 |     env.seed(seed)
119 | 
120 |     ################################################################
121 |     # PRETRAIN
122 |     # setting up initial random observations and replays during this session
123 |     print('now about to setup randomized [%s] required initial experience replay...' %
124 |               agent.replay.init_size, flush=True)
125 |     while True:
126 |         env.reset()
127 |         curr_state, done = env.state().clone(), False
128 |         while len(agent.replay) < agent.replay.init_size and not done:
129 |             action = env.sample()
130 |             next_observ, reward, done, _ = env.step(action)
131 |             next_state = env.state().clone()
132 |             agent.replay.push(curr_state, action, next_state, reward)
133 |             curr_state = next_state
134 | 
135 |         print(time.strftime('[%Y-%m-%d-%H:%M:%S'), '%s]:' % os.environ['run_name'],
136 |               'initializing prioritized experience replay progressing [%s/%s]' % (
137 |               len(agent.replay), agent.replay.init_size), flush=True)
138 |         if not done: break
139 |         # save final action into reply buffer
140 |         agent.replay.push(curr_state, action, None, reward)
141 | 
142 |     print(time.strftime('[%Y-%m-%d-%H:%M:%S'), '%s]:' % os.environ['run_name'],
143 |           'prioritized experience replay initialization completed [%s/%s]' % (
144 |           len(agent.replay), agent.replay.init_size), flush=True)
145 | 
146 |     env.refresh()
147 | 
148 |     ################################################################
149 |     # TRAINING
150 |     for _ in range(num_episodes):
151 |         env.reset()
152 |         # get initial state
153 |         curr_state, done = env.state().clone(), False
154 |         while True:
155 |             action = agent.next_action(env.state)
156 |             if not done:
157 |                 next_observ, reward, done, _ = env.step(action)
158 |                 next_state = env.state().clone()
159 |             else:
160 |                 next_state = None
161 |             agent.replay.push(curr_state, action, next_state, reward)
162 |             curr_state = next_state
163 |             agent.optimize()
164 |             if done: break
165 | 
166 |         print(time.strftime('[%Y-%m-%d-%H:%M:%S'), '%s]:' % os.environ['run_name'],
167 |               'episode [%s/%s], ep-reward [%s], eps [%.2f], '
168 |               'beta [%.2f], timesteps [%s], frames [%s]' %
169 |               (env.global_episodes(), num_episodes, env.episodic_reward(), 
170 |                get_thres(), get_beta(), agent.optimize_counter(), env.global_frames()), flush=True)
171 |         # recording via tensorboard
172 |         tensorboard.add_scalar('episode/reward', env.episodic_reward(), env.global_episodes())
173 |         tensorboard.add_scalar('episode/thres', get_thres(), env.global_episodes())
174 | 
175 |         if env.global_episodes() % config.record.save_freq == 0:
176 |             agent.save_pth(agent.q_net, config.record.save_path,
177 |                            filename='q_net.pth', obj_name='q_network')
178 |             agent.save_pth(agent.target_net, config.record.save_path,
179 |                            filename='target_net.pth', obj_name='target_network')
180 | 
181 | 
182 | if __name__ == '__main__':
183 |     main()
184 | 
185 | 
186 | 


--------------------------------------------------------------------------------
/examples/prioritized_dqn/run_project/ATARI.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/bash 
  2 | OPTSPEC=":LNhp-:"
  3 | 
  4 | alg='ATARI PRIORITIZED DQN' # name of algorithm
  5 | py_filename='learn.py'
  6 | local=false
  7 | rn_prefix='torl' # a prefix of the run name to help manage experiment dir
  8 | rn_suffix='default'
  9 | 
 10 | echo
 11 | 
 12 | while getopts "$OPTSPEC" optchar; do
 13 |     case "${optchar}" in
 14 |         -)
 15 |             case "${OPTARG}" in
 16 |                 partition)
 17 |                     partition="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 18 |                     ;;
 19 |                 partition=*)
 20 |                     partition=${OPTARG#*=}
 21 |                     ;;
 22 |                 prefix)
 23 |                     rn_prefix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 24 |                     ;;
 25 |                 prefix=*)
 26 |                     rn_prefix=${OPTARG#*=}
 27 |                     ;;
 28 |                 name)
 29 |                     rn_suffix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 30 |                     ;;
 31 |                 name=*)
 32 |                     rn_suffix=${OPTARG#*=}
 33 |                     ;;
 34 |                 local)
 35 |                     local=true
 36 |                     echo -e "\e[46m [lrun] LOCAL RUN (NON SRUN) MODE SPECIFIED \e[0m"
 37 |                     ;;
 38 |                 *)
 39 |                     if [ "$OPTERR" = 1 ]; then
 40 |                         echo -e "\e[41m [ERROR] UNKNOWN ARGUMENT '-${OPTARG}' SPECIFIED \e[0m" >&2
 41 |                         exit 2
 42 |                     fi
 43 |                     ;;
 44 |             esac;;
 45 |         L)
 46 |             local=true
 47 |             echo -e "\e[46m [lrun] LOCAL RUN (NON SRUN) MODE SPECIFIED \e[0m"
 48 |             ;;
 49 |         N)
 50 |             rn_suffix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 51 |             ;;
 52 |         h)
 53 |             echo "USAGE: $0 <OPTIONS...>" >&2
 54 |             echo "          [-h][-L, --local    using lrun instead of srun]" >&2
 55 |             echo "          [-h][    --prefix[=]<run name prefix>]" >&2
 56 |             echo "          [-h][-N, --name[=]<run name suffix>]" >&2
 57 |             echo "          [-h][-p, --partition=<srun partition>]" >&2
 58 |             echo
 59 |             exit 2
 60 |             ;;
 61 |         # lower case p
 62 |         p)
 63 |             partition="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
 64 |             ;;
 65 |         *)
 66 |             if [ "$OPTERR" != 1 ] || [ "${OPTSPEC:0:1}" = ":" ]; then
 67 |                 echo -e "\e[41m [ERROR] UNKNOWN ARGUMENT '-${OPTARG}' SPECIFIED \e[0m" >&2
 68 |                 exit 2
 69 |             fi
 70 |             ;;
 71 |     esac
 72 | done
 73 | 
 74 | cd ..
 75 | 
 76 | local_cmd="lrun -n1"
 77 | run_cmd="srun -J ${rn_suffix} -p ${partition} --gres gpu:1"
 78 | py_cmd="python ${py_filename} 2>&1 | tee -a ../log.txt"
 79 | 
 80 | if [ ${local} == true ]; then
 81 |     run_cmd=${local_cmd}
 82 | else
 83 |     if [ -z "${partition}" ]; then
 84 |         echo -e "\e[41m [ERROR] PARTITION(--partition or -p) NOT SPECIFIED \e[0m"  >&2
 85 |         exit 2
 86 |     fi
 87 | fi
 88 | 
 89 | echo '________________________________________________________________________________'
 90 | echo -e "\e[42m STARTING \e[0m\e[43m ${alg} \e[0m"
 91 | echo           "rl-run -rn ${rn_prefix}/${rn_suffix} -c"
 92 | echo           "${run_cmd}"
 93 | echo           "${py_cmd}"
 94 | echo '‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾'
 95 | 
 96 | rl-run -rn ${rn_prefix}/${rn_suffix} -c "${run_cmd} ${py_cmd}"
 97 | 
 98 | echo '________________________________________________________________________________'
 99 | echo          "RUN_NAME: ${rn_prefix}/${rn_suffix}"
100 | echo          "CMD_LINE: sh $0 $@"
101 | echo -e "\e[44m FINISHED \e[0m\e[43m ${alg} \e[0m"
102 | echo '‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾'
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/examples/prioritized_dqn/run_project/TEST.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash 
 2 | OPTSPEC=":Nh-:"
 3 | 
 4 | alg='TEST LEARNING OUTCOME'
 5 | py_filename='test.py'
 6 | rn_prefix='torl' # a prefix of the run name to help manage experiment dir
 7 | rn_suffix='default'
 8 | 
 9 | echo
10 | 
11 | while getopts "$OPTSPEC" optchar; do
12 |     case "${optchar}" in
13 |         -)
14 |             case "${OPTARG}" in
15 |                 prefix)
16 |                     rn_prefix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
17 |                     ;;
18 |                 prefix=*)
19 |                     rn_prefix=${OPTARG#*=}
20 |                     ;;
21 |                 name)
22 |                     rn_suffix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
23 |                     ;;
24 |                 name=*)
25 |                     rn_suffix=${OPTARG#*=}
26 |                     ;;
27 |                 *)
28 |                     if [ "$OPTERR" = 1 ]; then
29 |                         echo -e "\e[41m [ERROR] UNKNOWN ARGUMENT '-${OPTARG}' SPECIFIED \e[0m" >&2
30 |                         exit 2
31 |                     fi
32 |                     ;;
33 |             esac;;
34 |         N)
35 |             rn_suffix="${!OPTIND}"; OPTIND=$(( $OPTIND + 1 ))
36 |             ;;
37 |         h)
38 |             echo "USAGE: $0 <OPTIONS...>" >&2
39 |             echo "          [-h][    --prefix[=]<run name prefix>]" >&2
40 |             echo "          [-h][-N, --name[=]<run name suffix>]" >&2
41 |             echo
42 |             exit 2
43 |             ;;
44 |         *)
45 |             if [ "$OPTERR" != 1 ] || [ "${OPTSPEC:0:1}" = ":" ]; then
46 |                 echo -e "\e[41m [ERROR] UNKNOWN ARGUMENT '-${OPTARG}' SPECIFIED \e[0m" >&2
47 |                 exit 2
48 |             fi
49 |             ;;
50 |     esac
51 | done
52 | 
53 | cd ..
54 | 
55 | run_cmd="lrun -n1" # note that test only runs under local env
56 | py_cmd="python ${py_filename} 2>&1 | tee -a ../log.txt"
57 | 
58 | echo '________________________________________________________________________________'
59 | echo -e "\e[42m STARTING \e[0m\e[43m ${alg} \e[0m"
60 | echo           "rl-run -rn ${rn_prefix}/${rn_suffix} -c"
61 | echo           "${run_cmd}"
62 | echo           "${py_cmd}"
63 | echo '‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾'
64 | 
65 | rl-run -rn ${rn_prefix}/${rn_suffix} -c "${run_cmd} ${py_cmd}"
66 | 
67 | echo '________________________________________________________________________________'
68 | echo          "RUN_NAME: ${rn_prefix}/${rn_suffix}"
69 | echo          "CMD_LINE: sh $0 $@"
70 | echo -e "\e[44m FINISHED \e[0m\e[43m ${alg} \e[0m"
71 | echo '‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾'
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/examples/prioritized_dqn/run_project/config.yaml:
--------------------------------------------------------------------------------
 1 | seed: 123
 2 | solver:
 3 |     env: PongNoFrameskip-v4
 4 |     double_dqn: True
 5 |     dueling: True
 6 |     lr: 0.0001
 7 |     gamma: 0.99
 8 |     episodes: 4000
 9 |     # this specifies how much frames stacked for one input
10 |     frames_stack: 4
11 |     # this specifies how many frames an action lasts
12 |     frames_action: 4
13 |     # note: optimize_freq w.r.t timesteps
14 |     optimize_freq: 1
15 |     # note: update_target_freq w.r.t timesteps
16 |     update_target_freq: 1000
17 |     loss: smooth_l1_loss
18 |     optimizer: Adam
19 | greedy:
20 |     start: 1
21 |     end: 0.02
22 |     frames: 100000
23 | replay:
24 |     # note: capacity w.r.t states
25 |     capacity: 500000
26 |     init_size: default 
27 |     batch_size: 32
28 |     alpha: 0
29 |     beta:
30 |         start: 0.4
31 |         end: 1
32 |         frames: 100000
33 | record:
34 |     save_freq: 100
35 |     save_path: ../checkpoint
36 |     load_path: /Users/zhe/Desktop
37 |     render: False
38 |     
39 | 


--------------------------------------------------------------------------------
/examples/prioritized_dqn/test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | import time
 4 | import numpy as np
 5 | import torch
 6 | import torchvision.transforms as T
 7 | from pytorl.agents import DQN_Agent
 8 | from pytorl.envs import make_atari_env
 9 | from pytorl.networks import Q_Network, init_network
10 | import pytorl.utils as utils
11 | import pytorl.lib as lib
12 | 
13 | os.environ.setdefault('run_name', 'default')
14 | 
15 | 
16 | def main():
17 |     ################################################################
18 |     # DEVICE
19 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
20 |     print('current device: [%s]' % device, flush=True)
21 | 
22 |     ################################################################
23 |     # CONFIG
24 |     cfg_reader = utils.ConfigReader(default='run_project/config.yaml')
25 |     config = cfg_reader.get_config()
26 |     seed, num_episodes = config.seed, config.solver.episodes
27 | 
28 |     ################################################################
29 |     # RECORDER
30 |     # tensorboard
31 |     tensorboard = utils.tensorboard_writer(logdir='..')
32 |     tensorboard.add_textfile('config', cfg_reader.config_path)
33 | 
34 |     ################################################################
35 |     # ATARI ENVIRONMENT
36 |     resize = T.Compose([T.ToPILImage(),
37 |                     T.Grayscale(1),
38 |                     T.Resize((84, 84), interpolation=3),
39 |                     T.ToTensor()])
40 |     frames_stack = config.solver.frames_stack
41 |     env = make_atari_env(config.solver.env, resize,
42 |                         render=config.record.render)
43 | 
44 |     env.set_episodic_init('FIRE')
45 |     env.set_frames_stack(frames_stack)
46 |     env.set_single_life(True)
47 |     env.set_frames_action(4)
48 |     num_actions = env.num_actions()
49 | 
50 |     ################################################################
51 |     # AGENT
52 |     q_net = Q_Network(input_size=(frames_stack, 84, 84),
53 |                       num_actions=num_actions)
54 |     init_network(q_net, config.record.load_path, 'q_net.pth', obj_name='q_network')
55 |     q_net = q_net.to(device)
56 |     agent = DQN_Agent(device=device, q_net=q_net)
57 |     agent.set_exploration(env.sample, lib.eps_greedy_func(config.greedy.end, config.greedy.end))
58 | 
59 |     ################################################################
60 |     # SEEDING
61 |     random.seed(seed)
62 |     np.random.seed(seed)
63 |     torch.cuda.manual_seed(seed)
64 |     torch.manual_seed(seed)
65 |     env.seed(seed)
66 | 
67 |     ################################################################
68 |     # TESTING
69 |     for _ in range(num_episodes):
70 |         env.reset()
71 |         # get initial state
72 |         done = False
73 |         while True:
74 |             action = agent.next_action(env.state)
75 |             next_observ, reward, done, _ = env.step(action)
76 |             time.sleep(0.02)
77 |             if done: break
78 | 
79 |         print(time.strftime('[%Y-%m-%d-%H:%M:%S'), '%s]:' % os.environ['run_name'],
80 |               'episode [%s/%s], ep-reward [%s], frames [%s]' %
81 |               (env.global_episodes(), num_episodes, env.episodic_reward(),
82 |                env.global_frames()), flush=True)
83 |         # recording via tensorboard
84 |         tensorboard.add_scalar('episode/reward', env.episodic_reward(), env.global_episodes())
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     main()
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/pytorl/README.md:
--------------------------------------------------------------------------------
 1 | # [ pytorl ] Components Description
 2 | ### [PROJECT CURRENTLY UNDER DEVELOPMENT]
 3 | 
 4 | &nbsp;  
 5 | 
 6 | #### 1. agents:
 7 | <pre>This module contains implementations of RL agents. The goal is to make all
 8 | sophisticated choice-making, optimization, or utilities such as DQN replay buffer
 9 | completely inside of agent scope. Then wen can use it easily as a black-box.</pre>
10 | 
11 | #### 2. distributed:
12 | <pre>This module contains implementations of distributed intialization and commu-
13 | nication methods. The goal is to provided distributed support for pytorl.</pre>
14 | 
15 | #### 3. envs:
16 | <pre>This module contains implementations of RL environment. Currently it contains
17 | gym classic control and atari environment. The goal for this module is to regulate 
18 | and provide general interface for rl environmments so that we can probably DIY our 
19 | own learning environment and use it without changing training files. </pre>
20 | 
21 | #### 4. lib:
22 | <pre>This module contains implementations of reinforcement learning algorithm 
23 | related support. </pre>
24 | 
25 | #### 5. networks:
26 | <pre>This module contains implementations of deep reinforcement learning neural 
27 | network files. </pre>
28 | 
29 | #### 6. settings:
30 | <pre>This module contains utilities for command line fast entry (rl-run and lrun), 
31 | and pytorl package config.</pre>
32 | 
33 | #### 7. utils:
34 | <pre>This module contains implementations of non-RL related general support like 
35 | config reader and tensorboard recorder setup. The goal is to provide useful 
36 | and convenient general tools.</pre>
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/pytorl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kareido/pytorl/2f2f5258425166b8bfbde985a229fecdef3752d9/pytorl/__init__.py


--------------------------------------------------------------------------------
/pytorl/agents/DQN.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | import torch
  4 | import torch.nn as nn
  5 | from torch.nn.utils import clip_grad_value_
  6 | from pytorl.lib import PrioritizedReplay
  7 | from pytorl.utils import Setting
  8 | from ._base_agent import Agent
  9 | 
 10 | 
 11 | class DQN_Agent(Agent):
 12 |     def __init__(self, 
 13 |          device, 
 14 |          q_net, 
 15 |          target_net=None, 
 16 |          loss_func=None, 
 17 |          optimizer_func=None,  
 18 |          replay=None, 
 19 |         ):
 20 |         super(DQN_Agent, self).__init__()
 21 |         self.device = device
 22 |         self.q_net = q_net
 23 |         self.target_net = target_net
 24 |         self.loss = loss_func
 25 |         self._get_optimizer = optimizer_func
 26 |         self.replay = replay
 27 |         # attributes for optimization
 28 |         self.batch_size = None
 29 |         self.lr = .0001
 30 |         self.gamma = .99
 31 |         self.optimize_freq = 1
 32 |         self.update_target_freq = 1
 33 |         # attributes for action selection
 34 |         self.get_sample = None
 35 |         self.get_thres = lambda: 0
 36 |         
 37 |         
 38 |     @Setting
 39 |     def set_exploration(self,
 40 |                         get_sample=None, 
 41 |                         get_thres=lambda: 0,  
 42 |                         ):
 43 |         self.get_sample = get_sample
 44 |         self.get_thres = get_thres
 45 |     
 46 |     
 47 |     @Setting
 48 |     def set_optimize_scheme(self,
 49 |             lr=.0001, gamma=.99, 
 50 |             optimize_freq=1, 
 51 |             update_target_freq=1, 
 52 |         ):
 53 |         # set attributes
 54 |         self.batch_size = self.replay.batch_size
 55 |         self.lr = lr
 56 |         self.gamma = gamma
 57 |         self.optimize_freq = optimize_freq
 58 |         self.update_target_freq = update_target_freq
 59 |         self.optimizer = self._get_optimizer(
 60 |             self.q_net.parameters(), 
 61 |             lr=self.lr
 62 |         )
 63 |     
 64 |     
 65 |     def reset(self):
 66 |         if self.replay: self.replay.clear()
 67 |         if self.target_net: self.set_device()
 68 |         self.optimize_counter('set', 0)
 69 |         self.optimize_timer('set', 0)
 70 |         for name, params in self.q_net.named_parameters():
 71 |             if 'bias' in name:
 72 |                 # to avoid 'Fan in and fan out can not be computed for tensor with fewer 
 73 |                 # than 2 dimensions' problem
 74 |                 nn.init.zeros_(params)
 75 |             else:
 76 |                 nn.init.kaiming_normal_(params)
 77 |         if self.target_net: self.update_target()
 78 |         self.q_net.train(True)
 79 |         if self.target_net: self.target_net.train(False)
 80 |     
 81 |     
 82 |     def update_target(self):
 83 |         self.target_net.load_state_dict(self.q_net.state_dict())
 84 |         
 85 |         
 86 |     def set_device(self):
 87 |         self.q_net = self.q_net.to(self.device)
 88 |         if self.target_net: self.target_net = self.target_net.to(self.device)
 89 |         
 90 |     
 91 |     def next_action(self, get_state):
 92 |         if not hasattr(get_state, '__call__'): 
 93 |             curr_state = lambda: get_state
 94 |         else: 
 95 |             curr_state = get_state
 96 |         sample_val = random.random()
 97 |         if sample_val >= self.get_thres():
 98 |             with torch.no_grad():
 99 |                 curr_q_val = self.q_net(get_state().to(self.device))
100 |             return curr_q_val.argmax(1).item()
101 |         else:
102 |             return self.get_sample()
103 |     
104 |     
105 |     """ should check if non_final_next is None when call this method"""
106 |     def _non_final_targeted_q_values(self, non_final_next):
107 |         return self.target_net(non_final_next).max(1)[0].detach()
108 |     
109 |     
110 |     def _record(self, rewards, q_net_loss, predicted_q_values, expected_q_values, counter=None):
111 |         if self._tensorboard is not None:
112 |             if counter is None: counter = self.optimize_counter
113 |             reward_mean = rewards.mean().item()
114 |             predicted_q_values_mean = predicted_q_values.mean().item()
115 |             expected_q_values_mean = expected_q_values.mean().item()
116 | 
117 |             self._tensorboard.add_scalar('timestep/replay_reward-mean', 
118 |                                    reward_mean, counter())
119 |             self._tensorboard.add_scalar('timestep/loss', q_net_loss, counter())
120 |             self._tensorboard.add_scalar('timestep/predicted_q_values-mean', 
121 |                                    predicted_q_values_mean, counter())
122 |             self._tensorboard.add_scalar('timestep/expected_q_values-mean', 
123 |                                    expected_q_values_mean, counter())    
124 |     
125 |     
126 |     def optimize(self):
127 |         self.optimize_timer('add')
128 |         if self.optimize_timer() % self.optimize_freq != 0: return
129 |         self.optimize_counter('add')
130 |         sample_exp = self.replay.sample()
131 |         batch = self.replay.form_obj(*zip(*sample_exp))
132 |         curr_states = torch.cat(batch.curr_state).to(self.device)
133 |         actions = torch.tensor(batch.action).to(self.device).view(-1, 1)
134 |         rewards = torch.tensor(batch.reward).to(self.device)
135 |         non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), 
136 |                                       device=self.device, dtype=torch.uint8)
137 |         non_final_next = torch.cat(
138 |                             [s for s in batch.next_state if s is not None]).to(self.device)
139 |         predicted_q_values = self.q_net(curr_states).gather(1, actions)
140 |         targeted_q_values = torch.zeros(rewards.shape[0], device=self.device)
141 |         # compute Q values via stationary target network, this 'try' is to avoid the situation 
142 |         # when all next states are None
143 |         try:
144 |             targeted_q_values[non_final_mask] = self._non_final_targeted_q_values(non_final_next)
145 |         except TypeError: print('encountered a case where all next states are None', flush=True)
146 |         # compute the expected Q values
147 |         expected_q_values = (targeted_q_values * self.gamma) + rewards
148 |         # compute loss
149 |         q_net_loss = self.loss(predicted_q_values, expected_q_values.unsqueeze(1))
150 |         # optimize the model
151 |         self.optimizer.zero_grad()
152 |         q_net_loss.backward()
153 |         clip_grad_value_(self.q_net.parameters(), 1)
154 |         self.optimizer.step()
155 |         # update target network
156 |         if self.optimize_counter() % self.update_target_freq == 0:
157 |             self.update_target()
158 |         # tensorboard recording
159 |         self._record(rewards, q_net_loss, predicted_q_values, expected_q_values)
160 |         
161 |         
162 |             
163 | class DoubleDQN_Agent(DQN_Agent):
164 |     def __init__(self, 
165 |          device, 
166 |          q_net, 
167 |          target_net=None, 
168 |          loss_func=None, 
169 |          optimizer_func=None,  
170 |          replay=None, 
171 |         ):
172 |         super(DoubleDQN_Agent, self).__init__(
173 |             device, q_net, 
174 |             target_net=target_net, 
175 |             loss_func=loss_func, 
176 |             optimizer_func=optimizer_func,  
177 |             replay=replay, 
178 |         )
179 |         
180 |     
181 |     """ should check if non_final_next is None when call this method"""
182 |     def _non_final_targeted_q_values(self, non_final_next):
183 |         # must view it to match the shape
184 |         next_actions = self.q_net(non_final_next).max(1)[1].view(-1, 1)
185 |         # must squeeze it to make it a batch of scalar values
186 |         return self.target_net(non_final_next).gather(1, next_actions).squeeze()
187 |     
188 |     
189 |     
190 | class PrioritizedDQN_Agent(DQN_Agent):
191 |     def __init__(self, 
192 |          device, 
193 |          q_net, 
194 |          target_net=None, 
195 |          loss_func=None, 
196 |          optimizer_func=None, 
197 |          double_dqn=True, 
198 |         ):
199 |         super(PrioritizedDQN_Agent, self).__init__(
200 |             device, q_net, 
201 |             target_net=target_net, 
202 |             loss_func=loss_func, 
203 |             optimizer_func=optimizer_func,  
204 |         )
205 |         self.replay = None
206 |         if double_dqn:
207 |             self._non_final_targeted_q_values = self._double_dqn_q_values
208 |         else:
209 |             self._non_final_targeted_q_values = self._natural_dqn_q_values
210 |         
211 |     
212 |     @Setting
213 |     def set_prioritized_replay(self, capacity=None, batch_size=32, 
214 |                                init_size=None, alpha=1, beta_func=lambda: 1, eps=1e-6):
215 |         self.replay = PrioritizedReplay(
216 |             capacity=capacity, 
217 |             batch_size=batch_size, 
218 |             init_size=init_size, 
219 |             alpha=alpha,
220 |             beta_func=beta_func, 
221 |             eps=eps, 
222 |         )
223 |         
224 |     
225 |     """ should check if non_final_next is None when call this method"""
226 |     def _double_dqn_q_values(self, non_final_next):
227 |         # must view it to match the shape
228 |         next_actions = self.q_net(non_final_next).max(1)[1].view(-1, 1)
229 |         # must squeeze it to make it a batch of scalar values
230 |         return self.target_net(non_final_next).gather(1, next_actions).squeeze()
231 |     
232 |     
233 |     """ should check if non_final_next is None when call this method"""    
234 |     def _natural_dqn_q_values(self, non_final_next):
235 |         return self.target_net(non_final_next).max(1)[0].detach()
236 |        
237 |     
238 |     def optimize(self):
239 |         self.optimize_timer('add')
240 |         if self.optimize_timer() % self.optimize_freq != 0: return
241 |         self.optimize_counter('add')
242 |         sample_exp = self.replay.sample()
243 |         batch = self.replay.form_obj(*zip(*sample_exp))
244 |         curr_states = torch.cat(batch.curr_state).to(self.device)
245 |         actions = torch.tensor(batch.action).to(self.device).view(-1, 1)
246 |         rewards = torch.tensor(batch.reward).to(self.device)
247 |         weights = torch.tensor(batch.weight).to(self.device)
248 |         indices = torch.tensor(batch.index).to(self.device)
249 |         non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), 
250 |                                       device=self.device, dtype=torch.uint8)
251 |         non_final_next = torch.cat(
252 |                             [s for s in batch.next_state if s is not None]).to(self.device)
253 |         predicted_q_values = self.q_net(curr_states).gather(1, actions)
254 |         targeted_q_values = torch.zeros(rewards.shape[0], device=self.device)
255 |         # compute Q values via stationary target network, this 'try' is to avoid the situation 
256 |         # when all next states are None
257 |         try:
258 |             targeted_q_values[non_final_mask] = self._non_final_targeted_q_values(non_final_next)
259 |         except TypeError: print('encountered a case where all next states are None', flush=True)
260 |         # compute the expected Q values
261 |         expected_q_values = (targeted_q_values * self.gamma) + rewards
262 |         # compute temporal difference error
263 |         td_error = predicted_q_values - expected_q_values.unsqueeze(1)
264 |         new_priorities = (torch.abs(td_error.squeeze()) + self.replay.eps).tolist()
265 |         self.replay.update_priorities(indices, new_priorities)
266 |         # compute loss
267 |         q_net_loss = self.loss(predicted_q_values, expected_q_values.unsqueeze(1), reduction='none')
268 |         q_net_loss = torch.dot(weights, q_net_loss.squeeze())
269 |         # optimize the model
270 |         self.optimizer.zero_grad()
271 |         q_net_loss.backward()
272 |         clip_grad_value_(self.q_net.parameters(), 1)
273 |         self.optimizer.step()
274 |         # update target network
275 |         if self.optimize_counter() % self.update_target_freq == 0:
276 |             self.update_target()
277 |         # tensorboard recording
278 |         self._record(rewards, q_net_loss, predicted_q_values, expected_q_values)
279 |     
280 |     
281 |     


--------------------------------------------------------------------------------
/pytorl/agents/__init__.py:
--------------------------------------------------------------------------------
1 | from .DQN import DQN_Agent
2 | from .DQN import DoubleDQN_Agent
3 | from .DQN import PrioritizedDQN_Agent
4 | from .dist_DQN import GorilaDQN_ClientAgent
5 | from .dist_DQN import GorilaDQN_ServerAgent


--------------------------------------------------------------------------------
/pytorl/agents/_base_agent.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from pytorl.networks import io
 4 | from pytorl.utils import Setting
 5 | 
 6 | 
 7 | """
 8 | this base agent contains:
 9 |     1) save/load functionality
10 |     2) tensorboard utilities
11 | """
12 | 
13 | 
14 | class Agent:
15 |     def __init__(self):
16 |         self._optimize_counter = 0
17 |         self._optimize_timer = 0
18 |         self._tensorboard = None
19 |         
20 |     def save_pth(self, obj, path, filename=None, obj_name=None):
21 |         io.save_pth(obj, path, filename=filename, obj_name=obj_name)
22 |         
23 |     def load_pth(self, path, filename=None, obj_name=None):
24 |         io.load_pth(path, filename=filename, obj_name=obj_name)
25 |         
26 |     def optimize_counter(self, pattern=None, num=1):
27 |         assert type(num) == int and num >= 0
28 |         if pattern == 'add':
29 |             self._optimize_counter += num
30 |         elif pattern == 'set':
31 |             self._optimize_counter = num           
32 |         return self._optimize_counter 
33 | 
34 |     def optimize_timer(self, pattern=None, num=1):
35 |         assert type(num) == int and num >= 0
36 |         if pattern == 'add':
37 |             self._optimize_timer += num
38 |         elif pattern == 'set':
39 |             self._optimize_timer = num           
40 |         return self._optimize_timer 
41 |         
42 |     @Setting.only_once
43 |     def set_tensorboard(self, obj=None):
44 |         """
45 |         [!]WARNING: should check the legitimacy of num by yourself
46 |         """
47 |         if obj is not None:
48 |             self._tensorboard = obj
49 |         return self._tensorboard


--------------------------------------------------------------------------------
/pytorl/agents/dist_DQN.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import random
  3 | import numpy as np
  4 | import torch
  5 | import torch.autograd as autograd
  6 | import torch.distributed as dist
  7 | import torch.nn as nn
  8 | from torch.nn.utils import parameters_to_vector, vector_to_parameters
  9 | from pytorl.distributed import get_master_rank
 10 | from pytorl.utils import Setting
 11 | from .DQN import PrioritizedDQN_Agent
 12 | 
 13 | 
 14 | class _GorilaDQN_BaseAgent(PrioritizedDQN_Agent):
 15 |     def __init__(
 16 |         self, 
 17 |         device, 
 18 |         q_net,
 19 |         target_net=None, 
 20 |         loss_func=None, 
 21 |         optimizer_func=None, 
 22 |         double_dqn=True, 
 23 |         comm='cpu'
 24 |     ):
 25 |         super(_GorilaDQN_BaseAgent, self).__init__(
 26 |              device, 
 27 |              q_net, 
 28 |              target_net=target_net, 
 29 |              loss_func=loss_func, 
 30 |              optimizer_func=optimizer_func, 
 31 |              double_dqn=double_dqn, 
 32 |         )
 33 |         self.replay = None
 34 |         self.comm = comm
 35 |         if double_dqn:
 36 |             self._non_final_targeted_q_values = self._double_dqn_q_values
 37 |         else:
 38 |             self._non_final_targeted_q_values = self._natural_dqn_q_values
 39 |         
 40 |         # distributed env
 41 |         self.rank, self.world_size = dist.get_rank(), dist.get_world_size()
 42 |         self.master_rank = get_master_rank()
 43 |         self.num_clients = self.world_size - 1
 44 |         
 45 |         self._gradient_counter = 0
 46 |         self._gradient_timer = 0
 47 |         self.param_vector = parameters_to_vector(self.q_net.parameters()).zero_().detach()
 48 |         self.param_len = len(self.param_vector)
 49 |         
 50 |         
 51 |     def gradient_counter(self, pattern=None, num=1):
 52 |         assert type(num) == int and num >= 0
 53 |         if pattern == 'add':
 54 |             self._gradient_counter += num
 55 |         elif pattern == 'set':
 56 |             self._gradient_counter = num           
 57 |         return self._gradient_counter 
 58 | 
 59 |     
 60 |     def gradient_timer(self, pattern=None, num=1):
 61 |         assert type(num) == int and num >= 0
 62 |         if pattern == 'add':
 63 |             self._gradient_timer += num
 64 |         elif pattern == 'set':
 65 |             self._gradient_timer = num           
 66 |         return self._gradient_timer 
 67 |     
 68 |     
 69 |     def reset(self):
 70 |         super().reset()
 71 |         self.gradient_counter('set', 0)
 72 |         self.gradient_timer('set', 0)
 73 |         
 74 |         
 75 |         
 76 | class GorilaDQN_ServerAgent(_GorilaDQN_BaseAgent):
 77 |     """
 78 |     Args:
 79 |         comm: device used for communication
 80 |         shard_factor:
 81 |             default: the whole q network is devided into num_clients parts and each client is 
 82 |                 responsible for the gradient of its own part
 83 |             a float number e.g. 0.75: specifies the percentage of each part w.r.t. to the full 
 84 |                 gradient size, if it is smaller than default, using default size, if 
 85 |                 
 86 |             all: each client gives whole gradient
 87 |     """
 88 |     def __init__(
 89 |         self, 
 90 |         device, 
 91 |         q_net, 
 92 |         optimizer_func=None, 
 93 |         comm='cpu',
 94 |         shard_factor='default'
 95 |         ):
 96 |         super(GorilaDQN_ServerAgent, self).__init__(
 97 |             device, 
 98 |             q_net, 
 99 |             optimizer_func=optimizer_func, 
100 |             comm=comm, 
101 |         )
102 |         
103 |         if self.rank != self.master_rank: raise RuntimeError('server agent not running on server')
104 |         # initialize gradient list (ortherwise, grads are all None)
105 |         autograd.backward(self.q_net.parameters(), self.q_net.parameters())
106 |         self.grad_list = [item.grad for item in self.q_net.parameters()]
107 |         self.q_net.zero_grad()
108 |         
109 |         self.save = lambda: None
110 |         
111 |         # get rank to shard mapping
112 |         non_master_rank = list(range(self.rank)) + list(range(self.rank, self.world_size))
113 |         
114 |         self.rank_to_shard = [None] * self.world_size
115 |         for rank in non_master_rank:
116 |             if rank < self.master_rank: 
117 |                 self.rank_to_shard[rank] = rank
118 |             else: 
119 |                 self.rank_to_shard[rank] = rank - 1
120 |         self.rank_to_shard[self.rank] = -1
121 |         
122 |         # setup random shards
123 |         self.param_perm = torch.randperm(self.param_len).to(self.comm)    
124 |         self.shard_mask = [None] * self.world_size
125 |         self.shard_len = [None] * self.world_size
126 |         self.master_shard = torch.ones(self.param_len, dtype=torch.uint8, device=self.comm)
127 |         self.shard_mask[self.rank] = self.master_shard
128 |         self.shard_len[self.rank] = self.param_len
129 |         # preprocess shard_factor
130 |         _excess_nums = 0
131 |         if isinstance(shard_factor, (int, float)):
132 |             _default_factor = self.num_clients / self.param_len
133 |             if shard_factor >= 1: shard_factor = 'all'
134 |             elif shard_factor <= _default_factor: shard_factor = 'default'
135 |             else: 
136 |                 _excess_nums = int((shard_factor - _default_factor + .5) * self.param_len)
137 |                 shard_factor = 'specified'
138 |                 
139 |         for rank in non_master_rank:
140 |             if shard_factor == 'all':
141 |                 self.shard_mask[rank] = torch.ones(self.param_len, dtype=torch.uint8, device=self.comm)
142 |                 self.shard_len[rank] = self.param_len
143 |             else:
144 |                 assert shard_factor in {'default', 'specified'}
145 |                 self.shard_mask[rank] = self.param_perm % self.num_clients == self.rank_to_shard[rank]
146 |                 if _excess_nums >= 1: 
147 |                     curr_mask, curr_len = self.shard_mask[rank], len(self.shard_mask[rank])
148 |                     curr_mask[torch.randperm(curr_len)[1 - curr_mask][:_excess_nums]] = 1
149 |                 self.shard_len[rank] = self.shard_mask[rank].sum().item()
150 |                 
151 |         # send to clients
152 |         dist.scatter(self.master_shard, scatter_list=self.shard_mask, src=self.master_rank)
153 |         
154 |         for rank in non_master_rank:
155 |             self.shard_mask[rank] = self.shard_mask[rank].to(device)
156 |     
157 |     
158 |     @Setting
159 |     def set_optimize_scheme(self, lr=.0001, optimize_freq=1):
160 |         self.lr = lr
161 |         assert optimize_freq >= 1
162 |         self.optimize_freq = optimize_freq
163 |         self.optimizer = self._get_optimizer(
164 |             self.q_net.parameters(), 
165 |             lr=self.lr
166 |         )
167 |         self.zero_grad_()
168 |         
169 |         
170 |     @Setting
171 |     def set_checkpoint(self, save_freq, save_path):
172 |         self.save_freq = save_freq
173 |         self.save_path = save_path
174 |         def _save():
175 |             if self.optimize_counter() % self.save_freq == 0:
176 |                 self.save_pth(self.q_net, self.save_path,
177 |                     filename='q_net.pth', obj_name='q_network')
178 |         self.save = _save
179 |     
180 |     
181 |     def optimize(self, rank, grad_shard):
182 |         self.optimize_timer('add')
183 |         self.param_vector[self.shard_mask[rank]] = grad_shard
184 |         if self.optimize_timer() % self.optimize_freq != 0: return
185 |         vector_to_parameters(self.param_vector, self.grad_list)
186 |         self.optimize_counter('add')
187 |         self.optimizer.step()
188 |         self.zero_grad_()
189 |         self.save()
190 | 
191 |         
192 |     def zero_grad_(self):
193 |         self.param_vector = self.param_vector.zero_()
194 |         self.optimizer.zero_grad()
195 |     
196 |     
197 |                     
198 | class GorilaDQN_ClientAgent(_GorilaDQN_BaseAgent):
199 |     def __init__(
200 |         self, 
201 |         device, 
202 |         q_net,
203 |         target_net, 
204 |         loss_func, 
205 |         double_dqn=True, 
206 |         momentum=0.1, 
207 |         several=6, 
208 |         comm='cpu',
209 |     ):
210 |         super(GorilaDQN_ClientAgent, self).__init__(
211 |             device, 
212 |             q_net, 
213 |             target_net, 
214 |             loss_func=loss_func, 
215 |             double_dqn=double_dqn, 
216 |             comm=comm, 
217 |         )
218 |         if self.rank == self.master_rank: raise RuntimeError('client agent not running on client')
219 |             
220 |         self.gradient = None
221 |         self.loss_running_mean = 0.
222 |         self.loss_running_std = 1.
223 |         self.momentum = momentum
224 |         self.several = several
225 |         self.shard_mask = torch.zeros(self.param_len, dtype=torch.uint8, device=self.comm)
226 |         dist.scatter(self.shard_mask, [], src=self.master_rank)
227 |         self.shard_mask = self.shard_mask.to(device)
228 |         self.shard_len = self.shard_mask.sum().item()
229 |         
230 |     
231 |     def reset(self):
232 |         super().reset()
233 |         self.loss_running_mean = 0.
234 |         self.loss_running_std = 1.
235 |     
236 |     
237 |     @Setting
238 |     def set_gradient_scheme(
239 |             self,
240 |             gamma=.99, 
241 |             gradient_freq=1, 
242 |         ):
243 |         # set attributes
244 |         self.batch_size = self.replay.batch_size
245 |         self.gamma = gamma
246 |         assert gradient_freq >= 1
247 |         self.gradient_freq = gradient_freq
248 |         
249 |     
250 |     def _update_loss_running_mean(self, loss):
251 |         new_mean = loss.mean().detach()
252 |         self.loss_running_mean = \
253 |             (1 - self.momentum) * self.loss_running_mean + self.momentum * new_mean
254 |         return self.loss_running_mean
255 |     
256 |     
257 |     def _update_loss_running_std(self, loss):
258 |         new_std = loss.std().detach()
259 |         self.loss_running_std = (1 - self.momentum) * self.loss_running_std + self.momentum * new_std
260 |         return self.loss_running_std
261 |     
262 |     
263 |     def _loss_upper_bound(self):
264 |         return self.loss_running_mean + self.several * self.loss_running_std
265 |     
266 |     
267 |     def update_target(self):
268 |         self.target_net.load_state_dict(self.q_net.state_dict())
269 |         
270 |         
271 |     def backward(self):
272 |         self.gradient_timer('add')
273 |         if self.gradient_timer() % self.gradient_freq != 0: return
274 |         self.gradient_counter('add')
275 |         sample_exp = self.replay.sample()
276 |         batch = self.replay.form_obj(*zip(*sample_exp))
277 |         curr_states = torch.cat(batch.curr_state).to(self.device)
278 |         actions = torch.tensor(batch.action).to(self.device).view(-1, 1)
279 |         rewards = torch.tensor(batch.reward).to(self.device)
280 |         non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), 
281 |             device=self.device, dtype=torch.uint8)
282 |         non_final_next = torch.cat(
283 |             [s for s in batch.next_state if s is not None]).to(self.device)
284 |         predicted_q_values = self.q_net(curr_states).gather(1, actions)
285 |         targeted_q_values = torch.zeros(rewards.shape[0], device=self.device)
286 |         # compute Q values via stationary target network, this 'try' is to avoid the situation 
287 |         # when all next states are None
288 |         try:
289 |             targeted_q_values[non_final_mask] = self._non_final_targeted_q_values(non_final_next)
290 |         except TypeError: print('encountered a case where all next states are None', flush=True)
291 |         # compute the expected Q values
292 |         expected_q_values = (targeted_q_values * self.gamma) + rewards
293 |         # compute loss
294 |         q_net_loss = self.loss(predicted_q_values, expected_q_values.unsqueeze(1), reduction='none')
295 |         self._update_loss_running_mean(q_net_loss)
296 |         self._update_loss_running_std(q_net_loss)
297 |         q_net_loss = q_net_loss[q_net_loss <= self._loss_upper_bound()].mean()
298 |         # optimize the model
299 |         grad = autograd.grad(q_net_loss, self.q_net.parameters())
300 |         delta_grad = parameters_to_vector(grad)[self.shard_mask]
301 |         assert self.gradient is not None, 'should call zero_grad_() before backward'
302 |         self.gradient.add_(delta_grad)
303 |         self._record(rewards, q_net_loss, predicted_q_values, expected_q_values, self.gradient_counter)
304 |         return self.gradient
305 |     
306 |     
307 |     def zero_grad_(self):
308 |         self.q_net.zero_grad()
309 |         self.gradient = self.param_vector[self.shard_mask].clone()
310 |         
311 |         
312 |         


--------------------------------------------------------------------------------
/pytorl/distributed/__init__.py:
--------------------------------------------------------------------------------
1 | from .async_ops import *
2 | from .initialize import *
3 | from .param_server import *
4 | from .sync_ops import *


--------------------------------------------------------------------------------
/pytorl/distributed/_slurm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def get_world_size():
 5 |     return int(os.environ['SLURM_NTASKS'])
 6 | 
 7 | 
 8 | def get_rank():
 9 |     return int(os.environ['SLURM_PROCID'])
10 | 
11 | 
12 | def get_jobid():
13 |     return int(os.environ['SLURM_JOBID'])
14 | 
15 | 
16 | def get_backend():
17 |     return os.environ.get('DISTRIBUTED_BACKEND', None)
18 | 
19 | 
20 | def get_nodelist():
21 |     return os.environ['SLURM_NODELIST']
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/pytorl/distributed/async_ops.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | from torch.nn.utils import parameters_to_vector, vector_to_parameters
 4 | 
 5 | 
 6 | def isend_list(tensor_list, dst, group=None, tag=0):
 7 |     if group is None: 
 8 |         if dist.get_world_size() == 1: return
 9 |         _isend = lambda tensor: dist.send(tensor, dst, tag=tag)
10 |     else:
11 |         if dist.get_world_size(group) == 1: return
12 |         _isend = lambda tensor: dist.isend(tensor, dst, group, tag)
13 |     tensor = parameters_to_vector(tensor_list)
14 |     if dist.get_backend() == 'gloo': tensor = tensor.cpu()
15 |     return _isend(tensor)
16 |     
17 |     
18 |     


--------------------------------------------------------------------------------
/pytorl/distributed/initialize.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.distributed as dist
  4 | from ._slurm import *
  5 | 
  6 | 
  7 | """
  8 | Since this part is very environment-sensitive, you may wanna modify this module by yourself to fit your
  9 | specific distributed setup
 10 | """
 11 | 
 12 | 
 13 | def _get_slurm_addr():
 14 |     """
 15 |     this is related to my current slurm env, you can ignore it
 16 |     """
 17 |     node_list = get_nodelist()
 18 |     if '[' in node_list:
 19 |         beg = node_list.find('[')
 20 |         pos1 = node_list.find('-', beg)
 21 |         if pos1 < 0: pos1 = 1000
 22 |         pos2 = node_list.find(',', beg)
 23 |         if pos2 < 0: pos2 = 1000
 24 |         node_list = node_list[:min(pos1, pos2)].replace('[', '')
 25 |     addr = node_list[8:].replace('-', '.')
 26 |     
 27 |     return addr
 28 | 
 29 | 
 30 | 
 31 | def slurm_data_parallel_arch(port=23032, backend='nccl'):
 32 |     os.environ['DISTRIBUTED_BACKEND'] = backend
 33 | 
 34 |     rank, world_size = get_rank(), get_world_size()
 35 |     num_gpus = torch.cuda.device_count()
 36 |     if num_gpus > 0: 
 37 |         gpu_id = rank % num_gpus
 38 |         torch.cuda.set_device(gpu_id)
 39 | 
 40 |     if world_size == 1:
 41 |         rank, world_size = 0, 1
 42 |     else:
 43 |         os.environ['MASTER_PORT'] = str(port)
 44 |         os.environ['MASTER_ADDR'] = _get_slurm_addr()
 45 |         os.environ['WORLD_SIZE'] = str(world_size)
 46 |         os.environ['RANK'] = str(rank)
 47 |         
 48 |         dist.init_process_group(backend=backend)
 49 | 
 50 |     return rank, world_size
 51 | 
 52 | 
 53 | def data_parallel_arch(port=23030, backend='nccl'):
 54 |     """
 55 |     a DIY way to start data parallel distributed arch
 56 |     """
 57 |     os.environ['DISTRIBUTED_BACKEND'] = backend
 58 |     
 59 |     checklist = {'MASTER_PORT', 'MASTER_ADDR', 'WORLD_SIZE', 'RANK'}
 60 |     for env_var in checklist:
 61 |         assert env_var in os.environ, 'error: %s not set yet' % env_var
 62 |     dist.init_process_group(backend=backend)
 63 |         
 64 |     rank, world_size = dist.get_rank(), dist.get_world_size()
 65 |     num_gpus = torch.cuda.device_count()
 66 |     if num_gpus > 0: 
 67 |         gpu_id = rank % num_gpus
 68 |         torch.cuda.set_device(gpu_id)
 69 | 
 70 |     return rank, world_size, master_rank, worker_list
 71 | 
 72 | 
 73 | def slurm_param_server_arch(port=23029, backend='gloo', master_rank=0):
 74 |     """
 75 |     this function inits the parameter server architecture distributed environment, which helps the 
 76 |     asynchronized training in the context of reinforcement learning
 77 |     
 78 |     master_rank serves as the parameter server (master process, and the others are
 79 |     training processes (slaves) with the gpu mapping worker[i] -> gpu[i] % (num of gpus)
 80 |     
 81 |     [!]NOTE: haven't tested under mpi or nccl backend
 82 |     """
 83 |     
 84 |     os.environ['DISTRIBUTED_BACKEND'] = backend
 85 |     rank, world_size = get_rank(), get_world_size()
 86 |     assert world_size > 1, 'parameter server arch requires multiple processes'
 87 |     assert master_rank < world_size, 'invalid master_rank: %s' % master_rank
 88 |     worker_list = list(range(world_size))
 89 |     del worker_list[master_rank]
 90 |     
 91 |     num_gpus = torch.cuda.device_count()
 92 |     if num_gpus > 0: 
 93 |         gpu_id = rank % num_gpus
 94 |         torch.cuda.set_device(gpu_id)
 95 | 
 96 |     os.environ['MASTER_PORT'] = str(port)
 97 |     os.environ['MASTER_ADDR'] = _get_slurm_addr()
 98 |     os.environ['WORLD_SIZE'] = str(world_size)
 99 |     os.environ['RANK'] = str(rank)
100 |     os.environ['MASTER_RANK'] = str(master_rank)
101 |         
102 |     dist.init_process_group(backend=backend)
103 | 
104 |     return rank, world_size, master_rank, worker_list
105 | 
106 | 
107 | 
108 | def param_server_arch(port=23028, backend='gloo', master_rank=0):
109 |     """
110 |     a DIY way to start parameter server distributed arch
111 |     """
112 |     os.environ['DISTRIBUTED_BACKEND'] = backend
113 | 
114 |     checklist = {'MASTER_PORT', 'MASTER_ADDR', 'WORLD_SIZE', 'RANK', 'MASTER_RANK'}
115 |     for env_var in checklist:
116 |         assert env_var in os.environ, 'error: %s not set yet' % env_var
117 |         
118 |     dist.init_process_group(backend=backend)
119 |     
120 |     rank, world_size = get_rank(), get_world_size()
121 |     assert world_size > 1, 'parameter server arch requires multiple processes'
122 |     assert master_rank < world_size, 'invalid master_rank: %s' % master_rank
123 |     worker_list = list(range(world_size))
124 |     del worker_list[master_rank]
125 |     
126 |     num_gpus = torch.cuda.device_count()
127 |     if num_gpus > 0: 
128 |         gpu_id = rank % num_gpus
129 |         torch.cuda.set_device(gpu_id)
130 | 
131 |     return rank, world_size, master_rank, worker_list
132 | 
133 | 
134 | 


--------------------------------------------------------------------------------
/pytorl/distributed/param_server.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import threading
  3 | from threading import Thread
  4 | import torch
  5 | import torch.distributed as dist
  6 | from torch.nn.utils import parameters_to_vector, vector_to_parameters
  7 | from pytorl.utils import Setting
  8 | 
  9 | 
 10 | def get_master_rank():
 11 |     return int(os.environ['MASTER_RANK'])
 12 |  
 13 |     
 14 | class SIG:
 15 |     GRAD_PUSH = 0
 16 |     PARAM_REQ = 1
 17 |     QUERY = 2
 18 |     GRAD = 3
 19 |     PARAM = 4
 20 |     
 21 | 
 22 |     
 23 | class _Messenger(Thread):
 24 |     def __init__(self, device):
 25 |         super(_Messenger, self).__init__()
 26 |         rank, world_size = dist.get_rank(), dist.get_world_size()
 27 |         self.rank = rank
 28 |         self.world_size = world_size
 29 |         self.device = device
 30 | 
 31 |     
 32 |     def isend(self, overhead, payload=None, dst=0, tag=0, comm='cpu'):
 33 |         _overhead_msg, _payload_msg = overhead, payload
 34 |         if not isinstance(_overhead_msg, torch.Tensor): 
 35 |             if hasattr(_overhead_msg, '__iter__'):
 36 |                 _overhead_msg = torch.tensor(overhead, dtype=torch.float32)
 37 |             else:
 38 |                 _overhead_msg = torch.tensor([overhead], dtype=torch.float32)
 39 |         msg = _overhead_msg.to(comm)
 40 |         if _payload_msg is not None:
 41 |             if not isinstance(_payload_msg, torch.Tensor):
 42 |                 try:
 43 |                     _payload_msg = parameters_to_vector(payload).detach()
 44 |                 except:
 45 |                     raise TypeError('unrecognized payload type, not a vector tensor nor an iterator')
 46 |             _payload_msg = _payload_msg.to(comm)
 47 |             msg = torch.cat((msg, _payload_msg))
 48 |         return dist.isend(msg, dst, tag=tag)
 49 |         
 50 | 
 51 |     def recv(self, overhead_len, payload_len=0, src=None, tag=0, comm='cpu'):
 52 |         msg = torch.zeros(overhead_len, dtype=torch.float32, device=comm)
 53 |         if payload_len > 0: 
 54 |             _payload_msg = torch.zeros(payload_len, dtype=torch.float32, device=comm)
 55 |             msg = torch.cat((msg, _payload_msg))
 56 |         dist.recv(msg, src, tag=tag)
 57 |         if payload_len > 0:
 58 |             _overhead_msg, _payload_msg = msg[:overhead_len], msg[overhead_len:].to(self.device)
 59 |             _overhead_msg = list(map(int, _overhead_msg))
 60 |             return _overhead_msg, _payload_msg
 61 |         
 62 |         return list(map(int, msg))
 63 |     
 64 |     
 65 |     def run(self):
 66 |         raise NotImplementedError('cannot run base _Messenger class')
 67 |     
 68 |     
 69 | 
 70 | class ParamServer(_Messenger):
 71 |     def __init__(self, device, thread, lock):
 72 |         super(ParamServer, self).__init__(device)
 73 |         self.master_rank = get_master_rank()
 74 |         self.thread = thread
 75 |         self.lock = lock
 76 |     
 77 |     
 78 |     @Setting.only_once
 79 |     def set_listen(self, recv_info_len, global_timesteps_counter):
 80 |         """msg format: [rank, shard_len, local_timesteps, signal]"""
 81 |         self.recv_info_len = recv_info_len
 82 |         self.counter = global_timesteps_counter
 83 |     
 84 |     
 85 |     @Setting.only_once
 86 |     def set_param_update(self, model, optim_handler):
 87 |         self.model = model
 88 |         self.optim_handler = optim_handler
 89 |         self.param_vector = parameters_to_vector(model.parameters()).detach()
 90 |     
 91 |     
 92 |     def _recv_info(self):
 93 |         return self.recv(self.recv_info_len, tag=SIG.QUERY)
 94 |     
 95 |     
 96 |     def _recv_shard(self, src, shard_len):
 97 |         return self.recv(self.recv_info_len, shard_len, src=src, tag=SIG.GRAD)
 98 |     
 99 |     
100 |     def _isend_param(self, dst):
101 |         return self.isend(
102 |             [self.counter(), self.thread], self.model.parameters(), dst=dst, tag=SIG.PARAM)
103 |     
104 |     
105 |     def listen(self):
106 |         sender, shard_len, local_time, signal = self._recv_info()
107 |         if signal == SIG.GRAD_PUSH: 
108 |             _, grad_shard = self._recv_shard(sender, shard_len)
109 |             with self.lock: self.optim_handler(sender, grad_shard)
110 |             return
111 |         self._isend_param(sender).wait()
112 |     
113 |     
114 |     def run(self):
115 |         while True:
116 |             self.listen()
117 |   
118 |     
119 |     
120 | class ParamClient(_Messenger):
121 |     def __init__(self, device):
122 |         super(ParamClient, self).__init__(device)
123 |         self.master_rank = get_master_rank()
124 |         self.overhead = None
125 |     
126 |     
127 |     @Setting.only_once
128 |     def set_recv(self, recv_info_len):
129 |         """msg format: [global_timesteps, server_thread_num]"""
130 |         self.recv_info_len = recv_info_len
131 |     
132 |     
133 |     @Setting.only_once
134 |     def set_info(self, shard_idx, local_timesteps_counter):
135 |         self.shard_idx = shard_idx
136 |         self.counter = local_timesteps_counter
137 |     
138 |     
139 |     @Setting.only_once
140 |     def set_param_update(self, model):
141 |         self.model = model
142 |         self.param_vector = parameters_to_vector(model.parameters()).detach()
143 |         self.model_len = len(self.param_vector)
144 |     
145 |     
146 |     def _isend_info(self, signal):
147 |         self.overhead = [self.rank, self.shard_idx, self.counter(), signal]
148 |         return self.isend(self.overhead, dst=self.master_rank, tag=SIG.QUERY)
149 |     
150 |     
151 |     def recv_param(self):
152 |         self._isend_info(SIG.PARAM_REQ).wait()
153 |         return self.recv(self.recv_info_len, self.model_len, src=self.master_rank, tag=SIG.PARAM)
154 |     
155 |     
156 |     def isend_shard(self, shard_data):
157 |         self._isend_info(SIG.GRAD_PUSH).wait()
158 |         self.isend(self.overhead, shard_data, dst=self.master_rank, tag=SIG.GRAD).wait()
159 |     
160 |     
161 |     def run(self):
162 |         raise RuntimeError('cannot run parameter client')
163 |     
164 | 
165 |     


--------------------------------------------------------------------------------
/pytorl/distributed/sync_ops.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | 
 4 | 
 5 | def all_reduce_mean(tensor_list, group=None):
 6 |     handler_list = []
 7 |     if group is None:
 8 |         _allreduce = lambda tensor: dist.all_reduce(tensor, dist.ReduceOp.SUM)
 9 |     else:
10 |         _allreduce = lambda tensor: dist.all_reduce(tensor, dist.ReduceOp.SUM, group)
11 |     if isinstance(tensor_list, torch.Tensor):
12 |         raise TypeError('tensor_list should be list of tensors')
13 |     if dist.get_world_size() == 1: return
14 |     for tensor in tensor_list:
15 |         _allreduce(tensor)
16 |         handler_list.append(handler)
17 |         tensor.div_(dist.get_world_size())
18 | 
19 | 
20 | def all_reduce_sum(tensor_list, group=None):
21 |     handler_list = []
22 |     if group is None:
23 |         _allreduce = lambda tensor: dist.all_reduce(tensor, dist.ReduceOp.SUM)
24 |     else:
25 |         _allreduce = lambda tensor: dist.all_reduce(tensor, dist.ReduceOp.SUM, group)
26 |     if isinstance(tensor_list, torch.Tensor):
27 |         raise TypeError('tensor_list should be list of tensors')
28 |     if dist.get_world_size() == 1: return
29 |     for tensor in tensor_list:
30 |          _allreduce(tensor)
31 | 
32 | 
33 | def all_reduce_max(tensor_list, group=None):
34 |     handler_list = []
35 |     if group is None:
36 |         _allreduce = lambda tensor: dist.all_reduce(tensor, dist.ReduceOp.MAX)
37 |     else:
38 |         _allreduce = lambda tensor: dist.all_reduce(tensor, dist.ReduceOp.MAX, group)
39 |     if isinstance(tensor_list, torch.Tensor):
40 |         raise TypeError('tensor_list should be list of tensors')
41 |     if dist.get_world_size() == 1: return
42 |     for tensor in tensor_list:
43 |         _allreduce(tensor)
44 | 
45 | 
46 | def all_reduce_min(tensor_list, group=None):
47 |     handler_list = []
48 |     if group is None:
49 |         _allreduce = lambda tensor: dist.all_reduce(tensor, dist.ReduceOp.MAX)
50 |     else:
51 |         _allreduce = lambda tensor: dist.all_reduce(tensor, dist.ReduceOp.MAX, group)
52 |     if isinstance(tensor_list, torch.Tensor):
53 |         raise TypeError('tensor_list should be list of tensors')
54 |     if dist.get_world_size() == 1: return
55 |     for tensor in tensor_list:
56 |         tensor.neg_()
57 |         _allreduce(tensor)
58 |         tensor.neg_()
59 | 
60 | 
61 | def broadcast(tensor_list, src, group=None):
62 |     if group is None:
63 |         _broadcast = lambda tensor: dist.broadcast(tensor, src)
64 |     else:
65 |         _broadcast = lambda tensor: dist.broadcast(tensor, src, group)
66 |     if dist.get_world_size() == 1: return
67 |     for tensor in tensor_list:
68 |         _broadcast(tensor)
69 | 
70 | 
71 | def barrier(group=None):
72 |     if dist.get_world_size() == 1: return
73 |     if group is None:
74 |         _barrier = dist.barrier
75 |     else:
76 |         _barrier = lambda: dist.barrier(group)
77 |     _barrier()
78 | 
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/pytorl/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from .ale_atari import make_atari_env
2 | from .gym_ctrl import make_ctrl_env


--------------------------------------------------------------------------------
/pytorl/envs/_base_env.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | """
  4 | 
  5 | this module provides base class/metaclass for creating rl environment using this base can help 
  6 | make your environment compatible with the exsiting implementation of rl algorithms since it 
  7 | regulates and implements some interfaces and standard functionalities, which are:
  8 |     
  9 |     I. public interfaces:
 10 |         1) a reset method that supports init the first frame
 11 |         2) a sample method that samples a random action
 12 |         3) a step method that takes an action and returns observation, 
 13 |             reward, done, info
 14 |         4) an action space getter
 15 |         5) a current preprocessed state getter
 16 |         
 17 |     II. common functions and statistics:
 18 |         1) a global frame counter and resetter method
 19 |         2) a current episodic frames counter and resetter method 
 20 |         3) a past episodes counter and resetter (i.e. resets counter)
 21 |         4) a frame stack setter that specifies how many frames that are stacked as an 
 22 |             stacked observation
 23 |         5) a frame-per-ation setter that specifies how many frames that current action 
 24 |             should repeat
 25 |         7) an episodic initialization mode setter
 26 |         8) a tensorboard setter
 27 |         9) an episodic and per-action reward resetter and getter
 28 |        10) a global reinit method
 29 | 
 30 | if you create your own environment from scratch or you import an env from other sources and that 
 31 | env does not require a bass class, you must use base class _Env as your base class, your code 
 32 | should be someting like this:
 33 | 
 34 |     class MyEnv(Env):
 35 |         def __init__(self, **args, **kwargs):
 36 |             blahblahblah...
 37 |         other stuff...
 38 | 
 39 | if you retrieve your learning environment from other sources and taht env requires a base class 
 40 | other than _Env (e.g. Open AI gym envrionments usually requires a wrapper as their base class), 
 41 | in that case, you must use base metaclass _MetaEnv as your base class and make your code looks 
 42 | like this example:
 43 | 
 44 |     class AtariWrapper(gym.Wrapper, metaclass=MetaEnv):
 45 |         def __init__(self, env):
 46 |             blahblahblah...
 47 |         other stuff...
 48 |         
 49 | """
 50 | 
 51 | 
 52 | class Env(object):
 53 |     """
 54 |     base class for rl environment
 55 |     """
 56 |     def __init__(self):
 57 |         self._action_reward = None
 58 |         self._episodic_frames = 0
 59 |         self._episodic_reward = 0
 60 |         self._frames_action = 1
 61 |         self._frames_stack = 1
 62 |         self._global_episodes = 0
 63 |         self._global_frames = 0
 64 |         self._tensorboard = None
 65 |     
 66 |     def state(self):
 67 |         raise NotImplementedError()
 68 |     
 69 |     def num_actions(self):
 70 |         raise NotImplementedError()
 71 |         
 72 |     def sample(self):
 73 |         raise NotImplementedError()
 74 |         
 75 |     def reset(self):
 76 |         raise NotImplementedError()
 77 |     
 78 |     def step(self, action):
 79 |         raise NotImplementedError()
 80 |         
 81 |         
 82 |     def action_reward(self, pattern=None, num=None):
 83 |         """
 84 |         [!]WARNING: should check the legitimacy of num by yourself
 85 |         """
 86 |         assert pattern in {None, 'set'}
 87 |         if pattern == 'set':
 88 |             self._action_reward = num
 89 |         return self._action_reward
 90 |     
 91 |     
 92 |     def episodic_frames(self, pattern=None, num=1):
 93 |         assert type(num) == int and num >= 0
 94 |         if pattern == 'add':
 95 |             self._episodic_frames += num
 96 |         elif pattern == 'set':
 97 |             self._episodic_frames = num
 98 |         return self._episodic_frames
 99 |     
100 |     
101 |     def episodic_reward(self, pattern=None, num=None):
102 |         """
103 |         [!]WARNING: should check the legitimacy of num by yourself
104 |         """
105 |         if pattern == 'add':
106 |             assert num is not None
107 |             self._episodic_reward += num
108 |         elif pattern == 'set':
109 |             self._episodic_reward = num
110 |         return self._episodic_reward
111 |     
112 |     
113 |     def frames_action(self, pattern=None, num=None):
114 |         assert pattern in {None, 'set'}
115 |         if pattern == 'set':
116 |             assert type(num) == int and num >= 1
117 |             self._frames_action = num
118 |         return self._frames_action
119 |     
120 |     
121 |     def frames_stack(self, pattern=None, num=None):
122 |         assert pattern in {None, 'set'}
123 |         if pattern == 'set':
124 |             assert type(num) == int and num >= 1
125 |             self._frames_stack = num
126 |         return self._frames_stack
127 |     
128 |     
129 |     def global_frames(self, pattern=None, num=1):
130 |         assert type(num) == int and num >= 0
131 |         if pattern == 'add':
132 |             self._global_frames += num
133 |         elif pattern == 'set':
134 |             self._global_frames = num           
135 |         return self._global_frames
136 |     
137 |     
138 |     def global_episodes(self, pattern=None, num=1):
139 |         assert type(num) == int and num >= 0
140 |         if pattern == 'add':
141 |             self._global_episodes += num
142 |         elif pattern == 'set':
143 |             self._global_episodes = num           
144 |         return self._global_episodes 
145 | 
146 |     
147 |     def set_tensorboard(self, obj=None):
148 |         """
149 |         [!]WARNING: should check the legitimacy of num by yourself
150 |         """
151 |         if obj is not None:
152 |             self._tensorboard = obj
153 |         return self._tensorboard
154 |         
155 |         
156 |     def reset_statistics(self, mode):
157 |         assert mode in {'all', 'episodic'}
158 |         self._action_reward = None
159 |         self._episodic_frames = 0
160 |         self._episodic_reward = 0
161 |         if mode == 'all':
162 |             self._global_episodes = 0
163 |             self._global_frames = 0
164 |             
165 | 
166 |         
167 | """
168 | these methods help the setup process of metaclass: MetaEnv
169 | """        
170 | def _get_attrs_setter(target):
171 |     def _attrs_setter(attrs, values):
172 |         if hasattr(attrs, '__iter__'):
173 |             assert len(attrs) == len(values)
174 |             for attr, val in zip(attrs, values):
175 |                 assert type(attr) == str
176 |                 if not hasattr(target, attr):
177 |                     setattr(target, attr, val)
178 |         else:
179 |             if not hasattr(target, attrs):
180 |                 setattr(target, attrs, values)            
181 |     return _attrs_setter
182 | 
183 | 
184 | def _get_attr_setter(target):
185 |     def _attr_setter(attr, value):
186 |         if not hasattr(target, attr):
187 |             setattr(target, attr, value)
188 |     return _attr_setter
189 |         
190 |     
191 | class MetaEnv(type):
192 |     """
193 |     base metaclass for third-party rl environment
194 |     
195 |     bases[0](i.e. instance.__mro__[1]) is supposed to be the direct base class for third-party 
196 |     envirnoments and this metaclass will form the part which the base class of the env does not 
197 |     cover, and will keep other settings for base class to decide, if base class has the same 
198 |     attributes as this metaclass has, the direct base class will **OVERRIDE** corresponding 
199 |     metaclass attributes in that situation
200 |     
201 |     """
202 |     def __new__(self, name, bases, fields):
203 |         instance = super(MetaEnv, self).__new__(self, name, bases, fields)
204 |         """
205 |         [!]WARNING: should check if this condition(direct_base = bases[0]) holds 
206 |         """
207 |         # get the base class of the instance
208 |         direct_base = instance.__mro__[1]
209 |         attr_setter = _get_attr_setter(direct_base)
210 |         # get base environment instance
211 |         base_env = Env()
212 |         # set base value attributes
213 |         base_attrs, base_vals = zip(*base_env.__dict__.items())
214 |         # have to wrap map as an Iterable to help the map func to work
215 |         tuple(map(attr_setter, base_attrs, base_vals))
216 |         # set base functionalities
217 |         # have to wrap base_func_names as an Iterable to avoid missing attributes
218 |         base_func_names = tuple(attr for attr in dir(base_env) if not attr.startswith('_'))
219 |         # have to wrap base_func_vals as an Iterable to avoid missing values
220 |         base_func_vals = tuple(getattr(base_env, attr) for attr in base_func_names)
221 |         # have to wrap map as an Iterable to help the map func to work
222 |         tuple(map(attr_setter, base_func_names, base_func_vals))
223 |         
224 |         return instance
225 |     
226 |     
227 |     


--------------------------------------------------------------------------------
/pytorl/envs/ale_atari.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import torch
  4 | from collections import deque
  5 | from ._base_env import MetaEnv
  6 | 
  7 | 
  8 | class _AtariWrapper(gym.Wrapper, metaclass=MetaEnv):
  9 |     """
 10 |     this class is a wrapper of original ale atari in gym environment that helps:
 11 |         1) preprocessing the observation:
 12 |              i. find max value of each corresponding pixel between 2 consecutive frame
 13 |             ii. make observation state in the shape of C * H * W and converts it to torch tensors.
 14 |            iii. stack multiple frames for neural net input.
 15 |         2) directly sample an action
 16 |         3) initialize episode (and frames stack) with noop, random, fire ops
 17 |         4) single-life per episode mode
 18 |         5) misc methods and other methods required to be implemented by metaclass MetaEnv
 19 | 
 20 |     example:
 21 |         resize = T.Compose([T.ToPILImage(),
 22 |                         T.Grayscale(1),
 23 |                         T.Resize((84, 84)),
 24 |                         T.ToTensor()])
 25 |         env = make_atari('Breakout-v0', resize, render=True)
 26 |         env.set_episodic_init('FIRE')
 27 |         env.set_frames_stack(frame_stack)
 28 |         env.set_single_life(True)
 29 |         env.set_frames_action(4)
 30 | 
 31 |     another important criterion is that only convert an obj to tensor or make it downsampled iff.
 32 |     imminent necessay, otherwise, try not to make conversion which will confuse you latter
 33 | 
 34 |     """
 35 |     def __init__(self, env, tsfm, render):
 36 |         super(_AtariWrapper, self).__init__(env)
 37 |         self.tsfm = tsfm
 38 |         self.render_flag = render
 39 |         # frame initialization
 40 |         self.episodic_init_action = 'RANDOM'
 41 |         self.episodic_init_frames = 0
 42 |         # state and status
 43 |         self.prev_observ = None
 44 |         self.curr_observ = None
 45 |         self._new_observ_buffer = deque([], maxlen=1)
 46 |         self.curr_state = None
 47 |         # frame stack buffer
 48 |         self.buffer = deque([], maxlen=1)
 49 |         # single life mode
 50 |         self.single_life = False
 51 |         self.lives = 0
 52 | 
 53 | 
 54 |     def set_frames_action(self, num=1):
 55 |         self.frames_action('set', num)
 56 |         self._new_observ_buffer = deque([], maxlen=num)
 57 | 
 58 | 
 59 |     def set_frames_stack(self, num=1):
 60 |         self.frames_stack('set', num)
 61 |         self.buffer = deque([], maxlen=num)
 62 | 
 63 | 
 64 |     def set_episodic_init(self, op='RANDOM', frames=1):
 65 |         assert type(frames) == int and frames >= 1
 66 |         if op is None: op = 'RANDOM'
 67 |         self.episodic_init_action = op
 68 |         self.episodic_init_frames = frames
 69 | 
 70 | 
 71 |     def set_single_life(self, flag=True):
 72 |         self.single_life = flag
 73 | 
 74 | 
 75 |     def num_actions(self):
 76 |         return self.action_space.n
 77 | 
 78 | 
 79 |     def _feed_buffer(self):
 80 |         # deflickering previous and current observation
 81 |         if len(self._new_observ_buffer) == 0:
 82 |             max_pooled_observ = self.curr_observ
 83 |         else:
 84 |             max_pooled_observ = np.max(np.stack(self._new_observ_buffer), axis=0)
 85 |         deflickered_observ = np.maximum(self.prev_observ, max_pooled_observ)
 86 |         # let buffer save transformed 2-D frame
 87 |         encoded_frame = self.tsfm(deflickered_observ)
 88 |         self.buffer.append(encoded_frame)
 89 |         self.prev_observ = self.curr_observ
 90 | 
 91 | 
 92 |     def _preprocessing(self):
 93 |         assert len(self.buffer) == self.frames_stack() == self.buffer.maxlen
 94 |         observs_tensor = torch.cat(tuple(self.buffer))
 95 |         return observs_tensor.unsqueeze(0)
 96 | 
 97 | 
 98 |     def sample(self):
 99 |         return self.action_space.sample()
100 | 
101 |     def state(self):
102 |         self.curr_state = self._preprocessing()
103 |         return self.curr_state
104 | 
105 |     def _get_init_action(self, op):
106 |         assert op in {'RANDOM', 'NOOP', 'FIRE'}
107 |         if op == 'RANDOM':
108 |             wrapper = self.sample
109 |         elif op == 'NOOP':
110 |             assert self.unwrapped.get_action_meanings()[0] == 'NOOP'
111 |             wrapper = lambda: 0
112 |         elif op == 'FIRE':
113 |             assert self.unwrapped.get_action_meanings()[1] == 'FIRE'
114 |             wrapper = lambda: 1
115 |         return wrapper
116 | 
117 | 
118 |     def reset(self):
119 |         # reset episodic attributions
120 |         self.reset_statistics('episodic')
121 |         self.buffer.clear()
122 |         self._new_observ_buffer.clear()
123 |         self.prev_observ = self.env.reset()
124 |         init_frames = max(self.episodic_init_frames, self.buffer.maxlen)
125 |         assert init_frames > 0, 'minimum buffer length should be 1'
126 |         get_action = self._get_init_action(self.episodic_init_action)
127 |         for _ in range(init_frames):
128 |             self.curr_observ, reward, done, _ = self.env.step(get_action())
129 |             self._new_observ_buffer.append(self.curr_observ)
130 |             if done: print('EXCEPTION: done received during reset()', flush=True)
131 |             self._feed_buffer()
132 |         # statistics
133 |         self.global_frames('add', init_frames + 1)
134 |         self.global_episodes('add')
135 |         self.episodic_frames('add', init_frames + 1)
136 | 
137 | 
138 |     def refresh(self):
139 |         # state and status
140 |         self.prev_observ = None
141 |         self._new_observ_buffer = deque([], maxlen=self.frames_action())
142 |         self.curr_state = None
143 |         # frame stack buffer
144 |         self.buffer = deque([], maxlen=self.frames_stack())
145 |         self.reset_statistics('all')
146 | 
147 | 
148 |     def step(self, action):
149 |         if isinstance(action, torch.Tensor): action = action.item()
150 |         _action_reward = 0
151 |         for _ in range(self.frames_action()):
152 |             self.curr_observ, reward, done, info = self.env.step(action)
153 |             self._new_observ_buffer.append(self.curr_observ)
154 | #             self._feed_buffer()
155 |             _action_reward += reward
156 |             if self.render_flag: self.render()
157 |             self.global_frames('add')
158 |             self.episodic_frames('add')
159 |             if done: break
160 |         self._feed_buffer()
161 |         _action_reward = np.sign(_action_reward)
162 |         if self.single_life:
163 |             lives = self.env.unwrapped.ale.lives()
164 |             if lives < self.lives and lives > 0:
165 |                 done = True
166 |             self.lives = lives
167 | 
168 |         self.episodic_reward('add', _action_reward)
169 |         self.action_reward('set', _action_reward)
170 |         return self.curr_observ, self.action_reward(), done, info
171 | 
172 | 
173 | 
174 | def make_atari_env(env_name, tsfm, render=False):
175 |     orig_env = gym.make(env_name)
176 |     wrapped_env = _AtariWrapper(orig_env, tsfm, render)
177 |     return wrapped_env
178 | 
179 | 


--------------------------------------------------------------------------------
/pytorl/envs/gym_ctrl.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import torch
  4 | from collections import deque
  5 | from ._base_env import MetaEnv
  6 | 
  7 | 
  8 | class _Gym1DWrapper(gym.Wrapper, metaclass=MetaEnv):
  9 |     """
 10 |     this class is a wrapper of classic control problems in gym environment that helps:
 11 |         1) preprocessing the observation:
 12 |                i. stack multiple observations for neural net input.
 13 |         2) directly sample an action
 14 |         3) initialize episode (and frames stack) with random actions
 15 |         4) misc methods and other methods required to be implemented by metaclass MetaEnv
 16 | 
 17 |     example:
 18 |         env = make_gym('CartPole-v1', render=True)
 19 | 
 20 |     another important criterion is that only convert an obj to tensor or make it downsampled iff.
 21 |     imminent necessay, otherwise, try not to make conversion which will confuse you latter
 22 | 
 23 |     """
 24 |     def __init__(self, env, render):
 25 |         super(_Gym1DWrapper, self).__init__(env)
 26 |         self.render_flag = render
 27 |         # frame initialization
 28 |         self.prev_observ = None
 29 |         self.curr_observ = None
 30 |         self.curr_state = None
 31 |         # frame stack buffer
 32 |         self.buffer = deque([], maxlen=1)
 33 | 
 34 | 
 35 |     def set_frames_action(self, num=1):
 36 |         self.frames_action('set', num)
 37 | 
 38 | 
 39 |     def set_frames_stack(self, num=1):
 40 |         self.frames_stack('set', num)
 41 |         self.buffer = deque([], maxlen=num)
 42 | 
 43 | 
 44 |     def num_actions(self):
 45 |         return self.action_space.n
 46 | 
 47 | 
 48 |     def observ_shape(self):
 49 |         return self.observation_space.shape[0]
 50 | 
 51 | 
 52 |     def _feed_buffer(self):
 53 |         # let buffer save transformed 1-D frame
 54 |         self.buffer.append(self.curr_observ)
 55 |         self.prev_observ = self.curr_observ
 56 | 
 57 | 
 58 |     def _preprocessing(self):
 59 |         assert len(self.buffer) == self.frames_stack() == self.buffer.maxlen
 60 |         observs_tensor = torch.tensor(self.buffer, dtype=torch.float32)
 61 |         # .unsqueeze(0) here to make it ready for input
 62 |         return observs_tensor.unsqueeze(0)
 63 | 
 64 | 
 65 |     def sample(self):
 66 |         return self.action_space.sample()
 67 | 
 68 | 
 69 |     def state(self):
 70 |         self.curr_state = self._preprocessing()
 71 |         return self.curr_state
 72 | 
 73 | 
 74 |     def reset(self):
 75 |         # reset episodic attributions
 76 |         self.reset_statistics('episodic')
 77 |         self.buffer.clear()
 78 |         self.prev_observ = self.env.reset()
 79 |         init_frames = self.buffer.maxlen
 80 |         assert init_frames > 0, 'minimum buffer length should be 1'
 81 |         get_action = self.sample
 82 |         for _ in range(init_frames):
 83 |             self.curr_observ, reward, done, _ = self.env.step(get_action())
 84 |             if done: print('EXCEPTION: done received during reset()', flush=True)
 85 |             self._feed_buffer()
 86 |         # statistics
 87 |         self.global_frames('add', init_frames + 1)
 88 |         self.global_episodes('add')
 89 |         self.episodic_frames('add', init_frames + 1)
 90 | 
 91 | 
 92 |     def refresh(self):
 93 |         # state and status
 94 |         self.prev_observ = None
 95 |         self.curr_state = None
 96 |         # frame stack buffer
 97 |         self.buffer = deque([], maxlen=self.frames_stack())
 98 |         self.reset_statistics('all')
 99 | 
100 | 
101 |     def step(self, action):
102 |         if isinstance(action, torch.Tensor): action = action.item()
103 |         _action_reward = 0
104 |         for _ in range(self.frames_action()):
105 | #             self.prev_observ = self.curr_observ
106 |             self.curr_observ, reward, done, info = self.env.step(action)
107 |             _action_reward += reward
108 |             self._feed_buffer()
109 |             if self.render_flag: self.render()
110 |             self.global_frames('add')
111 |             self.episodic_frames('add')
112 |             if done: break
113 | #         self._feed_buffer()
114 |         _action_reward = np.sign(_action_reward)
115 | 
116 |         self.episodic_reward('add', _action_reward)
117 |         self.action_reward('set', _action_reward)
118 |         return self.curr_observ, self.action_reward(), done, info
119 | 
120 | 
121 | 
122 | def make_ctrl_env(env_name, render=False):
123 |     orig_env = gym.make(env_name)
124 |     wrapped_env = _Gym1DWrapper(orig_env, render)
125 |     return wrapped_env
126 | 
127 | 


--------------------------------------------------------------------------------
/pytorl/lib/__init__.py:
--------------------------------------------------------------------------------
1 | from .explore import beta_priority_func
2 | from .explore import eps_greedy_func
3 | from .replay import LazyReplay
4 | from .replay import PrioritizedReplay
5 | from .replay import VanillaReplay


--------------------------------------------------------------------------------
/pytorl/lib/_tree.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import operator
  3 | 
  4 | 
  5 | """
  6 | this module contains useful tree data structures
  7 | """
  8 | 
  9 | 
 10 | class _SegmentTree:
 11 |     def __init__(self, capacity, op, default_elem):
 12 |         assert capacity > 0 
 13 |         self.capacity = capacity
 14 | #         self._tree_len = 2 ** math.ceil(math.log(capacity, 2) + 1) - 1
 15 |         self._tree_len = 2 * capacity - 1
 16 |         self._op = op
 17 |         self._value = [default_elem for _ in range(self._tree_len)]
 18 |         
 19 |         self._idx_to_leaf = []
 20 |         self._leaf_to_idx = [None] * self.capacity
 21 |         self._init_mapping()
 22 |         # seedup indexing
 23 |         self._idx_to_leaf = tuple(self._idx_to_leaf)
 24 |         self._leaf_to_idx = tuple(self._leaf_to_idx)
 25 |         
 26 |     def __len__(self):
 27 |         return self.capacity
 28 |         
 29 |     def inorder(self, idx=0):
 30 |         if idx >= self._tree_len: return
 31 |         self.inorder(2 * idx + 1)
 32 |         print(idx, self._value[idx])
 33 |         self.inorder(2 * idx + 2)
 34 |     
 35 |     def _init_mapping(self, idx=0):
 36 |         if 2 * idx + 1 >= self._tree_len: 
 37 |             self._idx_to_leaf.append(idx)
 38 |             self._leaf_to_idx[idx - self.capacity + 1] = len(self._idx_to_leaf) - 1
 39 |             return
 40 |         self._init_mapping(2 * idx + 1)
 41 |         self._init_mapping(2 * idx + 2)
 42 |     
 43 |     def _traverse_reduce(self, start, end, curr_node, node_start, node_end):
 44 |         if start == node_start and end == node_end: return self._value[curr_node]
 45 |         if 2 * curr_node + 1 >= self._tree_len: return self._value[curr_node]
 46 |         mid = (node_start + node_end) // 2
 47 |         if end <= mid:
 48 |             return self._traverse_reduce(start, end, 2 * curr_node + 1, node_start, mid)
 49 |         elif mid + 1 <= start:
 50 |             return self._traverse_reduce(start, end, 2 * curr_node + 2, mid + 1, node_end)
 51 |         else:
 52 |             return self._op(
 53 |                     self._traverse_reduce(start, mid, 2 * curr_node + 1, node_start, mid),
 54 |                     self._traverse_reduce(mid + 1, end, 2 * curr_node + 2, mid + 1, node_end)
 55 |                     )
 56 |             
 57 |     def reduce(self, start=0, end=None):
 58 |         if end is None: end = self.capacity
 59 |         if end < 0: end += self.capacity
 60 |         if end > self.capacity: raise ValueError('reduction out of capacity')
 61 |         # note: 'end' in this reduce operation is exclusive
 62 |         end -= 1
 63 |         return self._traverse_reduce(start, end, 0, 0, self.capacity - 1)
 64 |     
 65 |     
 66 |     def __setitem__(self, idx, val):
 67 |         # index of the leaf
 68 |         assert 0 <= idx < self.capacity, '__setitem__ index out of range'
 69 |         # get mapping
 70 |         idx = self._idx_to_leaf[idx]
 71 |         self._value[idx] = val
 72 |         while idx > 0:
 73 |             # get parent node
 74 |             idx = (idx - 1) // 2
 75 |             self._value[idx] = self._op(self._value[2 * idx + 1], self._value[2 * idx + 2])
 76 |             
 77 | 
 78 |     def __getitem__(self, idx):
 79 |         assert 0 <= idx < self.capacity, '__getitem__ index out of range'
 80 |         return self._value[self._idx_to_leaf[idx]]
 81 |     
 82 | 
 83 | 
 84 | class SumSegmentTree(_SegmentTree):
 85 |     def __init__(self, capacity):
 86 |         super(SumSegmentTree, self).__init__(
 87 |             capacity=capacity,
 88 |             op=operator.add,
 89 |             default_elem=0
 90 |         )
 91 | 
 92 |     def sum(self, start=0, end=None):
 93 |         """Returns arr[start] + ... + arr[end]"""
 94 |         return super().reduce(start, end)
 95 | 
 96 |     def find_prefixsum_idx(self, prefixsum):
 97 |         """Find the highest index `i` in the array such that
 98 |             sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
 99 |         if array values are probabilities, this function
100 |         allows to sample indexes according to the discrete
101 |         probability efficiently.
102 |         Parameters
103 |         ----------
104 |         perfixsum: float
105 |             upperbound on the sum of array prefix
106 |         Returns
107 |         -------
108 |         idx: int
109 |             highest index satisfying the prefixsum constraint
110 |         """
111 |         assert 0 <= prefixsum <= self.sum() + 1e-6, 'prefixsum out of range'
112 |         idx = 0
113 |         while idx < self._tree_len - self.capacity:  # while non-leaf
114 |             if self._value[2 * idx + 1] >= prefixsum:
115 |                 idx = 2 * idx + 1
116 |             else:
117 |                 prefixsum -= self._value[2 * idx + 1]
118 |                 idx = 2 * idx + 2
119 |         return self._leaf_to_idx[idx - self.capacity + 1]
120 | 
121 | 
122 |     
123 | class MinSegmentTree(_SegmentTree):
124 |     def __init__(self, capacity):
125 |         super(MinSegmentTree, self).__init__(
126 |             capacity=capacity,
127 |             op=min,
128 |             default_elem=float('inf')
129 |         )
130 | 
131 |     def min(self, start=0, end=None):
132 |         """Returns min(arr[start], ...,  arr[end])"""
133 | 
134 |         return super().reduce(start, end)


--------------------------------------------------------------------------------
/pytorl/lib/explore.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | def eps_greedy_func(eps_start=1, eps_end=0.02, num_decays=1, 
 6 |                     global_frames_func=lambda: 1):
 7 |     """
 8 |     framed and linear decay of greedy threshold
 9 |     
10 |     Args:
11 |         global_frames_func: should be a callable counter, someting like env.global_frames
12 |     """
13 |     assert num_decays >= 1 and type(num_decays) == int
14 |     assert eps_start >= eps_end
15 |     decay_rate = (eps_start - eps_end) / num_decays
16 |     
17 |     def _result():
18 |         if global_frames_func() >= num_decays: return eps_end
19 |         return eps_start - decay_rate * global_frames_func()
20 |     
21 |     return _result
22 | 
23 | 
24 | def beta_priority_func(beta_start=0, beta_end=1, num_incres=1,
25 |                        global_frames_func=lambda: 1):
26 |     """
27 |     framed and linear increase of the effect of importance weights in DQN's prioritized replay
28 |     
29 |     Args:
30 |         global_frames_func: should be a callable counter, someting like env.global_frames
31 |     """
32 |     assert num_incres >= 1 and type(num_incres) == int
33 |     assert beta_end >= beta_start
34 |     incre_rate = (beta_end - beta_start) / num_incres
35 |     
36 |     def _result():
37 |         if global_frames_func() >= num_incres: return beta_end
38 |         return beta_start + incre_rate * global_frames_func()
39 |     
40 |     return _result


--------------------------------------------------------------------------------
/pytorl/lib/replay.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from collections import deque, namedtuple
  3 | import torch
  4 | from ._tree import SumSegmentTree, MinSegmentTree
  5 | 
  6 | 
  7 | class _ExpReplay:
  8 |     def __init__(self, capacity, batch_size, init_size):
  9 |         assert batch_size is not None and batch_size >= 1
 10 |         if type(init_size) != int or init_size < batch_size: init_size = batch_size
 11 |         self.capacity = capacity
 12 |         self.batch_size = batch_size
 13 |         self.init_size = init_size
 14 |         if self.capacity is None:
 15 |             print('MEMORY WARNING: '
 16 |                   'capacity of exp replay not specified (infinity)', flush=True)
 17 |         elif self.capacity <= 0:
 18 |             raise ValueError('invalid capacity of replay queue specified')
 19 |             
 20 |         
 21 |     def __len__(self):
 22 |         raise NotImplementedError
 23 |         
 24 |     def push(self, state):
 25 |         raise NotImplementedError
 26 |         
 27 |     def sample(self):
 28 |         raise NotImplementedError
 29 |         
 30 |     def clear(self):
 31 |         raise NotImplementedError
 32 | 
 33 |     
 34 | 
 35 | class VanillaReplay(_ExpReplay):
 36 |     def __init__(self, capacity=None, batch_size=32, init_size=None):
 37 |         super(VanillaReplay, self).__init__(capacity, batch_size, init_size)
 38 |         self.obj_type = namedtuple('Exp', ('curr_state', 'action', 'next_state', 'reward'))
 39 |         
 40 |         self.memory = deque(maxlen=self.capacity)
 41 |     
 42 |     def __len__(self):
 43 |         return len(self.memory)
 44 |     
 45 |     def clear(self):
 46 |         self.memory.clear()
 47 |         
 48 |     def push(self, *obj):
 49 |         processed_obj = self.form_obj(*obj)
 50 |         self.memory.append(processed_obj)
 51 |     
 52 |     def sample(self):
 53 |         return random.sample(self.memory, self.batch_size)
 54 |     
 55 |     def form_obj(self, *args):
 56 |         return self.obj_type(*args)
 57 |     
 58 |     
 59 |     
 60 | class LazyReplay(_ExpReplay):
 61 |     """
 62 |     this replay splits the stacked frames and makes sure each frame is only saved once, so it is
 63 |     meant to be memory-efficient. However, the sampling speed will be compromised since that
 64 |     process is kind of complicated
 65 |     """
 66 |     def __init__(self, capacity=None, batch_size=32, init_size=None, frames_stack=None):
 67 |         super(LazyReplay, self).__init__(capacity, batch_size, init_size)
 68 |         assert frames_stack is not None
 69 |         self.obj_type = namedtuple('Exp', ('curr_state', 'action', 'next_state', 'reward'))
 70 |         self.frames_stack = frames_stack
 71 |         self._valid_idx = deque([])
 72 |         self._valid_flag = [False] * self.capacity
 73 |         self._done = True
 74 |         self.memory = [None] * self.capacity
 75 |         self._idx = 0
 76 | 
 77 |     
 78 |     def clear(self):
 79 |         self.memory = [None] * self.capacity
 80 |         self._idx = 0
 81 |         self._valid_idx.clear()
 82 |     
 83 |     
 84 |     def __len__(self):
 85 |         return len(self._valid_idx)
 86 |     
 87 |     
 88 |     def push(self, *obj):
 89 |         """
 90 |         preprocess obj sequence and save it to the memory, note that preprocessing varies due to 
 91 |         various obj format
 92 |         """
 93 |         curr_state, action, next_state, reward = obj
 94 |         if self._done:
 95 |             initial_frames = curr_state.squeeze().split(1)
 96 |             for f in initial_frames:
 97 |                 if self.memory[self._idx] is not None and self._valid_flag[self._idx]:
 98 |                     self._valid_idx.popleft()
 99 |                     self._valid_flag[self._idx] = False
100 |                 self.memory[self._idx] = (f, None, None)
101 |                 self._idx = (self._idx + 1) % self.capacity
102 |             self._done = False
103 |         if next_state is None:
104 |             self._done = True
105 |             next_frame = None
106 |         else:
107 |             next_frame = next_state.squeeze().split(1)[-1]
108 |         if self.memory[self._idx] is not None and self._valid_flag[self._idx]:
109 |             self._valid_idx.popleft()
110 |             self._valid_flag[self._idx] = False
111 |         self.memory[self._idx] = (next_frame, action, reward)
112 |         self._idx = (self._idx + 1) % self.capacity
113 |         self._valid_flag[(self._idx - 5) % self.capacity] = True
114 |         self._valid_idx.append((self._idx - 5) % self.capacity)
115 |             
116 |                
117 |     def sample(self):
118 |         ret_list = []
119 |         frames_buffer = deque([], maxlen=self.frames_stack)
120 |         indices = random.sample(self._valid_idx, self.batch_size)
121 |         for idx in indices:
122 |             for shift in range(self.frames_stack):
123 |                 frames_buffer.append(self.memory[(idx + shift) % self.capacity][0])
124 |             curr_state = torch.cat(tuple(frames_buffer)).unsqueeze(0).clone()
125 |             next_frame, action, reward = self.memory[(idx + self.frames_stack) % self.capacity]
126 |             if next_frame is None:
127 |                 next_state = None
128 |             else:
129 |                 frames_buffer.append(next_frame)
130 |                 next_state = torch.cat(tuple(frames_buffer)).unsqueeze(0).clone()
131 |             ret_list.append((curr_state, action, next_state, reward))
132 |         return tuple(ret_list)
133 |             
134 |         
135 |     def form_obj(self, *args):
136 |         return self.obj_type(*args)        
137 |     
138 | 
139 | 
140 | class PrioritizedReplay(_ExpReplay):
141 |     def __init__(
142 |         self, 
143 |         capacity=None, 
144 |         batch_size=32, 
145 |         init_size=None, 
146 |         alpha=1, 
147 |         beta_func=lambda: 0, 
148 |         eps=1e-6
149 |     ):
150 |         super(PrioritizedReplay, self).__init__(capacity, batch_size, init_size)
151 |         assert alpha >= 0 and capacity is not None
152 |         self._alpha = alpha
153 |         self.in_obj_type = namedtuple('Exp', ('curr_state', 'action', 'next_state', 'reward'))
154 |         self.out_obj_type = namedtuple('PriorExp', (
155 |             'curr_state', 'action', 'next_state', 'reward', 'weight', 'index'))
156 |         self.memory = []
157 |         self._idx = 0
158 |         self._sum_prior = SumSegmentTree(self.capacity)
159 |         self._min_prior = MinSegmentTree(self.capacity)
160 |         self._max_prior = 1
161 |         self.get_beta = beta_func
162 |         self.eps = eps
163 |         
164 |         
165 |     def clear(self):
166 |         self.memory = []
167 |         self._idx = 0
168 |         self._sum_prior = SumSegmentTree(self.capacity)
169 |         self._min_prior = MinSegmentTree(self.capacity)
170 |     
171 |     
172 |     def __len__(self):
173 |         return len(self.memory)
174 |     
175 |     
176 |     def push(self, *obj):
177 |         processed_obj = self._form_input_obj(*obj)
178 |         if len(self) < self.capacity:
179 |             self.memory.append(processed_obj)
180 |         else:
181 |             self.memory[self._idx] = processed_obj
182 |             
183 |         priority = self._max_prior ** self._alpha
184 |         self._min_prior[self._idx] = self._sum_prior[self._idx] = priority
185 |         self._idx = (self._idx + 1) % self.capacity
186 |     
187 |     
188 |     def _sample_indices(self):
189 |         indices = []
190 |         for _ in range(self.batch_size):
191 |             # sum(a, b): a is inclusive whereas b is exclusive
192 |             mass = random.random() * self._sum_prior.sum(0, len(self))
193 |             idx = self._sum_prior.find_prefixsum_idx(mass)
194 |             indices.append(idx)
195 |         return indices
196 |     
197 |     
198 |     def sample(self):
199 |         beta = self.get_beta()
200 |         indices = self._sample_indices()
201 |         weights = []
202 |         ret_list = []
203 |         p_min = self._min_prior.min() / self._sum_prior.sum()
204 |         max_weight = (p_min * len(self)) ** (-beta)
205 | 
206 |         for idx in indices:
207 |             p_sample = self._sum_prior[idx] / self._sum_prior.sum()
208 |             weight = (p_sample * len(self)) ** (-beta) / max_weight
209 |             ret_list.append(tuple(list(self.memory[idx]) + [weight, idx]))
210 |         
211 |         return ret_list
212 |     
213 |     
214 |     def update_priorities(self, idxes, priorities):
215 |         assert len(idxes) == len(priorities)
216 |         for idx, priority in zip(idxes, priorities):
217 |             assert priority > 0 and 0 <= idx < len(self.memory)
218 |             self._sum_prior[idx] = priority ** self._alpha
219 |             self._min_prior[idx] = priority ** self._alpha
220 |             self._max_prior = max(self._max_prior, priority)
221 |     
222 |     
223 |     def _form_input_obj(self, *args):
224 |         return self.in_obj_type(*args)
225 |         
226 |         
227 |     def form_obj(self, *args):
228 |         return self.out_obj_type(*args)
229 | 
230 |     
231 |     


--------------------------------------------------------------------------------
/pytorl/networks/__init__.py:
--------------------------------------------------------------------------------
1 | from .atari_conv import Dueling_DQN
2 | from .atari_conv import Q_Network
3 | from .ctrl_mlp import Dueling_MLP
4 | from .ctrl_mlp import Q_MLP
5 | from .io import save_pth
6 | from .io import load_pth
7 | from .io import init_network


--------------------------------------------------------------------------------
/pytorl/networks/atari_conv.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | class Q_Network(nn.Module):
  7 |     """
  8 |     build a 2-D observation Q-network for DQN and its variants
  9 |     
 10 |     Args:
 11 |         input_size: a 3-D tuple (or equivalent Iterable) indicating the size of input
 12 |         num_actions: corresponding to the size of output
 13 |         backbone: a pytorch network, if not specified, using original DQN instead
 14 |         replace_fc: the name of last linear(i.e. 'fc' or 'last_linear'), if not 
 15 |             specified, using backbone's last output layer instead.
 16 |     """
 17 |     def __init__(
 18 |         self, 
 19 |         input_size=None, 
 20 |         num_actions=None, 
 21 |         backbone=None, 
 22 |         replace_fc=None, 
 23 |     ):
 24 |         super(Q_Network, self).__init__()
 25 |         if input_size is not None:
 26 |             assert len(input_size) == 3, 'input_size must be a 3-D Iterable'
 27 |         self.input_size, self.num_actions = input_size, num_actions
 28 |         if backbone:
 29 |             self.network = backbone
 30 |             if replace_fc is not None:
 31 |                 if num_actions is None:
 32 |                     raise ValueError('num_actions must be specified if enabling replace_fc')
 33 |                 last_fc = getattr(self.network, replace_fc)
 34 |                 last_fc_in = last_fc.__dict__['in_features']
 35 |                 last_fc_out = last_fc.__dict__['out_features']
 36 |                 if last_fc_out != num_actions:
 37 |                     last_fc = nn.Linear(last_fc_in, num_actions)
 38 |             if input_size is not None:
 39 |                 self.check_forward()
 40 |             else:
 41 |                 print('warning: skip precheck due to input_size not specified', flush=True)
 42 |                 
 43 |         else:
 44 |             if not (input_size and num_actions):
 45 |                 raise ValueError(
 46 |                         'must specify input_size and num_actions if backbone not specified')
 47 |             self.network = _Original_DQN(input_size, num_actions)
 48 |             self.check_forward()
 49 |    
 50 | 
 51 |     def forward(self, x):
 52 |         return self.network(x)
 53 |     
 54 |     def check_forward(self):
 55 |         mock = torch.zeros(1, *self.input_size)
 56 |         try:
 57 |             self.network(mock)
 58 |         except:
 59 |             raise ValueError(
 60 |                 'network forward failure, presumably due to invalid input_size')
 61 | 
 62 |             
 63 |             
 64 | class _Original_DQN(nn.Module):
 65 |     """
 66 |     this is the Q-network used in the original DeepMind DQN paper: Human-level control through 
 67 |         deep reinforcement learning (https://www.nature.com/articles/nature14236)
 68 |     
 69 |     Args:
 70 |         input_size: a 3-D tuple (or equivalent Iterable) indicating the size of input
 71 |         num_actions: corresponding to the size of output
 72 |     """
 73 |     def __init__(self, input_size, num_actions):
 74 |         super(_Original_DQN, self).__init__()
 75 |         self.input_size = input_size
 76 |         self.num_actions = num_actions
 77 |     
 78 |         self.conv1 = nn.Conv2d(self.input_size[0], 32, kernel_size=8, stride=4)
 79 |         self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
 80 |         self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
 81 |         self.fc1 = nn.Linear(self.feature_size, 512)
 82 |         self.fc2 = nn.Linear(512, self.num_actions)
 83 |     
 84 |     
 85 |     @property
 86 |     def feature_size(self):
 87 |         return self.features(torch.zeros(1, *self.input_size)).shape[1]
 88 |     
 89 |     
 90 |     def features(self, x):
 91 |         x = F.relu(self.conv1(x))
 92 |         x = F.relu(self.conv2(x))
 93 |         x = F.relu(self.conv3(x))
 94 |         x = x.view(x.shape[0], -1)
 95 |         return x   
 96 |     
 97 |     def forward(self, x):
 98 |         x = self.features(x)
 99 |         x = F.relu(self.fc1(x))
100 |         x = self.fc2(x)
101 |         return x
102 |     
103 |     
104 | 
105 | class Dueling_DQN(nn.Module):
106 |     def __init__(self, input_size, num_actions, num_hidden=512):
107 |         super(Dueling_DQN, self).__init__()
108 |         
109 |         self.input_size = input_size
110 |         self.num_actions = num_actions
111 |         self.num_hidden = num_hidden
112 |         
113 |         self.conv1 = nn.Conv2d(self.input_size[0], 32, kernel_size=8, stride=4)
114 |         self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
115 |         self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
116 | 
117 |         self.adv1 = nn.Linear(self.feature_size, self.num_hidden)
118 |         self.adv2 = nn.Linear(self.num_hidden, self.num_actions)
119 | 
120 |         self.val1 = nn.Linear(self.feature_size, self.num_hidden)
121 |         self.val2 = nn.Linear(self.num_hidden, 1)
122 |         
123 |         
124 |     def features(self, x):
125 |         x = F.relu(self.conv1(x))
126 |         x = F.relu(self.conv2(x))
127 |         x = F.relu(self.conv3(x))
128 |         x = x.view(x.shape[0], -1)
129 |         return x   
130 |     
131 |     
132 |     def forward(self, x):
133 |         x = self.features(x)
134 |         adv = F.relu(self.adv1(x))
135 |         adv = self.adv2(adv)
136 |         val = F.relu(self.val1(x))
137 |         val = self.val2(val)
138 |         return val + adv - adv.mean()
139 |     
140 |     @property
141 |     def feature_size(self):
142 |         return self.features(torch.zeros(1, *self.input_size)).shape[1]
143 |     
144 |     
145 |     


--------------------------------------------------------------------------------
/pytorl/networks/ctrl_mlp.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | class Q_MLP(nn.Module):
 7 |     """
 8 |     simple mlp for 1-D observation input from envs like
 9 |     CartPole-v1 etc.
10 |     
11 |     Args:
12 |         input_size: a 2-D tuple or equivalent Iterable, indicating the size 
13 |             of input, i.e, the obervation space
14 |         num_actions: corresponding to the size of output
15 |         hidden_size: size of hidden layer outputs
16 |     """
17 |     def __init__(self, input_size=None, num_actions=None, hidden_size=256):
18 |         assert input_size and num_actions
19 |         assert len(input_size) == 2, 'input_size must be a 2-D Iterable'
20 |         super(Q_MLP, self).__init__()
21 |         self.input_size = input_size
22 |         self.num_actions = num_actions
23 |         self.hidden_size = hidden_size 
24 |         self.fc1 = nn.Linear(input_size[0] * input_size[1], hidden_size)
25 |         self.fc2 = nn.Linear(hidden_size, hidden_size)
26 |         self.fc3 = nn.Linear(hidden_size, num_actions)
27 |     
28 |     def features(self, x):
29 |         x = x.view(x.shape[0], -1)
30 |         x = F.relu(self.fc1(x))
31 |         x = F.relu(self.fc2(x))
32 |         return x
33 |     
34 |     def forward(self, x):
35 |         x = self.features(x)
36 |         x = F.relu(self.fc3(x))
37 |         return x
38 |         
39 | 
40 | """ not tested yet"""
41 | class Dueling_MLP(nn.Module):
42 |     def __init__(self, input_size, num_actions, num_hidden=512):
43 |         assert len(input_size) == 2, 'input_size must be a 2-D Iterable'
44 |         super(Dueling_MLP, self).__init__()
45 |         
46 |         self.input_size = input_size
47 |         self.num_actions = num_actions
48 |         self.num_hidden = num_hidden
49 |         
50 |         self.fc1 = nn.Linear(input_size[0] * input_size[1], num_hidden)
51 |         self.fc2 = nn.Linear(num_hidden, num_hidden)
52 | 
53 |         self.adv1 = nn.Linear(num_hidden, num_hidden)
54 |         self.adv2 = nn.Linear(num_hidden, self.num_actions)
55 | 
56 |         self.val1 = nn.Linear(num_hidden, num_hidden)
57 |         self.val2 = nn.Linear(num_hidden, 1)
58 |         
59 |         
60 |     def features(self, x):
61 |         x = x.view(x.shape[0], -1)
62 |         x = F.relu(self.fc1(x))
63 |         x = F.relu(self.fc2(x))
64 |         return x
65 |     
66 |     
67 |     def forward(self, x):
68 |         x = self.features(x)
69 |         adv = F.relu(self.adv1(x))
70 |         adv = self.adv2(adv)
71 |         val = F.relu(self.val1(x))
72 |         val = self.val2(val)
73 |         return val + adv - adv.mean()


--------------------------------------------------------------------------------
/pytorl/networks/io.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | 
 6 | def save_pth(obj, path, filename=None, obj_name=None):
 7 |     """
 8 |     save an obj on the disk as .pth
 9 | 
10 |     Args:
11 |         obj: a serializable object, if specified as an nn obj (e.g. a neural net), the state_dict will 
12 |             be fetched from it and saved eventually.
13 |         path: supposed to be a string containing the path and filename if specifying just the path 
14 |             without filename, it is also ok so long as the filename is correctly set.
15 |         filename: supposed to be set if path is literally just the path without the file name.
16 |         obj_name: the string that will be the name of the saved obj, if not set, the saving process 
17 |             will be silent.
18 |     """
19 |     if isinstance(obj, nn.Module):
20 |         obj = obj.state_dict()
21 |     if filename:
22 |         path = os.path.join(path, filename)
23 |     filedir = os.path.dirname(path)
24 |     # check path existence
25 |     if not os.path.exists(filedir):
26 |         os.makedirs(filedir)
27 |     torch.save(obj, path)
28 |     if obj_name:
29 |         print('[%s] successfully saved at [%s]' % (
30 |                 obj_name, os.path.abspath(path)), flush=True)
31 |         
32 | 
33 | def load_pth(path, filename=None, obj_name=None):
34 |     """
35 |     load  an obj on the disk as .pth
36 | 
37 |     Args:
38 |         path: supposed to be a string containing the path and filename if specifying just the 
39 |             path without filename, it is also ok so long as the filename is correctly set.
40 |         filename: supposed to be set if path is literally just the path without the file name.
41 |         obj_name: the string that will be the name of the loaded obj, if not set, the loading 
42 |             process will be silent.
43 | 
44 |     Return:
45 |         loaded: the loaded object
46 |     """
47 |     if filename:
48 |         path = os.path.join(path, filename)
49 |     loaded = torch.load(path)
50 |     if obj_name:
51 |         print('[%s] successfully loaded from [%s]' % (
52 |                 obj_name, os.path.abspath(path)), flush=True)
53 |     return loaded
54 | 
55 | 
56 | def init_network(network, pretrained_path=None, filename=None, obj_name=None):
57 |     """
58 |     load a pretrained model
59 | 
60 |     Args:
61 |         network: the network which is going to be fed with the pretrained .pth
62 |         pretrained_path: supposed to be a string containing the path and filename if specifying 
63 |             just the path without filename, it is also ok so long as the filename is correctly set.
64 |             if pretrained_path is invalid(e.g. file not found or not a string), this function will 
65 |             return immediately without load
66 |         filename: supposed to be set if path is literally just the path without the file name.
67 |         obj_name: the string that will be the name of the network obj, if not set, the loading 
68 |             process will be silent.
69 |     
70 |     """    
71 |     if type(pretrained_path) != str: return
72 |     elif filename is not None: pretrained_path = os.path.join(pretrained_path, filename)
73 |     if not os.path.isfile(pretrained_path): return
74 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
75 |     if device == 'cuda':
76 |         device = 'cuda:%s' % torch.cuda.current_decice()
77 |         print('mapping cuda to [%s]' % device, flush=True)
78 |     checkpoint = torch.load(
79 |         pretrained_path,
80 |         map_location = device
81 |     )
82 |     network.load_state_dict(checkpoint)
83 |     if obj_name is not None:
84 |         print('[%s] successfully loaded from [%s]' % (
85 |                 obj_name, os.path.abspath(pretrained_path)), flush=True)
86 | 


--------------------------------------------------------------------------------
/pytorl/settings/__init__.py:
--------------------------------------------------------------------------------
1 | from .entries import lrun
2 | from .entries import rl_run


--------------------------------------------------------------------------------
/pytorl/settings/entries.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import shutil
 4 | import socket
 5 | import subprocess
 6 | import pytorl
 7 | from pytorl.utils import ConfigReader
 8 | 
 9 | MODULE_NAME = 'pytorl'
10 | 
11 | 
12 | def _cd_and_execute(trg_dir, command, run_name):
13 |     os.chdir(str(trg_dir))
14 |     env = os.environ.copy()
15 |     env['run_name'] = run_name
16 |     process = subprocess.Popen(command, shell = True, env = env)
17 |     while True:
18 |         try:
19 |             process.wait()
20 |             break
21 |         except KeyboardInterrupt:
22 |             print('\tPlease double press Ctrl-C within 1 second to kill srun job. '
23 |                   'It will take several seconds to shutdown ...', flush = True)
24 | 
25 | 
26 | def _get_host_ip():
27 |     try:
28 |         s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
29 |         s.connect(('8.8.8.8', 80))
30 |         ip = s.getsockname()[0]
31 |     finally:
32 |         s.close()
33 |     return ip
34 | 
35 | 
36 | 
37 | def rl_run():
38 |     parser = argparse.ArgumentParser()
39 |     parser.add_argument('--run-name', '-rn', default='default')
40 |     parser.add_argument('--command', '-c', required=True)
41 |     opt = parser.parse_args()
42 | 
43 |     setting_file = os.path.join(os.path.dirname(pytorl.__file__), 'settings/%s.yaml' % MODULE_NAME)
44 |     cfg_reader = ConfigReader(filename=setting_file)
45 |     config = cfg_reader.get_config()
46 |     exp_dir = config.experiment_dir
47 | 
48 |     src_path = os.getcwd()
49 |     src_dir = os.path.split(src_path)[-1]
50 |     if not os.path.isdir(exp_dir):
51 |         raise NotADirectoryError('experiment_dir [%s] does not exist' % exp_dir)
52 | 
53 |     exp_entry = os.path.join(exp_dir, opt.run_name)
54 | 
55 |     if os.path.isdir(exp_entry):
56 |         while True:
57 |             print('experiment [%s] already exists at [%s]:' % (opt.run_name, exp_dir),
58 |                   '\n>>>>>>>>>>>> overwrite it or not ? <<<<<<<<<<<< [Y/n]:', flush=True, end='')
59 |             response = input().strip()
60 |             if response in {'Y', 'y'}: break
61 |             elif response in {'N', 'n'}: sys.exit()
62 |             else: continue
63 |         # warning: this overwrites previous experiment
64 |         shutil.rmtree(exp_entry)
65 | 
66 |     os.makedirs(exp_entry, exist_ok=True)
67 |     trg_dir = os.path.join(exp_entry, src_dir)
68 |     shutil.copytree(src_path, trg_dir)
69 | 
70 |     _cd_and_execute(trg_dir, opt.command, opt.run_name)
71 | 
72 | 
73 | def lrun():
74 |     parser = argparse.ArgumentParser()
75 |     parser.add_argument('--ntasks', '-n', type=int, required=True)
76 |     parser.add_argument('cmd', nargs=argparse.REMAINDER)
77 |     opt = parser.parse_args()
78 |     cmd = ' '.join(opt.cmd)
79 | 
80 |     proc_list = []
81 |     for proc_id in range(opt.ntasks):
82 |         env = os.environ.copy()
83 |         env['SLURM_NTASKS'] = str(opt.ntasks)
84 |         env['SLURM_PROCID'] = str(proc_id)
85 |         env['SLURM_NODELIST'] = '5412306 ' + _get_host_ip().replace('.', '-')
86 |         proc_list.append(subprocess.Popen(cmd, shell=True, env=env))
87 | 
88 |     while True:
89 |         try:
90 |             for proc in proc_list:
91 |                 proc.wait()
92 |             break
93 |         except KeyboardInterrupt:
94 |             print('\tPlease double press Ctrl-C within 1 second to kill srun job. '
95 |                   'It will take several seconds to shutdown ...', flush=True)
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/pytorl/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import ConfigReader
2 | from .decorators import Setting
3 | from .recorder import tensorboard_writer


--------------------------------------------------------------------------------
/pytorl/utils/config.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import yaml
 3 | import torch.nn.functional as F
 4 | import torch.optim as optim
 5 | import pytorl.networks as networks
 6 | 
 7 | 
 8 | class _DotConfig(dict):
 9 |     """
10 |     override __getattr__ method in dict class to make config dict dot-accessible
11 |     
12 |     [!]WARNING: can only be used to get attribution but cannot modefy original config value
13 |     """ 
14 |     def __getattr__(self, key):
15 |         try:
16 |             value = self[key]
17 |         except KeyError:
18 |             return super().__getattr__(key)
19 |         if isinstance(value, dict):
20 |             return _DotConfig(value)
21 |         return value
22 | 
23 |     def __setattr__(self, key, value):
24 |         raise UserWarning('cannot override config entry')
25 | 
26 | 
27 | 
28 | """ [!]DEPRECATED
29 | THIS METHOD IS NO LONGER IN USE
30 | """
31 | def _get_config(filename=None, default='config.yaml'):
32 |     """
33 |     Args:
34 |         filename: is used to parse config filename directly in the program if not specifed, config 
35 |             file will be parsed via argparse
36 |         default: default config parser argument
37 |     """
38 |     if filename is None:
39 |         parser = argparse.ArgumentParser()
40 |         parser.add_argument('--config', '-cfg', type=str, default=default)
41 |         _cfg_name = parser.parse_args().config
42 |     else:
43 |         _cfg_name = filename
44 |     
45 |     assert type(_cfg_name) == str, 'invalid config filename specified (not a string type)'
46 |     assert _cfg_name.rsplit('.', 1)[-1] == 'yaml', 'unspported config file type yet (not .yaml)'
47 |     
48 |     with open(_cfg_name, 'r') as _cfg_f:
49 |         _raw_cfg = yaml.load(_cfg_f)
50 |     
51 |     return _DotConfig(_raw_cfg)
52 | 
53 | 
54 | 
55 | class ConfigReader:
56 |     """
57 |     public interface for getting config conents
58 |     Args:
59 |         filename: is used to parse config filename directly in the program if not specifed, config 
60 |             file will be parsed via argparse
61 |         default: default config parser argument
62 |     """
63 |     def __init__(self, filename=None, default='config.yaml'):
64 |         if filename is None:
65 |             parser = argparse.ArgumentParser()
66 |             parser.add_argument('--config', '-cfg', type=str, default=default)
67 |             _cfg_name = parser.parse_args().config
68 |         else:
69 |             _cfg_name = filename
70 | 
71 |         assert type(_cfg_name) == str, 'invalid config filename specified (not a string type)'
72 |         assert _cfg_name.rsplit('.', 1)[-1] == 'yaml', 'unspported config file type yet (not .yaml)'
73 |         with open(_cfg_name, 'r') as _cfg_f:
74 |             _raw_cfg = yaml.load(_cfg_f)
75 |         
76 |         self.config_path = _cfg_name
77 |         self.config = _DotConfig(_raw_cfg)
78 | 
79 |         
80 |     def get_config(self):
81 |         return self.config
82 |     
83 |     def get_loss_func(self, attr):
84 |         return getattr(F, attr)
85 |     
86 |     def get_optimizer_func(self, attr):
87 |         return getattr(optim, attr)
88 |     
89 | 
90 |     def get_network_func(self, attr):
91 |         return getattr(networks, attr)
92 |     


--------------------------------------------------------------------------------
/pytorl/utils/decorators.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | from collections import defaultdict
 3 | 
 4 | __all__ = ['Setting']
 5 | 
 6 | 
 7 | class DecorateSetting:
 8 |     def __init__(self):
 9 |         self.instances = defaultdict(set)
10 | 
11 |         
12 |     def __call__(self, func):
13 |         @functools.wraps(func)
14 |         def _setting(caller, *args, **kwargs):
15 |             if func.__name__ in self.instances[hash(caller)]: 
16 |                 print('warning: %s of %s has been overridden' % (func.__name__, caller), flush=True)
17 |             self.instances[hash(caller)].add(func.__name__)
18 |             func(caller, *args, **kwargs)
19 |         return _setting
20 | 
21 |     
22 |     def only_once(self, func):
23 |         @functools.wraps(func)
24 |         def _check_setting(caller, *args, **kwargs):
25 |             if func.__name__ in self.instances[hash(caller)]: 
26 |                 raise RuntimeError('%s can only be called once' % func)
27 |             self.instances[hash(caller)].add(func.__name__)
28 |             func(caller, *args, **kwargs)
29 |         return _check_setting
30 |     
31 |     
32 | Setting = DecorateSetting()
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/pytorl/utils/recorder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tensorboardX import SummaryWriter
 3 | 
 4 | 
 5 | class _TensorboardWriter(SummaryWriter):
 6 |     """
 7 |     SummaryWriter wrapper
 8 |     """
 9 |     def __init__(self, logdir):
10 |         super(_TensorboardWriter, self).__init__(logdir)
11 |         self.logdir = logdir
12 |     
13 |     def add_textfile(self, tag, path, filename=None):
14 |         assert type(tag) == str
15 |         if filename is not None:
16 |             path = os.path.join(path, filename)
17 |         with open(path, 'r') as f:
18 |             # since 'list' object has no attribute 'encode'
19 |             content = '\n'.join(f.readlines())
20 |         self.add_text(tag, content)
21 |         
22 | 
23 | def tensorboard_writer(logdir):
24 |     writer = _TensorboardWriter(logdir)
25 |     
26 |     return writer
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------