├── .gitignore
├── LICENSE
├── README.md
├── docs
    └── Usage.md
├── examples
    ├── run_dqn_agent.py
    └── run_pg_agent.py
├── plot
    ├── .gitignore
    ├── README.md
    ├── dataio.py
    ├── dataproc.py
    └── plotter.py
└── rltf
    ├── __init__.py
    ├── agents
        ├── __init__.py
        ├── agent.py
        ├── base_agents.py
        ├── ddpg_agent.py
        ├── dqn_agent.py
        ├── pg_agent.py
        ├── ppo_agent.py
        ├── qlearn_agent.py
        └── trpo_agent.py
    ├── cmdutils
        ├── __init__.py
        ├── cmdargs.py
        ├── defaults.py
        └── override.py
    ├── envs
        ├── __init__.py
        ├── atari.py
        ├── common.py
        ├── utils.py
        └── wrappers.py
    ├── exploration
        ├── __init__.py
        ├── exploration.py
        └── random_noise.py
    ├── memory
        ├── __init__.py
        ├── base_buffer.py
        ├── pg_buffer.py
        └── replay_buffer.py
    ├── models
        ├── __init__.py
        ├── base_dqn.py
        ├── base_pg.py
        ├── bdqn.py
        ├── bstrap_dqn.py
        ├── c51.py
        ├── c51_ids.py
        ├── ddpg.py
        ├── ddqn.py
        ├── dqn.py
        ├── dqn_ensemble.py
        ├── dqn_ids.py
        ├── dqn_ucb.py
        ├── model.py
        ├── ppo.py
        ├── qr_dqn.py
        ├── qrdqn_ids.py
        ├── reinforce.py
        └── trpo.py
    ├── monitoring
        ├── __init__.py
        ├── monitor.py
        ├── stats.py
        ├── vplot.py
        └── vplot_manager.py
    ├── optimizers
        ├── __init__.py
        ├── grad_clip.py
        ├── natural_grad.py
        └── opt_conf.py
    ├── schedules
        ├── __init__.py
        ├── const_schedule.py
        ├── exponential_decay.py
        ├── linear_schedule.py
        ├── piecewise_schedule.py
        ├── schedule.py
        └── utils.py
    ├── tf_utils
        ├── __init__.py
        ├── blr.py
        ├── cg.py
        ├── distributions.py
        ├── inverse.py
        ├── ops.py
        └── tf_utils.py
    └── utils
        ├── __init__.py
        ├── layouts.py
        ├── maker.py
        ├── rltf_conf.py
        ├── rltf_log.py
        └── seeding.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .gym
2 | trained_models/
3 | /**/__pycache__
4 | Notes.md
5 | plot/conf
6 | plot/restore.py
7 | .directory
8 | .pylintrc
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Nikolay Nikolov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RLTF: Reinforcement Learning in TensorFlow
 2 | RLTF is a research framework that provides high-quality implementations of common Reinforcement Learning algorithms. It also allows fast-prototyping and benchmarking of new methods.
 3 | 
 4 | **Status**: This work is under active development (breaking changes might occur).
 5 | 
 6 | ## Implemented Algorithms
 7 | 
 8 | | Algorithm                                                 | Model                                           | Agent                                  |
 9 | | ---                                                       | ---                                             | ---                                    |
10 | | [DQN](https://www.nature.com/articles/nature14236)        | [DQN](rltf/models/dqn.py)                       | [AgentDQN](rltf/agents/dqn_agent.py)   |
11 | | [Double DQN](https://arxiv.org/abs/1509.06461)            | [DDQN](rltf/models/ddqn.py)                     | [AgentDQN](rltf/agents/dqn_agent.py)   |
12 | | [Dueling DQN](https://arxiv.org/abs/1511.06581)           | next                                            | next                                   |
13 | | [Prioritized Experience Replay](https://arxiv.org/abs/1511.05952) | next                                    | next                                   |
14 | | [C51](https://arxiv.org/abs/1707.06887)                   | [C51](rltf/models/c51.py)                       | [AgentDQN](rltf/agents/dqn_agent.py)   |
15 | | [QR-DQN](https://arxiv.org/abs/1710.10044)                | [QRDQN](rltf/models/qr_dqn.py)                  | [AgentDQN](rltf/agents/dqn_agent.py)   |
16 | | [Bootstrapped DQN](https://arxiv.org/pdf/1602.04621.pdf)  | [BstrapDQN](rltf/models/bstrap_dqn.py)          | [AgentDQN](rltf/agents/dqn_agent.py)   |
17 | | [Bootstrapped UCB](https://arxiv.org/pdf/1706.01502.pdf)  | [DQN_UCB](rltf/models/dqn_ucb.py)               | [AgentDQN](rltf/agents/dqn_agent.py)   |
18 | | [DQN Ensemble](https://arxiv.org/pdf/1706.01502.pdf)      | [DQN_Ensemble](rltf/models/dqn_ensemble.py)     | [AgentDQN](rltf/agents/dqn_agent.py)   |
19 | | [BDQN](https://arxiv.org/abs/1802.04412)                  | [BDQN](rltf/models/bdqn.py)                     | [AgentBDQN](rltf/agents/dqn_agent.py)  |
20 | | [DQN-IDS](https://arxiv.org/abs/1812.07544)               | [DQN-IDS](rltf/models/dqn_ids.py)               | [AgentDQN](rltf/agents/dqn_agent.py)   |
21 | | [C51-IDS](https://arxiv.org/abs/1812.07544)               | [C51-IDS](rltf/models/c51_ids.py)               | [AgentDQN](rltf/agents/dqn_agent.py)   |
22 | | [DDPG](https://arxiv.org/abs/1509.02971)                  | [DDPG](rltf/models/ddpg.py)                     | [AgentDDPG](rltf/agents/ddpg_agent.py) |
23 | | [REINFORCE](http://www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf) | [REINFORCE](rltf/models/reinforce.py)           | [AgentPG](rltf/agents/pg_agent.py)     |
24 | | [PPO](https://arxiv.org/abs/1707.06347)                   | [PPO](rltf/models/ppo.py)                       | [AgentPPO](rltf/agents/ppo_agent.py)   |
25 | | [TRPO](https://arxiv.org/abs/1502.05477)                  | [TRPO](rltf/models/trpo.py)                     | [AgentTRPO](rltf/agents/trpo_agent.py) |
26 | 
27 | 
28 | Coming additions:
29 |  - MPI support for policy gradients
30 |  - Dueling DQN
31 |  - Prioritized Experience Replay
32 |  - n-step returns
33 |  - Rainbow
34 | 
35 | 
36 | ## Reproducibility and Known Issues
37 | Implemented models are able to achieve comparable results to the ones reported
38 | in the corresponding papers. With tiny exceptions, all implementations should be
39 | equivalent to the ones described in the original papers.
40 | 
41 | Implementations known to misbehave:
42 | - QR-DQN (in progress)
43 | 
44 | 
45 | ## About
46 | 
47 | The goal of this framework is to provide stable implementations of standard
48 | RL algorithms and simultaneously enable fast prototyping of new methods.
49 | Some important features include:
50 | - Exact reimplementation and competitive performance of original papers
51 | - Unified and reusable modules
52 | - Clear hierarchical structure and easy code control
53 | - Efficient GPU utilization and fast training
54 | - Detailed logs of hyperparameters, train and eval scores, git diff, TensorBoard visualizations
55 | - Episode video recordings with plots of network outputs
56 | - Compatible with OpenAI gym, MuJoCo, PyBullet and Roboschool
57 | - Restoring the training process from where it stopped, retraining on a new task, fine-tuning
58 | 
59 | 
60 | ## Installation
61 | 
62 | ### Dependencies
63 | - Python >= 3.5
64 | - Tensorflow >= 1.6.0
65 | - OpenAI gym >= 0.9.6
66 | - opencv-python (either pip package or OpenCV library with python bindings)
67 | - matplotlib (with TkAgg backend)
68 | - pybullet (optional)
69 | - roboschool (optional)
70 | 
71 | ### Install
72 | ```
73 | git clone https://github.com/nikonikolov/rltf.git
74 | ```
75 | pip package coming soon
76 | 
77 | ## Documentation
78 | For brief documentation see [docs/](docs/).
79 | 
80 | If you use this repository for you research, please cite:
81 | ```
82 | @misc{rltf,
83 |   author = {Nikolay Nikolov},
84 |   title = {RLTF: Reinforcement Learning in TensorFlow},
85 |   year = {2018},
86 |   publisher = {GitHub},
87 |   journal = {GitHub repository},
88 |   howpublished = {\url{https://github.com/nikonikolov/rltf}},
89 | }
90 | ```
91 | 


--------------------------------------------------------------------------------
/docs/Usage.md:
--------------------------------------------------------------------------------
  1 | ## Structure Overview
  2 | 
  3 | All algorithms are composed of two parts: `Agent` and `Model`
  4 | 
  5 | ### Agent
  6 | - Should inherit from the [`Agent`](rltf/agents/agent.py) class
  7 | - Provides communication interface between the [`Model`](rltf/models/model.py) and the environment
  8 | - Executes the exact training procedure
  9 | - Responsible for
 10 |   - Stepping the environment
 11 |   - Running a training step
 12 |   - Storing experience for training
 13 | 
 14 | ### Model
 15 | - Should inherit from the [`Model`](rltf/models/model.py) class
 16 | - A passive component which only implements the Tensorflow computation graph for the network
 17 | - Implements forward and backward network pass, exposes useful input and output Tensors and Operations
 18 | - Controlled by the [`Agent`](rltf/agents/agent.py) during training and evaluation
 19 | 
 20 | -------------------------------------------------------------------------------
 21 | 
 22 | ## Execution
 23 | 
 24 | The data for separate runs is stored on disk under the template directory path
 25 | `trained_models/<model-name>/<env-id>_<date>_<time>`. For example:
 26 | `trained_models/dqn/PongNoFrameskip-v4_2018-03-31_19.27.33/`.
 27 | 
 28 | Each run directory contains"
 29 | - `run.log` - Log file, including git branch and hash, all hyperparameters, train and eval metrics
 30 | - `git.diff` - Diff file with uncommited git changes at the time of launch
 31 | - `monitor/data/` - `numpy` data with train and eval statistics. Can be used for custom plots
 32 | - `monitor/videos/` - video recordings of episodes, if any were made
 33 | - `monitor/videos/` - TensorBoard files
 34 | - `snapshots/latest/` - latest training checkpoint
 35 | - `snapshots/best/` - checkpoint for which produced the best eval score
 36 | - `buffer/` - latest state and data of the replay buffer (if saved)
 37 | 
 38 | Every saved model can be restored and training continued as if it never stopped
 39 | (the only difference is state of the random number generators). Additionally, model
 40 | variables can be used for initializing or fune-tuning a new model on a new task.
 41 | 
 42 | -------------------------------------------------------------------------------
 43 | 
 44 | ## Usage
 45 | 
 46 | ### DQN Family
 47 | 
 48 | Usage:
 49 | ```bash
 50 | python3 -m examples.run_dqn_agent --model=<model> --env_id=<env>
 51 | ```
 52 | Allowed models include: `DQN, DDQN, C51, QRDQN, BstrapDQN, DQN_UCB,
 53 | DQN_Ensemble, DQN_IDS, C51_IDS, QRDQN_IDS, BDQN, BDQN_TS, BDQN_UCB, BDQN_IDS`
 54 | 
 55 | From the Atari family only `<AtariEnv>NoFrameskip-v4` and `<AtariEnv>NoFrameskip-v0`
 56 | are currently supported. To enable other versions of the gym environments, you need
 57 | to select the correct wrappers from [atari.py](rltf/envs/atari.py).
 58 | 
 59 | ### Policy Gradients Family
 60 | 
 61 | Usage:
 62 | ```bash
 63 | python3 -m examples.run_pg_agent --model=<model> --env_id=<env>
 64 | ```
 65 | Allowed models include: `DDPG, REINFORCE, TRPO, PPO`
 66 | 
 67 | ### Command Line Arguments
 68 | All default hyperparameters are located in [`rltf/cmdutils/defaults.py`](rltf/cmdutils/defaults.py).
 69 | All of these can be directly overriden from the command line, for example:
 70 | 
 71 | ```bash
 72 | python3 -m examples.run_dqn_agent --model=DQN --env_id=PongNoFrameskip-v4 --log_period=10000
 73 | ```
 74 | 
 75 | One can also directly override the arugments for custom python objects.
 76 | For example, override the learning rate of `rltf.optimizers.OptimizerConf`:
 77 | 
 78 | ```bash
 79 | python3 -m examples.run_dqn_agent --model=DQN --env_id=PongNoFrameskip-v4 --opt_conf.learn_rate=5e-5
 80 | ```
 81 | 
 82 | Additionally entire custom objects can be provided on the command line,
 83 | e.g. using `AdamOptimizer` instead of `RMSPropOptimizer`:
 84 | 
 85 | ```bash
 86 | python3 -m examples.run_dqn_agent --model=DQN --env_id=PongNoFrameskip-v4 --opt_conf='OptimizerConf(opt_type=tf.train.AdamOptimizer, learn_rate=5e-5, epsilon=.01/32)'
 87 | ```
 88 | 
 89 | ### Restoring a Model
 90 | 
 91 | ```bash
 92 | python3 -m examples.run_dqn_agent --model=DQN --env_id=PongNoFrameskip-v4 --restore=trained_models/dqn/PongNoFrameskip-v4_2018-12-28_12.44.17
 93 | ```
 94 | This will restore the latest checkpoint in `--restore` and
 95 | continue training from where it left off.
 96 | 
 97 | **NOTE**: You still need to make sure to provide the same `model`, `env_id`
 98 | and hyperparameters that were used to launch the original model. Restoring
 99 | the model won't create a new directory, but use the one provided in `--restore`.
100 | 
101 | 
102 | ### Evaluating a Model
103 | 
104 | One can use the same scripts to evaluate a model, but needs to provide additional arguments, e.g.
105 | ```bash
106 | python3 -m examples.run_dqn_agent --model=DQN --env_id=PongNoFrameskip-v4 --load_model=trained_models/dqn/PongNoFrameskip-v4_2018-12-28_12.44.17 --n_plays=10 --eval_len=100000
107 | ```
108 | The above will load the best checkpoint from in `--load_model` and
109 | perform `--n_plays` evaluation runs, each of length `--eval_len`. The data
110 | will be saved in the a sub-directory called `play` inside the `--load_model` directory.
111 | 
112 | 
113 | ### Using Model variables
114 | 
115 | To use the variables of a trained model for initializing a new model, use:
116 | ```bash
117 | python3 -m examples.run_dqn_agent --model=DQN --env_id=PongNoFrameskip-v4 --load_model=trained_models/dqn/PongNoFrameskip-v4_2018-12-28_12.44.17
118 | ```
119 | 
120 | This will fetch the best checkpoint in `--load_model` and use the checkpoint
121 | values for initialization. Note that variable names of the loaded model must match
122 | variable names of the built computational graph. Additionally, one can filter which
123 | variables are restored using a regex and the `--load_regex` argument. All variable
124 | names which are matched by the regex will be loaded; the rest will be randomly initialized.
125 | 


--------------------------------------------------------------------------------
/examples/run_dqn_agent.py:
--------------------------------------------------------------------------------
 1 | from rltf.cmdutils      import cmdargs
 2 | from rltf.envs          import wrap_dqn
 3 | from rltf.utils         import rltf_log
 4 | from rltf.utils         import maker
 5 | 
 6 | 
 7 | def parse_args():
 8 |   model_choices = ["DQN", "DDQN", "C51", "QRDQN", "BstrapDQN", "DQN_UCB", "DQN_Ensemble",
 9 |                   "DQN_IDS", "C51_IDS", "QRDQN_IDS", "BDQN", "BDQN_TS", "BDQN_UCB", "BDQN_IDS"]
10 |   return cmdargs.parse_args(model_choices)
11 | 
12 | 
13 | def make_agent():
14 | 
15 |   # Parse the command line args
16 |   agent_kwargs, args = parse_args()
17 | 
18 |   # Construct the model directory and configure loggers
19 |   model_dir = maker.make_model_dir(args)
20 | 
21 |   # Log the program parameters
22 |   rltf_log.log_params(agent_kwargs.items(), args)
23 | 
24 |   # Get the environment maker
25 |   env_kwargs = {**agent_kwargs.pop("env_kwargs"), **dict(
26 |     env_id=args.env_id,
27 |     seed=args.seed,
28 |     wrap=wrap_dqn,
29 |     # Wrapper kwargs
30 |     stack=agent_kwargs["stack_frames"],
31 |   )}
32 |   env_maker = maker.get_env_maker(**env_kwargs)
33 | 
34 |   agent_kwargs = {**agent_kwargs, **dict(
35 |     env_maker=env_maker,
36 |     model_dir=model_dir,
37 |   )}
38 | 
39 |   # Create the agent
40 |   agent_type  = agent_kwargs.pop("agent")
41 |   dqn_agent   = agent_type(**agent_kwargs)
42 | 
43 |   return dqn_agent, args
44 | 
45 | 
46 | def main():
47 |   # Create the agent
48 |   dqn_agent, args = make_agent()
49 | 
50 |   # Build the agent and the TF graph
51 |   dqn_agent.build()
52 | 
53 |   # Train or eval the agent
54 |   if args.mode == 'train':
55 |     dqn_agent.train()
56 |   else:
57 |     dqn_agent.play()
58 | 
59 |   # Close on exit
60 |   dqn_agent.close()
61 | 
62 | 
63 | if __name__ == "__main__":
64 |   main()
65 | 


--------------------------------------------------------------------------------
/examples/run_pg_agent.py:
--------------------------------------------------------------------------------
 1 | from rltf.cmdutils      import cmdargs
 2 | from rltf.envs          import wrap_pg
 3 | from rltf.envs          import wrap_ddpg
 4 | from rltf.utils         import rltf_log
 5 | from rltf.utils         import maker
 6 | 
 7 | 
 8 | def parse_args():
 9 |   model_choices = ["DDPG", "REINFORCE", "PPO", "TRPO"]
10 |   return cmdargs.parse_args(model_choices)
11 | 
12 | 
13 | def make_agent():
14 | 
15 |   # Parse the command line args
16 |   agent_kwargs, args = parse_args()
17 | 
18 |   # Construct the model directory and configure loggers
19 |   model_dir = maker.make_model_dir(args)
20 | 
21 |   # Log the program parameters
22 |   rltf_log.log_params(agent_kwargs.items(), args)
23 | 
24 |   # Get the environment maker
25 |   env_kwargs = {**agent_kwargs.pop("env_kwargs"), **dict(
26 |     env_id=args.env_id,
27 |     seed=args.seed,
28 |     wrap=wrap_pg if args.model != "DDPG" else wrap_ddpg,
29 |   )}
30 |   env_maker = maker.get_env_maker(**env_kwargs)
31 | 
32 |   agent_kwargs = {**agent_kwargs, **dict(
33 |     env_maker=env_maker,
34 |     model_dir=model_dir,
35 |   )}
36 | 
37 |   # Create the agent
38 |   agent_type  = agent_kwargs.pop("agent")
39 |   pg_agent    = agent_type(**agent_kwargs)
40 | 
41 |   return pg_agent, args
42 | 
43 | 
44 | def main():
45 |   # Create the agent
46 |   pg_agent, args = make_agent()
47 | 
48 |   # Build the agent and the TF graph
49 |   pg_agent.build()
50 | 
51 |   # Train or eval the agent
52 |   if args.mode == 'train':
53 |     pg_agent.train()
54 |   else:
55 |     pg_agent.play()
56 | 
57 |   # Close on exit
58 |   pg_agent.close()
59 | 
60 | 
61 | if __name__ == "__main__":
62 |   main()
63 | 


--------------------------------------------------------------------------------
/plot/.gitignore:
--------------------------------------------------------------------------------
1 | /**/__pycache__
2 | conf/*
3 | restore.py
4 | 


--------------------------------------------------------------------------------
/plot/README.md:
--------------------------------------------------------------------------------
1 | 
2 | To plot based on a conf file:
3 | ```
4 | python3 plot/plotter.py --conf <filename>
5 | ```
6 | Note that `<filename>` should not contain the `.json` extension


--------------------------------------------------------------------------------
/plot/dataio.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | import os
  3 | from collections import OrderedDict
  4 | 
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | import tabulate
  8 | 
  9 | 
 10 | def save_scores(scores, file, args):
 11 |   """Write scores in table format to a .txt file and to a .tex file (in latex format)
 12 |   Args:
 13 |     scores: dict
 14 |     file: str. Does not contain the extension
 15 |     args: ArgumentParser. The command-line arguments
 16 |   """
 17 |   envs    = sorted(scores.keys())
 18 |   labels  = [label for label in args.conf["legend"]]
 19 |   csvdata = []
 20 |   texdata = []
 21 | 
 22 |   for env in envs:
 23 |     data = [scores[env].get(label, -float("inf")) for label in labels]
 24 |     csvdata.append([env] + data)
 25 |     if args.boldmax:
 26 |       best = max(data)
 27 |       data = ["{:,.1f}".format(score) if score != best else "\\textbf{{{:,.1f}}}".format(score) for score in data]
 28 |     texdata.append([env] + data)
 29 | 
 30 |   csvtable = tabulate.tabulate(csvdata, headers=labels, floatfmt=".1f", tablefmt="presto")
 31 |   textable = tabulate.tabulate(texdata, headers=labels, floatfmt=".1f", tablefmt="latex_raw")
 32 | 
 33 |   with open(file + ".txt", 'w') as f:
 34 |     f.write(csvtable)
 35 | 
 36 |   with open(file + ".tex", 'w') as f:
 37 |     f.write(textable)
 38 | 
 39 | 
 40 | def get_model_props(conf, model):
 41 |   props = conf["legend"][model]
 42 |   return props["label"], props["color"]
 43 | 
 44 | 
 45 | def get_model_name(model_dir):
 46 |   s    = model_dir.find("/")
 47 |   name = model_dir[:s]
 48 |   return name
 49 | 
 50 | 
 51 | def get_env_name(model_dir):
 52 |   """
 53 |   Args:
 54 |     model_dir: str. Will be in the format model-name/env-name_run-date and might end in "/"
 55 |   Return:
 56 |     str with the env name as it appears in gym
 57 |   """
 58 |   len_date = 20
 59 |   if model_dir[-1] == "/":
 60 |     len_date += 1
 61 |   env = model_dir[:-20]
 62 |   s   = env.find("/")
 63 |   env = env[s+1:]
 64 |   s = env.find("NoFrameskip")
 65 |   if s > 0:
 66 |     env = env[:s]
 67 |   else:
 68 |     s = env.find("-v")
 69 |     env = env[:s]
 70 |   return env
 71 | 
 72 | 
 73 | def get_model_dir(model, args):
 74 |   return os.path.join(args.conf["root_dir"], model)
 75 | 
 76 | 
 77 | def read_conf(file):
 78 |   if not os.path.exists(file):
 79 |     raise ValueError("Configuration file {} does not exist".format(file))
 80 |   with open(file, 'r') as f:
 81 |     conf = yaml.load(f)
 82 | 
 83 |   assert "legend" in conf
 84 |   assert "root_dir" in conf
 85 |   assert os.path.exists(conf["root_dir"])
 86 |   for label, props in conf["legend"].items():
 87 |     assert "models" in props
 88 |     assert "color" in props
 89 | 
 90 |   return conf
 91 | 
 92 | 
 93 | def write_tb_file(tb_dir, steps, data):
 94 |   """
 95 |   Args:
 96 |     tb_dir: str. Directory where the file should be opened
 97 |     steps: list. List of the event time steps
 98 |     data: dict. Every key is a tag and every value is a list of the data for the tag. The length of
 99 |       the list must equal the length of steps
100 |   """
101 |   # Check for correctness
102 |   for tag, vals in data.items():
103 |     assert tag.startswith("train/") or tag.startswith("eval/")
104 |     assert len(steps) == len(vals)
105 | 
106 |   # import tensorflow as tf
107 |   writer = tf.summary.FileWriter(tb_dir)
108 | 
109 |   for i, s in enumerate(steps):
110 |     summary = tf.Summary()
111 |     for tag, vals in data.items():
112 |       summary.value.add(tag=tag, simple_value=vals[i])
113 |     writer.add_summary(summary, global_step=s)
114 | 
115 |   writer.flush()
116 |   writer.close()
117 | 


--------------------------------------------------------------------------------
/rltf/__init__.py:
--------------------------------------------------------------------------------
1 | from rltf import agents
2 | from rltf import cmdutils
3 | from rltf import envs
4 | from rltf import exploration
5 | from rltf import memory
6 | from rltf import models
7 | from rltf import schedules
8 | from rltf import utils
9 | 


--------------------------------------------------------------------------------
/rltf/agents/__init__.py:
--------------------------------------------------------------------------------
 1 | from rltf.agents.agent        import Agent
 2 | from rltf.agents.base_agents  import ThreadedAgent
 3 | from rltf.agents.base_agents  import LoggingAgent
 4 | from rltf.agents.qlearn_agent import BaseQlearnAgent
 5 | from rltf.agents.qlearn_agent import QlearnAgent
 6 | from rltf.agents.qlearn_agent import SequentialQlearnAgent
 7 | from rltf.agents.ddpg_agent   import AgentDDPG
 8 | from rltf.agents.dqn_agent    import AgentDQN
 9 | from rltf.agents.dqn_agent    import AgentBDQN
10 | from rltf.agents.pg_agent     import AgentPG
11 | from rltf.agents.ppo_agent    import AgentPPO
12 | from rltf.agents.trpo_agent   import AgentTRPO
13 | 


--------------------------------------------------------------------------------
/rltf/agents/base_agents.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from abc import ABCMeta, abstractmethod
  3 | 
  4 | import tensorflow as tf
  5 | 
  6 | from rltf.agents import Agent
  7 | 
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | class ThreadedAgent(Agent):
 13 |   """Abstract agent which can run the main process in several python threads.
 14 |   Allows for a standardized way of exiting cleanly, without losing any data at Ctrl+C"""
 15 | 
 16 |   def _run_threads(self, threads):
 17 |     """Run `threads`
 18 |     Args:
 19 |       threads: list of threads to start and join
 20 |     """
 21 |     # Start threads
 22 |     for t in threads:
 23 |       t.start()
 24 |     # Wait for threads
 25 |     for t in threads:
 26 |       t.join()
 27 | 
 28 | 
 29 |   def _thread(self, f):
 30 |     """Share the default graph over threads"""
 31 |     assert self.sess is not None
 32 |     with self.sess.graph.as_default():
 33 |       f()
 34 | 
 35 | 
 36 | 
 37 | class LoggingAgent(Agent, metaclass=ABCMeta):
 38 |   """Abstract Agent which takes care of logging training and evaluation progress to stdout
 39 |   and TensorBoard. Also takes care of saving data to disk and restoring it"""
 40 | 
 41 |   def __init__(self, *args, log_period=50000, video_period=1000, plot_video=False, **kwargs):
 42 |     """
 43 |     Args:
 44 |       log_period: int. Add TensorBoard summary and print progress every log_period agent steps
 45 |       video_period: int. Period for recording episode videos (in number of episodes). If <=0,
 46 |         no recordings will be made
 47 |       plot_video: bool. If True, plots of some of the model tensor values will be included in video
 48 |         recordings by the monitor. Values appear together with the corresponding state
 49 |     """
 50 |     super().__init__(*args, **kwargs)
 51 | 
 52 |     self.log_period   = log_period
 53 |     self.video_period = video_period
 54 |     self.plot_video   = plot_video
 55 |     self.summary      = None    # The most recent summary
 56 |     self.summary_op   = None    # TF op that contains all summaries
 57 | 
 58 | 
 59 |   def build(self):
 60 |     # Build the actual graph
 61 |     super().build()
 62 | 
 63 |     # Create an Op for all summaries
 64 |     self.summary_op = tf.summary.merge_all()
 65 | 
 66 |     # Configure the monitors
 67 |     self._configure_monitors()
 68 | 
 69 | 
 70 |   def _configure_monitors(self):
 71 |     # Set stdout data to log during training
 72 |     self.env_train.monitor.set_stdout_logs(self._append_log_spec())
 73 | 
 74 |     # Set the function to fetch TensorBoard summaries during training
 75 |     self.env_train.monitor.set_summary_getter(self._fetch_summary)
 76 | 
 77 |     if self.plot_video:
 78 |       # Enable plotting tensors in the recorded videos
 79 |       self.env_train.monitor.enable_video_plots(self.model.name)
 80 |       self.env_eval.monitor.enable_video_plots(self.model.name)
 81 | 
 82 |     # No need for deactivating. It is deactivated by default
 83 |     # else:
 84 |     #   # If plots not enabled, make sure no tensors are run needlessly
 85 |     #   self.model.plot_conf.deactivate_train_plots()
 86 |     #   self.model.plot_conf.deactivate_eval_plots()
 87 | 
 88 | 
 89 |   def _fetch_summary(self):
 90 |     byte_summary = self.summary
 91 |     summary = tf.Summary()
 92 |     if byte_summary is not None:
 93 |       summary.ParseFromString(byte_summary)
 94 |     # Pass the real current training step
 95 |     self._append_summary(summary, self.agent_step+1)
 96 | 
 97 |     return summary
 98 | 
 99 | 
100 |   def _save(self):
101 |     # Save the monitor statistics
102 |     self.env_train.monitor.save()
103 |     self.env_eval.monitor.save()
104 | 
105 | 
106 |   @abstractmethod
107 |   def _run_summary_op(self, t, feed_dict):
108 |     """Run the summary op and save the result in self.summary
109 |     NOTE:
110 |       - For significant computation efficiency, summaries should be run only each log_period,
111 |         otherwise the data is thrown away and causes unnecessary computations
112 |       - Careful with implementation. Remember that the summary is actually fetched by the
113 |         environment monitor, at every log_period, **during the call to env.step()**. Importantly,
114 |         this means that the summary has to be run **before** the corresponding call to env.step()
115 |     Args:
116 |       t: int. Current time step
117 |       feed_dict: dict. feed_dict to feed to sess.run
118 |     """
119 |     pass
120 | 
121 | 
122 |   def _append_log_spec(self):
123 |     """To be overriden by the subclass
124 |     Returns:
125 |       List of tuples `(name, format, lambda)` with information of custom subclass
126 |       parameters to log during training. `name`: `str`, the name of the reported
127 |       value. `modifier`: `str`, the type modifier for printing the value.
128 |       `lambda`: A function that takes the current timestep as argument and
129 |       returns the value to be printed.
130 |     """
131 |     return []
132 | 
133 | 
134 |   #pylint: disable=unused-argument
135 |   def _append_summary(self, summary, t):
136 |     """To be overriden by the subclass.
137 |     Append the tf.Summary that will be written to disk at timestep t with custom data.
138 |     Used only in train mode. The resulting summary is passed to rltf.Monitor for saving.
139 |     Args:
140 |       summary: tf.Summary. The summary to append
141 |       t: int. Current time step
142 |     """
143 |     return
144 | 


--------------------------------------------------------------------------------
/rltf/agents/ddpg_agent.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | 
  3 | import gym
  4 | import numpy as np
  5 | 
  6 | from rltf.agents      import QlearnAgent
  7 | from rltf.memory      import ReplayBuffer
  8 | from rltf.monitoring  import Monitor
  9 | 
 10 | 
 11 | class AgentDDPG(QlearnAgent):
 12 | 
 13 |   def __init__(self,
 14 |                env_maker,
 15 |                model,
 16 |                action_noise,
 17 |                memory_size=int(1e6),
 18 |                stack_frames=3,
 19 |                **agent_kwargs
 20 |               ):
 21 |     """
 22 |     Args:
 23 |       env_maker: callable. Function that takes the mode of an env and retruns a new environment instance
 24 |       model: rltf.models.Model. TF implementation of a model network
 25 |       action_noise: rltf.exploration.ExplorationNoise. Additive action space exploration noise
 26 |       memory_size: int. Size of the replay buffer
 27 |       stack_frames: int. How many frames comprise a single state.
 28 |       agent_kwargs: Keyword arguments that will be passed to the Agent base class
 29 |     """
 30 | 
 31 |     super().__init__(**agent_kwargs)
 32 | 
 33 |     self.env_train = Monitor(
 34 |                       env=env_maker('t'),
 35 |                       log_dir=self.model_dir,
 36 |                       mode='t',
 37 |                       log_period=self.log_period,
 38 |                       video_spec=self.video_period,
 39 |                     )
 40 | 
 41 |     self.env_eval  = Monitor(
 42 |                       env=env_maker('e'),
 43 |                       log_dir=self.model_dir,
 44 |                       mode='e',
 45 |                       log_period=self.eval_len,
 46 |                       video_spec=self.video_period,
 47 |                       eval_period=self.eval_period,
 48 |                     )
 49 | 
 50 |     self.action_noise = action_noise(self.env_train.action_space.shape)
 51 | 
 52 |     # Get environment specs
 53 |     obs_shape, obs_dtype, obs_len, act_shape = self._state_action_spec(stack_frames)
 54 | 
 55 |     # Initialize the model and the experience buffer
 56 |     self.model      = model(obs_shape=obs_shape, act_shape=act_shape, **self.model_kwargs)
 57 |     self.replay_buf = ReplayBuffer(memory_size, obs_shape, obs_dtype, act_shape, np.float32, obs_len)
 58 | 
 59 |     # Custom stats
 60 |     self.act_noise_stats = collections.deque([], maxlen=self.log_period)
 61 | 
 62 | 
 63 |   def _append_log_spec(self):
 64 |     t = self.log_period
 65 |     log_spec = [
 66 |       ( "mean_act_noise_mean (%d steps)"%t, "f", self._stats_act_noise_mean     ),
 67 |       ( "mean_act_noise_std  (%d steps)"%t, "f", self._stats_act_noise_std      ),
 68 |     ]
 69 |     return log_spec
 70 | 
 71 | 
 72 |   def _append_summary(self, summary, t):
 73 |     summary.value.add(tag="train/act_noise_mean", simple_value=self._stats_act_noise_mean())
 74 |     summary.value.add(tag="train/act_noise_std",  simple_value=self._stats_act_noise_std())
 75 | 
 76 | 
 77 |   def _reset(self):
 78 |     self.action_noise.reset()
 79 | 
 80 | 
 81 |   def _stats_act_noise_mean(self, *_):
 82 |     if len(self.act_noise_stats) == 0:
 83 |       return float("nan")
 84 |     return np.mean(self.act_noise_stats)
 85 | 
 86 |   def _stats_act_noise_std(self, *_):
 87 |     if len(self.act_noise_stats) == 0:
 88 |       return float("nan")
 89 |     return np.std(self.act_noise_stats)
 90 | 
 91 | 
 92 |   def _get_feed_dict(self, batch, t):
 93 |     feed_dict = {
 94 |       self.model.obs_t_ph:              batch["obs"],
 95 |       self.model.act_t_ph:              batch["act"],
 96 |       self.model.rew_t_ph:              batch["rew"],
 97 |       self.model.obs_tp1_ph:            batch["obs_tp1"],
 98 |       self.model.done_ph:               batch["done"],
 99 |       self.model.actor_opt_conf.lr_ph:  self.model.actor_opt_conf.lr_value(t),
100 |       self.model.critic_opt_conf.lr_ph: self.model.critic_opt_conf.lr_value(t),
101 |     }
102 |     return feed_dict
103 | 
104 | 
105 |   def _action_train(self, state, t):
106 |     noise   = self.action_noise.sample(t)
107 |     data    = self.model.action_train_ops(self.sess, state)
108 |     action  = data["action"][0]
109 |     action  = action + noise
110 | 
111 |     # Add action noise to stats
112 |     self.act_noise_stats.append(noise)
113 | 
114 |     return action
115 | 
116 | 
117 |   def _action_eval(self, state):
118 |     data    = self.model.action_eval_ops(self.sess, state)
119 |     action  = data["action"][0]
120 |     return action
121 | 
122 | 
123 |   def _state_action_spec(self, stack_frames):
124 |     assert isinstance(self.env_train.observation_space, gym.spaces.Box)
125 |     assert isinstance(self.env_train.action_space,      gym.spaces.Box)
126 | 
127 |     # Get environment specs
128 |     act_shape = list(self.env_train.action_space.shape)
129 |     obs_shape = list(self.env_train.observation_space.shape)
130 | 
131 |     if len(obs_shape) == 3:
132 |       assert stack_frames > 1
133 |       obs_dtype = np.uint8
134 |       obs_len   = stack_frames
135 |     else:
136 |       obs_dtype = np.float32
137 |       obs_len   = 1
138 | 
139 |     return obs_shape, obs_dtype, obs_len, act_shape
140 | 


--------------------------------------------------------------------------------
/rltf/agents/dqn_agent.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | 
  4 | from rltf.agents      import QlearnAgent
  5 | from rltf.memory      import ReplayBuffer
  6 | from rltf.monitoring  import Monitor
  7 | 
  8 | 
  9 | class AgentDQN(QlearnAgent):
 10 | 
 11 |   def __init__(self,
 12 |                env_maker,
 13 |                model,
 14 |                epsilon_train,
 15 |                epsilon_eval,
 16 |                memory_size=int(1e6),
 17 |                stack_frames=4,
 18 |                **agent_kwargs
 19 |               ):
 20 |     """
 21 |     Args:
 22 |       env_maker: callable. Function that takes the mode of an env and retruns a new environment instance
 23 |       model: rltf.models.Model. TF implementation of a model network
 24 |       epsilon_train: rltf.schedules.Schedule. Epsilon value for e-greedy exploration
 25 |       epsilon_eval: float. Epsilon value for selecting random action during evaluation
 26 |       memory_size: int. Size of the replay buffer
 27 |       stack_frames: int. How many frames comprise a single state.
 28 |       agent_kwargs: Keyword arguments that will be passed to the Agent base class
 29 |     """
 30 | 
 31 |     super().__init__(**agent_kwargs)
 32 | 
 33 |     self.env_train = Monitor(
 34 |                       env=env_maker('t'),
 35 |                       log_dir=self.model_dir,
 36 |                       mode='t',
 37 |                       log_period=self.log_period,
 38 |                       video_spec=self.video_period,
 39 |                     )
 40 | 
 41 |     self.env_eval  = Monitor(
 42 |                       env=env_maker('e'),
 43 |                       log_dir=self.model_dir,
 44 |                       mode='e',
 45 |                       log_period=self.eval_len,
 46 |                       video_spec=self.video_period,
 47 |                       eval_period=self.eval_period,
 48 |                     )
 49 | 
 50 |     self.epsilon_train  = epsilon_train
 51 |     self.epsilon_eval = epsilon_eval
 52 | 
 53 |     # Get environment specs
 54 |     obs_shape, obs_dtype, obs_len, n_actions = self._state_action_spec(stack_frames)
 55 | 
 56 |     # Initialize the model and the experience buffer
 57 |     self.model      = model(obs_shape=obs_shape, n_actions=n_actions, **self.model_kwargs)
 58 |     self.replay_buf = ReplayBuffer(memory_size, obs_shape, obs_dtype, [], np.uint8, obs_len)
 59 | 
 60 | 
 61 |   def _append_summary(self, summary, t):
 62 |     summary.value.add(tag="train/epsilon", simple_value=self.epsilon_train.value(t))
 63 | 
 64 | 
 65 |   def _get_feed_dict(self, batch, t):
 66 |     feed_dict = {
 67 |       self.model.obs_t_ph:        batch["obs"],
 68 |       self.model.act_t_ph:        batch["act"],
 69 |       self.model.rew_t_ph:        batch["rew"],
 70 |       self.model.obs_tp1_ph:      batch["obs_tp1"],
 71 |       self.model.done_ph:         batch["done"],
 72 |       self.model.opt_conf.lr_ph:  self.model.opt_conf.lr_value(t),
 73 |     }
 74 |     return feed_dict
 75 | 
 76 | 
 77 |   def _action_train(self, state, t):
 78 |     # Run epsilon greedy policy
 79 |     epsilon = self.epsilon_train.value(t)
 80 |     if self.prng.uniform(0,1) < epsilon:
 81 |       action = self.env_train.action_space.sample()
 82 |     else:
 83 |       # Run the network to select an action
 84 |       data   = self.model.action_train_ops(self.sess, state)
 85 |       action = data["action"][0]
 86 |     return action
 87 | 
 88 | 
 89 |   def _action_eval(self, state):
 90 |     # Run epsilon greedy policy
 91 |     if self.prng.uniform(0,1) < self.epsilon_eval:
 92 |       action = self.env_eval.action_space.sample()
 93 |     else:
 94 |       # Run the network to select an action
 95 |       data   = self.model.action_eval_ops(self.sess, state)
 96 |       action = data["action"][0]
 97 |     return action
 98 | 
 99 | 
100 |   def _reset(self):
101 |     pass
102 | 
103 | 
104 |   def _state_action_spec(self, stack_frames):
105 |     assert isinstance(self.env_train.observation_space, gym.spaces.Box)
106 |     assert isinstance(self.env_train.action_space,      gym.spaces.Discrete)
107 | 
108 |     n_actions = self.env_train.action_space.n
109 |     obs_shape = self.env_train.observation_space.shape
110 |     obs_shape = list(obs_shape)
111 | 
112 |     if len(obs_shape) == 3:
113 |       assert stack_frames > 1
114 |       obs_dtype = np.uint8
115 |       obs_len   = stack_frames
116 |     else:
117 |       obs_dtype = np.float32
118 |       obs_len   = 1
119 | 
120 |     return obs_shape, obs_dtype, obs_len, n_actions
121 | 
122 | 
123 | 
124 | class AgentBDQN(AgentDQN):
125 | 
126 |   def __init__(self, blr_train_period, blr_batch_size, **kwargs):
127 |     """
128 |     Args:
129 |       blr_train_period: int. Period in number of steps at which to train the BLR
130 |       blr_batch_size: int. Number of samples to train BLR in an update step
131 |     """
132 |     super().__init__(**kwargs)
133 | 
134 |     self.blr_train_period = blr_train_period
135 |     self.blr_batch_size = blr_batch_size
136 | 
137 | 
138 |   def _run_train_step(self, t):
139 |     super()._run_train_step(t)
140 | 
141 |     # Train BLR
142 |     if t % self.blr_train_period == 0:
143 |       # for batch in self.replay_buf.random_data(self.blr_batch_size, self.batch_size):
144 |       #   feed_dict = self._get_feed_dict(batch, t)
145 |       #   self.sess.run(self.model.train_blr, feed_dict=feed_dict)
146 | 
147 |       n = int(self.blr_batch_size / (self.batch_size * 4))
148 |       for _ in range(n):
149 |         batch = self.replay_buf.sample(self.batch_size)
150 |         feed_dict = self._get_feed_dict(batch, t)
151 |         self.sess.run(self.model.train_blr, feed_dict=feed_dict)
152 | 


--------------------------------------------------------------------------------
/rltf/agents/pg_agent.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | 
  4 | from rltf.agents      import LoggingAgent
  5 | from rltf.memory      import PGBuffer
  6 | from rltf.monitoring  import Monitor
  7 | 
  8 | 
  9 | class AgentPG(LoggingAgent):
 10 | 
 11 |   def __init__(self,
 12 |                env_maker,
 13 |                model,
 14 |                gamma,
 15 |                lam,
 16 |                rollout_len,
 17 |                stop_step,
 18 |                vf_iters=1,
 19 |                stack_frames=3,
 20 |                **agent_kwargs
 21 |               ):
 22 |     """
 23 |     Args:
 24 |       env_maker: callable. Function that takes the mode of an env and retruns a new environment instance
 25 |       gamma: float. Discount factor for GAE(gamma, lambda)
 26 |       lam: float. Lambda value for GAE(gamma, lambda)
 27 |       rollout_len: int. Number of agent steps before taking a policy gradient step
 28 |       stop_step: int. Total number of agent steps
 29 |       vf_iters: int. Number of value function training steps in a single epoch
 30 |     """
 31 | 
 32 |     super().__init__(**agent_kwargs)
 33 | 
 34 |     assert self.log_period % rollout_len == 0, "Log period must be divisible by rollout length"
 35 |     if self.eval_len > 0:
 36 |       assert self.eval_period % rollout_len == 0, "Eval period must be divisible by rollout length"
 37 |     if self.save_period > np.inf:
 38 |       assert self.save_period % rollout_len == 0, "Save period must be divisible by rollout length"
 39 | 
 40 |     self.env_train = Monitor(
 41 |                       env=env_maker('t'),
 42 |                       log_dir=self.model_dir,
 43 |                       mode='t',
 44 |                       log_period=None,
 45 |                       video_spec=self.video_period,
 46 |                     )
 47 | 
 48 |     self.env_eval  = Monitor(
 49 |                       env=env_maker('e'),
 50 |                       log_dir=self.model_dir,
 51 |                       mode='e',
 52 |                       log_period=self.eval_len,
 53 |                       video_spec=self.video_period,
 54 |                       eval_period=self.eval_period,
 55 |                     )
 56 | 
 57 |     self.rollout_len  = rollout_len
 58 |     self.stop_step    = stop_step
 59 |     self.epochs       = self.stop_step // self.rollout_len
 60 |     self.vf_iters     = vf_iters
 61 | 
 62 |     self.gamma  = gamma
 63 |     self.lam    = lam
 64 | 
 65 |     # Get environment specs
 66 |     obs_shape, obs_dtype, act_shape, act_dtype, obs_len = self._state_action_spec(stack_frames)
 67 | 
 68 |     # Initialize the model and the experience buffer
 69 |     self.model  = model(obs_shape=obs_shape, act_space=self.env_train.action_space, **self.model_kwargs)
 70 |     self.buffer = PGBuffer(self.rollout_len, obs_shape, obs_dtype, act_shape, act_dtype, obs_len)
 71 | 
 72 | 
 73 |   def _train(self):
 74 |     # Get the function that generates trajectories
 75 |     run_policy = self._trajectory_generator(self.rollout_len)
 76 | 
 77 |     for t in range(self.agent_step+1, self.epochs+1):
 78 |       if self._terminate:
 79 |         break
 80 | 
 81 |       step = t * self.rollout_len
 82 | 
 83 |       # Collect experience in the environment
 84 |       run_policy()
 85 | 
 86 |       # Train the model
 87 |       self._run_train_step(t)
 88 | 
 89 |       if step % self.log_period == 0:
 90 |         self.env_train.monitor.log_stats()
 91 | 
 92 |       # Stop and run evaluation procedure
 93 |       if self.eval_len > 0 and step % self.eval_period == 0:
 94 |         self._eval_agent()
 95 | 
 96 |       # Update the agent step - corresponds to number of epochs
 97 |       self.agent_step = step
 98 | 
 99 |       # Save **after** agent step is correct and completed
100 |       if step % self.save_period == 0:
101 |         self.save()
102 | 
103 | 
104 |   def _trajectory_generator(self, horizon):
105 |     """
106 |     Args:
107 |       horizon: int. Number of steps to run before yielding the trajectories
108 |     Returns:
109 |       A function which generates trajectories
110 |     """
111 | 
112 |     obs = self.reset()
113 | 
114 |     def run_env():
115 |       nonlocal obs
116 | 
117 |       # Clear the buffer to avoid using old data
118 |       self.buffer.reset()
119 | 
120 |       for _ in range(horizon):
121 |         if self._terminate:
122 |           return
123 | 
124 |         # Get an action to run and the value function estimate of this state
125 |         action, vf, logp = self._action_train(obs)
126 | 
127 |         # Run action
128 |         next_obs, reward, done, info = self.env_train.step(action)
129 | 
130 |         # Store the effect of the action taken upon obs
131 |         self.buffer.store(obs, action, reward, done, vf, logp)
132 | 
133 |         # Reset the environment if end of episode
134 |         if done:
135 |           next_obs = self.reset()
136 |         obs = next_obs
137 | 
138 |       # Store the value function for the next state. Needed to compute GAE(lambda)
139 |       if not done:
140 |         _, next_vf, _ = self._action_train(obs)
141 |       else:
142 |         next_vf = 0
143 | 
144 |       # Compute GAE(gamma, lambda) and TD(lambda)
145 |       self.buffer.compute_estimates(self.gamma, self.lam, next_vf)
146 | 
147 |     return run_env
148 | 
149 | 
150 |   def _get_feed_dict(self, batch, t):
151 |     feed_dict = {
152 |       self.model.obs_ph:              batch["obs"],
153 |       self.model.act_ph:              batch["act"],
154 |       self.model.adv_ph:              batch["adv"],
155 |       self.model.ret_ph:              batch["ret"],
156 |       self.model.old_logp_ph:         batch["logp"],
157 |       self.model.pi_opt_conf.lr_ph:   self.model.pi_opt_conf.lr_value(t),
158 |       self.model.vf_opt_conf.lr_ph:   self.model.vf_opt_conf.lr_value(t),
159 |     }
160 | 
161 |     return feed_dict
162 | 
163 | 
164 |   def _run_summary_op(self, t, feed_dict):
165 |     if t * self.rollout_len % self.log_period == 0:
166 |       self.summary = self.sess.run(self.summary_op, feed_dict=feed_dict)
167 | 
168 | 
169 |   def _run_train_step(self, t):
170 |     batch     = self.buffer.get_data()
171 |     feed_dict = self._get_feed_dict(batch, t)
172 | 
173 |     train_pi  = self.model.ops_dict["train_pi"]
174 |     train_vf  = self.model.ops_dict["train_vf"]
175 | 
176 |     # Run a policy gradient step and a value function training step
177 |     self.sess.run([train_pi, train_vf], feed_dict=feed_dict)
178 |     # self.sess.run([self.model.train_op], feed_dict=feed_dict)
179 | 
180 |     # Run a policy gradient step
181 |     # self.sess.run(train_pi, feed_dict=feed_dict)
182 | 
183 |     # Train the value function additionally
184 |     for _ in range(self.vf_iters-1):
185 |       if self._terminate:
186 |         break
187 |       self.sess.run(train_vf, feed_dict=feed_dict)
188 | 
189 |     # Run the summary op to log the changes from the update if necessary
190 |     self._run_summary_op(t, feed_dict)
191 | 
192 | 
193 |   def _action_train(self, state):
194 |     data   = self.model.action_train_ops(self.sess, state)
195 |     action = data["action"][0]
196 |     vf     = data["vf"][0]
197 |     logp   = data["logp"][0]
198 |     return action, vf, logp
199 | 
200 | 
201 |   def _action_eval(self, state):
202 |     data   = self.model.action_eval_ops(self.sess, state)
203 |     action = data["action"][0]
204 |     return action
205 | 
206 | 
207 |   def _save_allowed(self):
208 |     # Prevent saving if the process was terminated - state is most likely inconsistent
209 |     return not self._terminate
210 | 
211 | 
212 |   def _state_action_spec(self, stack_frames):
213 |     assert isinstance(self.env_train.observation_space, gym.spaces.Box)
214 | 
215 |     # Get environment specs
216 |     act_shape = list(self.env_train.action_space.shape)
217 |     obs_shape = list(self.env_train.observation_space.shape)
218 | 
219 |     # Get obs_shape and obs_dtype
220 |     if len(obs_shape) == 3:
221 |       assert stack_frames > 1
222 |       obs_dtype = np.uint8
223 |       obs_len   = stack_frames
224 |     else:
225 |       obs_dtype = np.float32
226 |       obs_len   = 1
227 | 
228 |     # Get act_shape and act_dtype
229 |     if isinstance(self.env_train.action_space, gym.spaces.Box):
230 |       act_shape = list(self.env_train.action_space.shape)
231 |       act_dtype = np.float32
232 |     elif isinstance(self.env_train.action_space, gym.spaces.Discrete):
233 |       act_shape = []
234 |       act_dtype = np.uint8
235 |     else:
236 |       raise ValueError("Unsupported action space")
237 | 
238 |     return obs_shape, obs_dtype, act_shape, act_dtype, obs_len
239 | 
240 | 
241 |   def _reset(self):
242 |     pass
243 | 
244 | 
245 |   def _append_log_spec(self):
246 |     return [ ( "pi_learn_rate", "f", self.model.pi_opt_conf.lr_value), ]
247 | 


--------------------------------------------------------------------------------
/rltf/agents/ppo_agent.py:
--------------------------------------------------------------------------------
 1 | from rltf.agents import AgentPG
 2 | 
 3 | 
 4 | class AgentPPO(AgentPG):
 5 | 
 6 |   def __init__(self, train_steps, batch_size, clip_range, **agent_kwargs):
 7 |     """
 8 |     Args:
 9 |       train_steps: int. Number of training steps per epoch
10 |       batch_size: int. Batch size for training the model
11 |       clip_range: rltf.schedules.Schedule. Clipping range value for PPO and VF objective
12 |     """
13 |     super().__init__(**agent_kwargs)
14 | 
15 |     self.train_steps  = train_steps
16 |     self.batch_size   = batch_size
17 |     self.clip_range   = clip_range
18 | 
19 | 
20 |   def _get_feed_dict(self, batch, t):
21 |     feed_dict = super()._get_feed_dict(batch, t)
22 | 
23 |     # Append the VF and cliprange
24 |     feed_dict[self.model.old_vf_ph]     = batch["vf"]
25 |     feed_dict[self.model.cliprange_ph]  = self.clip_range.value(t)
26 | 
27 |     return feed_dict
28 | 
29 | 
30 |   def _run_train_step(self, t):
31 | 
32 |     for _ in range(self.train_steps):
33 | 
34 |       # Iterate over all data in the buffer in mini-batches
35 |       for batch in self.buffer.iterate(batch_size=self.batch_size, shuffle=True):
36 |         if self._terminate:
37 |           return
38 | 
39 |         feed_dict = self._get_feed_dict(batch, t)
40 | 
41 |         # Run a policy gradient step and a value function training step
42 |         self.sess.run(self.model.train_op, feed_dict=feed_dict)
43 | 
44 |     # Run the summary op to log the changes from the update if necessary
45 |     self._run_summary_op(t, feed_dict)
46 | 


--------------------------------------------------------------------------------
/rltf/agents/trpo_agent.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import numpy as np
 3 | 
 4 | from rltf.agents import AgentPG
 5 | 
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | class AgentTRPO(AgentPG):
11 | 
12 |   def __init__(self, vf_batch_size, line_search_steps, **agent_kwargs):
13 |     """
14 |     Args:
15 |       vf_batch_size: int. Batch size for training the value function
16 |       line_search_steps: int. Number of max line search iterations
17 |     """
18 |     super().__init__(**agent_kwargs)
19 | 
20 |     self.max_kl             = self.model.pi_opt_conf.kwargs["max_kl"]
21 |     self.vf_batch_size      = vf_batch_size
22 |     self.line_search_steps  = line_search_steps
23 | 
24 | 
25 |   def _run_train_step(self, t):
26 |     # Get all collected data
27 |     batch     = self.buffer.get_data()
28 |     feed_dict = self._get_feed_dict(batch, t)
29 | 
30 |     # Update the old policy to match the current one
31 |     self.sess.run(self.model.update_old_pi)
32 | 
33 |     # Compute the TNPG step and the policy surrogate gain before the update
34 |     pi_gain_lo, _ = self.sess.run([self.model.pi_gain, self.model.step_op], feed_dict=feed_dict)
35 | 
36 |     # Perform line search for the new policy
37 |     self._line_search(pi_gain_lo, feed_dict)
38 | 
39 |     # Train the value function
40 |     for _ in range(self.vf_iters):
41 |       # Iterate over all data in the buffer in mini-batches
42 |       for batch in self.buffer.iterate(batch_size=self.vf_batch_size, shuffle=True):
43 |         if self._terminate:
44 |           return
45 | 
46 |         feed_dict = self._get_feed_dict(batch, t)
47 | 
48 |         # Run a value function training step
49 |         self.sess.run(self.model.train_vf, feed_dict=feed_dict)
50 | 
51 |     # Run the summary op to log the changes from the update if necessary
52 |     self._run_summary_op(t, feed_dict)
53 | 
54 | 
55 |   def _line_search(self, pi_gain_lo, feed_dict):
56 |     """Perform line search for the new policy.
57 |     Args:
58 |       pi_gain_lo: float. Policy surrogate gain before the update
59 |       feed_dict: dict with data to feed to the session in order to compute the line search metrics
60 |     """
61 |     step_size = 1.0
62 | 
63 |     for _ in range(self.line_search_steps):
64 |       if self._terminate:
65 |         return
66 | 
67 |       # Update the policy
68 |       self.sess.run(self.model.update_pi, feed_dict={self.model.pi_opt_conf.lr_ph: step_size})
69 | 
70 |       # Compute the policy surrogate gain and the KL divergence
71 |       kl, pi_gain = self.sess.run([self.model.mean_kl, self.model.pi_gain], feed_dict=feed_dict)
72 | 
73 |       if not np.isfinite(kl) or not np.isfinite(pi_gain):
74 |         logger.warning("Non-finite loss values: kl=%f, pi_gain=%f. Shrinking step", kl, pi_gain)
75 |       elif kl > self.max_kl * 1.5:
76 |         logger.debug("Violated KL constraint. Shrinking step")
77 |       elif pi_gain < pi_gain_lo:
78 |         logger.debug("Surrogate objective did not improve. Shrinking step")
79 |       else:
80 |         logger.debug("Stepsize OK")
81 |         break
82 | 
83 |       # Shrink step size
84 |       step_size = step_size * 0.5
85 | 
86 |     else:
87 |       logger.info("Line search could not compute a good step")
88 |       # Reset pi to its initial state
89 |       self.sess.run(self.model.reset_pi)
90 | 
91 | 
92 |   def _append_log_spec(self):
93 |     return []
94 | 


--------------------------------------------------------------------------------
/rltf/cmdutils/__init__.py:
--------------------------------------------------------------------------------
1 | from rltf.cmdutils.override import ArgSpec
2 | from rltf.cmdutils.override import LambdaArgSpec
3 | from rltf.cmdutils          import defaults
4 | from rltf.cmdutils          import cmdargs
5 | 


--------------------------------------------------------------------------------
/rltf/cmdutils/cmdargs.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | from rltf.cmdutils          import ArgSpec
  5 | from rltf.cmdutils          import LambdaArgSpec
  6 | from rltf.cmdutils.defaults import * #pylint: disable=wildcard-import,unused-wildcard-import
  7 | 
  8 | 
  9 | def str2bool(v):
 10 |   """Parse command line bool argument"""
 11 |   if v.lower() in ('yes', 'true', 't', 'y', '1'):
 12 |     return True
 13 |   elif v.lower() in ('no', 'false', 'f', 'n', '0'):
 14 |     return False
 15 |   else:
 16 |     raise argparse.ArgumentTypeError('Boolean value expected.')
 17 | 
 18 | 
 19 | def parse_args(model_choices):
 20 |   """Parse both known and unknown command line arguments. Automatically fetches the default arguments
 21 |   for the selected model and overrides the defaults with data from the unknown arguments.
 22 |   Args:
 23 |     model_choices: list of strings with the allowed model choices
 24 |   Returns:
 25 |     tuple of (agent_kwargs, args)
 26 |     agent_kwargs: dict with arguments to be passed directly to the agent
 27 |     args: namespace with the parsed known command line arguments
 28 |   """
 29 |   s2b = str2bool
 30 | 
 31 |   # Create the parser and add args
 32 |   parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 33 | 
 34 |   parser.add_argument('--env_id',        required=True,   type=str,   help='full environment name')
 35 |   parser.add_argument('--model',         required=True,   type=str,   choices=model_choices)
 36 | 
 37 |   parser.add_argument('--seed',          default=42,      type=int,   help='global seed; random if <=0')
 38 |   parser.add_argument('--n_plays',       default=0,       type=int,   help='number of runs in play mode')
 39 |   parser.add_argument('--log_lvl',       default='INFO',  type=str,   help='logger lvl')
 40 |   parser.add_argument('--plot_video',    default=False,   type=s2b,   help='add model plots to videos')
 41 |   parser.add_argument('--tag',           default="",      type=str,   help='additional custom info')
 42 | 
 43 |   # Optional arguments
 44 |   parser.add_argument('--restore', default=None,    type=str,
 45 |     help='(optional) directory path of existing model to restore and continue training')
 46 |   parser.add_argument('--load_model',    default=None,    type=str,
 47 |     help='(optional) directory path of existing model whose weights will be loaded initially')
 48 |   parser.add_argument('--load_regex',    default=None,    type=str,
 49 |     help='(optional) regex for matching the set of vars to load with --load-model')
 50 | 
 51 |   # Parse only the known args
 52 |   args, extra_args = parser.parse_known_args()
 53 | 
 54 |   # Verify the correctness of the known args
 55 |   args = verify_args(args)
 56 | 
 57 |   # Fetch the default arguments for the model
 58 |   model_kwargs = get_args(args.model)
 59 | 
 60 |   # Update the defaults with the extra command line arguments
 61 |   model_kwargs = parse_extra_args(extra_args, model_kwargs)
 62 | 
 63 |   # Build the defaults
 64 |   model_kwargs = build_kwargs(model_kwargs)
 65 | 
 66 |   # Get the required command line arguments to be passed to the agent
 67 |   cmd_kwargs = dict(
 68 |     n_plays=args.n_plays,
 69 |     plot_video=args.plot_video,
 70 |     load_model=args.load_model,
 71 |     load_regex=args.load_regex,
 72 |   )
 73 | 
 74 |   # Initialize the agent kwargs: merge the overriden defaults and the applicable command-line arguments
 75 |   agent_kwargs = {**model_kwargs, **cmd_kwargs}
 76 | 
 77 |   return agent_kwargs, args
 78 | 
 79 | 
 80 | def parse_extra_args(extra_args, kwargs):
 81 |   """Parse the extra command line arguments and update any overriden defaults
 82 |   Args:
 83 |     extra_args: list of command line arguments
 84 |     kwargs: dict of default arguments
 85 |   Returns:
 86 |     The updated kwargs
 87 |   """
 88 |   for arg in extra_args:
 89 |     assert arg.startswith('--')
 90 |     assert '=' in arg, "Cannot parse arg {}".format(arg)
 91 | 
 92 |     key, value = arg[2:].split('=', 1)
 93 | 
 94 |     # Check if argument is for nested ArgSpec assignment
 95 |     if '.' in key:
 96 |       key, *subkeys = key.split(".")
 97 |     else:
 98 |       subkeys = None
 99 | 
100 |     assert key in kwargs, "Unknown model argument {}".format(key)
101 | 
102 |     # Update a base type argument or a complete ArgSpec definition
103 |     if subkeys is None:
104 |       kwargs[key] = eval(value)
105 | 
106 |     # Update the subkey of an ArgSpec
107 |     else:
108 |       builder = kwargs[key]
109 | 
110 |       # Get the correct ArgSpec builder, even if nested
111 |       if isinstance(builder, ArgSpec):
112 |         builder.override(subkeys, value)
113 |       # Update the lambda so that arguments are truly overriden
114 |       elif builder.__name__ == "<lambda>":
115 |         kwargs[key] = LambdaArgSpec(builder, subkeys, value)
116 |       else:
117 |         raise TypeError("Only arguments of types 'lambda' or 'ArgSpec' can be overriden in a nested way.")
118 | 
119 |   return kwargs
120 | 
121 | 
122 | def build_kwargs(kwargs):
123 |   for key, value in kwargs.items():
124 |     if isinstance(value, ArgSpec):
125 |       kwargs[key] = value()
126 |     elif callable(value) and not isinstance(value, LambdaArgSpec) and value.__name__ == "<lambda>":
127 |       kwargs[key] = LambdaArgSpec(value)
128 |   return kwargs
129 | 
130 | 
131 | def verify_args(args):
132 |   # Only one of args.restore and args.load_model can be set
133 |   assert not (args.restore is not None and args.load_model is not None)
134 | 
135 |   # When in play mode, model needs to be loaded
136 |   if args.n_plays > 0:
137 |     assert args.load_model is not None
138 |     args.mode = 'play'
139 |   else:
140 |     args.mode = 'train'
141 | 
142 |   if args.restore is not None:
143 |     args.restore = os.path.abspath(args.restore)
144 |     assert os.path.exists(args.restore)
145 |     assert os.path.basename(args.restore).startswith(args.env_id)
146 | 
147 |   elif args.load_model is not None:
148 |     args.load_model = os.path.abspath(args.load_model)
149 |     assert os.path.exists(args.load_model)
150 |     assert os.path.exists(os.path.join(args.load_model, "snapshots"))
151 | 
152 |   if args.load_regex is not None:
153 |     assert args.load_model is not None
154 | 
155 |   return args
156 | 


--------------------------------------------------------------------------------
/rltf/cmdutils/override.py:
--------------------------------------------------------------------------------
 1 | from rltf.exploration   import * #pylint: disable=wildcard-import,unused-wildcard-import
 2 | from rltf.optimizers    import * #pylint: disable=wildcard-import,unused-wildcard-import
 3 | from rltf.schedules     import * #pylint: disable=wildcard-import,unused-wildcard-import
 4 | 
 5 | 
 6 | class ArgSpec:
 7 |   """Helper class that allows for easily overriding complex object-oriented arguments (rather than
 8 |   core python types) from command line"""
 9 | 
10 |   def __init__(self, arg_type, **kwargs):
11 |     self.arg_type = arg_type
12 |     self.kwargs   = kwargs
13 | 
14 | 
15 |   def __call__(self):
16 |     # First build nested ArgSpec objects recursively
17 |     for key, value in self.kwargs.items():
18 |       if isinstance(value, self.__class__):
19 |         self.kwargs[key] = value()
20 |     # Build this object
21 |     return self.arg_type(**self.kwargs)
22 | 
23 | 
24 |   def override(self, keys, value):
25 |     """Override a default argument, even nested
26 |       keys: list of str. Will be traversed (in order) to access the correct argument,
27 |         which value needs to be overriden.
28 |       value: str. The new value. Evaluated using `eval()`
29 |     """
30 | 
31 |     # Argument to be overriden is part of this object
32 |     if len(keys) == 1:
33 |       self.kwargs[keys[0]] = eval(value)
34 |     # Argument to be overriden is part of a nested object
35 |     elif len(keys) > 1:
36 |       builder = self.kwargs[keys[0]]
37 |       assert isinstance(builder, self.__class__)
38 |       # Recursively access the correct value
39 |       builder.override(keys[1:], value)
40 | 
41 | 
42 | class LambdaArgSpec:
43 |   """Helper class, similar to ArgSpec, for overriding objects which cannot be built without providing
44 |   additional information, which is not available until a later point. For example, objects which need
45 |   to know environment-specific properties at the time of their initialization."""
46 | 
47 |   def __init__(self, builder, subkeys=None, value=None):
48 |     """
49 |     Args:
50 |       builder: callable. Must return an object of type ArgSpec
51 |       subkeys:
52 |     """
53 |     self.builder  = builder
54 |     self.subkeys  = [] if subkeys is None else subkeys
55 |     self.value    = value
56 | 
57 |   def __call__(self, *args, **kwargs):
58 |     # Call the lambda, to get the ArgSpec
59 |     builder = self.builder(*args, **kwargs)
60 | 
61 |     assert isinstance(builder, ArgSpec)
62 | 
63 |     # Override the argument values
64 |     builder.override(self.subkeys, self.value)
65 | 
66 |     # Return the fully-built object
67 |     return builder()
68 | 
69 |   def __repr__(self):
70 |     return str(self.__call__())
71 | 


--------------------------------------------------------------------------------
/rltf/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from rltf.envs.atari    import wrap_deepmind_atari
2 | from rltf.envs.common   import wrap_ddpg
3 | from rltf.envs.common   import wrap_dqn
4 | from rltf.envs.common   import wrap_pg
5 | from rltf.envs.wrappers import MaxEpisodeLen
6 | 


--------------------------------------------------------------------------------
/rltf/envs/common.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | # from rltf.envs.wrappers import ResizeFrame
 4 | # from rltf.envs.wrappers import RepeatAndStackImage
 5 | from rltf.envs.wrappers import ClipAction
 6 | from rltf.envs.wrappers import NormalizeAction
 7 | from rltf.envs.wrappers import ScaleReward
 8 | from rltf.envs.atari    import wrap_deepmind_atari
 9 | from rltf.envs.atari    import ClippedRewardsWrapper
10 | 
11 | 
12 | def wrap_pg(env, mode, rew_scale=1.0):
13 |   # Continuous action space
14 |   if isinstance(env.action_space, gym.spaces.Box):
15 |     env = NormalizeAction(env)
16 |     env = ClipAction(env)
17 |   # Reward scaling
18 |   if mode == 't' and rew_scale != 1.0:
19 |     env = ScaleReward(env, rew_scale)
20 |   # Image observations
21 |   if len(env.observation_space.shape) == 3:
22 |     # env = ResizeFrame(env)
23 |     # env = RepeatAndStackImage(env)
24 |     raise NotImplementedError()
25 |   return env
26 | 
27 | 
28 | def wrap_ddpg(env, mode, rew_scale=1.0):
29 |   env = NormalizeAction(env)
30 |   env = ClipAction(env)
31 |   # Reward scaling
32 |   if mode == 't' and rew_scale != 1.0:
33 |     env = ScaleReward(env, rew_scale)
34 |   # Image observations
35 |   if len(env.observation_space.shape) == 3:
36 |     # env = ResizeFrame(env)
37 |     # env = RepeatAndStackImage(env)
38 |     raise NotImplementedError()
39 |   return env
40 | 
41 | 
42 | def _wrap_nonimg_dqn(env, mode):
43 |   if mode == 't':
44 |     env = ClippedRewardsWrapper(env)
45 |   return env
46 | 
47 | 
48 | def wrap_dqn(env, mode, **kwargs):
49 |   if len(env.observation_space.shape) == 3:
50 |     return wrap_deepmind_atari(env, mode, **kwargs)
51 |   else:
52 |     return _wrap_nonimg_dqn(env, mode, **kwargs)
53 | 


--------------------------------------------------------------------------------
/rltf/envs/utils.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | def get_env_monitor(env):
 4 |   """
 5 |   Args:
 6 |     env: gym.Env. The wrapped environment.
 7 |   Returns:
 8 |     the `gym.wrappers.Monitor` around env
 9 | 
10 |   Raises:
11 |     `ValueError` if env is not wrapper by Monitor
12 |   """
13 | 
14 |   currentenv = env
15 |   while True:
16 |     if "Monitor" in currentenv.__class__.__name__:
17 |       return currentenv
18 |     elif isinstance(currentenv, gym.Wrapper):
19 |       currentenv = currentenv.env
20 |     else:
21 |       raise ValueError("Couldn't find wrapper named Monitor")
22 | 


--------------------------------------------------------------------------------
/rltf/envs/wrappers.py:
--------------------------------------------------------------------------------
 1 | # import cv2
 2 | import gym
 3 | import numpy as np
 4 | 
 5 | 
 6 | class ScaleReward(gym.RewardWrapper):
 7 |   """Scale rewards"""
 8 | 
 9 |   def __init__(self, env, scale):
10 |     super().__init__(env)
11 |     self.scale = scale
12 | 
13 |   def reward(self, reward):
14 |     return self.scale * reward
15 | 
16 | 
17 | class NormalizeAction(gym.ActionWrapper):
18 |   """Receive actions in the range [-1, 1]"""
19 | 
20 |   def __init__(self, env):
21 |     assert isinstance(env.action_space, gym.spaces.Box)
22 |     super().__init__(env)
23 |     assert np.any(self.action_space.high !=  float("inf"))
24 |     assert np.any(self.action_space.low  != -float("inf"))
25 |     self.act_mean = (self.action_space.high + self.action_space.low) / 2.0
26 |     self.act_std  = (self.action_space.high - self.action_space.low) / 2.0
27 |     self.action_space = gym.spaces.Box(low=-1.0, high=1.0, shape=self.action_space.shape, dtype=np.float32)
28 | 
29 |   def action(self, action):
30 |     return self.act_std * action + self.act_mean
31 | 
32 |   def reverse_action(self, action):
33 |     return (action - self.act_mean) / self.act_std
34 | 
35 | 
36 | class ClipAction(gym.ActionWrapper):
37 |   """Clip actions before playing them"""
38 | 
39 |   def __init__(self, env, low=-1.0, high=1.0):
40 |     assert isinstance(env.action_space, gym.spaces.Box)
41 |     super().__init__(env)
42 |     self.high = high
43 |     self.low  = low
44 | 
45 |   def action(self, action):
46 |     return np.clip(action, self.low, self.high)
47 | 
48 |   def reverse_action(self, action):
49 |     return action
50 | 
51 | 
52 | class MaxEpisodeLen(gym.Wrapper):
53 |   def __init__(self, env, max_episode_steps):
54 |     """Limit episode length to max_steps"""
55 |     super().__init__(env)
56 |     self.max_steps  = max_episode_steps
57 |     self.steps      = None
58 | 
59 |   #pylint: disable=method-hidden
60 |   def step(self, action):
61 |     self.steps += 1
62 |     obs, reward, done, info = self.env.step(action)
63 |     done = done or (self.steps >= self.max_steps)
64 |     return obs, reward, done, info
65 | 
66 |   #pylint: disable=method-hidden
67 |   def reset(self, **kwargs):
68 |     self.steps = 0
69 |     return self.env.reset(**kwargs)
70 | 


--------------------------------------------------------------------------------
/rltf/exploration/__init__.py:
--------------------------------------------------------------------------------
1 | from rltf.exploration.random_noise    import DecayedExplorationNoise
2 | from rltf.exploration.random_noise    import GaussianNoise
3 | from rltf.exploration.random_noise    import OrnsteinUhlenbeckNoise
4 | 


--------------------------------------------------------------------------------
/rltf/exploration/exploration.py:
--------------------------------------------------------------------------------
 1 | from abc import ABCMeta, abstractmethod
 2 | 
 3 | from rltf.utils import seeding
 4 | 
 5 | 
 6 | class ExplorationNoise(metaclass=ABCMeta):
 7 | 
 8 |   def __init__(self):
 9 |     self.prng = seeding.get_prng()
10 | 
11 |   @abstractmethod
12 |   def sample(self, t):
13 |     """Get a sample from the noise process for the given time step
14 |     Args:
15 |       t: Current time step
16 |     Returns:
17 |       float: the sampled noise value
18 |     """
19 |     pass
20 | 
21 |   @abstractmethod
22 |   def reset(self):
23 |     """Reset the noise process"""
24 |     pass
25 | 


--------------------------------------------------------------------------------
/rltf/exploration/random_noise.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from rltf.exploration.exploration import ExplorationNoise
  4 | 
  5 | 
  6 | class NoNoise(ExplorationNoise):
  7 |   """Returns 0 as noise"""
  8 | 
  9 |   def sample(self, t):
 10 |     return 0.0
 11 | 
 12 |   def reset(self):
 13 |     pass
 14 | 
 15 |   def __repr__(self):
 16 |     return 'NoNoise()'
 17 | 
 18 | 
 19 | 
 20 | class DecayedExplorationNoise(ExplorationNoise):
 21 |   """Use any `ExplorationNoise` type but decay the amount of noise over time"""
 22 | 
 23 |   def __init__(self, noise, decay):
 24 |     """
 25 |     Args:
 26 |       noise: `ExplorationNoise`. The type of noise to use. Object must already be initalized
 27 |       decay: `rltf.schedule.Schedule`. Schedule for decaying the noise
 28 |     """
 29 |     super().__init__()
 30 |     self.noise = noise
 31 |     self.decay = decay
 32 | 
 33 |   def sample(self, t):
 34 |     noise = self.noise.sample(t) * self.decay.value(t)
 35 |     return noise
 36 | 
 37 |   def reset(self):
 38 |     self.noise.reset()
 39 | 
 40 |   def __repr__(self):
 41 |     return 'DecayedExplorationNoise(type={}, decay={})'.format(self.noise, self.decay)
 42 | 
 43 | 
 44 | 
 45 | class GaussianNoise(ExplorationNoise):
 46 |   """Produces Gaussian Noise"""
 47 | 
 48 |   def __init__(self, shape, mu, sigma):
 49 |     """
 50 |     Args:
 51 |       mu: float or np.array. Mean of the Gaussian
 52 |       sigma: float or np.array. Standard deviation of the Gaussian
 53 |     """
 54 |     super().__init__()
 55 | 
 56 |     self.mu     = np.ones(shape, dtype=np.float32) * mu
 57 |     self.sigma  = np.ones(shape, dtype=np.float32) * sigma
 58 | 
 59 |   def sample(self, t):
 60 |     return self.prng.normal(self.mu, self.sigma)
 61 | 
 62 |   def reset(self):
 63 |     pass
 64 | 
 65 |   def __repr__(self):
 66 |     return 'GaussianNoise(mu={}, sigma={})'.format(self.mu, self.sigma)
 67 | 
 68 | 
 69 | 
 70 | class OrnsteinUhlenbeckNoise(ExplorationNoise):
 71 |   """Simulates Ornstein-Uhlenbeck Random Process
 72 | 
 73 |   Sources:
 74 |     - https://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
 75 |     - https://en.wikipedia.org/wiki/Ornstein%E2%80%93Uhlenbeck_process
 76 |     - https://github.com/openai/baselines/blob/master/baselines/ddpg/noise.py
 77 |     - https://github.com/rll/rllab/blob/master/rllab/exploration_strategies/ou_strategy.py
 78 |   """
 79 | 
 80 |   def __init__(self, shape, mu, sigma, theta=0.15, dt=1e-2):
 81 |     """
 82 |     Args:
 83 |       shape: tuple or list. Shape of the nd.array for the noise
 84 |       mu: np.array or scalar. Noise mean
 85 |       sigma: np.array or scalar. Wiener noise scale constant. Should have the same shape as mu
 86 |       theta: float. Mean attraction constant
 87 |       dt: float. Time constant. Can be interpreted as the time difference
 88 |         between two environent actions
 89 |     """
 90 |     super().__init__()
 91 | 
 92 |     self.mu     = np.ones(shape, dtype=np.float32) * mu
 93 |     self.sigma  = np.ones(shape, dtype=np.float32) * sigma
 94 |     self.theta  = theta
 95 |     self.dt     = dt
 96 |     self.x      = None
 97 |     self.reset()
 98 | 
 99 |   def sample(self, t):
100 |     x = self.x + self.theta * (self.mu - self.x) * self.dt + \
101 |         self.sigma * np.sqrt(self.dt) * self.prng.normal(size=self.sigma.shape)
102 |     self.x = x
103 |     return x
104 | 
105 |   def reset(self):
106 |     self.x = self.mu
107 |     # self.x = np.zeros_like(self.mu, dtype=np.float32)
108 | 
109 |   def __repr__(self):
110 |     return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={}, theta={}, dt={})'.format(
111 |       self.mu, self.sigma, self.theta, self.dt)
112 | 


--------------------------------------------------------------------------------
/rltf/memory/__init__.py:
--------------------------------------------------------------------------------
1 | from rltf.memory.base_buffer    import BaseBuffer
2 | from rltf.memory.replay_buffer  import ReplayBuffer
3 | from rltf.memory.pg_buffer      import PGBuffer
4 | 


--------------------------------------------------------------------------------
/rltf/memory/pg_buffer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from rltf.memory  import BaseBuffer
  4 | 
  5 | 
  6 | class PGBuffer(BaseBuffer):
  7 |   """Fixed-size data buffer. Supports both image observations and low-level observations.
  8 |   """
  9 | 
 10 |   def __init__(self, size, state_shape, obs_dtype, act_shape, act_dtype, obs_len=1):
 11 |     """
 12 |     Args: See `BaseBuffer.store()`
 13 |     """
 14 | 
 15 |     super().__init__(size, state_shape, obs_dtype, act_shape, act_dtype, obs_len)
 16 | 
 17 |     # Create a buffer for the value function
 18 |     vf              = np.empty([self.max_size+1], dtype=np.float32)
 19 |     self.vf         = vf[:-1]
 20 |     self.next_vf    = vf[1:]
 21 |     self.gae_lambda = np.empty([self.max_size], dtype=np.float32)
 22 |     self.td_lambda  = np.empty([self.max_size], dtype=np.float32)
 23 |     self.logp       = np.empty([self.max_size], dtype=np.float32)
 24 | 
 25 |     self.it = None
 26 | 
 27 | 
 28 |   #pylint: disable=arguments-differ
 29 |   def store(self, obs_t, act_t, rew_tp1, done_tp1, vf_t, logp_t):
 30 |     """Store the observed transition defined as: Given `obs_t`, action `act_t` was taken.
 31 |     Then reward `reward_tp1` was observed. If after action `act_t` the episode terminated,
 32 |     then `done_tp1` will be `True`, otherwise `Fasle`. Note that the observation after taking
 33 |     `act_t` should be passed as `obs_t` on the next call to `store()`. NOTE: if `done_tp1 == True`,
 34 |     then there is no need to call `store()` on `obs_tp1`: we do NOT need to know it since we never
 35 |     use it in computing the backup value
 36 |     Args:
 37 |       obs_t: See `BaseBuffer.store()`
 38 |       act_t: See `BaseBuffer.store()`
 39 |       reward_tp1: See `BaseBuffer.store()`
 40 |       done_tp1: See `BaseBuffer.store()`
 41 |       vf_t: float. Value function estimate for `obs_t`
 42 |       logp_t: float. Log probability of action `act_t`
 43 |     """
 44 | 
 45 |     # Store these before advancing next_idx in BaseBuffer
 46 |     self.vf[self.next_idx]    = vf_t
 47 |     self.logp[self.next_idx]  = logp_t
 48 | 
 49 |     super().store(obs_t, act_t, rew_tp1, done_tp1)
 50 | 
 51 | 
 52 |   def __iter__(self):
 53 |     self.it = -1
 54 |     return self
 55 | 
 56 | 
 57 |   def __next__(self):
 58 |     if self.it >= self.size_now:
 59 |       raise StopIteration
 60 |     else:
 61 |       self.it += 1
 62 |       return self.__getitem__(self.it)
 63 | 
 64 | 
 65 |   def __getitem__(self, i):
 66 |     # If low-level observations or single frames
 67 |     if self.obs_len == 1:
 68 |       obs = self.obs[i]
 69 |     else:
 70 |       obs = self._encode_img_observation(i)
 71 | 
 72 |     return obs, self.action[i], self.reward[i], self.done[i], self.vf[i], self.next_vf[i]
 73 | 
 74 | 
 75 |   def compute_estimates(self, gamma, lam, next_vf=0):
 76 |     """Compute the advantage estimates using the GAE(gamma, lambda) estimator and
 77 |     the value function targets using the TD(lambda) estimator
 78 |     Args:
 79 |       gamma: float. The value of gamma for GAE(gamma, lambda)
 80 |       lam: float. The value of lambda for GAE(gamma, lambda) and TD(lambda)
 81 |       next_vf: float. The value function estimate for the observation encountered after the
 82 |         last step. Must be 0 if the episode was done
 83 |     """
 84 | 
 85 |     # Assert that the buffer is exactly filled
 86 |     assert self.next_idx == 0
 87 | 
 88 |     self.next_vf[-1] = next_vf
 89 |     gae_t = 0
 90 | 
 91 |     # Compute GAE(gamma, lambda)
 92 |     # pylint: disable=redefined-argument-from-local
 93 |     for t, (_, _, rew, done, vf, next_vf) in zip(reversed(range(self.size_now)), reversed(self)):
 94 |       delta_t = rew + (1 - done) * gamma * next_vf  - vf
 95 |       gae_t   = delta_t + (1 - done) * gamma * lam * gae_t
 96 |       self.gae_lambda[t] = gae_t
 97 | 
 98 |     # Compute TD(lambda)
 99 |     self.td_lambda = self.gae_lambda + self.vf
100 | 
101 | 
102 |   def get_data(self):
103 |     """Return all data"""
104 |     return self._batch_samples(np.arange(0, self.size_now))
105 | 
106 | 
107 |   def iterate(self, batch_size, shuffle=True):
108 | 
109 |     size = (self.size_now // batch_size) * batch_size
110 | 
111 |     inds = np.arange(0, self.size_now)
112 |     if shuffle:
113 |       # inds = self.prng.shuffle(inds)[:size]
114 |       self.prng.shuffle(inds)
115 |     inds = inds[:size]
116 | 
117 |     for lo in range(0, size, batch_size):
118 |       hi = lo + batch_size
119 |       yield self._batch_samples(inds[lo:hi])
120 | 
121 | 
122 |   def _batch_samples(self, inds):
123 |     """Takes the samples from the buffer stacks them into a batch
124 |     Args:
125 |       inds: np.array or list. Indices for transitions to be sampled from the buffer
126 |     Returns:
127 |       See self.sample()
128 |     """
129 |     if self.obs_len == 1:
130 |       obs_batch     = self.obs[inds]
131 |     else:
132 |       obs_batch     = np.concatenate([self._encode_img_observation(idx)[None] for idx in inds], 0)
133 | 
134 |     act_batch   = self.action[inds]
135 |     gae_batch   = self.gae_lambda[inds]
136 |     td_batch    = self.td_lambda[inds]
137 |     logp_batch  = self.logp[inds]
138 |     vf_batch    = self.vf[inds]
139 | 
140 |     return dict(obs=obs_batch, act=act_batch, adv=gae_batch, ret=td_batch, logp=logp_batch, vf=vf_batch)
141 | 


--------------------------------------------------------------------------------
/rltf/memory/replay_buffer.py:
--------------------------------------------------------------------------------
  1 | import threading
  2 | import numpy as np
  3 | 
  4 | from rltf.memory  import BaseBuffer
  5 | from rltf.utils   import seeding
  6 | 
  7 | 
  8 | class ReplayBuffer(BaseBuffer):
  9 |   """Uniform replay buffer. Supports both image observations and low-level
 10 |   observations
 11 |   """
 12 | 
 13 |   def __init__(self, size, state_shape, obs_dtype, act_shape, act_dtype, obs_len=1, sync=False):
 14 |     """
 15 |     Args: `See BaseBuffer.__init__()`
 16 |     """
 17 | 
 18 |     super().__init__(size, state_shape, obs_dtype, act_shape, act_dtype, obs_len)
 19 | 
 20 |     self._sync    = sync and seeding.SEEDED
 21 |     self._sampled = threading.Event()
 22 |     self._stored  = threading.Event()
 23 |     self._sampled.clear()
 24 |     self._stored.set()
 25 | 
 26 | 
 27 |   def store(self, obs_t, act_t, rew_tp1, done_tp1):
 28 |     """See `BaseBuffer.store()`"""
 29 | 
 30 |     self.wait_sampled()
 31 | 
 32 |     super().store(obs_t, act_t, rew_tp1, done_tp1)
 33 | 
 34 |     self.signal_stored()
 35 | 
 36 | 
 37 |   def sample(self, batch_size):
 38 |     """
 39 |     Sample uniformly `batch_size` different transitions. Note that the
 40 |     implementation is thread-safe and allows for another thread to be currently
 41 |     adding a new transition to the buffer.
 42 | 
 43 |     i-th sample transition is as follows: when state was `obs[i]`, action
 44 |     `act[i]` was taken. After that reward `rew[i]` was received and subsequent
 45 |     state `obs_tp1[i]` was observed. If `done[i]` is True, then the episode was
 46 |     finished after taking `act[i]` and `obs_tp1[i]` will be garbage
 47 | 
 48 |     Args:
 49 |       batch_size: int. Size of the batch to sample
 50 |     Returns:
 51 |       Python dictionary with keys
 52 |       "obs": np.array, shape=[batch_size, state_shape], dtype=obs_dtype, Batch states
 53 |       "act": np.array, shape=[batch_size, act_shape], dtype=act_dtype. Batch actions
 54 |       "rew": np.array, shape=[batch_size, 1], dtype=np.float32. Batch rewards
 55 |       "obs_tp1": np.array, shape=[batch_size, obs_shape], dtype=obs_dtype. Batch next state
 56 |       "done": np.array, shape=[batch_size, 1], dtype=np.bool. Batch done mask.
 57 |         True if episode has ended, False otherwise
 58 |     """
 59 | 
 60 |     self.wait_stored()
 61 |     exclude = self._exclude_indices()
 62 |     self.signal_sampled()
 63 | 
 64 |     assert batch_size < self.size_now - len(exclude) - 1
 65 | 
 66 |     inds    = self._sample_n_unique(batch_size, 0, self.size_now, exclude)
 67 |     samples = self._batch_samples(inds)
 68 | 
 69 |     return samples
 70 | 
 71 | 
 72 |   def _batch_samples(self, inds):
 73 |     """Takes the samples from the buffer stacks them into a batch
 74 |     Args:
 75 |       inds: np.array or list. Indices for transitions to be sampled from the buffer
 76 |     Returns:
 77 |       See self.sample()
 78 |     """
 79 |     next_inds = (inds+1) % self.max_size
 80 |     if self.obs_len == 1:
 81 |       obs_batch     = self.obs[inds]
 82 |       obs_tp1_batch = self.obs[next_inds]
 83 |     else:
 84 |       obs_batch     = np.concatenate([self._encode_img_observation(idx)[None] for idx in inds], 0)
 85 |       obs_tp1_batch = np.concatenate([self._encode_img_observation(idx)[None] for idx in next_inds], 0)
 86 | 
 87 |     act_batch = self.action[inds]
 88 |     rew_batch = self.reward[inds]
 89 |     done_mask = self.done[inds]
 90 | 
 91 |     return dict(obs=obs_batch, act=act_batch, rew=rew_batch, obs_tp1=obs_tp1_batch, done=done_mask)
 92 | 
 93 | 
 94 |   def _exclude_indices(self):
 95 |     """Compute indices that must be excluded because the information there
 96 |     might be incosistent or being currently modified.
 97 |     Returns:
 98 |       list or np.array of indices to exclude
 99 |     """
100 | 
101 |     # Assume no other thread is modifying self.next_idx. Then idx points
102 |     # to the observation that will be overwritten next time. Then:
103 |     # - the idx-1 observation is invalid because the next obs is not the true one
104 |     # - the [idx : idx+obs_len-1) observations have inconsistent history
105 |     # If another thread might be currently modifying the data in idx and it is not known whether it
106 |     # already incremented it, then the points with inconsistent history must also be incremented by 1.
107 |     # Also the index with invalid next state is either idx-1 (thread has not incremened idx yet) or
108 |     # idx (we have read the incremented idx). In either case, the safe lower bound remains idx-1.
109 |     # If self.sync == True, then `store()` has not begun and the upper bound is idx+obs_len-1
110 |     # NOTE: QlearnAgent can call `store()` only once before `sample()` finishes. If it calls
111 |     # `sample()` twice, before `store()` finishes, nothing changes.
112 | 
113 |     idx     = self.next_idx
114 |     exclude = np.arange(idx-1, idx+self.obs_len) % self.max_size
115 |     return exclude
116 | 
117 | 
118 |   def wait_sampled(self):
119 |     if not self._sync:
120 |       return
121 |     # Wait until an action is chosen to be run
122 |     while not self._sampled.is_set():
123 |       self._sampled.wait()
124 |     self._sampled.clear()
125 | 
126 |   def wait_stored(self):
127 |     if not self._sync:
128 |       return
129 |     # Wait until training step is done
130 |     while not self._stored.is_set():
131 |       self._stored.wait()
132 |     self._stored.clear()
133 | 
134 |   def signal_sampled(self):
135 |     if not self._sync:
136 |       return
137 |     # Signal that the action is chosen and the TF graph is safe to be run
138 |     self._sampled.set()
139 | 
140 |   def signal_stored(self):
141 |     if not self._sync:
142 |       return
143 |     # Signal to env thread that the training step is done running
144 |     self._stored.set()
145 | 


--------------------------------------------------------------------------------
/rltf/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from rltf.models.model        import Model
 2 | from rltf.models.base_dqn     import BaseQlearn
 3 | from rltf.models.base_dqn     import BaseDQN
 4 | from rltf.models.bstrap_dqn   import BstrapDQN
 5 | from rltf.models.c51          import C51
 6 | from rltf.models.ddpg         import DDPG
 7 | from rltf.models.ddqn         import DDQN
 8 | from rltf.models.dqn          import DQN
 9 | from rltf.models.dqn_ids      import DQN_IDS
10 | from rltf.models.dqn_ucb      import DQN_UCB
11 | from rltf.models.dqn_ensemble import DQN_Ensemble
12 | from rltf.models.qr_dqn       import QRDQN
13 | from rltf.models.c51_ids      import C51_IDS
14 | from rltf.models.qrdqn_ids    import QRDQN_IDS
15 | from rltf.models.bdqn         import BDQN
16 | from rltf.models.bdqn         import BDQN_IDS
17 | from rltf.models.bdqn         import BDQN_TS
18 | from rltf.models.bdqn         import BDQN_UCB
19 | from rltf.models.base_pg      import BasePG
20 | from rltf.models.reinforce    import REINFORCE
21 | from rltf.models.ppo          import PPO
22 | from rltf.models.trpo         import TRPO
23 | 


--------------------------------------------------------------------------------
/rltf/models/base_dqn.py:
--------------------------------------------------------------------------------
  1 | from abc import ABCMeta, abstractmethod
  2 | 
  3 | import tensorflow as tf
  4 | 
  5 | from rltf.models    import Model
  6 | from rltf.tf_utils  import tf_utils
  7 | 
  8 | 
  9 | class BaseQlearn(Model, metaclass=ABCMeta):
 10 |   """Abstract Q-learning class"""
 11 | 
 12 |   def __init__(self):
 13 | 
 14 |     super().__init__()
 15 | 
 16 |     # Input TF placeholders that must be set
 17 |     self.obs_t_ph   = None
 18 |     self.act_t_ph   = None
 19 |     self.rew_t_ph   = None
 20 |     self.obs_tp1_ph = None
 21 |     self.done_ph    = None
 22 | 
 23 |     # TF Ops that should be set
 24 |     self.train_op       = None
 25 |     self.update_target  = None  # Optional
 26 | 
 27 | 
 28 |   def _build_ph(self):
 29 |     """Build the input placehodlers"""
 30 |     self.obs_t_ph   = tf.placeholder(self.obs_dtype,  [None] + self.obs_shape, name="obs_t_ph")
 31 |     self.act_t_ph   = tf.placeholder(self.act_dtype,  [None] + self.act_shape, name="act_t_ph")
 32 |     self.rew_t_ph   = tf.placeholder(tf.float32,      [None],                  name="rew_t_ph")
 33 |     self.obs_tp1_ph = tf.placeholder(self.obs_dtype,  [None] + self.obs_shape, name="obs_tp1_ph")
 34 |     self.done_ph    = tf.placeholder(tf.bool,         [None],                  name="done_ph")
 35 | 
 36 | 
 37 | 
 38 | class BaseDQN(BaseQlearn):
 39 |   """Abstract DQN class"""
 40 | 
 41 |   def __init__(self, obs_shape, n_actions, opt_conf, gamma):
 42 |     """
 43 |     Args:
 44 |       obs_shape: list. Shape of the observation tensor
 45 |       n_actions: int. Number of possible actions
 46 |       opt_conf: rltf.optimizers.OptimizerConf. Configuration for the optimizer
 47 |       gamma: float. Discount factor
 48 |     """
 49 | 
 50 |     assert len(obs_shape) == 3 or len(obs_shape) == 1
 51 | 
 52 |     super().__init__()
 53 | 
 54 |     self.gamma      = gamma
 55 |     self.opt_conf   = opt_conf
 56 | 
 57 |     self.obs_dtype  = tf.uint8 if len(obs_shape) == 3 else tf.float32
 58 |     self.obs_shape  = obs_shape
 59 |     self.act_dtype  = tf.uint8
 60 |     self.act_shape  = []
 61 |     self.n_actions  = n_actions
 62 | 
 63 |     # Custom TF Tensors and Ops
 64 |     self.obs_t      = None
 65 |     self.obs_tp1    = None
 66 | 
 67 | 
 68 |   def build(self):
 69 | 
 70 |     # Build the input placeholders
 71 |     self._build_ph()
 72 | 
 73 |     # Preprocess the observation
 74 |     self.obs_t    = tf_utils.preprocess_input(self.obs_t_ph)
 75 |     self.obs_tp1  = tf_utils.preprocess_input(self.obs_tp1_ph)
 76 | 
 77 |     # Construct the Q-network and the target network
 78 |     agent_out     = self._nn_model(self.obs_t,   scope="agent_net")
 79 |     target_out    = self._nn_model(self.obs_tp1, scope="target_net")
 80 | 
 81 |     # Compute the estimated Q-function and its backup value
 82 |     estimate      = self._compute_estimate(agent_out)
 83 |     target        = self._compute_target(target_out)
 84 | 
 85 |     # Compute the loss
 86 |     loss          = self._compute_loss(estimate, target, name="train/loss")
 87 | 
 88 |     train_vars    = self._trainable_variables(scope="agent_net")
 89 |     agent_vars    = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="agent_net")
 90 |     target_vars   = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target_net")
 91 | 
 92 |     # Build the optimizer and the train op
 93 |     optimizer     = self.opt_conf.build(lr_tb_name="train/learn_rate")
 94 |     train_op      = self._build_train_op(optimizer, loss, train_vars, name="train_op")
 95 | 
 96 |     # Create the Op to update the target
 97 |     update_target = tf_utils.assign_vars(target_vars, agent_vars, name="update_target")
 98 | 
 99 |     # Compute the train and eval actions
100 |     self.train_dict = self._act_train(agent_out, name="a_train")
101 |     self.eval_dict  = self._act_eval(agent_out,  name="a_eval")
102 | 
103 |     self.train_op       = train_op
104 |     self.update_target  = update_target
105 |     self._vars          = agent_vars + target_vars
106 | 
107 | 
108 |   def _nn_model(self, x, scope):
109 |     with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
110 |       if len(self.obs_shape) == 3:
111 |         return self._conv_nn(x)
112 |       else:
113 |         return self._dense_nn(x)
114 | 
115 | 
116 |   @abstractmethod
117 |   def _conv_nn(self, x):
118 |     pass
119 | 
120 | 
121 |   def _dense_nn(self, x):
122 |     raise NotImplementedError()
123 | 
124 | 
125 |   @abstractmethod
126 |   def _act_train(self, agent_net, name):
127 |     pass
128 | 
129 | 
130 |   @abstractmethod
131 |   def _act_eval(self, agent_net, name):
132 |     pass
133 | 
134 | 
135 |   @abstractmethod
136 |   def _compute_estimate(self, agent_net):
137 |     pass
138 | 
139 | 
140 |   def _compute_target(self, target_net):
141 |     target = self._select_target(target_net)
142 |     target = tf.identity(target, name="target")
143 |     backup = self._compute_backup(target)
144 |     backup = tf.identity(backup, name="backup")
145 |     backup = tf.stop_gradient(backup)
146 |     return backup
147 | 
148 | 
149 |   @abstractmethod
150 |   def _select_target(self, target_net):
151 |     pass
152 | 
153 | 
154 |   @abstractmethod
155 |   def _compute_backup(self, target):
156 |     pass
157 | 
158 | 
159 |   @abstractmethod
160 |   def _compute_loss(self, estimate, target, name):
161 |     pass
162 | 
163 | 
164 |   def _build_train_op(self, optimizer, loss, agent_vars, name):
165 |     grads     = self._compute_gradients(optimizer, loss, agent_vars)
166 |     train_op  = optimizer.apply_gradients(grads, name=name)
167 |     return train_op
168 | 
169 | 
170 |   def _compute_gradients(self, optimizer, loss, agent_vars, gate_grads=True):
171 |     grads = optimizer.compute_gradients(loss, var_list=agent_vars)
172 |     if gate_grads:
173 |       grads = tf_utils.gate_gradients(grads)
174 |     return grads
175 | 
176 | 
177 |   def initialize(self, sess):
178 |     """Initialize the model. See Model.initialize()"""
179 |     sess.run(self.update_target)
180 | 
181 | 
182 |   def reset(self, sess):
183 |     pass
184 | 
185 | 
186 |   def action_train_ops(self, sess, state, run_dict=None):
187 |     return super()._action_train_ops(sess, run_dict, feed_dict={self.obs_t_ph: state[None,:]})
188 | 
189 | 
190 |   def action_eval_ops(self, sess, state, run_dict=None):
191 |     return super()._action_eval_ops(sess, run_dict, feed_dict={self.obs_t_ph: state[None,:]})
192 | 


--------------------------------------------------------------------------------
/rltf/models/base_pg.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import tensorflow as tf
  3 | 
  4 | from rltf.models    import Model
  5 | from rltf.tf_utils  import tf_dist
  6 | 
  7 | 
  8 | class BasePG(Model):
  9 |   """Abstract Policy Gradients class"""
 10 | 
 11 |   def __init__(self, obs_shape, act_space, pi_opt_conf, vf_opt_conf, layers, activation, obs_norm,
 12 |                nn_std=False):
 13 |     """
 14 |     Args:
 15 |       obs_shape: list. Shape of the observation tensor
 16 |       act_space: gym.Space. Environment action space
 17 |       pi_opt_conf: rltf.optimizers.OptimizerConf. Configuration for the policy optimizer
 18 |       vf_opt_conf: rltf.optimizers.OptimizerConf. Configuration for the value function optimizer
 19 |       layers: list of ints or tuples. Contains the hidden layer spec for the neural net for both the
 20 |         policy and the value function. Each entry must be either a tuple of `(filters, size, stride)`
 21 |         for a convolutional layer or an `int` for a dense layer
 22 |       activation: lambda. Non-linear activation function for the hidden layers
 23 |       obs_norm: bool. Whether to normalize input observations
 24 |       nn_std: bool. If True, the standard deviation for a Gaussian policy is output of the neural net.
 25 |         Otherwise, a single trainable variable is used. Ignored for discrete action spaces
 26 |     """
 27 | 
 28 |     assert len(obs_shape) == 3 or len(obs_shape) == 1
 29 | 
 30 |     super().__init__()
 31 | 
 32 |     self.discrete     = isinstance(act_space, gym.spaces.Discrete)
 33 |     self.nn_std       = nn_std
 34 |     self.pi_opt_conf  = pi_opt_conf
 35 |     self.vf_opt_conf  = vf_opt_conf
 36 |     self.obs_norm     = obs_norm
 37 | 
 38 |     # Determine observation specs
 39 |     self.obs_shape    = obs_shape
 40 |     self.obs_dtype    = tf.uint8 if len(obs_shape) == 3 else tf.float32
 41 | 
 42 |     # Determine action specs
 43 |     if self.discrete:
 44 |       self.act_dim    = act_space.n
 45 |       self.act_shape  = []
 46 |       self.act_dtype  = tf.int32
 47 |     else:
 48 |       self.act_dim    = act_space.shape[0]
 49 |       self.act_shape  = [self.act_dim]
 50 |       self.act_dtype  = tf.float32
 51 | 
 52 |     # Parse and save the layer specifications
 53 |     layers            = self._parse_layer_spec(layers)
 54 |     self.cnn_layers   = layers[0]
 55 |     self.dense_layers = layers[1]
 56 |     self.activation   = activation
 57 | 
 58 |     # General input TF placeholders
 59 |     self.obs_ph       = None
 60 |     self.act_ph       = None
 61 |     self.adv_ph       = None
 62 |     self.ret_ph       = None
 63 |     self.old_logp_ph  = None
 64 |     self.training     = None
 65 |     self.train_op     = None
 66 | 
 67 | 
 68 |   def _parse_layer_spec(self, layers):
 69 |     """Parse the layer spec provided in the constructor"""
 70 |     cnn_layers   = []
 71 |     dense_layers = []
 72 |     for layer in layers:
 73 |       if isinstance(layer, (tuple, list)):
 74 |         assert len(layer) == 3
 75 |         assert len(dense_layers) == 0, "All convolutional layers must come before any dense layer"
 76 |         cnn_layers.append(tuple(layer))
 77 |       elif isinstance(layer, int):
 78 |         dense_layers.append(layer)
 79 |       else:
 80 |         raise ValueError("Uknown layer specification {}. "
 81 |                          "Only dense and convlutional layers allowed".format(layer))
 82 |     return cnn_layers, dense_layers
 83 | 
 84 | 
 85 |   def _build_ph(self):
 86 |     """Build the input placehodlers"""
 87 |     self.obs_ph       = tf.placeholder(self.obs_dtype,  [None] + self.obs_shape, name="obs_ph")
 88 |     self.act_ph       = tf.placeholder(self.act_dtype,  [None] + self.act_shape, name="act_ph")
 89 |     self.adv_ph       = tf.placeholder(tf.float32,      [None],                  name="adv_ph")
 90 |     self.ret_ph       = tf.placeholder(tf.float32,      [None],                  name="ret_ph")
 91 |     self.old_logp_ph  = tf.placeholder(tf.float32,      [None],                  name="old_logp_ph")
 92 | 
 93 |     self.training     = tf.placeholder_with_default(True, (), name="training")
 94 | 
 95 | 
 96 |   def _build_nn(self, x, n_outputs):
 97 |     # Build the convolutional part of the network
 98 |     with tf.variable_scope("conv_net"):
 99 |       for filters, size, strides in self.cnn_layers:
100 |         x = tf.layers.conv2d(x, filters=filters, kernel_size=size, strides=strides,
101 |                              padding="SAME", activation=self.activation)
102 | 
103 |     if len(self.cnn_layers) > 0:
104 |       x = tf.layers.flatten(x)
105 | 
106 |     # Build the dense part of the network
107 |     with tf.variable_scope("dense_net"):
108 |       for size in self.dense_layers:
109 |         x = tf.layers.dense(x, units=size, activation=self.activation)
110 |       x = tf.layers.dense(x, units=n_outputs, activation=None)
111 | 
112 |     return x
113 | 
114 | 
115 |   def _pi_model(self, x, scope):
116 |     """ Build policy network and the corresponding distribution
117 |     Args:
118 |       x: tf.Tensor. Tensor for the input
119 |       scope: str. Scope in which all the model related variables should be created
120 |     Returns:
121 |       `ProbabilityDistribution`
122 |     """
123 | 
124 |     # Determine the network output size
125 |     if self.nn_std:
126 |       n_outputs = 2*self.act_dim
127 |     else:
128 |       n_outputs = self.act_dim
129 | 
130 |     # Build the policy network
131 |     with tf.variable_scope(scope):
132 |       pi_out = self._build_nn(x, n_outputs)
133 | 
134 |       # Use Categorical distribution
135 |       if self.discrete:
136 |         pd = tf.distributions.Categorical(logits=pi_out)
137 | 
138 |       # Use Gaussian distribution
139 |       else:
140 |         # Gaussian with logstd as network output
141 |         if self.nn_std:
142 |           mean, logstd = tf.split(pi_out, 2, axis=-1)
143 |         # Gaussian with logstd as a single tf.Variable
144 |         else:
145 |           mean    = pi_out
146 |           logstd  = tf.get_variable("logstd", shape=[1, n_outputs], initializer=tf.zeros_initializer())
147 | 
148 |         pd = tf_dist.MultivariateNormalDiag(mean, logstd)
149 | 
150 |     return pd
151 | 
152 | 
153 |   def _vf_model(self, x, scope):
154 |     """Build value function network"""
155 |     with tf.variable_scope(scope):
156 |       vf = self._build_nn(x, 1)
157 |       vf = tf.squeeze(vf, axis=-1)
158 | 
159 |     self.ops_dict["vf"] = vf
160 | 
161 |     return vf
162 | 
163 | 
164 |   @property
165 |   def adv_norm(self):
166 |     mean, var = tf.nn.moments(self.adv_ph, axes=[0], keep_dims=True)
167 |     return (self.adv_ph - mean) / (tf.sqrt(var) + 1e-8)
168 | 
169 | 
170 |   def _act_train(self, pi, vf, name):
171 |     action  = tf.identity(pi.sample(), name=name)
172 |     logp    = pi.log_prob(action)
173 |     return dict(action=action, vf=vf, logp=logp)
174 | 
175 | 
176 |   def _act_eval(self, pi, name):
177 |     # If the policy is Categorical, we want the action with highest probability
178 |     if self.discrete:
179 |       # action = tf.identity(self.train_dict["action"], name=name)
180 |       action = tf.identity(tf.argmax(pi.logits, axis=-1), name=name)
181 |     # If the policy is Gaussian, we want the mean action during evaluation
182 |     else:
183 |       action = tf.identity(pi.mean(), name=name)
184 | 
185 |     return dict(action=action)
186 | 
187 | 
188 |   def _build_train_op(self, loss, pi_vars, vf_vars, name=None):
189 |     pi_opt    = self.pi_opt_conf.build()
190 |     vf_opt    = self.vf_opt_conf.build()
191 |     train_pi  = pi_opt.minimize(loss, var_list=pi_vars, gate_gradients=pi_opt.GATE_GRAPH)
192 |     train_vf  = vf_opt.minimize(loss, var_list=vf_vars, gate_gradients=vf_opt.GATE_GRAPH)
193 |     train_op  = tf.group(train_pi, train_vf, name=name)
194 | 
195 |     self.ops_dict["train_pi"] = train_pi
196 |     self.ops_dict["train_vf"] = train_vf
197 | 
198 |     return train_op
199 | 
200 | 
201 |   def initialize(self, sess):
202 |     pass
203 | 
204 | 
205 |   def reset(self, sess):
206 |     pass
207 | 
208 | 
209 |   def action_train_ops(self, sess, state, run_dict=None):
210 |     feed_dict = {self.obs_ph: state[None,:], self.training: False}
211 |     return super()._action_train_ops(sess, run_dict, feed_dict=feed_dict)
212 | 
213 | 
214 |   def action_eval_ops(self, sess, state, run_dict=None):
215 |     feed_dict = {self.obs_ph: state[None,:], self.training: False}
216 |     return super()._action_eval_ops(sess, run_dict, feed_dict=feed_dict)
217 | 


--------------------------------------------------------------------------------
/rltf/models/bdqn.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | from rltf.models      import DDQN
  4 | from rltf.tf_utils    import BLR, tf_utils
  5 | 
  6 | 
  7 | class BDQN(DDQN):
  8 |   """Bayesian Double DQN"""
  9 | 
 10 |   def __init__(self, sigma_e, tau, mode="mean", **kwargs):
 11 |     """
 12 |     Args:
 13 |       obs_shape: list. Shape of the observation tensor
 14 |       n_actions: int. Number of possible actions
 15 |       opt_conf: rltf.optimizers.OptimizerConf. Configuration for the optimizer
 16 |       gamma: float. Discount factor
 17 |       sigma_e: float. Standard deviation of the noise observation for BLR
 18 |       tau: float. Standard deviation for the weight prior in BLR
 19 |       huber_loss: bool. Whether to use huber loss or not
 20 |     """
 21 | 
 22 |     super().__init__(**kwargs)
 23 | 
 24 |     self.agent_blr  = [BLR(tau=tau, sigma_e=sigma_e, mode=mode)   for _ in range(self.n_actions)]
 25 |     self.target_blr = [BLR(tau=tau, sigma_e=sigma_e, mode="mean") for _ in range(self.n_actions)]
 26 | 
 27 |     # Custom TF Tensors and Ops
 28 |     self._target    = None    # BLR target
 29 |     self._phi       = None    # BLR features
 30 |     self.train_blr  = None    # Op for updating the BLR weight posterior
 31 |     self.reset_blr  = None    # Op for reseting the BLR to initial weights
 32 |     self.a_var      = None    # Tensor with BLR var
 33 | 
 34 | 
 35 |   def build(self):
 36 |     super().build()
 37 |     self.reset_blr = tf.group(*[blr.reset_op for blr in self.agent_blr], name="reset_blr")
 38 | 
 39 | 
 40 |   def _conv_nn(self, x):
 41 |     """ Build the DQN architecture - as described in the original paper
 42 |     Args:
 43 |       x: tf.Tensor. Tensor for the input
 44 |     Returns:
 45 |       `tf.Tensor` of shape `[batch_size, n_actions]`. Contains the Q-function for each action
 46 |     """
 47 |     with tf.variable_scope("conv_net"):
 48 |       # original architecture
 49 |       x = tf.layers.conv2d(x, filters=32, kernel_size=8, strides=4, padding="SAME", activation=tf.nn.relu)
 50 |       x = tf.layers.conv2d(x, filters=64, kernel_size=4, strides=2, padding="SAME", activation=tf.nn.relu)
 51 |       x = tf.layers.conv2d(x, filters=64, kernel_size=3, strides=1, padding="SAME", activation=tf.nn.relu)
 52 |     x = tf.layers.flatten(x)
 53 |     with tf.variable_scope("action_value"):
 54 |       x = tf.layers.dense(x, units=512, activation=tf.nn.relu)
 55 |       # Normalize features
 56 |       # if self.phi_norm:
 57 |       #   x = tf.layers.batch_normalization(phi, axis=-1, training=tf.not_equal(tf.shape(x)[0], 1))
 58 |       blrs = self.agent_blr if "agent_net" in tf.get_variable_scope().name else self.target_blr
 59 | 
 60 |       # Compute the mean and std prediction from BLR
 61 |       blr_out = [blr.apply(x) for blr in blrs]
 62 |       # Remember phi and the stds
 63 |       if "agent_net" in tf.get_variable_scope().name and self._phi is None:
 64 |         self._phi  = x
 65 |         self.a_var = tf.concat([var for (_, var) in blr_out], axis=-1)
 66 |       # Group the mean predictions
 67 |       x = [mean for (mean, _) in blr_out]
 68 |       x = tf.concat(x, axis=-1)
 69 | 
 70 |     return x
 71 | 
 72 | 
 73 |   def _build_train_blr_op(self, phi, target, name):
 74 |     """Build the Bayesian Linear Regression ops and estimates
 75 |     Args:
 76 |       phi: tf.Tensor, shape: `[None, dim_phi]`. The feature tensor
 77 |       target: tf.Tensor, as returned by `self._compute_target()`; `[None]`
 78 |     Returns:
 79 |       tf.Op: The train Op for BLR
 80 |     """
 81 |     target = tf.expand_dims(target, axis=-1)
 82 | 
 83 |     def train_blr(blr, a):
 84 |       """Given a BLR instance, select only the examples for the corresponding action"""
 85 |       mask = tf.expand_dims(tf.equal(self.act_t_ph, a), axis=-1)
 86 |       mask = tf.cast(mask, tf.float32)  # out shape: [None]
 87 |       X = phi * mask                    # out shape: [None, dim_phi]
 88 |       y = target * mask                 # out shape: [None, 1]
 89 |       return blr.train(X, y)
 90 | 
 91 |     w_updates = [train_blr(blr, i) for i, blr in enumerate(self.agent_blr)]
 92 | 
 93 |     return tf.group(*w_updates, name=name)
 94 | 
 95 | 
 96 |   def _compute_target(self, target_net):
 97 |     target        = super()._compute_target(target_net)
 98 |     self._target  = target
 99 |     return target
100 | 
101 | 
102 |   def _build_train_op(self, optimizer, loss, agent_vars, name):
103 |     self.train_blr = self._build_train_blr_op(self._phi, self._target, name="train_blr")
104 | 
105 |     return super()._build_train_op(optimizer, loss, agent_vars, name)
106 | 
107 | 
108 | 
109 | class BDQN_TS(BDQN):
110 |   """Bayesian Double DQN with Thompson Sampling exploration policy"""
111 | 
112 |   def __init__(self, **kwargs):
113 | 
114 |     super().__init__(mode="ts", **kwargs)
115 | 
116 |     # Custom TF Tensors and Ops
117 |     self.reset_ts   = None    # Op that resamples the parameters for TS
118 | 
119 | 
120 |   def build(self):
121 |     super().build()
122 | 
123 |     agent_w   = [blr.w            for blr in self.agent_blr]
124 |     target_w  = [blr.resample_w() for blr in self.target_blr]
125 | 
126 |     self.reset_ts = tf_utils.assign_vars(agent_w, target_w, name="reset_ts")
127 | 
128 | 
129 |   def reset(self, sess):
130 |     sess.run(self.reset_ts)
131 | 
132 | 
133 | 
134 | class BDQN_UCB(BDQN):
135 |   """Bayesian Double DQN with UCB exploration policy"""
136 | 
137 |   def __init__(self, n_stds, **kwargs):
138 | 
139 |     super().__init__(mode="mean", **kwargs)
140 | 
141 |     self.n_stds = n_stds       # Scale constant for computing uncertainty
142 | 
143 | 
144 |   def _act_train(self, agent_net, name):
145 |     mean    = agent_net
146 |     std     = tf.sqrt(self.a_var)
147 |     action  = tf.argmax(mean + self.n_stds * std, axis=-1, output_type=tf.int32, name=name)
148 | 
149 |     # Add debug histograms
150 |     tf.summary.histogram("debug/a_std",   std)
151 |     tf.summary.histogram("debug/a_mean",  mean)
152 | 
153 |     return dict(action=action)
154 | 
155 | 
156 | 
157 | class BDQN_IDS(BDQN):
158 |   """Bayesian Double DQN with IDS exploration policy"""
159 | 
160 |   def __init__(self, n_stds, **kwargs):
161 |     super().__init__(mode="mean", **kwargs)
162 | 
163 |     self.n_stds = n_stds       # Scale constant for computing uncertainty
164 |     self.rho    = 1.0
165 | 
166 | 
167 |   def _act_train(self, agent_net, name):
168 |     mean      = agent_net
169 |     var       = self.a_var
170 |     std       = tf.sqrt(var)
171 |     regret    = tf.reduce_max(mean + self.n_stds * std, axis=-1, keepdims=True)
172 |     regret    = regret - (mean - self.n_stds * std)
173 |     regret_sq = tf.square(regret)
174 |     info_gain = tf.log(1 + var / self.rho**2) + 1e-5
175 |     ids_score = tf.div(regret_sq, info_gain)
176 |     ids_score = tf.check_numerics(ids_score, "IDS score is NaN or Inf")
177 | 
178 |     action    = tf.argmin(ids_score, axis=-1, output_type=tf.int32, name=name)
179 | 
180 |     # Add debug histograms
181 |     tf.summary.histogram("debug/a_mean",    mean)
182 |     tf.summary.histogram("debug/a_std",     std)
183 |     tf.summary.histogram("debug/a_regret",  regret)
184 |     tf.summary.histogram("debug/a_info",    info_gain)
185 |     tf.summary.histogram("debug/a_ids",     ids_score)
186 | 
187 |     # Set the plottable tensors for video. Use only the first action in the batch
188 |     p_a     = tf.identity(action[0],    name="plot/train/a")
189 |     p_mean  = tf.identity(mean[0],      name="plot/train/mean")
190 |     p_std   = tf.identity(std[0],       name="plot/train/std")
191 |     p_ids   = tf.identity(ids_score[0], name="plot/train/ids")
192 | 
193 |     train_actions = {
194 |       "a_mean": dict(height=p_mean, a=p_a),
195 |       "a_std":  dict(height=p_std,  a=p_a),
196 |       "a_ids":  dict(height=p_ids,  a=p_a),
197 |     }
198 |     self.plot_conf.set_train_spec(dict(train_actions=train_actions))
199 | 
200 |     return dict(action=action)
201 | 
202 | 
203 |   def _act_eval(self, agent_net, name):
204 |     action = super()._act_eval(agent_net, name)
205 | 
206 |     # Set the plottable tensors for train
207 |     p_a     = tf.identity(action["action"][0],  name="plot/eval/a")
208 |     p_mean  = tf.identity(agent_net[0],         name="plot/eval/mean")
209 |     # Set the plottable tensors for episode recordings
210 |     eval_actions = {
211 |       "a_mean": dict(height=p_mean, a=p_a),
212 |     }
213 |     self.plot_conf.set_eval_spec(dict(eval_actions=eval_actions))
214 | 
215 |     return action
216 | 


--------------------------------------------------------------------------------
/rltf/models/c51.py:
--------------------------------------------------------------------------------
  1 | import numpy      as np
  2 | import tensorflow as tf
  3 | 
  4 | from rltf.models    import BaseDQN
  5 | from rltf.tf_utils  import tf_ops
  6 | 
  7 | 
  8 | class C51(BaseDQN):
  9 | 
 10 |   def __init__(self, V_min, V_max, N, **kwargs):
 11 |     """
 12 |     Args:
 13 |       obs_shape: list. Shape of the observation tensor
 14 |       n_actions: int. Number of possible actions
 15 |       opt_conf: rltf.optimizers.OptimizerConf. Configuration for the optimizer
 16 |       gamma: float. Discount factor
 17 |       V_min: float. lower bound for histrogram range
 18 |       V_max: float. upper bound for histrogram range
 19 |       N: int. number of histogram bins
 20 |     """
 21 | 
 22 |     super().__init__(**kwargs)
 23 | 
 24 |     self.N      = N
 25 |     self.V_min  = V_min
 26 |     self.V_max  = V_max
 27 |     self.dz     = (self.V_max - self.V_min) / float(self.N - 1)
 28 | 
 29 |     # Custom TF Tensors and Ops
 30 |     self.bins   = None
 31 | 
 32 | 
 33 |   def build(self):
 34 |     # Costruct the tensor of the bins for the probability distribution
 35 |     bins      = np.arange(0, self.N, 1, dtype=np.float32)
 36 |     bins      = bins * self.dz + self.V_min
 37 |     self.bins = tf.constant(bins[None, None, :], dtype=tf.float32)  # out shape: [1, 1, N]
 38 | 
 39 |     super().build()
 40 | 
 41 | 
 42 |   def _conv_nn(self, x):
 43 |     """ Build the C51 architecture - as desribed in the original paper
 44 |     Args:
 45 |       x: tf.Tensor. Tensor for the input
 46 |       scope: str. Scope in which all the model related variables should be created
 47 |     Returns:
 48 |       `tf.Tensor` of shape `[batch_size, n_actions, N]`. Contains the logits for the
 49 |         return distribution for each action
 50 |     """
 51 |     n_actions = self.n_actions
 52 |     N         = self.N
 53 | 
 54 |     with tf.variable_scope("conv_net"):
 55 |       # original architecture
 56 |       x = tf.layers.conv2d(x, filters=32, kernel_size=8, strides=4, padding="SAME", activation=tf.nn.relu)
 57 |       x = tf.layers.conv2d(x, filters=64, kernel_size=4, strides=2, padding="SAME", activation=tf.nn.relu)
 58 |       x = tf.layers.conv2d(x, filters=64, kernel_size=3, strides=1, padding="SAME", activation=tf.nn.relu)
 59 |     x = tf.layers.flatten(x)
 60 |     with tf.variable_scope("action_value"):
 61 |       x = tf.layers.dense(x, units=512,          activation=tf.nn.relu)
 62 |       x = tf.layers.dense(x, units=N*n_actions,  activation=None)
 63 |     x = tf.reshape(x, [-1, n_actions, N])
 64 | 
 65 |     return x
 66 | 
 67 | 
 68 |   def _compute_estimate(self, agent_net):
 69 |     """Select the return distribution Z of the selected action
 70 |     Args:
 71 |       agent_net: `tf.Tensor`, shape `[None, n_actions, N]. The tensor output from `self._nn_model()`
 72 |         for the agent
 73 |     Returns:
 74 |       `tf.Tensor` of shape `[None, N]`
 75 |     """
 76 |     a_mask  = tf.one_hot(self.act_t_ph, self.n_actions, dtype=tf.float32)   # out: [None, n_actions]
 77 |     a_mask  = tf.expand_dims(a_mask, axis=-1)                               # out: [None, n_actions, 1]
 78 |     z       = tf.reduce_sum(agent_net * a_mask, axis=1)                     # out: [None, N]
 79 |     return z
 80 | 
 81 | 
 82 |   def _select_target(self, target_net):
 83 |     """Select the C51 target distributions - use the greedy action from E[Z]
 84 |     Args:
 85 |       target_net: `tf.Tensor`, shape `[None, n_actions, N]. The tensor output from `self._nn_model()`
 86 |         for the target
 87 |     Returns:
 88 |       `tf.Tensor` of shape `[None, N]`
 89 |     """
 90 |     n_actions   = self.n_actions
 91 |     target_z    = tf_ops.softmax(target_net, axis=-1)
 92 | 
 93 |     # Get the target Q probabilities for the greedy action; output shape [None, N]
 94 |     target_q    = tf.reduce_sum(target_z * self.bins, axis=-1)            # out: [None, n_actions]
 95 |     target_act  = tf.argmax(target_q, axis=-1, output_type=tf.int32)      # out: [None]
 96 |     target_mask = tf.one_hot(target_act, n_actions, dtype=tf.float32)     # out: [None, n_actions]
 97 |     target_mask = tf.expand_dims(target_mask, axis=-1)                    # out: [None, n_actions, 1]
 98 |     target_z    = tf.reduce_sum(target_z * target_mask, axis=1)           # out: [None, N]
 99 |     return target_z
100 | 
101 | 
102 |   def _compute_backup(self, target):
103 |     """Compute the C51 backup distributions
104 |     Args:
105 |       target: `tf.Tensor`, shape `[None, N]. The output from `self._select_target()`
106 |     Returns:
107 |       `tf.Tensor` of shape `[None, N]`
108 |     """
109 |     target_z    = target
110 | 
111 |     # Compute the target atoms support; output shape [None, N]
112 |     done_mask   = tf.cast(tf.logical_not(self.done_ph), tf.float32)
113 |     done_mask   = tf.expand_dims(done_mask, axis=-1)
114 |     rew_t       = tf.expand_dims(self.rew_t_ph, axis=-1)
115 |     bins        = tf.reshape(self.bins, [1, self.N])
116 |     target_bins = rew_t + self.gamma * done_mask * bins
117 | 
118 |     return self._project_distribution(target_bins, target_z)
119 | 
120 | 
121 |   def _project_distribution(self, atoms, p):
122 |     """Project the distribution given by (atoms, p) onto the support of self.bins
123 |       using Eq. (7) from the Categorical DQN paper (Bellemare et. al. 2017)
124 |     Args:
125 |       atoms: tf.Tensor, shape `[None, N]`. Atoms for the support of the distribution
126 |       p: tf.Tensor, shape `[None, N]`. Probability of each atom of the distribution
127 |     Returns:
128 |       tf.Tensor of shape `[None, N]`, which contains the projected distribution
129 |     """
130 | 
131 |     # Clip the atom supports in [V_min, V_max]
132 |     atoms = tf.clip_by_value(atoms, self.V_min, self.V_max)   # [None, N]
133 |     atoms = tf.expand_dims(atoms, axis=-2)                    # [None, 1, N]
134 |     atoms = tf.tile(atoms, [1, self.N, 1])                    # [None, N, N]
135 | 
136 |     # Compute the temporal difference between atoms and bins
137 |     td_z  = atoms - tf.reshape(self.bins, [1, self.N, 1])     # [None, N, N]
138 |     # td_z[0] =
139 |     # [ [tz1-z1, tz2-z1, ..., tzN-z1],
140 |     #   [tz1-z2, tz2-z2, ..., tzN-z2],
141 |     #   ...
142 |     #   [tz1-zN, tzN-zN, ..., tzN-zN]  ]
143 | 
144 |     # Compute the projection weights and clip them between 0 and 1
145 |     # Corresponds to `[1 - |[\hat{T}z_j]_{V_min}^{V_max} - z_i| / (\Delta z) ]_0^1` in Eq. (7)
146 |     weights = tf.clip_by_value(1 - tf.abs(td_z) / self.dz, 0, 1)
147 | 
148 |     # Compute the projected probabilities
149 |     p       = tf.expand_dims(p, axis=1)                     # [None, 1, N]
150 |     proj_p  = tf.reduce_sum(weights * p, axis=-1)           # [None, N]
151 | 
152 |     return proj_p
153 | 
154 | 
155 |   def _compute_loss(self, estimate, target, name):
156 |     logits_z  = estimate
157 |     target_z  = target
158 |     entropy   = tf.nn.softmax_cross_entropy_with_logits_v2(labels=target_z, logits=logits_z)
159 |     loss      = tf.reduce_mean(entropy)
160 | 
161 |     tf.summary.scalar(name, loss)
162 | 
163 |     return loss
164 | 
165 | 
166 |   def _act_train(self, agent_net, name):
167 |     # Compute the Q-function as expectation of Z; output shape [None, n_actions]
168 |     z       = tf_ops.softmax(agent_net, axis=-1)
169 |     qf      = tf.reduce_sum(z * self.bins, axis=-1)
170 |     action  = tf.argmax(qf, axis=-1, output_type=tf.int32, name=name)
171 | 
172 |     # Add debugging plot for the variance of the return
173 |     z_var   = self._compute_z_variance(z=z, qf=qf, normalize=True)  # [None, n_actions]
174 |     tf.summary.scalar("debug/z_var", tf.reduce_mean(z_var))
175 |     tf.summary.histogram("debug/a_rho2", z_var)
176 | 
177 |     return dict(action=action)
178 | 
179 | 
180 |   def _act_eval(self, agent_net, name):
181 |     return dict(action=tf.identity(self.train_dict["action"], name=name))
182 | 
183 | 
184 |   def _compute_z_variance(self, z=None, logits=None, qf=None, normalize=True):
185 |     """Compute the return distribution variance. Only one of `z` and `logits` must be set
186 |     Args:
187 |       z: tf.Tensor, shape `[None, n_actions, N]`. Return atoms probabilities
188 |       logits: tf.Tensor, shape `[None, n_actions, N]`. Logits of the return
189 |       qf: tf.Tensor, shape `[None, n_actions]`. Optionally provide a tensor for the Q-function
190 |       normalize: bool. If True, normalize the variance values such that the mean of the
191 |         return variances of all actions in a given state is 1.
192 |     Returns:
193 |       tf.Tensor of shape `[None, n_actions]`
194 |     """
195 |     assert (z is None) != (logits is None), "Only one of 'z' and 'logits' must be set"
196 | 
197 |     if logits is not None:
198 |       z = tf_ops.softmax(logits, axis=-1)
199 |     if qf is None:
200 |       qf = tf.reduce_sum(z * self.bins, axis=-1, keepdims=True)
201 |     else:
202 |       qf = tf.reshape(qf, [-1] + z.shape.as_list()[1:])
203 | 
204 |     # Var(X) = sum_x p(X)*[X - E[X]]^2
205 |     center  = self.bins - qf                                  # out: [None, n_actions, N]
206 |     z_var   = tf.square(center) * z                           # out: [None, n_actions, N]
207 |     z_var   = tf.reduce_sum(z_var, axis=-1)                   # out: [None, n_actions]
208 | 
209 |     # Normalize the variance across the action axis
210 |     if normalize:
211 |       mean  = tf.reduce_mean(z_var, axis=-1, keepdims=True)   # out: [None, 1]
212 |       z_var = z_var / mean                                    # out: [None, n_actions]
213 | 
214 |     return z_var
215 | 


--------------------------------------------------------------------------------
/rltf/models/c51_ids.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | from rltf.models    import DQN_IDS
  4 | from rltf.models    import C51
  5 | from rltf.tf_utils  import tf_utils
  6 | from rltf.tf_utils  import tf_ops
  7 | 
  8 | 
  9 | class C51_IDS(DQN_IDS, C51):
 10 | 
 11 |   def __init__(self, **kwargs):
 12 |     """
 13 |     Args:
 14 |       obs_shape: list. Shape of the observation tensor
 15 |       n_actions: int. Number of possible actions
 16 |       opt_conf: rltf.optimizers.OptimizerConf. Configuration for the optimizer
 17 |       gamma: float. Discount factor
 18 |       n_heads: Number of bootstrap heads
 19 |       huber_loss: bool. Use huber loss for the bootstrap heads
 20 |       V_min: float. lower bound for histrogram range
 21 |       V_max: float. upper bound for histrogram range
 22 |       N: int. number of histogram bins
 23 |       n_stds: float. Standard deviation scale for computing regret
 24 |     """
 25 |     super().__init__(**kwargs)
 26 | 
 27 |     # Custom TF Tensors and Ops
 28 |     self.rho2   = None
 29 | 
 30 | 
 31 |   def _conv_nn(self, x):
 32 |     """ Build the Bootstrapped DQN architecture - as described in the original paper
 33 |     Args:
 34 |       x: tf.Tensor. Tensor for the input
 35 |     Returns:
 36 |       Tuple of `tf.Tensor`s. First tensor is of shape `[batch_size, n_heads, n_actions]` and contains the
 37 |       Q-function bootstrapped estimates. Second tensor is of shape `[batch_size, n_actions, N]` and
 38 |       contains the C51 return distribution for each action
 39 |     """
 40 |     n_actions = self.n_actions
 41 |     N         = self.N
 42 | 
 43 |     def build_bstrap_head(x):
 44 |       """ Build the head of the DQN network
 45 |       Args:
 46 |         x: tf.Tensor. Tensor for the input
 47 |       Returns:
 48 |         `tf.Tensor` of shape `[batch_size, 1, n_actions]`. Contains the Q-function for each action
 49 |       """
 50 |       x = tf.layers.dense(x, units=512,       activation=tf.nn.relu)
 51 |       x = tf.layers.dense(x, units=n_actions, activation=None)
 52 |       x = tf.expand_dims(x, axis=-2)
 53 |       return x
 54 | 
 55 |     def build_z_head(x):
 56 |       """ Build the head of the C51 network
 57 |       Args:
 58 |         x: tf.Tensor. Tensor for the input
 59 |       Returns:
 60 |         `tf.Tensor` of shape `[batch_size, n_actions, N]`. Contains the Q-function distribution
 61 |           for each action
 62 |       """
 63 |       x = tf.layers.dense(x, 512,         activation=tf.nn.relu)
 64 |       x = tf.layers.dense(x, N*n_actions, activation=None)
 65 |       x = tf.reshape(x, [-1, n_actions, N])
 66 |       return x
 67 | 
 68 |     with tf.variable_scope("conv_net"):
 69 |       x = tf.layers.conv2d(x, filters=32, kernel_size=8, strides=4, padding="SAME", activation=tf.nn.relu)
 70 |       x = tf.layers.conv2d(x, filters=64, kernel_size=4, strides=2, padding="SAME", activation=tf.nn.relu)
 71 |       x = tf.layers.conv2d(x, filters=64, kernel_size=3, strides=1, padding="SAME", activation=tf.nn.relu)
 72 |     x = tf.layers.flatten(x)
 73 | 
 74 |     # Careful: Make sure self._conv_out is set only during the right function call
 75 |     if "agent_net" in tf.get_variable_scope().name and self._conv_out is None: self._conv_out = x
 76 | 
 77 |     # Build the C51 head
 78 |     with tf.variable_scope("distribution_value"):
 79 |       z = build_z_head(x)
 80 | 
 81 |     # Build the Bootstrap heads
 82 |     with tf.variable_scope("action_value"):
 83 |       heads = [build_bstrap_head(x) for _ in range(self.n_heads)]
 84 |       x = tf.concat(heads, axis=-2)
 85 |     return dict(q_values=x, logits=z)
 86 | 
 87 | 
 88 |   def _compute_estimate(self, agent_net):
 89 |     """Get the Q value for the selected action
 90 |     Args:
 91 |       agent_net: tuple of `tf.Tensor`s. Output from the agent network. Shapes:
 92 |         `[batch_size, n_heads, n_actions]` and `[batch_size, n_actions, N]`
 93 |     Returns:
 94 |       Tuple of `tf.Tensor`s of shapes `[batch_size, n_heads]` and `[batch_size, N]`
 95 |     """
 96 |     q, z = agent_net["q_values"], agent_net["logits"]
 97 |     q = DQN_IDS._compute_estimate(self, q)        # out: [None, n_heads]
 98 |     z = C51._compute_estimate(self, z)            # logits; out: [None, N]
 99 |     return dict(q_values=q, logits=z)
100 | 
101 | 
102 |   def _select_target(self, target_net):
103 |     """Select the Double DQN target
104 |     Args:
105 |       target_net: `tf.Tensor`. shape `[None, n_heads, n_actions]. The output from `self._nn_model()`
106 |         for the target
107 |     Returns:
108 |       `tf.Tensor` of shape `[None, n_heads]`
109 |     """
110 |     n_actions   = self.n_actions
111 | 
112 |     # Compute the Q-estimate with the agent network variables and select the maximizing action
113 |     agent_net   = self._nn_model(self.obs_tp1, scope="agent_net")       # out: [None, n_heads, n_actions]
114 |     agent_net   = agent_net["q_values"]  # Select only the Q-tensor
115 |     target_act  = tf.argmax(agent_net, axis=-1, output_type=tf.int32)   # out: [None, n_heads]
116 | 
117 |     # Select the target Q-function
118 |     target_mask = tf.one_hot(target_act, n_actions, dtype=tf.float32)   # out: [None, n_heads, n_actions]
119 |     target_q    = tf.reduce_sum(target_net * target_mask, axis=-1)      # out: [None, n_heads]
120 | 
121 |     return target_q
122 | 
123 | 
124 |   def _compute_target(self, target_net):
125 |     """Compute the backups
126 |     Args:
127 |       target_net: tuple of `tf.Tensor`s. Output from the target network. Shapes:
128 |         `[batch_size, n_heads, n_actions]` and `[batch_size, n_actions, N]`
129 |     Returns:
130 |       Tuple of `tf.Tensor`s of shapes `[batch_size, n_heads]` and `[batch_size, N]`
131 |     """
132 |     target_q, target_z = target_net["q_values"], target_net["logits"]
133 |     # DQN_IDS call to self._select_target resolves to the C51_IDS._select_target()
134 |     backup_q = DQN_IDS._compute_target(self, target_q)
135 |     # NOTE: Do NOT call C51._compute_target(self, target_z) - call to self._select_target()
136 |     # will resolve to C51_IDS._select_target() - incorrect
137 |     target_z = C51._select_target(self, target_z)
138 |     backup_z = C51._compute_backup(self, target_z)
139 |     backup_z = tf.stop_gradient(backup_z)
140 |     return dict(target_q=backup_q, target_p=backup_z)
141 | 
142 | 
143 |   def _compute_loss(self, estimate, target, name):
144 |     q, logits_z         = estimate["q_values"], estimate["logits"]
145 |     target_q, target_p  = target["target_q"], target["target_p"]
146 | 
147 |     head_loss = DQN_IDS._compute_loss(self, q, target_q, name)
148 |     z_loss    = C51._compute_loss(self, logits_z, target_p, "train/z_loss")
149 | 
150 |     return dict(head_loss=head_loss, z_loss=z_loss)
151 | 
152 | 
153 |   def _compute_gradients(self, optimizer, loss, agent_vars, gate_grads=True):
154 |     head_loss = loss["head_loss"]
155 |     z_loss    = loss["z_loss"]
156 | 
157 |     # Get the Bootsrapped heads and conv net gradients
158 |     net_grads = DQN_IDS._compute_gradients(self, optimizer, head_loss, agent_vars, gate_grads=False)
159 | 
160 |     # Get the train op for the distributional FC layers
161 |     z_vars    = tf_utils.scope_vars(agent_vars, scope='agent_net/distribution_value')
162 |     z_grads   = C51._compute_gradients(self, optimizer, z_loss, z_vars, gate_grads=False)
163 | 
164 |     grads     = net_grads + z_grads
165 | 
166 |     if gate_grads:
167 |       grads   = tf_utils.gate_gradients(grads)
168 | 
169 |     return grads
170 | 
171 | 
172 |   def _act_train(self, agent_net, name):
173 |     # agent_net tuple of shapes: [None, n_heads, n_actions], [None, n_actions, N]
174 | 
175 |     z_var     = self._compute_z_variance(logits=agent_net["logits"], normalize=True)  # [None, n_actions]
176 |     self.rho2 = tf.maximum(z_var, 0.25)
177 | 
178 |     action    = DQN_IDS._act_train(self, agent_net["q_values"], name)
179 | 
180 |     # Add debugging data for TB
181 |     tf.summary.histogram("debug/a_rho2", self.rho2)
182 |     tf.summary.scalar("debug/z_var", tf.reduce_mean(z_var))
183 | 
184 |     # Append the plottable tensors for episode recordings
185 |     p_rho2  = tf.identity(self.rho2[0], name="plot/train/rho2")
186 |     p_a     = self.plot_conf.true_train_spec["train_actions"]["a_mean"]["a"]
187 |     self.plot_conf.true_train_spec["train_actions"]["a_rho2"] = dict(height=p_rho2, a=p_a)
188 | 
189 |     return action
190 | 
191 | 
192 |   def _act_eval(self, agent_net, name):
193 |     return DQN_IDS._act_eval(self, agent_net["q_values"], name)
194 | 


--------------------------------------------------------------------------------
/rltf/models/ddqn.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from rltf.models.dqn  import DQN
 4 | 
 5 | 
 6 | class DDQN(DQN):
 7 | 
 8 |   def _select_target(self, target_net):
 9 |     """Select the Double DQN target
10 |     Args:
11 |       target_net: `tf.Tensor`, shape `[None, n_actions]. The output from `self._nn_model()` for the target
12 |     Returns:
13 |       `tf.Tensor` of shape `[None]`
14 |     """
15 |     # Compute the Q-estimate with the agent network variables and select the maximizing action
16 |     agent_net   = self._nn_model(self.obs_tp1, scope="agent_net")
17 |     target_act  = tf.argmax(agent_net, axis=-1, output_type=tf.int32)
18 | 
19 |     # Select the target Q-function
20 |     target_mask = tf.one_hot(target_act, self.n_actions, dtype=tf.float32)
21 |     target_q    = tf.reduce_sum(target_net * target_mask, axis=-1)
22 | 
23 |     return target_q
24 | 


--------------------------------------------------------------------------------
/rltf/models/dqn.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from rltf.models import BaseDQN
 4 | 
 5 | 
 6 | class DQN(BaseDQN):
 7 | 
 8 |   def __init__(self, huber_loss, **kwargs):
 9 |     """
10 |     Args:
11 |       obs_shape: list. Shape of the observation tensor
12 |       n_actions: int. Number of possible actions
13 |       opt_conf: rltf.optimizers.OptimizerConf. Configuration for the optimizer
14 |       gamma: float. Discount factor
15 |       huber_loss: bool. Whether to use huber loss or not
16 |     """
17 | 
18 |     super().__init__(**kwargs)
19 | 
20 |     self.huber_loss = huber_loss
21 | 
22 | 
23 |   def _conv_nn(self, x):
24 |     """ Build the DQN architecture - as described in the original paper
25 |     Args:
26 |       x: tf.Tensor. Tensor for the input
27 |       scope: str. Scope in which all the model related variables should be created
28 |     Returns:
29 |       `tf.Tensor` of shape `[batch_size, n_actions]`. Contains the Q-function for each action
30 |     """
31 |     n_actions = self.n_actions
32 | 
33 |     with tf.variable_scope("conv_net"):
34 |       # original architecture
35 |       x = tf.layers.conv2d(x, filters=32, kernel_size=8, strides=4, padding="SAME", activation=tf.nn.relu)
36 |       x = tf.layers.conv2d(x, filters=64, kernel_size=4, strides=2, padding="SAME", activation=tf.nn.relu)
37 |       x = tf.layers.conv2d(x, filters=64, kernel_size=3, strides=1, padding="SAME", activation=tf.nn.relu)
38 |     x = tf.layers.flatten(x)
39 |     with tf.variable_scope("action_value"):
40 |       x = tf.layers.dense(x, units=512,       activation=tf.nn.relu)
41 |       x = tf.layers.dense(x, units=n_actions, activation=None)
42 |     return x
43 | 
44 | 
45 |   def _dense_nn(self, x):
46 |     """ Build a Neural Network of dense layers only. Used for low-level observations
47 |     Args:
48 |       x: tf.Tensor. Tensor for the input
49 |       scope: str. Scope in which all the model related variables should be created
50 |     Returns:
51 |       `tf.Tensor` of shape `[batch_size, n_actions]`. Contains the Q-function for each action
52 |     """
53 |     n_actions = self.n_actions
54 | 
55 |     with tf.variable_scope("dense_net"):
56 |       x = tf.layers.dense(x, units=512,       activation=tf.nn.relu)
57 |       x = tf.layers.dense(x, units=512,       activation=tf.nn.relu)
58 |       x = tf.layers.dense(x, units=n_actions, activation=None)
59 |     return x
60 | 
61 | 
62 |   def _act_train(self, agent_net, name):
63 |     action = tf.argmax(agent_net, axis=-1, output_type=tf.int32, name=name)
64 |     return dict(action=action)
65 | 
66 | 
67 |   def _act_eval(self, agent_net, name):
68 |     return dict(action=tf.identity(self.train_dict["action"], name=name))
69 | 
70 | 
71 |   def _compute_estimate(self, agent_net):
72 |     # Get the Q value for the selected action; output shape [None]
73 |     a_mask  = tf.one_hot(self.act_t_ph, self.n_actions, dtype=tf.float32)
74 |     qf      = tf.reduce_sum(agent_net * a_mask, axis=-1)
75 | 
76 |     return qf
77 | 
78 | 
79 |   def _select_target(self, target_net):
80 |     target_q  = tf.reduce_max(target_net, axis=-1)
81 |     return target_q
82 | 
83 | 
84 |   def _compute_backup(self, target):
85 |     done_mask = tf.cast(tf.logical_not(self.done_ph), tf.float32)
86 |     target_q  = self.rew_t_ph + self.gamma * done_mask * target
87 |     return target_q
88 | 
89 | 
90 |   def _compute_loss(self, estimate, target, name):
91 |     if self.huber_loss:
92 |       loss = tf.losses.huber_loss(target, estimate)
93 |     else:
94 |       loss = tf.losses.mean_squared_error(target, estimate)
95 |     tf.summary.scalar(name, loss)
96 |     return loss
97 | 


--------------------------------------------------------------------------------
/rltf/models/dqn_ensemble.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from rltf.models.bstrap_dqn import BaseBstrapDQN
 4 | 
 5 | 
 6 | class DQN_Ensemble(BaseBstrapDQN):
 7 |   """Ensemble policy from Boostrapped DQN"""
 8 | 
 9 |   def _act_train(self, agent_net, name):
10 |     action = self._act_eval_vote(agent_net, name)
11 |     # Set the plottable tensors for episode recordings
12 |     self.plot_conf.set_train_spec(dict(eval_actions=self.plot_conf.true_eval_spec["eval_actions"]))
13 |     return dict(action=action)
14 | 
15 | 
16 |   def _act_eval(self, agent_net, name):
17 |     return dict(action=tf.identity(self.train_dict["action"], name=name))
18 | 


--------------------------------------------------------------------------------
/rltf/models/dqn_ids.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from rltf.models.bstrap_dqn import BaseBstrapDQN
 4 | 
 5 | 
 6 | class DQN_IDS(BaseBstrapDQN):
 7 |   """IDS policy from Boostrapped DQN"""
 8 | 
 9 |   def __init__(self, n_stds, **kwargs):
10 |     super().__init__(**kwargs)
11 | 
12 |     self.n_stds = n_stds    # Number of standard deviations for computing uncertainty
13 |     self.rho2   = 1.0**2    # Return distribution variance
14 | 
15 | 
16 |   def _act_train(self, agent_net, name):
17 |     mean      = tf.reduce_mean(agent_net, axis=1)
18 |     zero_mean = agent_net - tf.expand_dims(mean, axis=-2)
19 |     var       = tf.reduce_mean(tf.square(zero_mean), axis=1)
20 |     std       = tf.sqrt(var)
21 |     regret    = tf.reduce_max(mean + self.n_stds * std, axis=-1, keepdims=True)
22 |     regret    = regret - (mean - self.n_stds * std)
23 |     regret_sq = tf.square(regret)
24 |     info_gain = tf.log(1 + var / self.rho2) + 1e-5
25 |     ids_score = regret_sq / info_gain
26 |     action    = tf.argmin(ids_score, axis=-1, output_type=tf.int32, name=name)
27 | 
28 |     # Add debug histograms
29 |     tf.summary.histogram("debug/a_mean",    mean)
30 |     tf.summary.histogram("debug/a_std",     std)
31 |     tf.summary.histogram("debug/a_regret",  regret)
32 |     tf.summary.histogram("debug/a_info",    info_gain)
33 |     tf.summary.histogram("debug/a_ids",     ids_score)
34 | 
35 |     # Set the plottable tensors for episode recordings
36 |     p_a     = tf.identity(action[0],    name="plot/train/a")
37 |     p_mean  = tf.identity(mean[0],      name="plot/train/mean")
38 |     p_std   = tf.identity(std[0],       name="plot/train/std")
39 |     p_ids   = tf.identity(ids_score[0], name="plot/train/ids")
40 | 
41 |     train_actions = {
42 |       "a_mean": dict(height=p_mean, a=p_a),
43 |       "a_std":  dict(height=p_std,  a=p_a),
44 |       "a_ids":  dict(height=p_ids,  a=p_a),
45 |     }
46 |     self.plot_conf.set_train_spec(dict(train_actions=train_actions))
47 | 
48 |     return dict(action=action)
49 | 
50 | 
51 |   def _act_eval(self, agent_net, name):
52 |     return dict(action=self._act_eval_greedy(agent_net, name))
53 | 


--------------------------------------------------------------------------------
/rltf/models/dqn_ucb.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from rltf.models.bstrap_dqn import BaseBstrapDQN
 4 | 
 5 | 
 6 | class DQN_UCB(BaseBstrapDQN):
 7 |   """UCB policy from Boostrapped DQN"""
 8 | 
 9 |   def __init__(self, n_stds=0.1, **kwargs):
10 |     super().__init__(**kwargs)
11 |     self.n_stds = n_stds       # Number of standard deviations for computing uncertainty
12 | 
13 | 
14 |   def _act_train(self, agent_net, name):
15 |     mean    = tf.reduce_mean(agent_net, axis=1)
16 |     std     = agent_net - tf.expand_dims(mean, axis=-2)
17 |     std     = tf.sqrt(tf.reduce_mean(tf.square(std), axis=1))
18 |     action  = tf.argmax(mean + self.n_stds * std, axis=-1, output_type=tf.int32, name=name)
19 | 
20 |     # Add debug histograms
21 |     tf.summary.histogram("debug/a_std",   std)
22 |     tf.summary.histogram("debug/a_mean",  mean)
23 | 
24 |     return dict(action=action)
25 | 
26 | 
27 |   def _act_eval(self, agent_net, name):
28 |     return dict(action=self._act_eval_vote(agent_net, name))
29 | 


--------------------------------------------------------------------------------
/rltf/models/model.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from abc import ABCMeta, abstractmethod
  3 | 
  4 | import tensorflow as tf
  5 | 
  6 | from rltf.monitoring import vplot_manager
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | class Model(metaclass=ABCMeta):
 12 |   """The base class for operating a Reinforcement Learning deep net in TensorFlow.
 13 |   All network estimators descend from this class
 14 |   """
 15 | 
 16 |   def __init__(self):
 17 | 
 18 |     # Properties that should be set by the subclass
 19 |     self.obs_dtype  = None
 20 |     self.obs_shape  = None
 21 |     self.act_dtype  = None
 22 |     self.act_shape  = None
 23 | 
 24 |     # Get a TensorPlotConf object that manages plotting tenors in episode video recordings
 25 |     self.plot_conf  = vplot_manager.get_plot_conf(self.name)
 26 | 
 27 |     self.train_dict = None  # Dict of all tensors to run when fetching a train action
 28 |     self.eval_dict  = None  # Dict of all tensors to run when fetching an eval action
 29 |     self.ops_dict   = {}    # Dict of all important model tensors. Used for general access (by agent)
 30 | 
 31 |     # Regex that matches variables that should not be trained. Used when variable values are
 32 |     # restored and reused from an already trained model
 33 |     self.notrain_re = None
 34 | 
 35 |     self._vars      = None  # List of all model variables
 36 | 
 37 | 
 38 |   @abstractmethod
 39 |   def build(self):
 40 |     pass
 41 | 
 42 | 
 43 |   @abstractmethod
 44 |   def initialize(self, sess):
 45 |     """Run additional initialization for the model when it was created via
 46 |     self.build(). Assumes that tf.global_variables_initializer() and
 47 |     tf.local_variables_initializer() have already been run
 48 |     """
 49 |     pass
 50 | 
 51 | 
 52 |   @abstractmethod
 53 |   def reset(self, sess):
 54 |     """This method is called by the agent at the end of every episode. Allows for
 55 |     internal changes in the model that stay the same for the duration of the whole episode
 56 |     """
 57 |     pass
 58 | 
 59 | 
 60 |   @abstractmethod
 61 |   def action_train_ops(self, sess, state, run_dict=None):
 62 |     """Compute the training action from the model and any additional tensors.
 63 |     Args:
 64 |       sess: tf.Session(). Currently open session
 65 |       state: np.array. Observation for the current state
 66 |       run_dict: dict of str-tf.Tensor pairs. Contains any additional tensors to run
 67 |     Returns:
 68 |       dict of str-np.array pairs. Contains the action, additional model tensors and run_dict
 69 |     """
 70 |     pass
 71 | 
 72 | 
 73 |   @abstractmethod
 74 |   def action_eval_ops(self, sess, state, run_dict=None):
 75 |     """Compute the action that should be taken in evaluation mode and any additional tensors.
 76 |     Args:
 77 |       sess: tf.Session(). Currently open session
 78 |       state: np.array. Observation for the current state
 79 |       run_dict: dict of str-tf.Tensor pairs. Contains any additional tensors to run
 80 |     Returns:
 81 |       dict of str-np.array pairs. Contains the action, additional model tensors and run_dict
 82 |     """
 83 |     pass
 84 | 
 85 | 
 86 |   def _action_train_ops(self, sess, run_dict, feed_dict):
 87 |     if run_dict is None:
 88 |       run_dict = self.train_dict
 89 |     else:
 90 |       run_dict = {**run_dict, **self.train_dict}
 91 | 
 92 |     # Run the results and update any data that needs to be plotted
 93 |     data, self.plot_conf.train_data = sess.run([run_dict, self.plot_conf.train_spec], feed_dict=feed_dict)
 94 |     return data
 95 | 
 96 | 
 97 |   def _action_eval_ops(self, sess, run_dict, feed_dict):
 98 |     if run_dict is None:
 99 |       run_dict = self.eval_dict
100 |     else:
101 |       run_dict = {**run_dict, **self.eval_dict}
102 | 
103 |     # Run the results and update any data that needs to be plotted
104 |     data, self.plot_conf.eval_data = sess.run([run_dict, self.plot_conf.eval_spec], feed_dict=feed_dict)
105 |     return data
106 | 
107 | 
108 |   def exclude_train_vars(self, regex):
109 |     """Set a regex to match and exclude model variables which should not be trained
110 |     Args:
111 |       regex: A compiled Regular Expression Object from the `re` module. Must support `regex.search()`
112 |     """
113 |     self.notrain_re = regex
114 | 
115 | 
116 |   def _trainable_variables(self, scope):
117 |     """Get the trainable variables in the given scope and remove any which match `self.notrain_re`
118 |     Args:
119 |       scope: str. TensorFlow variable scope
120 |     Returns:
121 |       list of `tf.Variable`s that should be trained
122 |     """
123 |     train_vars = tf.trainable_variables(scope=scope)
124 |     # Exlude variables that are explicitly exluded from training
125 |     if self.notrain_re is not None:
126 |       exlcude = [v for v in train_vars if self.notrain_re.search(v.name)]
127 |       if len(exlcude) > 0:
128 |         logger.info("Excluding model variables in '%s' scope from training:", scope)
129 |         for v in exlcude:
130 |           logger.info(v.name)
131 |         train_vars = [v for v in train_vars if v not in exlcude]
132 |       else:
133 |         logger.info("No variables in scope '%s' will be excluded from training:", scope)
134 |     return train_vars
135 | 
136 | 
137 |   @property
138 |   def name(self):
139 |     """
140 |     Returns:
141 |       name of the model class
142 |     """
143 |     return self.__class__.__name__
144 | 
145 | 
146 |   @property
147 |   def variables(self):
148 |     """
149 |     Returns:
150 |       `list` of `tf.Variable`s which contains all variables used by the model. If there is a target
151 |       network, its variables must be included. Optimizer related variables must be excluded
152 |     """
153 |     if self._vars is not None:
154 |       return self._vars
155 |     else:
156 |       raise NotImplementedError()
157 | 


--------------------------------------------------------------------------------
/rltf/models/ppo.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | from rltf.models    import BasePG
  4 | from rltf.tf_utils  import tf_utils
  5 | 
  6 | 
  7 | class PPO(BasePG):
  8 |   """Proximal Policy Optimization Model"""
  9 | 
 10 |   def __init__(self, ent_weight, vf_weight, **kwargs):
 11 |     super().__init__(**kwargs)
 12 | 
 13 |     self.ent_weight = ent_weight
 14 |     self.vf_weight  = vf_weight
 15 | 
 16 |     # Custom TF placeholders
 17 |     self.cliprange_ph = None
 18 |     self.old_vf_ph    = None
 19 | 
 20 | 
 21 |   def build(self):
 22 | 
 23 |     # Build the input placeholders
 24 |     self._build_ph()
 25 | 
 26 |     # Preprocess the observation
 27 |     obs_t   = tf_utils.preprocess_input(self.obs_ph, norm=self.obs_norm, training=self.training)
 28 | 
 29 |     # Construct the policy and the value function networks
 30 |     pi      = self._pi_model(obs_t, scope="policy")
 31 |     vf      = self._vf_model(obs_t, scope="value_fn")
 32 | 
 33 |     # Compute the loss
 34 |     loss    = self._compute_loss(pi, vf, tb_name="train/loss")
 35 | 
 36 |     pi_vars = self._trainable_variables(scope="policy")
 37 |     vf_vars = self._trainable_variables(scope="value_fn")
 38 | 
 39 |     # Build the optimizer and the train op
 40 |     train_op  = self._build_train_op(loss, pi_vars, vf_vars, name="train_op")
 41 | 
 42 |     # Compute the train and eval actions
 43 |     self.train_dict = self._act_train(pi, vf, name="a_train")
 44 |     self.eval_dict  = self._act_eval(pi, name="a_eval")
 45 | 
 46 |     self._vars        = pi_vars + vf_vars
 47 |     self.train_op     = train_op
 48 | 
 49 | 
 50 |   def _compute_loss(self, pi, vf, tb_name):
 51 | 
 52 |     CLIP_RANGE = self.cliprange_ph
 53 | 
 54 |     # Compute the policy gradient loss
 55 |     adv       = self.adv_norm
 56 |     logp      = pi.log_prob(self.act_ph)
 57 |     weights   = tf.exp(logp - self.old_logp_ph)
 58 |     pg_loss_1 = weights * adv
 59 |     pg_loss_2 = tf.clip_by_value(weights, 1 - CLIP_RANGE, 1 + CLIP_RANGE) * adv
 60 |     pg_loss   = -tf.reduce_mean(tf.minimum(pg_loss_1, pg_loss_2))
 61 | 
 62 |     # Compute the policy entropy for Max-Ent learning
 63 |     entropy  = tf.reduce_mean(pi.entropy())
 64 | 
 65 |     # Compute the Value Function loss
 66 |     vf_clip   = tf.clip_by_value(vf, self.old_vf_ph - CLIP_RANGE, self.old_vf_ph + CLIP_RANGE)
 67 |     vf_loss_1 = tf.square(vf      - self.ret_ph)
 68 |     vf_loss_2 = tf.square(vf_clip - self.ret_ph)
 69 |     vf_loss   = 0.5 * tf.reduce_mean(tf.maximum(vf_loss_1, vf_loss_2))
 70 | 
 71 |     loss      = pg_loss - self.ent_weight * entropy + self.vf_weight * vf_loss
 72 | 
 73 |     # Remember the ops
 74 |     # self.ops_dict["loss"]     = loss
 75 |     # self.ops_dict["pg_loss"]  = pg_loss
 76 |     # self.ops_dict["vf_loss"]  = vf_loss
 77 |     # self.ops_dict["entropy"]  = entropy
 78 | 
 79 |     # Add metrics to track the training progress
 80 |     # Fraction of examples with clipped PG objective
 81 |     frac_clip = tf.reduce_mean(tf.cast(tf.greater(tf.abs(weights - 1.0), CLIP_RANGE), dtype=tf.float32))
 82 |     # Easy-to-compute approximate estimate of KL between old and new policy
 83 |     approxkl  = tf.reduce_mean(self.old_logp_ph - logp)
 84 | 
 85 |     # Add summaries
 86 |     tf.summary.scalar(tb_name,              loss)
 87 |     tf.summary.scalar("train/pg_loss",      pg_loss)
 88 |     tf.summary.scalar("train/vf_loss",      vf_loss)
 89 |     tf.summary.scalar("train/pi_entropy",   entropy)
 90 |     tf.summary.scalar("train/approx_kl",    approxkl)
 91 |     tf.summary.scalar("train/frac_clip",    frac_clip)
 92 |     tf.summary.scalar("train/vf",           tf.reduce_mean(vf))
 93 | 
 94 |     # Add summaries for stdout
 95 |     tf.summary.scalar("stdout/pg_loss",     pg_loss)
 96 |     tf.summary.scalar("stdout/pi_entropy",  entropy)
 97 |     tf.summary.scalar("stdout/approx_kl",   approxkl)
 98 | 
 99 |     return loss
100 | 
101 | 
102 |   def _build_ph(self):
103 |     super()._build_ph()
104 |     self.cliprange_ph = tf.placeholder(tf.float32, (),     name="cliprange_ph")
105 |     self.old_vf_ph    = tf.placeholder(tf.float32, [None], name="old_vf_ph")
106 | 


--------------------------------------------------------------------------------
/rltf/models/qrdqn_ids.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | from rltf.models    import DQN_IDS
  4 | from rltf.models    import QRDQN
  5 | from rltf.tf_utils  import tf_utils
  6 | 
  7 | 
  8 | class QRDQN_IDS(DQN_IDS, QRDQN):
  9 | 
 10 |   def __init__(self, **kwargs):
 11 |     """
 12 |     Args:
 13 |       obs_shape: list. Shape of the observation tensor
 14 |       n_actions: int. Number of possible actions
 15 |       opt_conf: rltf.optimizers.OptimizerConf. Configuration for the optimizer
 16 |       gamma: float. Discount factor
 17 |       n_heads: Number of bootstrap heads
 18 |       N: int. number of quantiles
 19 |       k: int. Huber loss order
 20 |     """
 21 |     super().__init__(**kwargs)
 22 | 
 23 |     # Custom TF Tensors and Ops
 24 |     self.rho2   = None
 25 | 
 26 | 
 27 |   def _conv_nn(self, x):
 28 |     """ Build the Bootstrapped DQN architecture - as described in the original paper
 29 |     Args:
 30 |       x: tf.Tensor. Tensor for the input
 31 |     Returns:
 32 |       Tuple of `tf.Tensor`s. First tensor is of shape `[batch_size, n_heads, n_actions]` and contains the
 33 |       Q-function bootstrapped estimates. Second tensor is of shape `[batch_size, n_actions, N]` and
 34 |       contains the QRDQN return distribution for each action
 35 |     """
 36 |     n_actions = self.n_actions
 37 |     N         = self.N
 38 |     # k_init    = tf_utils.init_dqn
 39 |     k_init    = tf_utils.init_glorot_normal
 40 |     # k_init    = tf_utils.init_default
 41 | 
 42 |     def build_bstrap_head(x):
 43 |       """ Build the head of the DQN network
 44 |       Args:
 45 |         x: tf.Tensor. Tensor for the input
 46 |       Returns:
 47 |         `tf.Tensor` of shape `[batch_size, 1, n_actions]`. Contains the Q-function for each action
 48 |       """
 49 |       x = tf.layers.dense(x, units=512,       activation=tf.nn.relu)
 50 |       x = tf.layers.dense(x, units=n_actions, activation=None)
 51 |       x = tf.expand_dims(x, axis=-2)
 52 |       return x
 53 | 
 54 |     def build_z_head(x):
 55 |       """ Build the head of the QRDQN network
 56 |       Args:
 57 |         x: tf.Tensor. Tensor for the input
 58 |       Returns:
 59 |         `tf.Tensor` of shape `[batch_size, n_actions, N]`. Contains the Q-function distribution
 60 |           for each action
 61 |       """
 62 |       x = tf.layers.dense(x, 512,         activation=tf.nn.relu,  kernel_initializer=k_init())
 63 |       x = tf.layers.dense(x, N*n_actions, activation=None,        kernel_initializer=k_init())
 64 |       x = tf.reshape(x, [-1, n_actions, N])
 65 |       return x
 66 | 
 67 |     with tf.variable_scope("conv_net"):
 68 |       x = tf.layers.conv2d(x, filters=32, kernel_size=8, strides=4, padding="SAME", activation=tf.nn.relu)
 69 |       x = tf.layers.conv2d(x, filters=64, kernel_size=4, strides=2, padding="SAME", activation=tf.nn.relu)
 70 |       x = tf.layers.conv2d(x, filters=64, kernel_size=3, strides=1, padding="SAME", activation=tf.nn.relu)
 71 |     x = tf.layers.flatten(x)
 72 | 
 73 |     # Careful: Make sure self._conv_out is set only during the right function call
 74 |     if "agent_net" in tf.get_variable_scope().name and self._conv_out is None: self._conv_out = x
 75 | 
 76 |     # Build the QRDQN head
 77 |     with tf.variable_scope("distribution_value"):
 78 |       z = build_z_head(x)
 79 | 
 80 |     # Build the Bootstrap heads
 81 |     with tf.variable_scope("action_value"):
 82 |       heads = [build_bstrap_head(x) for _ in range(self.n_heads)]
 83 |       x = tf.concat(heads, axis=-2)
 84 |     return dict(q_values=x, quantiles=z)
 85 | 
 86 | 
 87 |   def _compute_estimate(self, agent_net):
 88 |     """Get the Q value for the selected action
 89 |     Args:
 90 |       agent_net: tuple of `tf.Tensor`s. Output from the agent network. Shapes:
 91 |         `[batch_size, n_heads, n_actions]` and `[batch_size, n_actions, N]`
 92 |     Returns:
 93 |       Tuple of `tf.Tensor`s of shapes `[batch_size, n_heads]` and `[batch_size, N]`
 94 |     """
 95 |     q, z = agent_net["q_values"], agent_net["quantiles"]
 96 |     q = DQN_IDS._compute_estimate(self, q)        # out: [None, n_heads]
 97 |     z = QRDQN._compute_estimate(self, z)          # out: [None, N]
 98 |     return dict(q_values=q, quantiles=z)
 99 | 
100 | 
101 |   def _select_target(self, target_net):
102 |     """Select the Double DQN target
103 |     Args:
104 |       target_net: `tf.Tensor`. shape `[None, n_heads, n_actions]. The output from `self._nn_model()`
105 |         for the target
106 |     Returns:
107 |       `tf.Tensor` of shape `[None, n_heads]`
108 |     """
109 |     n_actions   = self.n_actions
110 | 
111 |     # Compute the Q-estimate with the agent network variables and select the maximizing action
112 |     agent_net   = self._nn_model(self.obs_tp1, scope="agent_net")       # out: [None, n_heads, n_actions]
113 |     agent_net   = agent_net["q_values"]  # Select only the Q-tensor
114 |     target_act  = tf.argmax(agent_net, axis=-1, output_type=tf.int32)   # out: [None, n_heads]
115 | 
116 |     # Select the target Q-function
117 |     target_mask = tf.one_hot(target_act, n_actions, dtype=tf.float32)   # out: [None, n_heads, n_actions]
118 |     target_q    = tf.reduce_sum(target_net * target_mask, axis=-1)      # out: [None, n_heads]
119 | 
120 |     return target_q
121 | 
122 | 
123 |   def _compute_target(self, target_net):
124 |     """Compute the backups
125 |     Args:
126 |       target_net: tuple of `tf.Tensor`s. Output from the target network. Shapes:
127 |         `[batch_size, n_heads, n_actions]` and `[batch_size, n_actions, N]`
128 |     Returns:
129 |       Tuple of `tf.Tensor`s of shapes `[batch_size, n_heads]` and `[batch_size, N]`
130 |     """
131 |     target_q, target_z = target_net["q_values"], target_net["quantiles"]
132 |     # DQN_IDS call to self._select_target resolves to the QRDQN_IDS._select_target()
133 |     backup_q = DQN_IDS._compute_target(self, target_q)
134 |     # NOTE: Do NOT call QRDQN._compute_target(self, target_z) - call to self._select_target()
135 |     # will resolve to QRDQN_IDS._select_target() - incorrect
136 |     target_z = QRDQN._select_target(self, target_z)
137 |     backup_z = QRDQN._compute_backup(self, target_z)
138 |     backup_z = tf.stop_gradient(backup_z)
139 |     return dict(target_q=backup_q, target_z=backup_z)
140 | 
141 | 
142 |   def _compute_loss(self, estimate, target, name):
143 |     q, z                = estimate["q_values"], estimate["quantiles"]
144 |     target_q, target_z  = target["target_q"], target["target_z"]
145 | 
146 | 
147 |     head_loss = DQN_IDS._compute_loss(self, q, target_q, name)
148 |     z_loss    = QRDQN._compute_loss(self, z, target_z, "train/z_loss")
149 | 
150 |     return dict(head_loss=head_loss, z_loss=z_loss)
151 | 
152 | 
153 |   def _compute_gradients(self, optimizer, loss, agent_vars, gate_grads=True):
154 |     head_loss = loss["head_loss"]
155 |     z_loss    = loss["z_loss"]
156 | 
157 |     # Get the Bootsrapped heads and conv net gradients
158 |     net_grads = DQN_IDS._compute_gradients(self, optimizer, head_loss, agent_vars, gate_grads=False)
159 | 
160 |     # Get the train op for the distributional FC layers
161 |     z_vars    = tf_utils.scope_vars(agent_vars, scope='agent_net/distribution_value')
162 |     z_grads   = QRDQN._compute_gradients(self, optimizer, z_loss, z_vars, gate_grads=False)
163 | 
164 |     grads     = net_grads + z_grads
165 | 
166 |     if gate_grads:
167 |       grads   = tf_utils.gate_gradients(grads)
168 | 
169 |     return grads
170 | 
171 | 
172 |   # Propagate QR loss gradients
173 |   # def _build_train_op(self, optimizer, loss, agent_vars, name):
174 |   #   head_loss = loss["head_loss"]
175 |   #   z_loss    = loss["z_loss"]
176 | 
177 |   #   # Update the Bootstrap heads variables. Do not backpropagate gradients to the conv layers
178 |   #   head_vars   = tf_utils.scope_vars(agent_vars, scope='agent_net/action_value')
179 |   #   head_grads  = tf.gradients(loss, head_vars)
180 |   #   head_grads  = list(zip(head_grads, head_vars))
181 |   #   train_heads = optimizer.apply_gradients(head_grads)
182 |   #   # heads_grads = optimizer.compute_gradients(head_loss, var_list=head_vars)
183 |   #   # train_heads = optimizer.minimize(head_loss, var_list=head_vars)
184 | 
185 |   #   # Update the conv and the QRDQN head variables based on QRDQN loss
186 |   #   conv_vars = tf_utils.scope_vars(agent_vars, scope='agent_net/conv_net')
187 |   #   z_vars    = tf_utils.scope_vars(agent_vars, scope='agent_net/distribution_value')
188 |   #   train_z   = optimizer.minimize(z_loss, var_list=conv_vars+z_vars)
189 | 
190 |   #   train_op  = tf.group(train_heads, train_z, name=name)
191 |   #   return train_op
192 | 
193 | 
194 |   def _act_train(self, agent_net, name):
195 |     # agent_net tuple of shapes: [None, n_heads, n_actions], [None, n_actions, N]
196 | 
197 |     z_var     = self._compute_z_variance(agent_net["quantiles"], normalize=True)  # [None, n_actions]
198 |     self.rho2 = tf.maximum(z_var, 0.25)
199 | 
200 |     action    = DQN_IDS._act_train(self, agent_net["q_values"], name)
201 | 
202 |     # Add debugging data for TB
203 |     tf.summary.histogram("debug/a_rho2", self.rho2)
204 |     tf.summary.scalar("debug/z_var", tf.reduce_mean(z_var))
205 | 
206 |     # Append the plottable tensors for episode recordings
207 |     p_rho2  = tf.identity(self.rho2[0], name="plot/train/rho2")
208 |     p_a     = self.plot_conf.true_train_spec["train_actions"]["a_mean"]["a"]
209 |     self.plot_conf.true_train_spec["train_actions"]["a_rho2"] = dict(height=p_rho2, a=p_a)
210 | 
211 |     return action
212 | 
213 | 
214 |   def _act_eval(self, agent_net, name):
215 |     return DQN_IDS._act_eval(self, agent_net["q_values"], name)
216 | 


--------------------------------------------------------------------------------
/rltf/models/reinforce.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from rltf.models    import BasePG
 4 | from rltf.tf_utils  import tf_utils
 5 | 
 6 | 
 7 | class REINFORCE(BasePG):
 8 |   """Vanilla Policy Gradient Model"""
 9 | 
10 |   def build(self):
11 | 
12 |     # Build the input placeholders
13 |     self._build_ph()
14 | 
15 |     # Preprocess the observation
16 |     obs_t   = tf_utils.preprocess_input(self.obs_ph, norm=self.obs_norm, training=self.training)
17 | 
18 |     # Construct the policy and the value function networks
19 |     pi      = self._pi_model(obs_t, scope="policy")
20 |     vf      = self._vf_model(obs_t, scope="value_fn")
21 | 
22 |     # Compute the loss
23 |     loss    = self._compute_loss(pi, vf, tb_name="train/loss")
24 | 
25 |     pi_vars = self._trainable_variables(scope="policy")
26 |     vf_vars = self._trainable_variables(scope="value_fn")
27 | 
28 |     # Build the optimizer and the train op
29 |     train_op  = self._build_train_op(loss, pi_vars, vf_vars, name="train_op")
30 | 
31 |     # Compute the train and eval actions
32 |     self.train_dict = self._act_train(pi, vf, name="a_train")
33 |     self.eval_dict  = self._act_eval(pi, name="a_eval")
34 | 
35 |     self._vars        = pi_vars + vf_vars
36 |     self.train_op     = train_op
37 | 
38 | 
39 |   def _compute_loss(self, pi, vf, tb_name):
40 |     logp    = pi.log_prob(self.act_ph)
41 |     pg_loss = - tf.reduce_mean(logp * self.adv_norm)
42 |     vf_loss = tf.losses.mean_squared_error(self.ret_ph, vf)
43 |     loss    = vf_loss + pg_loss
44 | 
45 |     # Remember the ops
46 |     self.ops_dict["loss"]    = loss
47 |     self.ops_dict["pg_loss"] = pg_loss
48 |     self.ops_dict["vf_loss"] = vf_loss
49 | 
50 |     # Easy-to-compute approximate estimates of KL and entropy (assume uniform weights)
51 |     approxkl  = tf.reduce_mean(self.old_logp_ph - logp)
52 |     approxent = tf.reduce_mean(-logp)
53 | 
54 |     # Add TensorBoard summaries
55 |     tf.summary.scalar(tb_name,               loss)
56 |     tf.summary.scalar("train/pg_loss",    pg_loss)
57 |     tf.summary.scalar("train/vf_loss",    vf_loss)
58 |     tf.summary.scalar("train/approx_ent", approxent)
59 |     tf.summary.scalar("train/approx_kl",  approxkl)
60 | 
61 |     # Add summaries for stdout
62 |     tf.summary.scalar("stdout/approx_ent", approxent)
63 |     tf.summary.scalar("stdout/approx_kl",  approxkl)
64 | 
65 |     return loss
66 | 


--------------------------------------------------------------------------------
/rltf/models/trpo.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | from rltf.models    import BasePG
  4 | from rltf.tf_utils  import tf_utils
  5 | 
  6 | 
  7 | class TRPO(BasePG):
  8 |   """Trust Region Policy Optimization Model"""
  9 | 
 10 |   def __init__(self, ent_weight, **kwargs):
 11 |     """
 12 |     Args:
 13 |       ent_weight: float. Coefficient for Entropy Maximization in the surrogate objective
 14 |     """
 15 |     super().__init__(**kwargs)
 16 | 
 17 |     self.ent_weight = ent_weight
 18 | 
 19 |     # Custom TF Ops
 20 |     self.step_op        = None
 21 |     self.update_pi      = None
 22 |     self.reset_pi       = None
 23 |     self.update_old_pi  = None
 24 |     self.mean_kl        = None
 25 |     self.pi_gain        = None
 26 |     self.train_vf       = None
 27 | 
 28 | 
 29 |   def build(self):
 30 | 
 31 |     # Build the input placeholders
 32 |     self._build_ph()
 33 | 
 34 |     # Preprocess the observation
 35 |     obs_t   = tf_utils.preprocess_input(self.obs_ph, norm=self.obs_norm, training=self.training)
 36 | 
 37 |     # Construct the policy and the value function networks
 38 |     pi      = self._pi_model(obs_t, scope="policy")
 39 |     old_pi  = self._pi_model(obs_t, scope="old_policy")
 40 |     vf      = self._vf_model(obs_t, scope="value_fn")
 41 | 
 42 |     # Compute all losses/objectives
 43 |     losses  = self._compute_losses(pi, old_pi, vf)
 44 | 
 45 |     pi_vars     = self._trainable_variables(scope="policy")
 46 |     old_pi_vars = self._trainable_variables(scope="old_policy")
 47 |     vf_vars     = self._trainable_variables(scope="value_fn")
 48 | 
 49 |     # Build the Value Function train op
 50 |     train_vf    = self._build_vf_train_op(losses["vf_loss"], vf_vars, name="train_vf")
 51 | 
 52 |     # Compute the Truncated Natural Policy Gradient step
 53 |     train_pi    = self._build_pi_train_op(losses["pi_gain"], losses["mean_kl"], pi_vars, name="save_steps")
 54 | 
 55 |     # Assign operators
 56 |     reset_pi      = tf_utils.assign_vars(pi_vars, old_pi_vars,  name="reset_pi")
 57 |     update_old_pi = tf_utils.assign_vars(old_pi_vars, pi_vars,  name="update_old_pi")
 58 | 
 59 |     self.step_op        = train_pi["step_op"]     # Op which computes the TNPG step
 60 |     self.update_pi      = train_pi["update_pi"]   # Op which updates pi with the TNPG step
 61 |     self.reset_pi       = reset_pi                # Op which resets pi to old_pi
 62 |     self.update_old_pi  = update_old_pi           # Op which updates old_pi to pi
 63 |     self.mean_kl        = losses["mean_kl"]
 64 |     self.pi_gain        = losses["pi_gain"]
 65 |     self.train_vf       = train_vf
 66 | 
 67 |     # Compute the train and eval actions
 68 |     self.train_dict = self._act_train(pi, vf, name="a_train")
 69 |     self.eval_dict  = self._act_eval(pi, name="a_eval")
 70 | 
 71 |     self._vars   = pi_vars + vf_vars
 72 | 
 73 | 
 74 |   def _build_pi_train_op(self, pi_gain, mean_kl, pi_vars, name):
 75 |     # Build the Natural Gradient optimizer
 76 |     pi_opt = self.pi_opt_conf.build()
 77 | 
 78 |     # Compute the TNPG step
 79 |     steps = pi_opt.compute_steps(pi_gain=pi_gain, kl=mean_kl, var_list=pi_vars)
 80 | 
 81 |     # Get variables to save the computed steps into
 82 |     step_vars = self._get_step_variables(pi_vars, scope="policy_steps")
 83 |     # Get an Op which saves the steps
 84 |     save_steps  = tf_utils.assign_vars(step_vars, steps, name=name)
 85 | 
 86 |     # Get an Op which applies the computed steps
 87 |     steps_and_vars  = list(zip(step_vars, pi_vars))
 88 |     apply_steps     = pi_opt.apply_steps(steps_and_vars)
 89 | 
 90 |     return dict(step_op=save_steps, update_pi=apply_steps)
 91 | 
 92 | 
 93 |   def _build_vf_train_op(self, loss, vf_vars, name=None):
 94 |     vf_opt    = self.vf_opt_conf.build()
 95 |     train_vf  = vf_opt.minimize(loss, var_list=vf_vars, name=name)
 96 |     return train_vf
 97 | 
 98 | 
 99 |   def _compute_losses(self, pi, old_pi, vf):
100 |     # Compute the KL divergence between the two policies
101 |     mean_kl = tf.reduce_mean(old_pi.kl_divergence(pi))
102 | 
103 |     # Compute the policy gradient maximization objective: advantage * p_new / p_old
104 |     pg_objective = self.adv_norm * tf.exp(pi.log_prob(self.act_ph) - self.old_logp_ph)
105 |     # pg_objective = self.adv_norm * tf.exp(pi.log_prob(self.act_ph) - old_pi.log_prob(self.act_ph))
106 |     pg_objective = tf.reduce_mean(pg_objective)
107 | 
108 |     # Compute the policy entropy for Max-Ent learning
109 |     entropy   = tf.reduce_mean(pi.entropy())
110 | 
111 |     # Compute the final optimization objective
112 |     objective = pg_objective + self.ent_weight * entropy
113 | 
114 |     # Compute the Value Function loss
115 |     vf_loss   = tf.losses.mean_squared_error(self.ret_ph, vf)
116 | 
117 |     # Remember the ops
118 |     # self.ops["surr_gain"] = pg_objective
119 | 
120 |     # Add TB summaries
121 |     tf.summary.scalar("train/surr_gain",    objective)
122 |     tf.summary.scalar("train/pi_gain",      pg_objective)
123 |     # tf.summary.scalar("train/vf_loss",      vf_loss)
124 |     tf.summary.scalar("train/pi_entropy",   entropy)
125 |     tf.summary.scalar("train/kl",           mean_kl)
126 | 
127 |     # Add summaries for stdout
128 |     tf.summary.scalar("stdout/surr_gain",    objective)
129 |     tf.summary.scalar("stdout/pi_gain",      pg_objective)
130 |     tf.summary.scalar("stdout/pi_entropy",   entropy)
131 |     tf.summary.scalar("stdout/kl",           mean_kl)
132 | 
133 |     return dict(pi_gain=objective, mean_kl=mean_kl, vf_loss=vf_loss)
134 | 
135 | 
136 |   def _get_step_variables(self, pi_vars, scope):
137 |     with tf.variable_scope(scope):
138 |       step_vars = [tf.get_variable(name=pi_var.name[6:-2], shape=pi_var.shape, dtype=pi_var.dtype,
139 |                                    initializer=tf.zeros_initializer()) for pi_var in pi_vars]
140 |     return step_vars
141 | 


--------------------------------------------------------------------------------
/rltf/monitoring/__init__.py:
--------------------------------------------------------------------------------
1 | from rltf.monitoring.monitor  import Monitor
2 | from rltf.monitoring.stats    import StatsRecorder
3 | from rltf.monitoring          import vplot_manager
4 | from rltf.monitoring.vplot    import VideoPlotter
5 | 


--------------------------------------------------------------------------------
/rltf/monitoring/vplot_manager.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class TensorPlotConf:
 3 |   """Helper configuration object for inserting Tensor values into plots which appear in episode
 4 |   video recordings made by rltf.Monitor. Allows for easy communication between the TensorFlow model,
 5 |   which is supposed to evaluate the Tensors during a sess.run() call, and the Monitor, which is
 6 |   supposed to use the results to create the plots and add them to the recording."""
 7 | 
 8 |   def __init__(self):
 9 | 
10 |     self.train_spec   = {}    # dict of tensors that the TF model runs when selecting a train action
11 |     self._train_spec  = None
12 |     self.eval_spec    = {}    # dict of tensors that the TF model runs when selecting an eval action
13 |     self._eval_spec   = None
14 | 
15 |     self.train_data   = {}    # dict which holds the latest result of sess.run(self.train_spec)
16 |     self.eval_data    = {}    # dict which holds the latest result of sess.run(self.eval_spec)
17 | 
18 | 
19 |   def set_train_spec(self, spec):
20 |     """Set the configuration that the model should run during a train step.
21 |     Args:
22 |       spec: dict of str-tf.Tensor pairs
23 |     """
24 |     # NOTE: Do not set self.train_spec. We want it to be deactivated by default
25 |     self._train_spec = spec
26 | 
27 | 
28 |   def set_eval_spec(self, spec):
29 |     """Set the configuration that the model should run during an eval step.
30 |     Args:
31 |       spec: dict of str-tf.Tensor pairs
32 |     """
33 |     # NOTE: Do not set self.eval_spec. We want it to be deactivated by default
34 |     self._eval_spec = spec
35 | 
36 | 
37 |   def activate_train_plots(self):
38 |     """Activate the tensor configuration so it is run at a train step by the model"""
39 |     self.train_spec = self._train_spec
40 | 
41 | 
42 |   def deactivate_train_plots(self):
43 |     """De-activate the tensor configuration so it is NOT run at a train step by the model.
44 |     This increase speed substantially, especially when not all episodes are recorded.
45 |     """
46 |     self.train_spec = {}
47 | 
48 | 
49 |   def activate_eval_plots(self):
50 |     """Activate the tensor configuration so it is run at an eval step by the model"""
51 |     self.eval_spec = self._eval_spec
52 | 
53 | 
54 |   def deactivate_eval_plots(self):
55 |     """De-activate the tensor configuration so it is NOT run at an eval step by the model.
56 |     This increase speed substantially, especially when not all episodes are recorded.
57 |     """
58 |     self.eval_spec = {}
59 | 
60 | 
61 |   @property
62 |   def true_train_spec(self):
63 |     """Return the true train_spec, no matter if activated or not"""
64 |     return self._train_spec
65 | 
66 | 
67 |   @property
68 |   def true_eval_spec(self):
69 |     """Return the true train_spec, no matter if activated or not"""
70 |     return self._eval_spec
71 | 
72 | 
73 | # Global collection of the TensorPlotConf objects for all models
74 | _COLLECTION = {}
75 | 
76 | 
77 | def get_plot_conf(model):
78 |   """Get a reference to the TensorPlotConf for model.
79 |   Args:
80 |     model: str. Name of the model
81 |   Returns:
82 |     The corresponding TensorPlotConf.
83 |   """
84 |   if model not in _COLLECTION:
85 |     _COLLECTION[model] = TensorPlotConf()
86 | 
87 |   return _COLLECTION[model]
88 | 


--------------------------------------------------------------------------------
/rltf/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from rltf.optimizers.opt_conf     import OptimizerConf
2 | from rltf.optimizers.grad_clip    import GradClipOptimizer
3 | from rltf.optimizers.natural_grad import NaturalGradientOptimizer
4 | 


--------------------------------------------------------------------------------
/rltf/optimizers/grad_clip.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | def GradClipOptimizer(base_opt, *args, **kwargs):
 5 |   import inspect
 6 | 
 7 |   assert inspect.isclass(base_opt), "'base_opt' must be a valid Optimizer class"
 8 |   assert issubclass(base_opt, tf.train.Optimizer), "'base_opt' must be a subclass of 'tf.train.Optimizer'"
 9 | 
10 |   def compute_gradients(self, *args, **kwargs):
11 |     gradients = super(self.__class__, self).compute_gradients(*args, **kwargs)
12 | 
13 |     # Clip the gradients
14 |     for i, (grad, var) in enumerate(gradients):
15 |       if grad is not None:
16 |         gradients[i] = (tf.clip_by_norm(grad, self.grad_clip), var)
17 |     return gradients
18 | 
19 | 
20 |   def __init__(self, *args, grad_clip=None, **kwargs):
21 |     super(self.__class__, self).__init__(*args, **kwargs)
22 |     assert grad_clip is not None
23 |     self.grad_clip = grad_clip
24 | 
25 |   # Determine the name of the class
26 |   classname = str(base_opt.__name__).replace("Optimizer", "") + "GradClipOptimizer"
27 | 
28 |   # Create the new class
29 |   OptClass = type(classname, (base_opt,), dict(__init__=__init__, compute_gradients=compute_gradients))
30 | 
31 |   # Create the optimizer
32 |   opt = OptClass(*args, **kwargs)
33 | 
34 |   return opt
35 | 


--------------------------------------------------------------------------------
/rltf/optimizers/natural_grad.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable
  2 | 
  3 | import tensorflow as tf
  4 | 
  5 | from rltf.tf_utils import tf_utils, tf_cg
  6 | 
  7 | 
  8 | class NaturalGradientOptimizer:
  9 |   """Optimzier which computes the Truncated Natural Policy Gradient (TNPG) step"""
 10 | 
 11 | 
 12 |   def __init__(self, learning_rate, max_kl, cg_iters=10, cg_damping=1e-3, name="NaturalGradient"):
 13 |     """
 14 |     Args:
 15 |       learning_rate: scalar tf.Tensor or float. The step size for the gradient step
 16 |       max_kl: float. Maximum allowed KL divergence between the old and the new policy
 17 |       cg_iters: int. Number of Conjugate Gradient iterations
 18 |       cg_damping: float. Conjugate Gradient damping coefficient
 19 |     """
 20 |     self.step_size  = learning_rate
 21 |     self.max_kl     = max_kl
 22 |     self.cg_iters   = cg_iters
 23 |     self.cg_damping = cg_damping
 24 |     self.name       = name
 25 | 
 26 | 
 27 |   def compute_steps(self, pi_gain: tf.Tensor, kl: tf.Tensor, var_list: list) -> list:
 28 |     """Compute the Truncated Natural Policy Gradient step
 29 |     Args:
 30 |       pi_gain: scalar Tensor for the Policy Gradient optimization objective
 31 |       kl: scalar Tensor for the KL divergence
 32 |       var_list: list of variables to optimize
 33 |     Returns:
 34 |       list of tf.Tensor. Contains the corresponding update direction for var_list
 35 |     """
 36 |     with tf.name_scope(self.name):
 37 |       steps = self._compute_TNPG_step(pi_gain, kl, var_list)
 38 |     return steps
 39 | 
 40 | 
 41 |   def apply_steps(self, steps_and_vars, name=None):
 42 |     """Apply steps to variables
 43 |     Args:
 44 |       steps_and_vars: list of tuples of (tf.Tensor, tf.Variable). The first entry contains
 45 |         the unscaled step, the second contains the variable to apply the step to
 46 |     Returns:
 47 |       tf.Operation which applies the steps
 48 |     """
 49 |     # Compute the updated values for the policy variables
 50 |     updates = [v + self.step_size * step for step, v in steps_and_vars]
 51 |     pi_vars = [v for _, v in steps_and_vars]
 52 | 
 53 |     return tf_utils.assign_vars(pi_vars, updates, name=name)
 54 | 
 55 | 
 56 |   def _compute_TNPG_step(self, pi_gain, mean_kl, pi_vars: list) -> list:
 57 |     # Compute the policy gradient
 58 |     pi_grad = tf.gradients(pi_gain, pi_vars)
 59 |     pi_grad = tf_cg.flatten_tensors(pi_grad)     # shape: [None]
 60 | 
 61 |     # TODO: Assert pi_grad is not too close to 0
 62 |     # assert_op = tf_ops.assert_not_near(pi_grad, 0, message="Policy Gradient near zero")
 63 |     # with tf.control_dependencies([assert_op]):
 64 |     #   pi_grad = tf.identity(pi_grad)
 65 | 
 66 |     # Get the function to compute the Hessian-vector product for the KL Hessian
 67 |     f_Hv    = self._fisher_vector_product(mean_kl, pi_vars)
 68 | 
 69 |     # Use the Conjugate Gradient to compute H^-1 g
 70 |     H_inv_g = tf_cg.conjugate_gradient(f_Av=f_Hv, b=pi_grad, iterations=self.cg_iters,
 71 |                                        damping=self.cg_damping)
 72 | 
 73 |     # Compute the Natural Policy Gradient step vector and split for variables
 74 |     step    = self._compute_step_vector(pi_grad, H_inv_g)
 75 |     shapes  = [v.shape.as_list() for v in pi_vars]
 76 |     steps   = tf_cg.split_vector(step, shapes)
 77 | 
 78 |     return steps
 79 | 
 80 | 
 81 |   def _fisher_vector_product(self, mean_kl: tf.Tensor, var_list: list) -> Callable:
 82 |     """Get a function that computes the product of the KL Hessian and some vector v.
 83 |     Use the fact that Hv = d^2 L / dt^2 v = d/dt (dL/dt) v = d/dt gv
 84 |     Args:
 85 |       mean_kl: tf.Tensor. The KL divergence between the old and the new policy
 86 |       var_list: list of tf.Variables for which to compute gradients
 87 |     Returns:
 88 |       lambda, which takes as input a vector v and computes the product Hv
 89 |     """
 90 | 
 91 |     # Compute the gradients of the KL divergence w.r.t. var_list and flatten them
 92 |     grads = tf.gradients(mean_kl, var_list)
 93 |     grad  = tf_cg.flatten_tensors(grads)     # shape: [None]
 94 | 
 95 |     def compute_hvp(v):
 96 |       # Compute the dot product between grad and v
 97 |       v     = tf.stop_gradient(v)
 98 |       gvp   = tf.reduce_sum(grad * v)
 99 |       # Compute the matrix-vector product `Hv`, between the Hessian and v and flatten it
100 |       hvps  = tf.gradients(gvp, var_list)
101 |       hvp   = tf_cg.flatten_tensors(hvps)
102 |       hvp   = tf.check_numerics(hvp, message="Invalid Fisher-vector product")
103 |       return hvp
104 | 
105 |     return compute_hvp
106 | 
107 | 
108 |   def _compute_step_vector(self, g: tf.Tensor, H_inv_g: tf.Tensor) -> tf.Tensor:
109 |     """Compute the step vector given by the Natural Policy Gradient
110 |     Args:
111 |       g: tf.Tensor, shape [None]. Contains the policy gradient g
112 |       H_inv_g: tf.Tensor, shape [None]. Contains the product H^-1 g
113 |     Returns:
114 |       tf.Tensor, shape [None]. Contains the full NPG step
115 |     """
116 |     # Compute the dot product `g^T H^-1 g`
117 |     g_H_inv_g = tf.reduce_sum(g * H_inv_g)
118 | 
119 |     # Compute the corresponding Lagrange multiplier
120 |     lm = tf.sqrt(g_H_inv_g / (2 * self.max_kl))
121 | 
122 |     # Compute the final step
123 |     step = H_inv_g / lm
124 | 
125 |     return step
126 | 


--------------------------------------------------------------------------------
/rltf/optimizers/opt_conf.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from rltf.schedules import ConstSchedule
 4 | from rltf.schedules import Schedule
 5 | 
 6 | 
 7 | class OptimizerConf:
 8 |   """Config for an optimizer"""
 9 | 
10 |   def __init__(self, opt_type, learn_rate, **kwargs):
11 |     """
12 |     Args:
13 |       opt_type: tf.train.Optimizer subclass. Constructor for the optimizer, e,g, tf.train.AdamOptimizer
14 |       learn_rate: float or rltf.schedules.Schedule. Schedule for the learning rate
15 |       kwargs: dict. All additional keyword arguments will be passed on directly
16 |         to the optimizer constructor
17 |     """
18 |     if learn_rate is None or isinstance(learn_rate, float):
19 |       lr_schedule = ConstSchedule(learn_rate)
20 |     elif isinstance(learn_rate, Schedule):
21 |       lr_schedule = learn_rate
22 |     else:
23 |       raise TypeError("Incorrect learn_rate type {}".format(type(learn_rate)))
24 | 
25 |     self.opt_type     = opt_type
26 |     self.lr_schedule  = lr_schedule
27 |     self.kwargs       = kwargs
28 |     self.lr_ph        = None
29 |     self.built        = False
30 |     self.opt          = None
31 | 
32 | 
33 |   def build(self, lr_tb_name=None, lr_ph_name=None):
34 |     """Construct the optimizer with all the specs and a learning rate placeholder
35 |     Args:
36 |       lr_tb_name: str or None. Name for a tensorboard scalar summary to attach to the learning rate.
37 |         If None, no summary is attached.
38 |       lr_ph_name: str. Optional name for the placeholder Tensor
39 |     Returns:
40 |       The built optimizer. Instance of tf.train.Optimizer
41 |     """
42 |     if self.built:
43 |       return self.opt
44 |     self.built = True
45 | 
46 |     self.lr_ph = tf.placeholder(tf.float32, shape=(), name=lr_ph_name)
47 | 
48 |     if lr_tb_name is not None:
49 |       tf.summary.scalar(lr_tb_name, self.lr_ph)
50 | 
51 |     self.opt = self.opt_type(learning_rate=self.lr_ph, **self.kwargs)
52 | 
53 |     return self.opt
54 | 
55 | 
56 |   def lr_value(self, t):
57 |     """
58 |     Args:
59 |       t: current timestep
60 |     Returns:
61 |       The value of the learning rate schedule for timestep t
62 |     """
63 |     return self.lr_schedule.value(t)
64 | 
65 | 
66 |   def __repr__(self):
67 |     string = self.opt_type.__name__ + '(learn_rate={}'.format(self.lr_schedule)
68 |     for a, v in self.kwargs.items():
69 |       string += ", {}={}".format(a, v)
70 |     string += ")"
71 |     return string
72 | 


--------------------------------------------------------------------------------
/rltf/schedules/__init__.py:
--------------------------------------------------------------------------------
1 | from rltf.schedules.schedule            import Schedule
2 | from rltf.schedules.const_schedule      import ConstSchedule
3 | from rltf.schedules.exponential_decay   import ExponentialDecay
4 | from rltf.schedules.linear_schedule     import LinearSchedule
5 | from rltf.schedules.piecewise_schedule  import PiecewiseSchedule
6 | 


--------------------------------------------------------------------------------
/rltf/schedules/const_schedule.py:
--------------------------------------------------------------------------------
 1 | from rltf.schedules.schedule import Schedule
 2 | 
 3 | class ConstSchedule(Schedule):
 4 | 
 5 |   def __init__(self, value):
 6 |     """Value remains constant over time.
 7 |     Args:
 8 |       value: float. The constant value of the schedule
 9 |     """
10 |     self._v = value
11 | 
12 |   def value(self, t):
13 |     """See Schedule.value"""
14 |     return self._v
15 | 
16 |   def __repr__(self):
17 |     """See Schedule.__str__"""
18 |     string = self.__class__.__name__ + "({})".format(self._v)
19 |     return string
20 | 


--------------------------------------------------------------------------------
/rltf/schedules/exponential_decay.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | from rltf.schedules.schedule  import Schedule
 4 | 
 5 | 
 6 | class ExponentialDecay(Schedule):
 7 | 
 8 |   def __init__(self, init, final, decay_rate):
 9 |     """Exponential decay schedule starting from `init`. At timestep `t` the output is
10 |     `init * decay_rate^t`. The minimum/maximum possible output is `final`
11 | 
12 |     Args:
13 |       init: float. Initial output value
14 |       final: float. Final output value
15 |       decay: float. Decay factor
16 |     """
17 |     assert decay_rate > 0
18 | 
19 |     self.decay_rate = float(decay_rate)
20 |     self.final      = float(final)
21 |     self.init       = float(init)
22 | 
23 |     self.t          = 1
24 |     self.v          = self.init
25 |     self.max_t      = int(math.log(self.final/self.init, self.decay_rate)) + 1
26 | 
27 | 
28 |   def value(self, t):
29 |     """See Schedule.value"""
30 |     if t > self.max_t:
31 |       return self.final
32 | 
33 |     # Compute the exponential difference
34 |     diff = t - self.t
35 |     v = self.v * math.pow(self.decay_rate, diff)
36 |     self.v = v
37 |     self.t = t
38 |     return v
39 | 
40 | 
41 |   def __repr__(self):
42 |     string = self.__class__.__name__
43 |     string += "(initial={}, final={}, decay_rate={})".format(self.init, self.final, self.decay_rate)
44 |     return string
45 | 


--------------------------------------------------------------------------------
/rltf/schedules/linear_schedule.py:
--------------------------------------------------------------------------------
 1 | from rltf.schedules.schedule  import Schedule
 2 | from rltf.schedules.utils     import linear_interpolation
 3 | 
 4 | 
 5 | class LinearSchedule(Schedule):
 6 | 
 7 |   def __init__(self, timesteps, final_p, initial_p=1.0):
 8 |     """Linear interpolation between initial_p and final_p over timesteps.
 9 |     After this many timesteps pass final_p is returned.
10 | 
11 |     Args:
12 |       timesteps: int. Number of timesteps for which to linearly anneal initial_p to final_p
13 |       initial_p: float. Initial output value
14 |       final_p: float. Final output value
15 |     """
16 |     self.timesteps = timesteps
17 |     self.final_p   = final_p
18 |     self.initial_p = initial_p
19 | 
20 | 
21 |   def value(self, t):
22 |     """See Schedule.value"""
23 |     fraction  = min(float(t) / self.timesteps, 1.0)
24 |     return linear_interpolation(self.initial_p, self.final_p, fraction)
25 |     # return self.initial_p + fraction * (self.final_p - self.initial_p)
26 | 
27 | 
28 |   def __repr__(self):
29 |     string = self.__class__.__name__
30 |     string += "(initial={}, final={}, timesteps={})".format(self.initial_p, self.final_p, self.timesteps)
31 |     return string
32 | 


--------------------------------------------------------------------------------
/rltf/schedules/piecewise_schedule.py:
--------------------------------------------------------------------------------
 1 | from rltf.schedules.schedule  import Schedule
 2 | from rltf.schedules.utils     import linear_interpolation
 3 | 
 4 | 
 5 | class PiecewiseSchedule(Schedule):
 6 |   def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
 7 |     """Piecewise schedule.
 8 | 
 9 |     Args:
10 |       endpoints: list of pairs [(int, int)]. Every pair `(time, value)` means
11 |        that schedule should output `value` when `t==time`. All time values
12 |        must be sorted in an increasing order. For times in between endpoints,
13 |        interpolation is used to return a value
14 |       interpolation: lambda float, float, float: float
15 |         a function that takes value to the left and to the right of t according
16 |         to the `endpoints`. The last argument alpha is the fraction of distance
17 |         from left endpoint to right endpoint that t has covered.
18 |         See linear_interpolation for example.
19 |       outside_value: float
20 |         if the value is requested outside of all the intervals sepecified in
21 |         `endpoints` this value is returned.
22 |     """
23 |     inds = [e[0] for e in endpoints]
24 |     assert inds == sorted(inds)
25 |     self._interpolation = interpolation
26 |     self._outside_value = outside_value
27 |     self._endpoints     = endpoints
28 | 
29 |   def value(self, t):
30 |     """See Schedule.value
31 |     Raises:
32 |       AssertionError if outside_value is None and outside value is requested.
33 |     """
34 |     for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
35 |       if l_t <= t and t < r_t:
36 |         alpha = float(t - l_t) / (r_t - l_t)
37 |         return self._interpolation(l, r, alpha)
38 | 
39 |     if self._outside_value is None:
40 |       return self._endpoints[-1][1]
41 |     else:
42 |       return self._outside_value
43 | 
44 | 
45 |   def __repr__(self):
46 |     string = self.__class__.__name__ + "("
47 |     for step, val in self._endpoints:
48 |       string += (" (%d, %f);" % (step, val))
49 |     string += " )"
50 |     return string
51 | 


--------------------------------------------------------------------------------
/rltf/schedules/schedule.py:
--------------------------------------------------------------------------------
 1 | from abc import ABCMeta, abstractmethod
 2 | 
 3 | class Schedule(metaclass=ABCMeta):
 4 | 
 5 |   @abstractmethod
 6 |   def value(self, t):
 7 |     """Value of the schedule at step t"""
 8 |     pass
 9 | 
10 |   @abstractmethod
11 |   def __repr__(self):
12 |     """Representation of the schedule for logging purposes"""
13 |     pass
14 | 


--------------------------------------------------------------------------------
/rltf/schedules/utils.py:
--------------------------------------------------------------------------------
1 | 
2 | def linear_interpolation(l, r, alpha):
3 |   return l + alpha * (r - l)
4 | 


--------------------------------------------------------------------------------
/rltf/tf_utils/__init__.py:
--------------------------------------------------------------------------------
1 | from rltf.tf_utils      import tf_utils       as tf_utils
2 | from rltf.tf_utils      import ops            as tf_ops
3 | from rltf.tf_utils      import inverse        as tf_inv
4 | from rltf.tf_utils      import distributions  as tf_dist
5 | from rltf.tf_utils      import cg             as tf_cg
6 | from rltf.tf_utils.blr  import BLR
7 | 


--------------------------------------------------------------------------------
/rltf/tf_utils/blr.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | from rltf.tf_utils import tf_inv
  4 | 
  5 | 
  6 | class BLR(tf.layers.Layer):
  7 |   """Bayesian Linear Regression Layer
  8 |   By default:
  9 |     - Uses tf.float64 for internal variables since higher precision is required for inverting matrices
 10 |     - Inverts the precision matrix using the woodburry formula - more stable and efficient
 11 |     -
 12 |   NOTE:
 13 |     - Internal variables are not captured by `tf.trainable_variables()` and by optimizers
 14 |       - This is because `self.trainable_variables` returns an empty list
 15 |       - The fact that variables are set as trainable at instantiation does not affect this
 16 |     - Internal variables are captured when selecting variables from a given scope
 17 |       - This means that if an agent and a target network with identical structures contain BLR layers,
 18 |         the BLR variables will get updated, if all variables within the relevant scopes are captured
 19 |     - For training variables, one needs to use the op returned by `self.train()`
 20 |   """
 21 | 
 22 |   def __init__(self, tau, sigma_e, mode="mean", w_dim=None, bias=False, dtype=tf.float64, name=None):
 23 | 
 24 |     super().__init__(trainable=False, dtype=dtype, name=name)
 25 | 
 26 |     assert mode in ["mean", "ts"]
 27 | 
 28 |     self.sigma  = sigma_e
 29 |     self.beta   = 1.0 / self.sigma**2
 30 |     self.tau    = tau
 31 |     self.bias   = bias
 32 |     self.w_dim  = w_dim
 33 |     self.mode   = mode
 34 | 
 35 |     # Custom TF Tensors and Ops
 36 |     self.w_mu     = None
 37 |     self.w_Sigma  = None
 38 |     self.w_Lambda = None
 39 |     self.w        = None  # Sampled value for w when using Thompson Sampling
 40 |     self.reset_op = None  # Reset all trainable variables to their inNeed to make sure it is itial values
 41 | 
 42 |     self.input_spec = tf.layers.InputSpec(min_ndim=2, max_ndim=2)
 43 | 
 44 | 
 45 |   def build(self, input_shape):
 46 |     input_shape = tf.TensorShape(input_shape)
 47 | 
 48 |     if input_shape[-1].value is None:
 49 |       raise ValueError('The last dimension of the inputs to `BLR` should be defined. Found `None`.')
 50 | 
 51 |     if self.w_dim is None:
 52 |       self.w_dim      = input_shape[-1].value
 53 |       self.input_spec = tf.layers.InputSpec(min_ndim=2, max_ndim=2, axes={-1: self.w_dim})
 54 |     else:
 55 |       assert self.w_dim == input_shape[-1].value
 56 | 
 57 |     I = tf.eye(self.w_dim, dtype=self.dtype)
 58 | 
 59 |     mu_init     = tf.zeros([self.w_dim, 1], dtype=self.dtype)
 60 |     Sigma_init  = 1.0/self.tau * I
 61 |     Lambda_init = self.tau * I
 62 | 
 63 |     self.w_mu     = self.add_variable("w_mu",
 64 |                                       shape=[self.w_dim, 1],
 65 |                                       # initializer=lambda *args, **kwargs: mu_init,
 66 |                                       initializer=tf.zeros_initializer,
 67 |                                       trainable=True)
 68 | 
 69 |     self.w_Sigma  = self.add_variable("w_Sigma",
 70 |                                       shape=[self.w_dim, self.w_dim],
 71 |                                       initializer=lambda *args, **kwargs: Sigma_init,
 72 |                                       trainable=True)
 73 | 
 74 |     self.w_Lambda = self.add_variable("w_Lambda",
 75 |                                       shape=[self.w_dim, self.w_dim],
 76 |                                       initializer=lambda *args, **kwargs: Lambda_init,
 77 |                                       trainable=True)
 78 | 
 79 |     self.w        = self.add_variable("w",
 80 |                                       shape=[self.w_dim, 1],
 81 |                                       # initializer=lambda *args, **kwargs: mu_init,
 82 |                                       initializer=tf.zeros_initializer,
 83 |                                       trainable=False)
 84 | 
 85 |     # Build the reset op
 86 |     self.reset_op = self._tf_update_params(mu_init, Sigma_init, Lambda_init)
 87 | 
 88 |     # Add debug histogrrams
 89 |     # tf.summary.histogram("debug/BLR/w_mu",      self.w_mu)
 90 |     # tf.summary.histogram("debug/BLR/w_Sigma",   self.w_Sigma)
 91 |     # tf.summary.histogram("debug/BLR/w_Lambda",  self.w_Lambda)
 92 |     # tf.summary.histogram("debug/BLR/w_ts",      self.w)
 93 | 
 94 |     self.built = True
 95 | 
 96 | 
 97 |   def call(self, inputs, **kwargs):
 98 |     """ Compute the posterior predictive distribution
 99 |     Args:
100 |       X: tf.Tensor, `shape=[None, D]`. The feature matrix
101 |     Returns:
102 |       List of `tf.Tensor`s:
103 |         mu: tf.Tensor, `shape=[None, 1]. The mean at each test point
104 |         var: tf.Tensor, `shape=[None, 1]. The variance at each test point
105 |     """
106 |     X = self._cast_input(inputs)
107 | 
108 |     # Thompson Sampling Output
109 |     if self.mode == "ts":
110 |       mu = tf.matmul(X, self.w)
111 |     # Bayesian Regression Output
112 |     else:
113 |       mu = tf.matmul(X, self.w_mu)
114 | 
115 |     # var ends up being diag(sigma**2 + matmul(matmul(X, w_Sigma), X.T))
116 |     var = self.sigma**2 + tf.reduce_sum(tf.matmul(X, self.w_Sigma) * X, axis=-1, keepdims=True)
117 |     # std = tf.sqrt(var)
118 | 
119 |     outputs = [mu, var]
120 |     outputs = [self._cast_output(t) for t in outputs]
121 |     return outputs
122 | 
123 | 
124 |   def train(self, X, y):
125 |     """Compute the weight posteriror of Bayesian Linear Regression
126 |     Args:
127 |       X: tf.Tensor, `shape=[None, D]`. The feature matrix
128 |       y: tf.Tensor, `shape=[None, 1]`. The correct outputs
129 |     Returns:
130 |       tf.Op which performs the update operation
131 |     """
132 |     X = self._cast_input(X)
133 |     y = self._cast_input(y)
134 | 
135 |     # Compute the posterior precision matrix
136 |     w_Lambda = self.w_Lambda + self.beta * tf.matmul(X, X, transpose_a=True)
137 | 
138 |     # Compute the posterior covariance matrix
139 |     X_norm  = 1.0 / self.sigma * X
140 |     w_Sigma = tf_inv.woodburry_inverse(self.w_Sigma, tf.transpose(X_norm), X_norm)
141 | 
142 |     error = tf.losses.mean_squared_error(tf.matmul(w_Lambda, w_Sigma), tf.eye(self.w_dim))
143 |     tf.summary.scalar("debug/BLR/inv_error", error)
144 | 
145 |     # Compute the posterior mean
146 |     w_mu = tf.matmul(w_Sigma, self.beta * tf.matmul(X, y, True) + tf.matmul(self.w_Lambda, self.w_mu))
147 | 
148 |     return self._tf_update_params(w_mu, w_Sigma, w_Lambda)
149 | 
150 | 
151 |   def resample_w(self, cholesky=False):
152 |     sample = tf.random_normal(shape=self.w_mu.shape, dtype=self.dtype)
153 | 
154 |     # Compute A s.t. A A^T = w_Sigma. Note that SVD and Cholesky give different A
155 |     if cholesky:
156 |       # Use cholesky
157 |       A = tf.cholesky(self.w_Sigma)
158 |     else:
159 |       # Use SVD
160 |       S, U, _ = tf.svd(self.w_Sigma)
161 |       A = tf.matmul(U, tf.diag(tf.sqrt(S)))
162 | 
163 |     w = self.w_mu + tf.matmul(A, sample)
164 |     return tf.assign(self.w, w, name="resample_w")
165 | 
166 | 
167 |   @property
168 |   def reset(self):
169 |     return self.reset_op
170 | 
171 | 
172 |   @property
173 |   def trainable_weights(self):
174 |     return self._trainable_weights or []
175 | 
176 | 
177 |   def _tf_update_params(self, w_mu, w_Sigma, w_Lambda):
178 |     """
179 |     Returns:
180 |       tf.Op which performs an update on all weight parameters
181 |     """
182 |     mu_op     = tf.assign(self.w_mu,      w_mu)
183 |     Sigma_op  = tf.assign(self.w_Sigma,   w_Sigma)
184 |     Lambda_op = tf.assign(self.w_Lambda,  w_Lambda)
185 |     return tf.group(mu_op, Sigma_op, Lambda_op)
186 | 
187 | 
188 |   def _cast_input(self, x):
189 |     if self.dtype == tf.float64 and x.dtype.base_dtype != tf.float64:
190 |       x = tf.cast(x, self.dtype)
191 |     return x
192 | 
193 | 
194 |   def _cast_output(self, x):
195 |     if x.dtype.base_dtype != tf.float32:
196 |       x = tf.cast(x, tf.float32)
197 |     return x
198 | 


--------------------------------------------------------------------------------
/rltf/tf_utils/cg.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable
  2 | 
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | 
  6 | 
  7 | def conjugate_gradient(f_Av: Callable, b: tf.Tensor, iterations: int, damping=0.0,
  8 |                        tolerance=1e-10) -> tf.Tensor:
  9 |   """Compute the solution to Ax=b using the Conjugate Gradient method. Uses tf operations
 10 |   Args:
 11 |     b: tf.Tensor, shape `[None]`
 12 |     f_Av: lambda. Takes a vector `v` as argument and computes the matrix-vector product `Av`
 13 |     iterations: int. Number of iterations
 14 |     damping: float. CG damping coefficient
 15 |     tolerance: float or tf.Tensor. Return `x`, if the square of the residual gets below this tolerance
 16 |   Returns:
 17 |     tf.Tensor of the same shape as `b` which contains the solution
 18 |   """
 19 |   assert b.shape.ndims == 1
 20 |   assert tolerance > 0.0
 21 | 
 22 |   def cg_body(i, x, r, p, rTr):
 23 |     Ap      = f_Av(p)
 24 |     if damping > 0.0:
 25 |       Ap      = Ap + damping * p
 26 |     alpha   = rTr / tf.reduce_sum(p * Ap)
 27 |     x       = x + alpha * p
 28 |     r       = r + alpha * Ap
 29 |     rTr_    = tf.reduce_sum(tf.square(r))
 30 |     # Make sure division by 0 is avoided
 31 |     beta    = rTr_ / tf.maximum(rTr, tolerance)
 32 |     p       = - r + beta * p
 33 |     i       = tf.add(i, 1)
 34 | 
 35 |     return [i, x, r, p, rTr_]
 36 | 
 37 |   #pylint: disable=unused-argument
 38 |   def cg_cond(i, x, r, p, rTr):
 39 |     return tf.greater(rTr, tolerance)
 40 | 
 41 |   # Initial CG values
 42 |   x   = tf.zeros_like(b, dtype=tf.float32)  # [None, 1]
 43 |   r   = -b
 44 |   p   = -r
 45 |   i   = tf.constant(0)
 46 |   rTr = tf.reduce_sum(tf.square(r))
 47 | 
 48 |   # _, x, _, _, _ = tf.while_loop(cg_cond, cg_body, loop_vars=[i, x, r, p, rTr])
 49 | 
 50 |   loop_vars = [i, x, r, p, rTr]
 51 | 
 52 |   # Use unrolled loop; TF does not allow computing gradients inside a tf.while_loop
 53 |   for _ in range(iterations):
 54 |     cond      = cg_cond(*loop_vars)
 55 |     update    = lambda: cg_body(*loop_vars)
 56 |     identity  = lambda: loop_vars
 57 |     loop_vars = tf.cond(pred=cond, true_fn=update, false_fn=identity)
 58 | 
 59 |   x = loop_vars[1]
 60 | 
 61 |   return tf.check_numerics(x, message="Invalid Conjugate Gradient solution", name="cg_x")
 62 | 
 63 | 
 64 | def conjugate_gradient_np(f_Av, b, iterations, tolerance=1e-10):
 65 |   """Compute the solution to Ax=b using the Conjugate Gradient method. Uses numpy operations
 66 |   Args:
 67 |     b: np.array, shape `[None]`
 68 |     f_Av: lambda. Takes a vector `v` as argument and computes the matrix-vector product `Av`
 69 |     iterations: int. Number of iterations
 70 |     tolerance: float. Return `x`, if the square of the residual gets below this tolerance
 71 |   Returns:
 72 |     np.array of the same shape as `b` which contains the solution
 73 |   """
 74 |   dtype = b.dtype
 75 | 
 76 |   x = np.zeros_like(b, dtype=dtype)  # [None, 1]
 77 |   r = np.array(-b, dtype=dtype)
 78 |   p = np.array(-r, dtype=dtype)      #pylint: disable=invalid-unary-operand-type
 79 | 
 80 |   for _ in range(iterations):
 81 |     Ap      = np.asarray(f_Av(p), dtype=dtype)
 82 |     rTr     = np.dot(r.T, r)
 83 |     alpha   = rTr / np.dot(p.T, Ap)
 84 |     x       = x + alpha * p
 85 |     r       = r + alpha * Ap
 86 |     rTr_    = np.dot(r.T, r)
 87 |     beta    = rTr_ / rTr
 88 |     p       = - r + beta * p
 89 |     rTr     = rTr_
 90 | 
 91 |     if rTr < tolerance:
 92 |       break
 93 | 
 94 |   return x
 95 | 
 96 | 
 97 | def flatten_tensors(tensor_list: list) -> tf.Tensor:
 98 |   """Flatten the tensors in tensor_list and concatenate them in a vector
 99 |   Args:
100 |     tensors_list: list of tf.Tensors.
101 |   Returns:
102 |     tf.Tensor of shape `[None]`, which contains the flattened tensors in tensor_list in order
103 |   """
104 |   for tensor in tensor_list:
105 |     assert isinstance(tensor, tf.Tensor)
106 |   flat = tf.concat([tf.reshape(tensor, [-1]) for tensor in tensor_list], axis=0)
107 |   return flat
108 | 
109 | 
110 | def split_vector(v: tf.Tensor, shapes: list) -> list:
111 |   """Slice and reshape a vector such that the shapes of the slices match the ones in shapes_list
112 |   or of the tensors in tensor_list. Only one of `tensor_list` and `shapes` must be specified
113 |   Args:
114 |     v: tf.Tensor. The vector to be split
115 |     shapes: list of shapes to split into
116 |   """
117 |   # assert (tensor_list is None) != (shapes is None)
118 |   assert v.shape.ndims == 1
119 | 
120 |   # shapes  = [tensor.shape.as_list() for tensor in tensor_list] if shapes is None else shapes
121 |   # shapes  = [tensor.shape.as_list() for tensor in tensor_list]
122 |   sizes   = [np.prod(shape) for shape in shapes]
123 | 
124 |   assert v.shape.ndims == 1
125 |   assert np.sum(sizes) == v.shape.as_list()[0]
126 | 
127 |   flats   = tf.split(v, sizes)
128 |   tensors = [tf.reshape(tensor, shape) for tensor, shape in zip(flats, shapes)]
129 | 
130 |   return tensors
131 | 


--------------------------------------------------------------------------------
/rltf/tf_utils/distributions.py:
--------------------------------------------------------------------------------
 1 | import numpy      as np
 2 | import tensorflow as tf
 3 | 
 4 | 
 5 | class MultivariateNormalDiag(tf.distributions.Distribution):
 6 |   """Multivariate Gaussian distribution with diagonal covariance matrix"""
 7 | 
 8 |   def __init__(self, loc, log_scale=None, scale=None, validate_args=False, allow_nan_stats=True):
 9 | 
10 |     parameters = dict(locals())
11 | 
12 |     assert log_scale.shape.ndims == 2
13 |     assert (log_scale is None) != (scale is None)
14 | 
15 |     with tf.name_scope(self.__class__.__name__):
16 | 
17 |       if log_scale is not None:
18 |         loc       = tf.identity(loc, name="loc")
19 |         scale     = tf.exp(log_scale, name="scale")
20 |         log_scale = tf.identity(log_scale, name="log_scale")
21 |       elif scale is not None:
22 |         with tf.control_dependencies([tf.assert_positive(scale)]):
23 |           loc       = tf.identity(loc, name="loc")
24 |           scale     = tf.identity(scale, name="scale")
25 |           log_scale = tf.log(scale, name="log_scale")
26 | 
27 |       assert loc.dtype.base_dtype == tf.float32 or loc.dtype.base_dtype == tf.float64
28 |       assert loc.dtype.base_dtype == log_scale.dtype.base_dtype == scale.dtype.base_dtype
29 | 
30 |     self.loc        = loc             # [batch_size, self.dim]
31 |     self.log_scale  = log_scale       # [batch_size, self.dim] or [1, self.dim]
32 |     self.scale      = scale           # [batch_size, self.dim] or [1, self.dim]
33 |     self.dim        = self.loc.shape.as_list()[1]
34 | 
35 |     super().__init__(dtype=self.loc.dtype,
36 |                      reparameterization_type=tf.distributions.FULLY_REPARAMETERIZED,
37 |                      validate_args=validate_args,
38 |                      allow_nan_stats=allow_nan_stats,
39 |                      parameters=parameters,
40 |                     )
41 | 
42 | 
43 |   def sample(self):
44 |     """
45 |     Returns:
46 |       tf.Tensor of shape as `[None, self.dim]`. The size of the first dimension is
47 |         determined from self.loc
48 |     """
49 |     return self.loc + tf.random_normal(shape=tf.shape(self.loc)) * self.scale
50 | 
51 | 
52 |   def _log_prob(self, value):
53 |     """
54 |     Args:
55 |       value: tf.Tensor, shape=`[batch_size, self.dim]`
56 |     Returns:
57 |       tf.Tensor of shape `[batch_size]`
58 |     """
59 |     assert value.shape.ndims == self.loc.shape.ndims
60 |     logp = - 0.5 * tf.reduce_sum(tf.square((value - self.loc) / self.scale), axis=-1) \
61 |            - 0.5 * np.log(2.0 * np.pi) * self.dim - tf.reduce_sum(self.log_scale, axis=-1)
62 |     return logp
63 | 
64 | 
65 |   def _entropy(self):
66 |     return 0.5 * self.dim * (np.log(2*np.pi) + 1) + tf.reduce_sum(self.log_scale, axis=-1)
67 | 
68 | 
69 |   def _kl_divergence(self, other):
70 |     assert isinstance(other, self.__class__)
71 |     assert other.dim == self.dim
72 | 
73 |     return tf.reduce_sum( (
74 |                             0.5 * tf.square(self.scale / other.scale) +
75 |                             0.5 * tf.square((self.loc - other.loc) / other.scale) +
76 |                             other.log_scale - self.log_scale
77 |                           ),
78 |                           axis=-1) - 0.5 * self.dim
79 | 
80 | 
81 |   def _mean(self):
82 |     return self.loc
83 | 
84 | 
85 |   def _mode(self):
86 |     return self.loc
87 | 
88 | 
89 |   def _stddev(self):
90 |     return self.scale
91 | 
92 | 
93 |   @property
94 |   def dimension(self):
95 |     return self.dim
96 | 


--------------------------------------------------------------------------------
/rltf/tf_utils/inverse.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | def cholesky_inverse(A):
 5 |   """Compute the inverse of `A` using Choselky decomposition. NOTE: `A` must be
 6 |   symmetric positive definite. This method of inversion is not completely stable since
 7 |   tf.cholesky is not always stable. Might raise `tf.errors.InvalidArgumentError`
 8 |   """
 9 |   N     = tf.shape(A)[0]
10 |   L     = tf.cholesky(A)
11 |   L_inv = tf.matrix_triangular_solve(L, tf.eye(N))
12 |   A_inv = tf.matmul(L_inv, L_inv, transpose_a=True)
13 |   return A_inv
14 | 
15 | 
16 | def sherman_morrison_inverse(A_inv, u, v):
17 |   """Compute the inverse of (A + uv^T) using Sherman-Morrison formula:
18 |   https://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula
19 |   Args:
20 |     A_inv: tf.Tensor. The inverse of A or batch. Last two dimensions should have shape [N, N]
21 |     u: tf.Tensor. (Batch of) column vector(s). Last two dimensions should have shape [N, 1]
22 |     v: tf.Tensor. (Batch of) column vector(s). Last two dimensions should have shape [N, 1]
23 |   Returns: (A + uv^T)^{-1} with the same shape as `A_inv`
24 |   """
25 |   assert u.shape.as_list()[-1] == 1 and u.shape.ndims >= 2
26 |   assert v.shape.as_list()[-1] == 1 and v.shape.ndims >= 2
27 | 
28 |   A_inv_u = tf.matmul(A_inv, u)
29 |   num     = tf.matmul(A_inv_u, tf.matmul(v, A_inv, transpose_a=True))
30 |   denom   = tf.matmul(v, A_inv_u, transpose_a=True)
31 |   denom   = 1 + tf.squeeze(denom, axis=[-2, -1])
32 |   inverse = A_inv - num / denom
33 | 
34 |   return inverse
35 | 
36 | 
37 | def woodburry_inverse(A_inv, U, V):
38 |   """Compute the inverse of (A + UV) using Woodburry formula:
39 |   `(A + UV)^-1 = A^-1 - A^-1 U (I + V A^-1 U)^-1 V A^-1`. For details see:
40 |   https://en.wikipedia.org/wiki/Woodbury_matrix_identity
41 |   Args:
42 |     A_inv: tf.Tensor. The inverse of A, `shape=[N, N]`
43 |     U: tf.Tensor, `shape=[N, M]`
44 |     V: tf.Tensor. `shape=[M, N]`
45 |   Returns: (A + UV)^-1 with the same shape and dtype as `A_inv`
46 |   """
47 | 
48 |   # NOTE: Must make sure to use double precision. Otherwise results are very inaccurate
49 |   A_inv_64  = tf.cast(A_inv, tf.float64) if  A_inv.dtype.base_dtype == tf.float32 else A_inv
50 |   U_64      = tf.cast(U,     tf.float64) if      U.dtype.base_dtype == tf.float32 else U
51 |   V_64      = tf.cast(V,     tf.float64) if      V.dtype.base_dtype == tf.float32 else V
52 | 
53 |   assert  A_inv_64.dtype.base_dtype == tf.float64
54 |   assert      U_64.dtype.base_dtype == tf.float64
55 |   assert      V_64.dtype.base_dtype == tf.float64
56 | 
57 |   A_inv_U = tf.matmul(A_inv_64, U_64)
58 |   V_A_inv = tf.matmul(V_64, A_inv_64)
59 |   I       = tf.eye(tf.shape(V)[0], dtype=tf.float64)
60 |   inverse = tf.matrix_inverse(I + tf.matmul(V_64, A_inv_U))
61 |   inverse = tf.matmul(A_inv_U, inverse)
62 |   inverse = tf.matmul(inverse, V_A_inv)
63 |   inverse = A_inv_64 - inverse
64 | 
65 |   assert  inverse.dtype.base_dtype == tf.float64
66 |   inverse = tf.cast(inverse, tf.float32) if A_inv.dtype.base_dtype == tf.float32 else inverse
67 |   assert  inverse.dtype.base_dtype == A_inv.dtype.base_dtype
68 | 
69 |   return inverse
70 | 


--------------------------------------------------------------------------------
/rltf/tf_utils/ops.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | def huber_loss(x, delta=1.0):
 5 |   """Apply the Huber Loss function:
 6 |   ```
 7 |   0.5*x^2 if |x| < delta else delta*(|x| - 0.5*delta)
 8 |   ```
 9 |   """
10 |   abs_x = tf.abs(x)
11 |   return tf.where(
12 |     abs_x < delta,
13 |     tf.square(x) * 0.5,
14 |     delta * (abs_x - 0.5 * delta),
15 |     name="huber_loss"
16 |   )
17 | 
18 | 
19 | def softmax(logits, axis=None, name=None):
20 |   """Perform stable softmax"""
21 |   C = tf.stop_gradient(tf.reduce_max(logits, axis=axis, keepdims=True))
22 |   x = tf.nn.softmax(logits-C, axis=axis, name=name)
23 |   return x
24 | 
25 | 
26 | def log_softmax(logits, axis=None, name=None):
27 |   """Perform stable log_softmax"""
28 |   C = tf.stop_gradient(tf.reduce_max(logits, axis=axis, keepdims=True))
29 |   x = tf.nn.log_softmax(logits-C, axis=axis, name=name)
30 |   return x
31 | 


--------------------------------------------------------------------------------
/rltf/tf_utils/tf_utils.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import tensorflow as tf
  3 | 
  4 | 
  5 | logger = logging.getLogger(__name__)
  6 | 
  7 | 
  8 | # ------------------------------------ OPERATIONS ------------------------------------
  9 | 
 10 | 
 11 | def assign_vars(dest_vars, source_vars, weight=1.0, name=None):
 12 |   """Create a `tf.Op` that assigns the values of source_vars to dest_vars.
 13 |   `source_vars` and `dest_vars` must have variables with matching names,
 14 |   but do not need to be sorted.
 15 |   The assignment operation is:
 16 |   ```
 17 |   dest_var = weight*source_var + (1-weight)*dest_var
 18 |   ```
 19 | 
 20 |   Args:
 21 |     dest_vars: list of tf.Variables. Holds the variables that will be updated
 22 |     source_vars: list of tf.Variables. Holds the source values
 23 |     weight: int. Weight to use in the above expression
 24 |     name: string. Optional name for the returned operation
 25 |   Returns:
 26 |     `tf.Op` that performs the assignment
 27 |   """
 28 |   assert weight <= 1.0
 29 |   assert weight >  0.0
 30 |   assert len(source_vars) == len(dest_vars)
 31 | 
 32 |   update_ops = []
 33 | 
 34 |   logger.debug("Assigning tf values as:")
 35 |   for s_var, d_var in zip(source_vars, dest_vars):
 36 |     logger.debug(d_var.name + " := " + s_var.name)
 37 |     if weight == 1.0:
 38 |       update_ops.append(tf.assign(d_var, s_var))
 39 |     else:
 40 |       update_ops.append(tf.assign(d_var, (1.-weight)*d_var + weight*s_var))
 41 | 
 42 |   assert len(update_ops) > 0
 43 | 
 44 |   return tf.group(*update_ops, name=name)
 45 | 
 46 | 
 47 | def scope_vars(var_list, scope):
 48 |   """
 49 |   Args:
 50 |     var_list: list of `tf.Variable`s. Contains all variables that should be searched
 51 |     scope: str. Scope of the variables that should be selected
 52 |   """
 53 |   return [v for v in var_list if scope in v.name]
 54 | 
 55 | 
 56 | def normalize(x, training, momentum=0.0):
 57 |   """Normalize a tensor along the batch dimension. Normalization is done using the statistics of the
 58 |   current batch (in training mode) or based on running mean and variance (in inference mode).
 59 |   Args:
 60 |     x: tf.Tensor, shape.ndims == 2. Input tensor
 61 |     training: tf.Tensor or bool. Whether to return the output in training mode (normalized with
 62 |       statistics of the current batch) or in inference mode (normalized with moving statistics)
 63 |     momentum: float. Momentum for the moving average.
 64 |   """
 65 |   assert x.shape.ndims == 2
 66 | 
 67 |   kwargs = dict(axis=-1, center=False, scale=False, trainable=True, training=training, momentum=momentum)
 68 | 
 69 |   ops = tf.get_collection_ref(tf.GraphKeys.UPDATE_OPS)
 70 |   i   = len(ops)
 71 | 
 72 |   x = tf.layers.batch_normalization(x, **kwargs)
 73 | 
 74 |   # Get the batch norm update ops and remove them from the global list
 75 |   update_ops = ops[i:]
 76 |   del ops[i:]
 77 | 
 78 |   # Update the moving mean and variance before returning the output
 79 |   with tf.control_dependencies(update_ops):
 80 |     x = tf.identity(x)
 81 |   return x
 82 | 
 83 | 
 84 | def preprocess_input(x, norm=True, training=None, momentum=0.0):
 85 |   """Preprocess input observations by optionally normalizing them.
 86 |   Args:
 87 |     x: tf.Tensor. Input tensor. When image observations, `shape.ndims` must be `4` and dtype must be
 88 |       `uint8`. When low-dimensional observations, `shape.ndims` must be `2` and dtype must be float
 89 |     norm: bool. If True, normalize the tensor
 90 |     training: tf.Tensor or bool. Required only for low-dimensional tensors. See normalize()
 91 |     momentum: float. See normalize()
 92 |   """
 93 |   # Image input
 94 |   if x.shape.ndims == 4 and x.dtype.base_dtype == tf.uint8:
 95 |     x = tf.cast(x, tf.float32)
 96 |     if norm:
 97 |       x = x / 255.0
 98 |   # Low-dimensional 2D input
 99 |   elif x.shape.ndims == 2 and x.dtype.base_dtype == tf.float32 or x.dtype.base_dtype == tf.float64:
100 |     if norm:
101 |       assert training is not None
102 |       # Normalize observations
103 |       x = normalize(x, training, momentum)
104 |   else:
105 |     raise ValueError("Invalid observation shape and type")
106 |   return x
107 | 
108 | 
109 | def gate_gradients(gradsvars):
110 |   """Make sure that all gradients are computed before being used
111 |   Args:
112 |     grads: list of tuples (grad, var)
113 |   Returns:
114 |     list of the same structure are grads
115 |   """
116 |   grads, variables = zip(*gradsvars)
117 |   grads = tf.tuple(list(grads))
118 |   variables = list(variables)
119 |   gradsvars = list(zip(grads, variables))
120 |   return gradsvars
121 | 
122 | 
123 | # ------------------------------------ INITIALIZERS ------------------------------------
124 | 
125 | 
126 | def init_he_relu():
127 |   """
128 |   Returns:
129 |     A normal distribution initializer with std = sqrt(2.0 / fan_in), where fan_in
130 |     is the size of the variable
131 |   """
132 |   # variance_scaling_initializer: https://www.tensorflow.org/api_docs/python/tf/variance_scaling_initializer
133 |   return tf.variance_scaling_initializer(scale=2.0, mode="fan_in", distribution="normal")
134 | 
135 | 
136 | def init_glorot_normal():
137 |   return tf.glorot_normal_initializer()
138 | 
139 | 
140 | def init_default():
141 |   return None
142 | 
143 | 
144 | def init_dqn():
145 |   """Return the initializer used in DQN and its improvements"""
146 |   return tf.variance_scaling_initializer(scale=1./3.0, mode="fan_in", distribution="uniform")
147 | 


--------------------------------------------------------------------------------
/rltf/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from rltf.utils.seeding import set_random_seeds
2 | 


--------------------------------------------------------------------------------
/rltf/utils/layouts.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | 
  3 | def plot_bars(ax, kwargs, env, color):
  4 |   x = atari_labels(env.unwrapped.get_action_meanings())
  5 |   return ax.bar(x=x, **kwargs, color=color)
  6 | 
  7 | 
  8 | def plot_highlight_bars(ax, kwargs, env, color_n='#1f77b4', color_hi='#d62728'):
  9 |   x = atari_labels(env.unwrapped.get_action_meanings())
 10 |   color = [color_n] * len(x)
 11 |   a = kwargs.pop("a")
 12 |   color[a] = color_hi
 13 |   return ax.bar(x=x, **kwargs, color=color)
 14 | 
 15 | 
 16 | def atari_labels(x):
 17 |   for i, label in enumerate(x):
 18 | 
 19 |     if label[-4:] == "FIRE":
 20 |       if len(label) > 4:
 21 |         end = "\nFIRE"
 22 | 
 23 |         length = len(label[:-4])
 24 |         if length >= 6:
 25 |           if label[:2] == "UP":
 26 |             start = "UP\n" + label[2:-4]
 27 |           elif label[:4] == "DOWN":
 28 |             start = "DOWN\n" + label[4:-4]
 29 |           else:
 30 |             raise ValueError
 31 |         else:
 32 |           start = label[:-4]
 33 |         x[i] = start + end
 34 | 
 35 |     elif len(label) >= 6:
 36 |       length = len(label)
 37 |       if label[:2] == "UP":
 38 |         x[i] = "UP\n" + label[2:]
 39 |       elif label[:4] == "DOWN":
 40 |         x[i] = "DOWN\n" + label[4:]
 41 |       else:
 42 |         raise ValueError
 43 | 
 44 |   return x
 45 | 
 46 | 
 47 | qrdqn_layout = {
 48 |   "width": 900,
 49 |   "height": 440,
 50 |   "obs_align": dict(vertical='center', horizontal='left'),
 51 |   # "obs_scale": 1.0,
 52 |   "figures": {
 53 |     "train_actions": {
 54 |       "align": dict(vertical='center', horizontal='right'),
 55 |       "width": 720,
 56 |       "height": -1,
 57 |       "fig": {
 58 |         "subplots": dict(nrows=2, ncols=1, sharex=True),
 59 |         "subplots_conf": OrderedDict(
 60 |           a_q={
 61 |             "tick_params": dict(axis='y', labelsize=5.5),
 62 |             "set_title": dict(label="Q FUNCTION", size=6),
 63 |           },
 64 |           a_z_var={
 65 |             "tick_params": dict(axis='y', labelsize=5.5),
 66 |             "set_title": dict(label="Z VARIANCE", size=6),
 67 |           },
 68 |           # a_z={
 69 |           #   "tick_params": dict(axis='y', labelsize=5.5),
 70 |           #   "set_title": dict(label="Z", size=6),
 71 |           # },
 72 |         ),
 73 |         "subplots_common": {
 74 |           "grid": dict(linewidth=0.2),
 75 |           "tick_params": dict(axis='x', labelsize=6.5),
 76 |         },
 77 |         "fig_conf": {
 78 |           "tight_layout": dict(pad=1.0, h_pad=0.0),
 79 |         },
 80 |       },
 81 |       "plot_function": plot_highlight_bars,
 82 |     },
 83 |     "eval_actions": {
 84 |       "align": dict(vertical='center', horizontal='right'),
 85 |       "width": 720,
 86 |       "height": -1,
 87 |       "fig": {
 88 |         "subplots": dict(nrows=2, ncols=1),
 89 |         "subplots_conf": OrderedDict(
 90 |           a_q={
 91 |             "tick_params": dict(axis='y', labelsize=5.5),
 92 |             "set_title": dict(label="Q FUNCTION", size=6),
 93 |           },
 94 |           a_z_var={
 95 |             "tick_params": dict(axis='y', labelsize=5.5),
 96 |             "set_title": dict(label="Z VARINACE", size=6),
 97 |           },
 98 |           # a_z={
 99 |           #   "tick_params": dict(axis='y', labelsize=5.5),
100 |           #   "set_title": dict(label="Z", size=6),
101 |           # },
102 |         ),
103 |         "subplots_common": {
104 |           "tick_params": dict(axis='x', labelsize=6.5),
105 |         },
106 |         "fig_conf": {
107 |           "tight_layout": dict(pad=1.0, h_pad=0.0),
108 |         },
109 |       },
110 |       "plot_function": plot_highlight_bars,
111 |     },
112 |   }
113 | }
114 | 
115 | 
116 | ids_homoscedastic_layout = {
117 |   "width": 800,
118 |   "height": 300,
119 |   "obs_align": dict(vertical='center', horizontal='left'),
120 |   # "obs_scale": 1.0,
121 |   "figures": {
122 |     "train_actions": {
123 |       "align": dict(vertical='center', horizontal='right'),
124 |       "width": 620,
125 |       "height": -1,
126 |       "fig": {
127 |         "subplots": dict(nrows=3, ncols=1, sharex=True),
128 |         "subplots_conf": OrderedDict(
129 |           a_mean={
130 |             "tick_params": dict(axis='y', labelsize=5.5),
131 |             "set_title": dict(label="MEAN", size=6),
132 |           },
133 |           a_std={
134 |             "tick_params": dict(axis='y', labelsize=5.5),
135 |             "set_title": dict(label="STD", size=6),
136 |           },
137 |           a_ids={
138 |             "tick_params": dict(axis='y', labelsize=5.5),
139 |             "set_title": dict(label="IDS", size=6),
140 |           },
141 |         ),
142 |         "subplots_common": {
143 |           "grid": dict(linewidth=0.2),
144 |           "tick_params": dict(axis='x', labelsize=6.5),
145 |         },
146 |         "fig_conf": {
147 |           "tight_layout": dict(pad=1.0, h_pad=0.0),
148 |         },
149 |       },
150 |       "plot_function": plot_highlight_bars,
151 |     },
152 |     "eval_actions": {
153 |       "align": dict(vertical='center', horizontal='right'),
154 |       "width": 620,
155 |       "height": -1,
156 |       "fig": {
157 |         "subplots": dict(nrows=1, ncols=1),
158 |         "subplots_conf": OrderedDict(
159 |           a_mean={
160 |             "set_title": dict(label="MEANS", size=8),
161 |             "tick_params": dict(axis='y', labelsize=8),
162 |           },
163 |           # a_vote={
164 |           #   "set_title": dict(label="VOTES", size=8),
165 |           #   "tick_params": dict(axis='y', labelsize=8),
166 |           # },
167 |         ),
168 |         "subplots_common": {
169 |           "tick_params": dict(axis='x', labelsize=6.5),
170 |         },
171 |         "fig_conf": {
172 |           "tight_layout": dict(pad=1.0, h_pad=0.0),
173 |         },
174 |       },
175 |       "plot_function": plot_highlight_bars,
176 |     },
177 |   }
178 | }
179 | 
180 | 
181 | ids_heteroscedastic_layout = {
182 |   "width": 840,
183 |   "height": 440,
184 |   "obs_align": dict(vertical='center', horizontal='left'),
185 |   # "obs_scale": 1.0,
186 |   "figures": {
187 |     "train_actions": {
188 |       "align": dict(vertical='center', horizontal='right'),
189 |       "width": 660,
190 |       "height": -1,
191 |       "fig": {
192 |         "subplots": dict(nrows=4, ncols=1, sharex=True),
193 |         "subplots_conf": OrderedDict(
194 |           a_mean={
195 |             "tick_params": dict(axis='y', labelsize=5.5),
196 |             "set_title": dict(label="MEAN", size=6),
197 |           },
198 |           a_std={
199 |             "tick_params": dict(axis='y', labelsize=5.5),
200 |             "set_title": dict(label="STD", size=6),
201 |           },
202 |           a_rho2={
203 |             "tick_params": dict(axis='y', labelsize=5.5),
204 |             "set_title": dict(label=r'$RHO^2$', size=6),
205 |           },
206 |           a_ids={
207 |             "tick_params": dict(axis='y', labelsize=5.5),
208 |             "set_title": dict(label="IDS", size=6),
209 |           },
210 |         ),
211 |         "subplots_common": {
212 |           "grid": dict(linewidth=0.2),
213 |           "tick_params": dict(axis='x', labelsize=6.5),
214 |         },
215 |         "fig_conf": {
216 |           "tight_layout": dict(pad=1.0, h_pad=0.0),
217 |         },
218 |       },
219 |       "plot_function": plot_highlight_bars,
220 |     },
221 |     "eval_actions": {
222 |       "align": dict(vertical='center', horizontal='right'),
223 |       "width": 660,
224 |       "height": -1,
225 |       "fig": {
226 |         "subplots": dict(nrows=1, ncols=1),
227 |         "subplots_conf": OrderedDict(
228 |           a_mean={
229 |             "set_title": dict(label="MEANS", size=8),
230 |             "tick_params": dict(axis='y', labelsize=8),
231 |           },
232 |           # a_vote={
233 |           #   "set_title": dict(label="VOTES", size=8),
234 |           #   "tick_params": dict(axis='y', labelsize=8),
235 |           # },
236 |         ),
237 |         "subplots_common": {
238 |           "tick_params": dict(axis='x', labelsize=6.5),
239 |         },
240 |         "fig_conf": {
241 |           "tight_layout": dict(pad=1.0, h_pad=0.0),
242 |         },
243 |       },
244 |       "plot_function": plot_highlight_bars,
245 |     },
246 |   }
247 | }
248 | 
249 | 
250 | layouts = {
251 |   "QRDQN": qrdqn_layout,
252 |   "DQN_IDS": ids_homoscedastic_layout,
253 |   "BDQN_IDS": ids_homoscedastic_layout,
254 |   "C51_IDS": ids_heteroscedastic_layout,
255 |   "QRDQN_IDS": ids_heteroscedastic_layout,
256 | }
257 | 


--------------------------------------------------------------------------------
/rltf/utils/maker.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import os
  3 | 
  4 | import gym
  5 | 
  6 | from rltf.envs        import MaxEpisodeLen
  7 | from rltf.utils       import rltf_conf
  8 | from rltf.utils       import rltf_log
  9 | from rltf.utils       import seeding
 10 | 
 11 | 
 12 | def get_env_maker(env_id, seed, wrap=None, max_ep_steps_train=None, max_ep_steps_eval=None, **wrap_kwargs):
 13 |   """Create an environment maker function
 14 |   Args:
 15 |     env_id: str or callable. If str, full name of a registered gym, roboschool or pybullet
 16 |       env. If callable, must return a new env instance.
 17 |     seed: int. Seed for the environment and the modules
 18 |     wrap: function. Must take as arguments the environment and its mode and wrap it.
 19 |     max_ep_steps_train: int. A limit on the max steps in a training episode.
 20 |     max_ep_steps_eval: int. A limit on the max steps in an evaluation episode.
 21 |     wrap_kwargs: dict. Keyword arguments that will be passed to the wrapper
 22 |   Returns:
 23 |     callable which takes the mode of an env and builds a new enviornment instance
 24 |   """
 25 | 
 26 |   # Set the global seed. Note that once the seed it set, multiple calls to this do not afect randomness
 27 |   seeding.set_random_seeds(seed)
 28 | 
 29 |   # Create a variable which tracks the seeds passed to environments.
 30 |   # This is to prevent environments from having the same seed, which will cause unwanted correlation.
 31 |   env_seed = int(seed)
 32 | 
 33 |   if isinstance(env_id, str):
 34 |     if "Roboschool" in env_id:
 35 |       import roboschool #pylint: disable=unused-variable
 36 | 
 37 |     if "Bullet" in env_id:
 38 |       import pybullet_envs #pylint: disable=unused-variable
 39 | 
 40 |     make = lambda: gym.make(env_id)
 41 | 
 42 |   elif callable(env_id):
 43 |     make = env_id
 44 | 
 45 |   else:
 46 |     raise ValueError("You must provide a str or a function for 'env_id', "
 47 |                      "not {}: {}".format(type(env_id), env_id))
 48 | 
 49 | 
 50 |   def make_env(mode):
 51 |     nonlocal env_seed
 52 | 
 53 |     # Make the environment
 54 |     env = make()
 55 | 
 56 |     if env_seed >= 0:
 57 |       # Increment seed to avoid producing identical environments
 58 |       env_seed += 1
 59 |       env.seed(env_seed)
 60 | 
 61 |     # NOTE: Wrapper for episode steps limit must be set before any other wrapper
 62 |     if mode == 't' and max_ep_steps_train is not None:
 63 |       env = MaxEpisodeLen(env, max_episode_steps=max_ep_steps_train)
 64 |     elif mode == 'e' and max_ep_steps_eval is not None:
 65 |       env = MaxEpisodeLen(env, max_episode_steps=max_ep_steps_eval)
 66 | 
 67 |     if wrap is not None:
 68 |       env = wrap(env, mode, **wrap_kwargs)
 69 | 
 70 |     return env
 71 | 
 72 |   return make_env
 73 | 
 74 | 
 75 | def make_model_dir(args, base=rltf_conf.MODELS_DIR):
 76 |   """Construct the correct absolute path of the model and create the directory.
 77 |   Args:
 78 |     args: argparse.ArgumentParser. The command-line arguments
 79 |     base: str. The absolute path of the directory where all models are saved
 80 |   Returns:
 81 |     The absolute path for the model directory
 82 |   """
 83 | 
 84 |   # Get the model, the env, values of restore and reuse
 85 |   model_type  = args.model
 86 |   env_id      = args.env_id
 87 |   restore_dir = args.restore
 88 |   reuse_dir   = args.load_model
 89 | 
 90 |   # If restoring, do not create a new directory
 91 |   if restore_dir is not None:
 92 |     model_dir = restore_dir
 93 | 
 94 |   # If evaluating, create a subdirectory
 95 |   elif args.mode == 'play':
 96 |     assert reuse_dir is not None
 97 |     model_dir = os.path.join(reuse_dir, "play/")
 98 |     os.makedirs(model_dir)
 99 | 
100 |   # Create a new model directory
101 |   else:
102 |     model_id    = datetime.datetime.now().strftime("%Y-%m-%d_%H.%M.%S")
103 |     model_id    = env_id + "_" + model_id
104 |     model_name  = model_type.lower()
105 | 
106 |     model_dir   = os.path.join(base,      model_name)
107 |     model_dir   = os.path.join(model_dir, model_id)
108 |     model_dir   = os.path.join(model_dir, "")
109 | 
110 |     # Create the directory for the model
111 |     os.makedirs(model_dir)
112 | 
113 |   # Configure the logger
114 |   rltf_log.conf_logs(model_dir, args.log_lvl, args.log_lvl)
115 | 
116 |   return model_dir
117 | 


--------------------------------------------------------------------------------
/rltf/utils/rltf_conf.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | PROJECT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
4 | 
5 | MODELS_DIR  = os.path.join(PROJECT_DIR, "trained_models")
6 | 
7 | STATS_LOGGER_NAME = "stats_logger"
8 | PARAM_LOGGER_NAME = "param_logger"
9 | 


--------------------------------------------------------------------------------
/rltf/utils/rltf_log.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging.config
  3 | import os
  4 | import subprocess
  5 | 
  6 | from rltf.utils import rltf_conf
  7 | 
  8 | 
  9 | param_logger = logging.getLogger(rltf_conf.PARAM_LOGGER_NAME)
 10 | stats_logger = logging.getLogger(rltf_conf.STATS_LOGGER_NAME)
 11 | 
 12 | COLORS = dict(
 13 |   gray=37,
 14 |   red=31,
 15 |   green=32,
 16 |   yellow=93,
 17 |   blue=94,
 18 |   magenta=35,
 19 |   cyan=36,
 20 |   white=97,
 21 | )
 22 | 
 23 | 
 24 | def conf_logs(model_dir, stdout_lvl="DEBUG", file_lvl="DEBUG"):
 25 | 
 26 |   run_file    = os.path.join(model_dir, "run.log")
 27 | 
 28 |   conf = {
 29 |     'version': 1,
 30 |     'disable_existing_loggers': False,
 31 |     'formatters':
 32 |       {
 33 |       'default':
 34 |         {
 35 |           'format': '[%(levelname)s] %(name)s: %(message)s'
 36 |         },
 37 |       'info_formatter':
 38 |         {
 39 |           'format': '%(message)s'
 40 |         },
 41 |       },
 42 |     'handlers':
 43 |       {
 44 |         # Console debug file handler
 45 |         'console': {
 46 |           'level': stdout_lvl,
 47 |           'class': 'logging.StreamHandler',
 48 |           'formatter': 'default',
 49 |           'stream': 'ext://sys.stdout'
 50 |         },
 51 |         # Runtime file handler
 52 |         'run_file': {
 53 |           'level': file_lvl,
 54 |           'class': 'logging.FileHandler',
 55 |           'formatter': 'default',
 56 |           'filename': run_file,
 57 |         },
 58 |         # Runtime file handler
 59 |         'info_run_file': {
 60 |           'level': file_lvl,
 61 |           'class': 'logging.FileHandler',
 62 |           'formatter': 'info_formatter',
 63 |           'filename': run_file,
 64 |         },
 65 |         # Normal stdout info and stats
 66 |         'std_info': {
 67 |           'level': 'INFO',
 68 |           'class': 'logging.StreamHandler',
 69 |           'formatter': 'info_formatter',
 70 |           'stream': 'ext://sys.stdout'
 71 |         },
 72 |       },
 73 |     'loggers':
 74 |       {
 75 |       # All loggers
 76 |       '':
 77 |         {
 78 |           'handlers': ['console', 'run_file'],
 79 |           'level': 'DEBUG',
 80 |           'propagate': True
 81 |         },
 82 |       # Parameter file logger
 83 |       rltf_conf.PARAM_LOGGER_NAME:
 84 |         {
 85 |           'handlers': ['std_info', 'info_run_file'],
 86 |           'level': 'INFO',
 87 |           'propagate': False
 88 |         },
 89 |       # Trianing stat reports logger
 90 |       rltf_conf.STATS_LOGGER_NAME:
 91 |         {
 92 |           'handlers': ['std_info', 'info_run_file'],
 93 |           'level': 'INFO',
 94 |           'propagate': False
 95 |         },
 96 |       }
 97 |     }
 98 | 
 99 | 
100 |   logging.config.dictConfig(conf)
101 | 
102 |   # Log the git diff
103 |   try:
104 |     diff = subprocess.check_output(["git", "diff"], cwd=rltf_conf.PROJECT_DIR)
105 |     diff = diff.decode("utf-8")
106 |     if diff != "":
107 |       with open(os.path.join(model_dir, "git.diff"), 'w') as f:
108 |         f.write(diff)
109 |   except subprocess.CalledProcessError:
110 |     # git repo not initialized
111 |     pass
112 | 
113 | 
114 | def colorize(string, color, bold=False, highlight=False):
115 |   attr = []
116 |   code = COLORS[color]
117 |   if highlight: code += 10
118 |   attr.append(str(code))
119 |   if bold: attr.append('1')
120 |   return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
121 | 
122 | 
123 | def log_params(params, args):
124 |   """Log the runtime parameters for the model to a file on disk
125 |   Args:
126 |     params: list. Each entry must be a tuple of (name, value). Value can also
127 |       be any time of object, but it should have an implementation of __str__
128 |     args: ArgumentParser. The command line arguments
129 |   """
130 | 
131 |   # Log date and time and git commit and branch
132 |   date    = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
133 |   commit  = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=rltf_conf.PROJECT_DIR)
134 |   commit  = commit.decode("utf-8").strip("\n")
135 |   branch  = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=rltf_conf.PROJECT_DIR)
136 |   branch  = branch.decode("utf-8").strip("\n")
137 | 
138 |   param_logger.info("")
139 |   param_logger.info("TIME: %s", date)
140 |   param_logger.info("GIT COMMIT: %s", commit)
141 |   param_logger.info("GIT BRANCH: %s", branch)
142 |   param_logger.info("")
143 | 
144 |   # Build the list of data that will be logged
145 |   data = {**vars(args), **dict(params)}
146 |   data = data.items()
147 | 
148 |   # Log the parameters
149 |   param_logger.info("PARAMETERS:")
150 |   data = format_tabular(data, 80)
151 |   for s, v in data:
152 |     try:
153 |       param_logger.info(s.format(v))
154 |     except TypeError:
155 |       param_logger.info(s.format(str(v)))
156 |   param_logger.info("")
157 | 
158 | 
159 | def format_tabular(data, value_width=15, sort=True):
160 |   """
161 |   Args:
162 |     data: list of tuples. The first member of the tuple must be str, the last
163 |       must be either a lambda or the variable to be printed
164 |     value_width: int. The max number of chars that the value of the last member
165 |       of the tuple can take
166 |   Returns:
167 |     list of tuples. Calling `print(s.format(v)) for s,v in data`, will result in
168 |       tabular print. If `v` is lambda, it must be evaluated before printing
169 |   """
170 |   data = _pad_keys_tabular(data, sort)
171 |   width = len(data[0][0]) + 6 + value_width
172 | 
173 |   # Values available directly
174 |   if len(data[0]) == 2:
175 |     hborder = ("-" * width + "{}", "")
176 |     data    = [("| " + s + "| {:<" + str(value_width) + "} |", v) for s, v in data]
177 |   # Values available from calling a lambda
178 |   elif len(data[0]) == 3:
179 |     hborder = ("-" * width + "{}", lambda *args, **kwargs: "")
180 |     data    = [("| " + s + "| {:<" + str(value_width) + f + "} |", v) for s, f, v in data]
181 |   else:
182 |     raise ValueError("Tuple must have len 2 or 3")
183 | 
184 | 
185 |   data = [(s, str(v)) if v is None else (s, v) for s, v in data]
186 |   data = [hborder] + data + [hborder]
187 | 
188 |   return data
189 | 
190 | 
191 | _DUMP_TABULAR = []
192 | 
193 | 
194 | def log_tabular(name, value):
195 |   _DUMP_TABULAR.append((name, value))
196 | 
197 | 
198 | def dump_tabular(logger=stats_logger):
199 |   global _DUMP_TABULAR
200 |   data = _DUMP_TABULAR
201 |   # Format in tabular way
202 |   data = format_tabular(data, sort=False)
203 |   # Dump the data
204 |   logger.info("")
205 |   for s, v in data:
206 |     logger.info(s.format(v))
207 |   logger.info("")
208 |   _DUMP_TABULAR = []
209 | 
210 | 
211 | def _pad_keys_tabular(data, sort):
212 |   """Pad only the key fields in data (i.e. the strs) in a tabular way, such that they
213 |   all take the same amount of characters
214 |   Args:
215 |     data: list of tuples. The first member of the tuple must be str, the rest can be anything.
216 |   Returns:
217 |     list with the strs padded with space chars in order to align in tabular way
218 |   """
219 |   if sort:
220 |     data  = sorted(data, key=lambda tup: tup[0])
221 |   sizes = [len(t[0]) for t in data]
222 |   pad   = max(sizes) + 2
223 |   data  = [(t[0].ljust(pad), *t[1:]) for t in data]
224 |   return data
225 | 


--------------------------------------------------------------------------------
/rltf/utils/seeding.py:
--------------------------------------------------------------------------------
  1 | # Partially based on https://github.com/openai/gym under the following license:
  2 | #
  3 | # The MIT License
  4 | #
  5 | # Copyright (c) 2016 OpenAI (http://openai.com)
  6 | #
  7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  8 | # of this software and associated documentation files (the "Software"), to deal
  9 | # in the Software without restriction, including without limitation the rights
 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 | # copies of the Software, and to permit persons to whom the Software is
 12 | # furnished to do so, subject to the following conditions:
 13 | #
 14 | # The above copyright notice and this permission notice shall be included in
 15 | # all copies or substantial portions of the Software.
 16 | #
 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 23 | # THE SOFTWARE.
 24 | #
 25 | 
 26 | import hashlib
 27 | import numbers
 28 | import os
 29 | import random
 30 | import struct
 31 | import numpy      as np
 32 | import tensorflow as tf
 33 | 
 34 | 
 35 | SEEDED  = False
 36 | seeder  = np.random.RandomState()
 37 | 
 38 | 
 39 | def set_random_seeds(seed):
 40 |   global SEEDED
 41 |   if seed < 0 or SEEDED:
 42 |     return
 43 |   SEEDED = True
 44 | 
 45 |   # Set the RLTF seed
 46 |   seeder.seed(seed)
 47 | 
 48 |   # Sample separate seeds from seeder to avoid correlation among modules
 49 |   # NOTE: For the same seed, the sampled sequence of module seeds will always be the same
 50 |   tf.set_random_seed(hash_seed(create_seed()))
 51 |   np.random.seed(hash_seed(create_seed()))
 52 |   random.seed(hash_seed(create_seed()))
 53 | 
 54 | 
 55 | def get_prng(seed=None):
 56 |   """Creates a new instance of a pseudo-random number generator (prng). If `SEEDED==True` (seed has
 57 |   been set globally), then the prng is also seeded deterministically. Otherwise, it is seeded with a
 58 |   random seed. If `seed=None` and `SEEDED==True`, `seed` will be autogenerated in a deterministic way
 59 |   by a custom prng, provided that the order of calls to this subroutine remains the same between
 60 |   different runs of your program.
 61 |   Args:
 62 |     seed: int or None. If int, the prng will be seeded with this number. Otherwise, a seed will be
 63 |       automatically generated
 64 |   Returns:
 65 |     np.random.RandomState: the pseudo-random number generator
 66 |   """
 67 |   if seed is not None and not (isinstance(seed, numbers.Integral) and seed >= 0):
 68 |     raise ValueError('Seed must be a non-negative integer or omitted, not {}'.format(seed))
 69 | 
 70 |   prng = np.random.RandomState()
 71 |   seed = create_seed(seed)
 72 |   seed = _int_list_from_bigint(hash_seed(seed))
 73 |   prng.seed(seed)
 74 |   return prng
 75 | 
 76 | 
 77 | def create_seed(seed=None, max_bytes=4):
 78 |   if seed is not None:
 79 |     seed = seed % 2**(8 * max_bytes)
 80 |   else:
 81 |     if SEEDED:
 82 |       seed = seeder.randint(0, 2**(8*max_bytes))
 83 |     else:
 84 |       seed = _bigint_from_bytes(os.urandom(max_bytes))
 85 |   return seed
 86 | 
 87 | 
 88 | def hash_seed(seed=None, max_bytes=4):
 89 |   """Different modules are likely to request an autogenerated seed. There is literature
 90 |   indicating that generating seeds in a linear fashion will cause correlated outputs
 91 |   generated by those seeds. To avoid this, we need to generate the seeds in a non-linear
 92 |   fashion. For more details check:
 93 | 
 94 |   http://blogs.unity3d.com/2015/01/07/a-primer-on-repeatable-random-numbers/
 95 |   http://stackoverflow.com/questions/1554958/how-different-do-random-seeds-need-to-be
 96 |   http://dl.acm.org/citation.cfm?id=1276928
 97 | 
 98 |   Thus, we hash the seeds, which should get rid of simple correlations.
 99 | 
100 |   Args:
101 |     seed: int
102 |     max_bytes: Maximum number of bytes to use in the hashed seed.
103 |   Returns:
104 |     int - the hashed seed
105 |   """
106 |   hash_code = hashlib.sha512(str(seed).encode('utf8')).digest()
107 |   return _bigint_from_bytes(hash_code[:max_bytes])
108 | 
109 | 
110 | def _bigint_from_bytes(data):
111 |   sizeof_int = 4
112 |   padding = sizeof_int - len(data) % sizeof_int
113 |   data += b'\0' * padding
114 |   int_count = int(len(data) / sizeof_int)
115 |   unpacked = struct.unpack("{}I".format(int_count), data)
116 |   accum = 0
117 |   for i, val in enumerate(unpacked):
118 |     accum += 2 ** (sizeof_int * 8 * i) * val
119 |   return accum
120 | 
121 | 
122 | def _int_list_from_bigint(bigint):
123 |   # Special case 0
124 |   if bigint < 0:
125 |     raise ValueError('Seed must be non-negative, not {}'.format(bigint))
126 |   elif bigint == 0:
127 |     return [0]
128 | 
129 |   ints = []
130 |   while bigint > 0:
131 |     bigint, mod = divmod(bigint, 2 ** 32)
132 |     ints.append(mod)
133 |   return ints
134 | 
135 | 
136 | # def get_prng(seed=None):
137 | #   """Creates a new instance of a pseudo-random number generator (prng). If `SEEDED==True` (seed has
138 | #   been set globally), then the prng is also seeded deterministically. Otherwise, it is seeded with a
139 | #   random seed. If `seed=None` and `SEEDED==True`, `seed` will be autogenerated in a deterministic way
140 | #   by a custom prng, provided that the order of calls to this subroutine remains the same between
141 | #   different runs of your program.
142 | #   Args:
143 | #     seed: int or None. If int, the prng will be seeded with this number. Otherwise, a seed will be
144 | #       automatically generated
145 | #   Returns:
146 | #     np.random.RandomState: the pseudo-random number generator
147 | #   """
148 | #   from gym.utils import seeding
149 | #   max_bytes = 8
150 | #   if SEEDED and seed is not None:
151 | #     seed = seeder.randint(0, 2**(8*max_bytes))
152 | #   prng, seed = seeding.np_random(seed)
153 | #   return prng
154 | 


--------------------------------------------------------------------------------