├── .gitignore
├── Makefile
├── README.md
├── e_maml_experiments
    ├── __init__.py
    ├── goal_cheetah_baseline.py
    └── jaynes_demo.py
├── e_maml_tf
    ├── __init__.py
    ├── algos
    │   ├── __init__.py
    │   ├── bc.py
    │   ├── bc_learned_loss.py
    │   ├── cpi.py
    │   ├── ppo2.py
    │   ├── sac.py
    │   └── vpg.py
    ├── config.py
    ├── custom_vendor
    │   ├── README.md
    │   ├── __init__.py
    │   ├── half_cheetah_goal_direction.py
    │   ├── half_cheetah_goal_velocity.py
    │   ├── maze_env.py
    │   └── patches.py
    ├── distributions.py
    ├── e_maml_ge.py
    ├── ge_policies.py
    ├── ge_utils.py
    ├── meta_rl_tasks.py
    ├── packages
    │   ├── __init__.py
    │   └── schedules.py
    ├── sampler.py
    ├── sampling_utils.py
    ├── train.py
    ├── trainer.py
    ├── value_baselines
    │   ├── __init__.py
    │   ├── base.py
    │   ├── gaussian_conv_baseline.py
    │   ├── gaussian_mlp_baseline.py
    │   ├── linear_feature_baseline.py
    │   └── zero_baseline.py
    └── wrappers
    │   ├── __init__.py
    │   ├── k_index.py
    │   ├── subproc_vec_env.py
    │   └── vec_env_normalize.py
├── jaynes-template.yml
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### JetBrains template
  3 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
  4 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  5 | 
  6 | .DS_Store
  7 | 
  8 | *__dataset
  9 | data
 10 | *outputs
 11 | leaf/runs
 12 | *run
 13 | 
 14 | # pytest output directories
 15 | .pytest*
 16 | test-logs
 17 | README
 18 | 
 19 | # jaynes config files.
 20 | jaynes.yml
 21 | *.jaynes.yml
 22 | *.jaynes.yaml
 23 | 
 24 | # User-specific stuff:
 25 | .idea
 26 | 
 27 | # run config files
 28 | .yours
 29 | 
 30 | # scratch
 31 | 
 32 | # mujoco key
 33 | mjkey.txt
 34 | 
 35 | ## File-based project format:
 36 | *.iws
 37 | 
 38 | ## Plugin-specific files:
 39 | 
 40 | # IntelliJ
 41 | /out/
 42 | 
 43 | # mpeltonen/sbt-idea plugin
 44 | .idea_modules/
 45 | 
 46 | # JIRA plugin
 47 | atlassian-ide-plugin.xml
 48 | 
 49 | # Crashlytics plugin (for Android Studio and IntelliJ)
 50 | com_crashlytics_export_strings.xml
 51 | crashlytics.properties
 52 | crashlytics-build.properties
 53 | fabric.properties
 54 | ### Python template
 55 | # Byte-compiled / optimized / DLL files
 56 | __pycache__/
 57 | *.py[cod]
 58 | *$py.class
 59 | 
 60 | # C extensions
 61 | *.so
 62 | 
 63 | # Distribution / packaging
 64 | .Python
 65 | env/
 66 | build/
 67 | develop-eggs/
 68 | dist/
 69 | downloads/
 70 | eggs/
 71 | .eggs/
 72 | lib/
 73 | lib64/
 74 | parts/
 75 | sdist/
 76 | var/
 77 | wheels/
 78 | *.egg-info/
 79 | .installed.cfg
 80 | *.egg
 81 | 
 82 | # PyInstaller
 83 | #  Usually these files are written by a python script from a template
 84 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 85 | *.manifest
 86 | *.spec
 87 | 
 88 | # Installer logs
 89 | pip-log.txt
 90 | pip-delete-this-directory.txt
 91 | 
 92 | # Unit test / coverage reports
 93 | htmlcov/
 94 | .tox/
 95 | .coverage
 96 | .coverage.*
 97 | .cache
 98 | nosetests.xml
 99 | coverage.xml
100 | *,cover
101 | .hypothesis/
102 | 
103 | # Translations
104 | *.mo
105 | *.pot
106 | 
107 | # Django stuff:
108 | *.log
109 | local_settings.py
110 | 
111 | # Flask stuff:
112 | instance/
113 | .webassets-cache
114 | 
115 | # Scrapy stuff:
116 | .scrapy
117 | 
118 | # Sphinx documentation
119 | docs/_build/
120 | 
121 | # PyBuilder
122 | target/
123 | 
124 | # Jupyter Notebook
125 | .ipynb_checkpoints
126 | 
127 | # pyenv
128 | .python-version
129 | 
130 | # celery beat schedule file
131 | celerybeat-schedule
132 | 
133 | # SageMath parsed files
134 | *.sage.py
135 | 
136 | # dotenv
137 | .env
138 | 
139 | # virtualenv
140 | .venv
141 | venv/
142 | ENV/
143 | 
144 | # Spyder project settings
145 | .spyderproject
146 | 
147 | # Rope project settings
148 | .ropeproject
149 | 
150 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: build
 2 | build:
 3 | 	docker-compose up --build
 4 | install-dc:
 5 | 	sudo curl -L https://github.com/docker/compose/releases/download/1.21.2/docker-compose-$(uname -s)-$(uname -m) -o /usr/local/bin/docker-compose
 6 | 	sudo chmod +x /usr/local/bin/docker-compose
 7 | publish-docker:
 8 | 	docker tag super-expert episodeyang/super-expert
 9 | 	docker tag super-expert-gpu episodeyang/super-expert-gpu
10 | 	docker push episodeyang/super-expert
11 | 	docker push episodeyang/super-expert-gpu
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # E-MAML Implementation
 2 | 
 3 | This repo contains the full implementation of the E-MAML algorithm from the paper
 4 | [*Some Considerations on Learning to Explore via Meta-Reinforcement Learning*][NIPS_link]
 5 | 
 6 | [NIPS_link]: https://papers.nips.cc/paper/8140-the-importance-of-sampling-inmeta-reinforcement-learning
 7 | 
 8 | ## Structure of This Codebase
 9 | 
10 | The main implementation is contained in the `e_maml_tf` directory. Inside the `e_maml_experiments` 
11 | directory we provide a light weight half-cheetah baseline for verification. The original
12 | KrazyWorld codebase is not opensourced. So we implemented a new KrazyWorld environment. To run E-MAML
13 | on this new KrazyWorld, you need to add a thin adaptor following the convention in `custom_vendor`
14 | and `sampler.py`.
15 | 
16 | :point_right: [`KrazyWorld` github repo][KrazyWorld]
17 | 
18 | [KrazyWorld]: https://github.com/bstadie/krazyworld.git
19 | 
20 | # Getting Started:
21 | 
22 | 1. Setup conda environment with python 3.6.4 or above. (this is required for all of the `f-string` literals.)
23 | 2. if on mac, run `brew install mpich`. this is the MPI version that `baseline` and `mpi4py` relies on.
24 | 3. run `pip install -e .`. If the `mpi4py` installation fails, try `pip install mpi4py` in a new terminal session.
25 | 4. if `mujoco-py` complains (which fails the installation), make sure you have installed mujoco and have a working license key.
26 | 5. If not, you should download mujoco for your environment and place the license key `mjkey.txt` under `~/.mujoco/`.
27 | 6. Distributed Setup: Add a file `.yours` inside `e_maml_experiments` that contains the following content:
28 | 
29 |     ```yaml
30 |     username: <your-id>
31 |     project: e_maml
32 |     logging_server: http://<your-ml-logger-logging-server>:8081
33 |     ```
34 |    
35 |    If you are not using a distributed logging setup, you can leave the logging_server to `none` or 
36 |    leave it empty. In that case it would be logged to you `~/ml-logger-outputs` directory.
37 |    
38 | # Cite
39 | 
40 | To cite E-MAML please use
41 | 
42 | ```bibtex
43 | @article{stadie2018e-maml,
44 |   title={Some considerations on learning to explore via meta-reinforcement learning},
45 |   author={Stadie, Bradly C and Yang, Ge and Houthooft, Rein and Chen, Xi and Duan, Yan and Wu, Yuhuai and Abbeel, Pieter and Sutskever, Ilya},
46 |   journal={arXiv preprint arXiv:1803.01118},
47 |   year={2018}
48 | }
49 | ```
50 | 


--------------------------------------------------------------------------------
/e_maml_experiments/__init__.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | import os
  3 | from functools import reduce
  4 | from os.path import basename, dirname, abspath, join, expanduser
  5 | 
  6 | import yaml
  7 | from termcolor import cprint
  8 | 
  9 | with open(os.path.join(os.path.dirname(__file__), ".yours"), 'r') as stream:
 10 |     rc = yaml.load(stream, Loader=yaml.BaseLoader)
 11 | 
 12 | 
 13 | class RUN:
 14 |     from ml_logger import logger
 15 | 
 16 |     server = rc.get('logging_server', expanduser("~/ml-logger-outputs"))
 17 |     prefix = f"{rc['username']}/{rc['project']}/{logger.now('%Y/%m-%d')}"
 18 | 
 19 | 
 20 | def dir_prefix(depth=-1):
 21 |     from ml_logger import logger
 22 | 
 23 |     caller_script = abspath(inspect.getmodule(inspect.stack()[1][0]).__file__)
 24 |     # note: for scripts in the `plan2vec` module this also works -- b/c we truncate fixed depth.
 25 |     script_path = logger.truncate(caller_script, depth=len(__file__.split('/')) - 1)
 26 |     prefix = os.path.join(RUN.prefix, script_path)
 27 |     return reduce(lambda p, i: dirname(p), range(-depth), prefix)
 28 | 
 29 | 
 30 | def config_charts(config_yaml="", path=None):
 31 |     from textwrap import dedent
 32 |     from ml_logger import logger
 33 | 
 34 |     if not config_yaml:
 35 |         caller_script = abspath(inspect.getmodule(inspect.stack()[1][0]).__file__)
 36 |         if path is None:
 37 |             path = logger.stem(caller_script) + ".charts.yml"
 38 |         try:  # first try the namesake chart file
 39 |             with open(os.path.join(os.path.dirname(caller_script), path), 'r') as s:
 40 |                 config_yaml = s.read()
 41 |                 cprint(f"Found ml-dash config file \n{path}", 'green')
 42 |         except:  # do not upload when can not find
 43 |             path = ".charts.yml"
 44 |             with open(os.path.join(os.path.dirname(caller_script), path), 'r') as s:
 45 |                 config_yaml = s.read()
 46 |             cprint(f"Found ml-dash config file \n{path}", 'green')
 47 | 
 48 |     logger.log_text(dedent(config_yaml), ".charts.yml")
 49 | 
 50 | 
 51 | def thunk(fn, *ARGS, __prefix="", __timestamp='%H.%M/%S.%f', **KWARGS):
 52 |     """
 53 |     thunk for configuring the logger. The reason why this is not a decorator is
 54 | 
 55 |     :param fn: function to be called
 56 |     :param *ARGS: position arguments for the call
 57 |     :param __prefix: logging prefix for this run, default to "", where it does not do much.
 58 |     :param __timestamp: bool, default to True, whether post-fix with time stamps.
 59 |     :param **KWARGS: keyword arguments for the call
 60 |     :return: a thunk that can be called without parameters
 61 |     """
 62 |     from ml_logger import logger
 63 | 
 64 |     caller_script = abspath(inspect.getmodule(inspect.stack()[1][0]).__file__)
 65 |     # note: for scripts in the `plan2vec` module this also works -- b/c we truncate fixed depth.
 66 |     script_path = logger.truncate(caller_script, depth=len(__file__.split('/')) - 1)
 67 |     _ = [logger.now(__timestamp)] if __timestamp else []
 68 |     PREFIX = join(RUN.prefix, logger.stem(script_path), __prefix, *_)
 69 | 
 70 |     # todo: there should be a better way to log these.
 71 |     # todo: we shouldn't need to log to the same directory, and the directory for the run shouldn't be fixed.
 72 |     logger.configure(log_directory=RUN.server, prefix=PREFIX, asynchronous=False,  # use sync logger
 73 |                      max_workers=4, register_experiment=False)
 74 |     # the tension is in between creation vs run. Code snapshot are shared, but runs need to be unique.
 75 |     logger.log_params(
 76 |         run=logger.run_info(status="created", script_path=script_path),
 77 |         revision=logger.rev_info(),
 78 |         fn=logger.fn_info(fn), )
 79 |     logger.log_params(args=ARGS, kwargs=KWARGS)
 80 |     logger.diff(silent=True)
 81 | 
 82 |     import jaynes  # now set the job name to prefix
 83 |     if jaynes.RUN.mode != "local":
 84 |         runner_class, runner_args = jaynes.RUN.config['runner']
 85 |         if 'name' in runner_args:  # ssh mode does not have 'name'.
 86 |             runner_args['name'] = PREFIX.replace("geyang/", "")  # destroy my traces.
 87 |         del logger, jaynes, runner_args, runner_class
 88 |         cprint(f'{__file__}: Set up job name', "green")
 89 | 
 90 |     def _(*args, **kwargs):
 91 |         import traceback
 92 |         from ml_logger import logger
 93 | 
 94 |         assert not (args and ARGS), f"can not use position argument at both thunk creation as well as " \
 95 |             f"run.\n_args: {args}\nARGS: {ARGS}"
 96 | 
 97 |         logger.configure(log_directory=RUN.server, prefix=PREFIX, register_experiment=False, max_workers=10)
 98 |         logger.log_params(host=dict(hostname=logger.hostname), run=dict(status="running", startTime=logger.now()))
 99 | 
100 |         try:
101 |             _KWARGS = KWARGS.copy()
102 |             _KWARGS.update(kwargs)
103 | 
104 |             fn(*(args or ARGS), **_KWARGS)
105 | 
106 |             logger.log_line("========= execution is complete ==========")
107 |             logger.log_params(run=dict(status="completed", completeTime=logger.now()))
108 |         except Exception as e:
109 |             import time
110 |             time.sleep(1)
111 |             tb = traceback.format_exc()
112 |             with logger.SyncContext():  # Make sure uploaded finished before termination.
113 |                 logger.log_text(tb, filename="traceback.err")
114 |                 logger.log_params(run=dict(status="error", exitTime=logger.now()))
115 |                 logger.log_line(tb)
116 |                 logger.flush()
117 |             time.sleep(30)
118 |             raise e
119 | 
120 |         import time
121 |         time.sleep(30)
122 | 
123 |     return _
124 | 


--------------------------------------------------------------------------------
/e_maml_experiments/goal_cheetah_baseline.py:
--------------------------------------------------------------------------------
 1 | """Minimal Exasmple for E-MAML running on HalfCheetahGoalDir-v0
 2 | """
 3 | from e_maml_tf.config import RUN, G, Reporting, DEBUG
 4 | from e_maml_tf.train import run_e_maml
 5 | 
 6 | if __name__ == '__main__':
 7 | 
 8 |     G.env_name = "HalfCheetahGoalDir-v0"
 9 |     G.n_tasks = 20
10 |     G.n_graphs = 1
11 |     G.n_grad_steps = 5
12 |     G.meta_n_grad_steps = 1
13 | 
14 |     # to debug this locally
15 |     run_e_maml()
16 | 
17 |     # to launch with Jaynes on a SLURM cluster
18 |     import jaynes
19 |     jaynes.config('default')
20 |     jaynes.run(run_e_maml, _G=vars(G))
21 | 


--------------------------------------------------------------------------------
/e_maml_experiments/jaynes_demo.py:
--------------------------------------------------------------------------------
 1 | """A simple demo for launching ML jobs with Jaynes.
 2 | """
 3 | 
 4 | 
 5 | def train_fn(some_variable=0):
 6 |     import tensorflow as tf
 7 |     print(f"tensorflow version: {tf.__version__}")
 8 | 
 9 |     print('training is happening!')
10 |     print("some_variable is", some_variable)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     import jaynes
15 | 
16 |     jaynes.config('default')
17 |     jaynes.run(train_fn, some_variable=5)
18 | 
19 |     jaynes.listen(timeout=60)
20 | 


--------------------------------------------------------------------------------
/e_maml_tf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/e-maml/336291d6819a82650d5bcc5f08dd431742897416/e_maml_tf/__init__.py


--------------------------------------------------------------------------------
/e_maml_tf/algos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/e-maml/336291d6819a82650d5bcc5f08dd431742897416/e_maml_tf/algos/__init__.py


--------------------------------------------------------------------------------
/e_maml_tf/algos/bc.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is the behavior cloning algorithm. Takes in observations and demonstration actions, to supervise-learn a
  3 | policy.
  4 | 
  5 | -- Ge
  6 | 
  7 | """
  8 | from collections import defaultdict, Sequence
  9 | import tensorflow as tf
 10 | from gym import spaces
 11 | from typing import Callable, Union
 12 | from waterbear import OrderedBear
 13 | 
 14 | # NOTE: best way to define the input interface is to use a named_tuple and then others could just import the tuple from
 15 | # here: https://pymotw.com/2/collections/namedtuple.html
 16 | # NOTE: However pickle has trouble with namedtuple. Plus a class offers more functions, so we use a namespace instead.
 17 | # InputT = namedtuple("Inputs", 'A ADV R OLD_NEG_LOG_P_AC OLD_V_PRED CLIP_RANGE X_act X_train')
 18 | 
 19 | 
 20 | # Here we use a input class to make it easy to define defaults.
 21 | from e_maml_tf.ge_utils import placeholders_from_variables
 22 | 
 23 | 
 24 | class Inputs:
 25 |     def __init__(self, *, action_space, A=None):
 26 |         if isinstance(action_space, spaces.Discrete):
 27 |             self.A = A or tf.placeholder(tf.int32, [None], name="A")
 28 |         else:
 29 |             self.A = A or tf.placeholder(tf.float32, [None] + list(action_space.shape), name="A")
 30 | 
 31 | 
 32 | class Reports(OrderedBear):
 33 |     loss = None
 34 |     act_norm = None
 35 |     targ_act_norm = None
 36 |     entropy = None
 37 | 
 38 | 
 39 | class BC:
 40 |     def __init__(self, *, inputs: Inputs, policy):
 41 |         self.inputs = inputs
 42 |         self.policy = policy
 43 |         with tf.variable_scope('BC'):
 44 |             self.loss = tf.reduce_mean(policy.pd.neglogp(inputs.A))  # equivalent to L2 loss
 45 |             self.reports = Reports(
 46 |                 loss=self.loss,
 47 |                 act_norm=tf.reduce_mean(policy.pd.mean),
 48 |                 targ_act_norm=tf.reduce_mean(inputs.A),
 49 |                 entropy=tf.reduce_mean(policy.pd.entropy())
 50 |             )
 51 | 
 52 | 
 53 | class Optimize:
 54 |     optimize = None
 55 |     run_optimize = None
 56 | 
 57 |     def __init__(self, *, loss, trainables, lr=None, max_grad_norm=None, max_grad_clip=None, strict=False,
 58 |                  reports=None, **_):
 59 |         """
 60 |         Graph constructor for the optmizer
 61 | 
 62 |         :param lr: The learning rate, usually a placeholder but can be a float. Not needed if using external optimizer,
 63 |                     Needed here for the SGD update in the inner-step.
 64 |                     If set to None, then does not construct the self.optimize operator and the self.run_optimize
 65 |                     function.
 66 |         :param loss:
 67 |         :param trainables: Optional array used for the gradient calculation
 68 |         :param max_grad_norm:
 69 |         :param optimizer:
 70 |         :param _:
 71 |         """
 72 |         with tf.variable_scope('BC_Optimize'):
 73 |             # optimizer.gradients is just a wrapper around tf.gradients, with extra assertions. This is why it raises
 74 |             # errors on non-trainables.
 75 |             _grads = tf.gradients(loss, trainables)
 76 |             if strict:
 77 |                 for g in _grads:
 78 |                     assert g is not None, f'Some Grads are not defined: {_grads}'
 79 |             else:
 80 |                 _grads = [tf.zeros_like(p) if g is None else g for g, p in zip(_grads, trainables)]
 81 | 
 82 |             assert (not max_grad_norm or not max_grad_clip), \
 83 |                 f'max_grad_norm({max_grad_clip}) and max_grad_norm({max_grad_clip}) can not be trueful at the same time.'
 84 |             if max_grad_norm:  # allow 0 to be by-pass
 85 |                 # print('setting max-grad-norm to', max_grad_norm)
 86 |                 # tf.clip_by_global_norm is just fine. No need to use my own.
 87 |                 _grads = [g * tf.stop_gradient(max_grad_norm / tf.maximum(max_grad_norm, tf.norm(g))) for g in _grads]
 88 |                 # _grads, grad_norm = tf.clip_by_global_norm(_grads, max_grad_norm)
 89 |             elif max_grad_clip:
 90 |                 _grads = [tf.clip_by_value(g, -max_grad_clip, max_grad_clip) for g in _grads]
 91 | 
 92 |             self.grads = _grads
 93 | 
 94 |             # graph operator for updating the parameter. used by maml with the SGD inner step
 95 |             self.apply_grad = lambda *, lr, grad, var: var - lr * grad
 96 | 
 97 |             if lr is not None:
 98 |                 assert hasattr(trainables[0], '_variable'), "trainables have to have the _variable attribute"
 99 |                 lr_not_scalar = (hasattr(lr, 'shape') and len(lr.shape)) or (isinstance(lr, Sequence) and len(lr))
100 |                 self.optimize = [v.assign(self.apply_grad(lr=lr[i] if lr_not_scalar else lr, grad=g, var=v))
101 |                                  for i, (v, g) in enumerate(zip(trainables, self.grads))]
102 |                 _ = self.optimize if reports is None else [*vars(reports).values(), *self.optimize]
103 |                 self.run_optimize = lambda feed_dict: tf.get_default_session().run(_, feed_dict=feed_dict)
104 | 
105 |         # Function to compute the CPI gradients
106 |         self.run_grads = lambda *, feed_dict: tf.get_default_session().run([_grads], feed_dict)
107 | 
108 | 
109 | # note: this is singleton. Doesn't support having two instances. Move to class if that is needed.
110 | # sampling helpers for demonstration data
111 | SAMPLE_GENS = defaultdict(lambda: None)
112 | 
113 | 
114 | def sample_generator(*, paths_list: Union[list], batch_size=None, augment_fn: Union[None, Callable] = None,
115 |                      episodic_subsample_interval=1):
116 |     """
117 |     mode == "timestep":
118 |         The sampler samples each timestep individually. Different rollouts are always sampled individually.
119 | 
120 |     mode == "episode":
121 |         Each episode (index = 1) are sampled individually (rollout). Timesteps are not shuffled.
122 | 
123 |     Episodic Subsampling:
124 |         Only applies under mode == "episode".
125 |         
126 |         The episodic subsample occurs at fixed interval. The starting point of this subsampling is randomly sampled.
127 | 
128 | 
129 |     :param paths: dict['obs', 'acs'], values are tensors of the shape
130 | 
131 |                 Size(timesteps, n_envs, feat_n).
132 | 
133 |             This makes it easier to manipulate shuffling and slicing timestep wise.
134 | 
135 |     :param batch_size: size for the mini-batches.
136 |     :param augment_fn:  A function (*, obs, acs, *task_spec) => augmented path{obs, acs}
137 |             Note: This augment_fn is called every mini-batch. It is task-specific. (takes in task_spec)
138 |     :return: dict(
139 |                  obs = Size(batch_size, 1, feat_n),
140 |                  acs = Size(batch_size, 1, feat_n)
141 |                  ...
142 |              )
143 |     """
144 |     import numpy as np
145 | 
146 |     # assert mode is 'multitask', "Only multitask mode is supported now."
147 |     # assert augment_fn is None, "The augmentation function is not called under this mode."
148 |     # Now allow augment_fn in multitask mode.
149 | 
150 |     p0 = paths_list[0]  # assume that all data are identical shape.
151 |     assert p0['obs'].shape[0] == p0['acs'].shape[0], "observation and actions need to have the same length."
152 |     assert len(p0['obs'].shape) == 3, "observation (and action) are rank 3 tensors ~ Size(k, horizon, feat_n)."
153 | 
154 |     timesteps, k_rollouts, _ = p0['obs'].shape
155 |     batch_size = batch_size or timesteps
156 |     batch_n = timesteps * k_rollouts // batch_size
157 |     assert timesteps % (episodic_subsample_interval * batch_size) == 0, f's.t. that shuffling works. ' \
158 |         f'{timesteps} % ({episodic_subsample_interval} * {batch_size}) != 0'
159 | 
160 |     # the first next returns the number of batch :)
161 |     task_spec = yield dict(batch_n=batch_n)
162 | 
163 |     assert timesteps % episodic_subsample_interval == 0, "has to be the right shape"
164 |     new_shape = [episodic_subsample_interval, timesteps // episodic_subsample_interval, k_rollouts, -1]
165 |     final_shape = [k_rollouts * episodic_subsample_interval, timesteps // episodic_subsample_interval, -1]
166 |     paths = [{k: v.reshape(new_shape).swapaxes(1, 2).reshape(final_shape) if hasattr(v, 'shape') else v
167 |               for k, v in _.items()} for _ in paths_list]
168 |     while True:
169 |         shuffled_inds = np.random.rand(episodic_subsample_interval * k_rollouts).argsort()
170 |         # do all of the copying here.
171 |         shuffled_paths = [{
172 |             k: v[shuffled_inds].reshape(timesteps * k_rollouts, -1) if isinstance(v, np.ndarray)
173 |             else v for k, v in _.items()} for _ in paths]
174 |         for i in range(batch_n):
175 |             task_index = task_spec['index'] if task_spec else 0
176 |             selected_paths = shuffled_paths[task_index]
177 | 
178 |             start = i * batch_size
179 |             # no copy involved
180 |             batch_paths = {
181 |                 k: v[start: start + batch_size].reshape(batch_size, 1, -1) if isinstance(v, np.ndarray)
182 |                 else v for k, v in selected_paths.items()
183 |             }
184 |             # obs_augment occurs here
185 |             # note: pass in index=task_index explicitly, b/c task_spec can be None.
186 |             task_spec = yield augment_fn(**batch_paths, index=task_index) if augment_fn else batch_paths
187 | 
188 | 
189 | def use_samples(key=None, **kwargs):
190 |     global SAMPLE_GENS
191 |     key = 'default' if key is None else key
192 |     SAMPLE_GENS[key] = sample_generator(**kwargs)
193 |     return next(SAMPLE_GENS[key])  # start the generator, return information of the generator.
194 | 
195 | 
196 | DATA_MODE = "multi-mode"  # OneOf['multi-mode', 'simple']
197 | 
198 | 
199 | # key = None or key = eval
200 | def sample_demonstration_data(task_spec=None, key=None):
201 |     global SAMPLE_GENS
202 |     import numpy as np
203 |     # add logic here to support multi-mode.
204 |     if DATA_MODE == "multi-mode":
205 |         if key is None:
206 |             keys = [k for k in SAMPLE_GENS.keys() if "/" not in k]
207 |         else:
208 |             keys = [k for k in SAMPLE_GENS.keys() if k == key or k.startswith(key + "/")]
209 |         key = keys[np.random.choice(len(keys))]
210 |     elif DATA_MODE == "simple":
211 |         key = 'default' if key is None else key
212 |     else:
213 |         raise NotImplementedError
214 | 
215 |     g = SAMPLE_GENS.get(key, None)
216 |     assert g is not None, f'sample key {key} does NOT exist. First call use_samples to setup this sample gen.'
217 |     return next(g) if task_spec is None else g.send(task_spec)
218 | 
219 | 
220 | # in behavior cloning, we use the supervising observation and actions.
221 | # Assume the actions come from a gaussian policy
222 | def path_to_feed_dict(*, inputs: Inputs, paths, lr=None, **_rest):
223 |     """
224 |     convert path objects to feed_dict for the tensorflow graph.
225 | 
226 |     :param inputs:  Input object
227 |     :param paths: dict['obs', 'acs']: Size(n_timesteps, n_envs, feat_n)
228 |     :param lr: placeholder or floating point number
229 |     :param _rest:
230 |     :return: feed_dict, keyed by the input placeholders.
231 |     """
232 |     # reshaping the path, need to debug
233 |     n_timesteps, n_envs, *_ = paths['obs'].shape
234 |     n = n_timesteps * n_envs
235 | 
236 |     feed_dict = {
237 |         inputs.X: paths['obs'].reshape(n, -1),
238 |         inputs.A: paths['acs'].reshape(n, -1),
239 |         # all of these are gone.
240 |         # inputs.OLD_NEG_LOG_P_AC: paths['neglogpacs'].reshape(-1),
241 |         # inputs.OLD_V_PRED: paths['values'].reshape(-1),
242 |         # These are useful if the agent receives the reward.
243 |         # inputs.ADV: advs_normalized.reshape(-1),
244 |         # inputs.R: paths['returns'].reshape(-1),
245 |         # inputs.CLIP_RANGE: clip_range
246 |     }
247 |     if lr is not None:
248 |         assert inputs.LR is not None, f'Input should have LR attribute if a learning rate is passed.'
249 |         feed_dict[inputs.LR] = lr
250 |     return feed_dict
251 | 


--------------------------------------------------------------------------------
/e_maml_tf/algos/bc_learned_loss.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is the behavior cloning algorithm. Takes in observations and demonstration actions, to supervise-learn a
  3 | policy.
  4 | 
  5 | -- Ge
  6 | 
  7 | """
  8 | import tensorflow as tf
  9 | from gym import spaces
 10 | from waterbear import OrderedBear
 11 | 
 12 | 
 13 | # NOTE: best way to define the input interface is to use a named_tuple and then others could just import the tuple from
 14 | # here: https://pymotw.com/2/collections/namedtuple.html
 15 | # NOTE: However pickle has trouble with namedtuple. Plus a class offers more functions, so we use a namespace instead.
 16 | # InputT = namedtuple("Inputs", 'A ADV R OLD_NEG_LOG_P_AC OLD_V_PRED CLIP_RANGE X_act X_train')
 17 | 
 18 | 
 19 | class Inputs:
 20 |     # note: we do not pass in the observation placeholder b/c it is not used at all in the code base.
 21 |     # note: this breaks consistency with the rest of the algos folder but we can fix it when see fit.
 22 |     def __init__(self, *, action_space, type=None):
 23 |         if type in [LOSS_TYPES.two_headed_BC, LOSS_TYPES.learned_loss_exp_act, LOSS_TYPES.learned_loss_deep,
 24 |                     LOSS_TYPES.learned_loss_exp_act_deep]:
 25 |             if isinstance(action_space, spaces.Discrete):
 26 |                 self.A = tf.placeholder(tf.int32, [None], name="A")
 27 |             else:
 28 |                 self.A = tf.placeholder(tf.float32, [None] + list(action_space.shape), name="A")
 29 | 
 30 | 
 31 | class Reports(OrderedBear):
 32 |     # note: does not include optional keys.
 33 |     loss = None
 34 |     act_norm = None
 35 |     entropy = None
 36 | 
 37 | 
 38 | class LOSS_TYPES:
 39 |     surrogate_target = "surrogate-target"
 40 |     a2_target = "a2_target"
 41 |     two_headed_BC = "two-headed-BC"
 42 |     learned_loss = "learned-BC-loss"
 43 |     learned_loss_deep = "learned-BC-loss-deep"  # this doesnt work as well as the action one.
 44 |     learned_loss_exp_act = "learned-BC-loss-with-expert-action"
 45 |     learned_loss_exp_act_deep = "learned-BC-loss-with-expert-action-deep"
 46 | 
 47 | 
 48 | def fc(x, scope, nh, act=tf.nn.relu):
 49 |     with tf.variable_scope(scope):
 50 |         nin = x.get_shape()[-1].value  # can take batched or individual tensors.
 51 |         w = tf.get_variable("w", [nin, nh], initializer=tf.contrib.layers.xavier_initializer())
 52 |         b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0))
 53 |         z = tf.matmul(x, w) + b
 54 |         h = act(z)
 55 |         return h
 56 | 
 57 | 
 58 | class BCLearnedLoss:
 59 |     def __init__(self, *, inputs: Inputs, policy, type: str):
 60 |         self.inputs = inputs
 61 |         self.policy = policy
 62 |         with tf.variable_scope('BCLearnedLoss'):
 63 |             act_dim = policy.pd.mean.shape[-1]
 64 |             if type == LOSS_TYPES.surrogate_target:
 65 |                 # learned loss. Use identity function as the activation
 66 |                 surrogate_action = fc(policy.h_, 'surrogate_action_target', nh=act_dim, act=lambda x: x)
 67 |                 self.loss = tf.reduce_mean(policy.pd.neglogp(surrogate_action))  # equivalent to L2 loss
 68 |                 self.reports = Reports(
 69 |                     loss=self.loss,
 70 |                     act_norm=tf.reduce_mean(policy.pd.mean),
 71 |                     surrogate_act_norm=tf.reduce_mean(surrogate_action),
 72 |                     entropy=tf.reduce_mean(policy.pd.entropy())
 73 |                 )
 74 |             elif type == LOSS_TYPES.a2_target:
 75 |                 # two headed architecture. The policy head is not BC trained.
 76 |                 action = fc(tf.concat([policy.h_, policy.pd.mean], -1), 'bc-surrogate-head', nh=act_dim,
 77 |                             act=lambda x: x)
 78 |                 self.loss = tf.reduce_mean(action ** 2)
 79 |                 self.reports = Reports(
 80 |                     loss=self.loss,
 81 |                     act_norm=tf.reduce_mean(policy.pd.mean),
 82 |                     surrogate_act_norm=tf.reduce_mean(action),
 83 |                     entropy=tf.reduce_mean(policy.pd.entropy())
 84 |                 )
 85 |             elif type == LOSS_TYPES.two_headed_BC:
 86 |                 # two headed architecture. The policy head is not BC trained.
 87 |                 # Requires a BC action input
 88 |                 surrogate_action = fc(tf.concat([policy.h_, policy.pd.mean], -1), 'surrogate_loss', nh=1,
 89 |                                       act=lambda x: x)
 90 |                 self.loss = tf.reduce_mean(surrogate_action - inputs.A)
 91 |                 self.reports = Reports(
 92 |                     loss=self.loss,
 93 |                     act_norm=tf.reduce_mean(policy.pd.mean),
 94 |                     entropy=tf.reduce_mean(policy.pd.entropy()),
 95 |                     surrogate_act_norm=tf.reduce_mean(surrogate_action),
 96 |                     expert_act_norm=tf.reduce_mean(inputs.A),
 97 |                 )
 98 |             elif type == LOSS_TYPES.learned_loss:
 99 |                 _ = tf.concat([inputs.X, policy.pd.mean], -1)
100 |                 _ = fc(_, 'learned_loss', nh=1, act=lambda x: x)
101 |                 self.loss = tf.reduce_mean(_ ** 2)  # effectively
102 |                 self.reports = Reports(
103 |                     loss=self.loss,
104 |                     act_norm=tf.reduce_mean(policy.pd.mean),
105 |                     entropy=tf.reduce_mean(policy.pd.entropy())
106 |                 )
107 |             elif type == LOSS_TYPES.learned_loss_deep:
108 |                 with tf.variable_scope('learned_loss'):
109 |                     _ = tf.concat([inputs.X, policy.pd.mean], -1)
110 |                     _ = fc(_, 'layer_1', nh=64)
111 |                     _ = fc(_, 'layer_2', nh=1, act=lambda x: x)
112 |                     self.loss = tf.reduce_mean(_ ** 2)  # effectively
113 |                 self.reports = Reports(
114 |                     loss=self.loss,
115 |                     act_norm=tf.reduce_mean(policy.pd.mean),
116 |                     entropy=tf.reduce_mean(policy.pd.entropy())
117 |                 )
118 |             elif type == LOSS_TYPES.learned_loss_exp_act:
119 |                 with tf.variable_scope('learned_loss'):
120 |                     _ = tf.concat([inputs.X, inputs.A, policy.pd.mean], -1)
121 |                     _ = fc(_, 'learned_loss', nh=1, act=lambda x: x)
122 |                     self.loss = tf.reduce_mean(_ ** 2)  # effectively
123 |                 self.reports = Reports(
124 |                     loss=self.loss,
125 |                     expert_act_norm=tf.reduce_mean(inputs.A),
126 |                     act_norm=tf.reduce_mean(policy.pd.mean),
127 |                     entropy=tf.reduce_mean(policy.pd.entropy())
128 |                 )
129 |             elif type == LOSS_TYPES.learned_loss_exp_act_deep:
130 |                 with tf.variable_scope('learned_loss'):
131 |                     _ = tf.concat([inputs.X, inputs.A, policy.pd.mean], -1)
132 |                     _ = fc(_, 'layer_1', nh=64)
133 |                     _ = fc(_, 'layer_2', nh=1, act=lambda x: x)
134 |                     self.loss = tf.reduce_mean(_ ** 2)  # effectively
135 |                 self.reports = Reports(
136 |                     loss=self.loss,
137 |                     expert_act_norm=tf.reduce_mean(inputs.A),
138 |                     act_norm=tf.reduce_mean(policy.pd.mean),
139 |                     entropy=tf.reduce_mean(policy.pd.entropy())
140 |                 )
141 |             else:
142 |                 raise NotImplemented
143 | 
144 | 
145 | # in behavior cloning, we use the supervising observation and actions.
146 | # Assume the actions come from a gaussian policy
147 | def path_to_feed_dict(*, inputs: Inputs, paths, lr=None, **_rest):
148 |     """
149 |     convert path objects to feed_dict for the tensorflow graph.
150 | 
151 |     :param inputs:  Input object
152 |     :param paths: dict['obs', 'acs']: Size(n_timesteps, n_envs, feat_n)
153 |     :param lr: placeholder or floating point number
154 |     :param _rest:
155 |     :return: feed_dict, keyed by the input placeholders.
156 |     """
157 |     # reshaping the path, need to debug
158 |     n_timesteps, n_envs, *_ = paths['obs'].shape
159 |     n = n_timesteps * n_envs
160 | 
161 |     feed_dict = {
162 |         inputs.X: paths['obs'].reshape(n, -1),
163 |     }
164 |     if hasattr(inputs, 'A') and inputs.A is not None:
165 |         feed_dict[inputs.A] = paths['acs'].reshape(n, -1)
166 |     if lr is not None:
167 |         assert inputs.LR is not None, f'Input should have LR attribute if a learning rate is passed.'
168 |         feed_dict[inputs.LR] = lr
169 |     return feed_dict
170 | 


--------------------------------------------------------------------------------
/e_maml_tf/algos/cpi.py:
--------------------------------------------------------------------------------
  1 | from collections import Sequence
  2 | import tensorflow as tf
  3 | from gym import spaces
  4 | from waterbear import Bear
  5 | from waterbear import OrderedBear
  6 | 
  7 | from e_maml_tf.config import G
  8 | 
  9 | # NOTE: best way to define the input interface is to use a named_tuple and then others could just import the tuple from
 10 | # here: https://pymotw.com/2/collections/namedtuple.html
 11 | # NOTE: However pickle has trouble with namedtuple. Plus a class offers more functions, so we use a namespace instead.
 12 | # InputT = namedtuple("Inputs", 'A ADV R OLD_NEG_LOG_P_AC OLD_V_PRED CLIP_RANGE X_act X_train')
 13 | 
 14 | 
 15 | # Here we use a input class to make it easy to define defaults.
 16 | from e_maml_tf.ge_utils import placeholders_from_variables
 17 | 
 18 | 
 19 | class Inputs:
 20 |     def __init__(self, *, action_space, value_baseline=False):
 21 |         if isinstance(action_space, spaces.Discrete):
 22 |             self.A = tf.placeholder(tf.int32, [None], name="A")
 23 |         else:
 24 |             self.A = tf.placeholder(tf.float32, [None] + list(action_space.shape), name="A")
 25 |         self.ADV = tf.placeholder(tf.float32, [None], name="ADV")
 26 |         self.OLD_NEG_LOG_P_AC = tf.placeholder(tf.float32, [None], name="OLD_NEG_LOG_P_AC")
 27 |         self.CLIP_RANGE = tf.placeholder(tf.float32, [], name="CLIP_RANGE")
 28 | 
 29 |         if value_baseline:
 30 |             self.R = tf.placeholder(tf.float32, [None], name="R")
 31 |             self.OLD_V_PRED = tf.placeholder(tf.float32, [None], name="OLD_V_PRED")
 32 | 
 33 | 
 34 | class Reports(OrderedBear):
 35 |     loss = None
 36 |     entropy = None
 37 |     approx_kl = None
 38 |     clip_frac = None
 39 | 
 40 | 
 41 | class CPI:
 42 |     vf_loss = None
 43 |     def __init__(self, *, inputs: Inputs, policy, vf_coef=None, ent_coef=None):
 44 |         self.inputs = inputs
 45 |         self.policy = policy
 46 |         with tf.variable_scope('CPI'):
 47 |             self.neglogpac = policy.pd.neglogp(inputs.A)
 48 |             entropy = tf.reduce_mean(policy.pd.entropy())
 49 | 
 50 |             ratio = tf.exp(inputs.OLD_NEG_LOG_P_AC - self.neglogpac)
 51 |             pg_loss = tf.reduce_mean(-inputs.ADV * ratio)
 52 |             self.loss = pg_loss - entropy * ent_coef
 53 | 
 54 |             if policy.vf is not None:
 55 |                 assert vf_coef is not None, \
 56 |                     "vf_coef can not be None when policy has value function."
 57 |                 vpred = policy.vf
 58 |                 vf_loss = tf.square(vpred - inputs.R)
 59 |                 self.loss += vf_loss * vf_coef
 60 | 
 61 |             self.reports = Reports(
 62 |                 loss=self.loss,
 63 |                 entropy=entropy,
 64 |                 approx_kl=.5 * tf.reduce_mean(tf.square(self.neglogpac - inputs.OLD_NEG_LOG_P_AC)),
 65 |                 clip_frac=tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), inputs.CLIP_RANGE)))
 66 |             )
 67 |             if policy.vf is not None:
 68 |                 self.reports.vf_loss = vf_loss
 69 | 
 70 | 
 71 | class Optimize:
 72 |     optimize = None
 73 |     run_optimize = None
 74 | 
 75 |     def __init__(self, *, loss, trainables, lr=None, max_grad_norm=None, max_grad_clip=None,
 76 |                  strict=None, reports=None, **_):
 77 |         """
 78 |         :param trainables: Optional array used for the gradient calculation
 79 |         """
 80 |         with tf.variable_scope('CPI_Optimize'):
 81 |             # grad_placeholders = placeholders_from_variables(trainables)
 82 |             # optimizer.gradients is just a wrapper around tf.gradients, with extra assertions. This is why it raises
 83 |             # errors on non-trainables.
 84 |             _grads = tf.gradients(loss, trainables)
 85 |             if strict:
 86 |                 for g in _grads:
 87 |                     assert g is not None, f'Some Grads are not defined: {_grads}'
 88 |             else:
 89 |                 _grads = [tf.zeros_like(p) if g is None else g for g, p in zip(_grads, trainables)]
 90 | 
 91 |             assert (not max_grad_norm or not max_grad_clip), \
 92 |                 f'max_grad_norm({max_grad_clip}) and max_grad_norm({max_grad_clip}) can not be trueful at the same time.'
 93 |             if max_grad_norm:  # allow 0 to be by-pass
 94 |                 # print('setting max-grad-norm to', max_grad_norm)
 95 |                 # tf.clip_by_global_norm is just fine. No need to use my own.
 96 |                 _grads = [g * tf.stop_gradient(max_grad_norm / tf.maximum(max_grad_norm, tf.norm(g))) for g in _grads]
 97 |                 # _grads, grad_norm = tf.clip_by_global_norm(_grads, max_grad_norm)
 98 |             elif max_grad_clip:
 99 |                 _grads = [tf.clip_by_value(g, -max_grad_clip, max_grad_clip) for g in _grads]
100 | 
101 |             self.grads = _grads
102 | 
103 |             # graph operator for updating the parameter. used by maml with the SGD inner step
104 |             self.apply_grad = lambda *, lr, grad, var: var - lr * grad
105 | 
106 |             if lr is not None:
107 |                 assert hasattr(trainables[0], '_variable'), "trainables have to have the _variable attribute"
108 |                 lr_not_scalar = (hasattr(lr, 'shape') and len(lr.shape)) or (isinstance(lr, Sequence) and len(lr))
109 |                 self.optimize = [v.assign(self.apply_grad(lr=lr[i] if lr_not_scalar else lr, grad=g, var=v))
110 |                                  for i, (v, g) in enumerate(zip(trainables, self.grads))]
111 |                 _ = self.optimize if reports is None else [*vars(reports).values(), *self.optimize]
112 |                 self.run_optimize = lambda feed_dict: tf.get_default_session().run(_, feed_dict=feed_dict)
113 | 
114 |         # Function to compute the CPI gradients
115 |         self.run_grads = lambda *, feed_dict: tf.get_default_session().run([_grads], feed_dict)
116 | 
117 | 
118 | def path_to_feed_dict(*, inputs: Inputs, paths, lr=None, clip_range, **_r):
119 |     if 'adv' in paths:
120 |         phi = paths['advs']
121 |     elif 'values' in paths:
122 |         phi = paths['returns'] - paths['values']
123 |     else:
124 |         phi = paths['returns']
125 |     # advs_normalized = (advs - advs.mean()) / (advs.std() + 1e-8)
126 | 
127 |     n_timesteps, n_envs, *_ = paths['obs'].shape
128 |     n = n_timesteps * n_envs
129 | 
130 |     feed_dict = {
131 |         inputs.X: paths['obs'].reshape(n, -1),
132 |         inputs.A: paths['acs'].reshape(n, -1),
133 |         inputs.ADV: phi.reshape(-1),
134 |         inputs.OLD_NEG_LOG_P_AC: paths['neglogpacs'].reshape(-1),
135 |         inputs.CLIP_RANGE: clip_range
136 |     }
137 |     if hasattr(inputs, 'OLD_V_PRED'):
138 |         feed_dict[inputs.OLD_V_PRED] = paths['values'].reshape(-1)
139 |         feed_dict[inputs.R] = paths['returns'].reshape(-1)
140 |     if lr is not None:
141 |         assert inputs.LR is not None, f'Input should have LR attribute if a learning rate is passed.'
142 |         feed_dict[inputs.LR] = lr
143 |     return feed_dict
144 | 


--------------------------------------------------------------------------------
/e_maml_tf/algos/ppo2.py:
--------------------------------------------------------------------------------
  1 | from collections import Sequence
  2 | import tensorflow as tf
  3 | from gym import spaces
  4 | from waterbear import OrderedBear
  5 | 
  6 | # best way to define the input interface is to use a named_tuple and then others could just import the tuple from here:
  7 | # https://pymotw.com/2/collections/namedtuple.html
  8 | # InputT = namedtuple("Inputs", 'A ADV R OLD_NEG_LOG_P_AC OLD_V_PRED CLIP_RANGE X_act X_train')
  9 | 
 10 | 
 11 | # Here we use a input class to make it easy to define defaults.
 12 | from e_maml_tf.ge_utils import placeholders_from_variables
 13 | 
 14 | 
 15 | class Inputs:
 16 |     def __init__(self, *, action_space, value_baseline=False):
 17 |         if isinstance(action_space, spaces.Discrete):
 18 |             self.A = tf.placeholder(tf.int32, [None], name="A")
 19 |         else:
 20 |             self.A = tf.placeholder(tf.float32, [None] + list(action_space.shape), name="A")
 21 | 
 22 |         self.ADV = tf.placeholder(tf.float32, [None], name="ADV")
 23 |         self.OLD_NEG_LOG_P_AC = tf.placeholder(tf.float32, [None], name="OLD_NEG_LOG_P_AC")
 24 |         self.CLIP_RANGE = tf.placeholder(tf.float32, [], name="CLIP_RANGE")
 25 | 
 26 |         if value_baseline:
 27 |             self.R = tf.placeholder(tf.float32, [None], name="R")
 28 |             self.OLD_V_PRED = tf.placeholder(tf.float32, [None], name="OLD_V_PRED")
 29 | 
 30 | 
 31 | class Reports(OrderedBear):
 32 |     loss = None
 33 |     entropy = None
 34 |     act_norm = None
 35 |     pg_loss = None
 36 |     # vf_loss = None
 37 |     approx_kl = None
 38 |     clip_frac = None
 39 | 
 40 | 
 41 | class PPO:
 42 |     vf_loss = None
 43 | 
 44 |     def __init__(self, *, inputs: Inputs, policy, vf_coef=None, ent_coef=None):
 45 |         self.inputs = inputs
 46 |         self.policy = policy
 47 |         with tf.variable_scope('PPO'):
 48 |             self.neglogpac = policy.pd.neglogp(inputs.A)
 49 |             entropy = tf.reduce_mean(policy.pd.entropy())
 50 | 
 51 |             ratio = tf.exp(inputs.OLD_NEG_LOG_P_AC - self.neglogpac)
 52 |             pg_losses = -inputs.ADV * ratio
 53 |             pg_losses2 = -inputs.ADV * tf.clip_by_value(ratio, 1.0 - inputs.CLIP_RANGE, 1.0 + inputs.CLIP_RANGE)
 54 |             pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
 55 |             self.loss = pg_loss - entropy * ent_coef
 56 | 
 57 |             if policy.vf is not None:
 58 |                 assert vf_coef is not None, \
 59 |                     "vf_coef can not be None when policy has value function."
 60 |                 vpred = policy.vf
 61 |                 vpred_clipped = inputs.OLD_V_PRED + \
 62 |                                 tf.clip_by_value(policy.vf - inputs.OLD_V_PRED, - inputs.CLIP_RANGE, inputs.CLIP_RANGE)
 63 |                 vf_losses1 = tf.square(vpred - inputs.R)
 64 |                 vf_losses2 = tf.square(vpred_clipped - inputs.R)
 65 |                 vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
 66 |                 self.loss += vf_loss * vf_coef
 67 | 
 68 |             self.reports = Reports(
 69 |                 loss=self.loss,
 70 |                 entropy=entropy,
 71 |                 act_norm=tf.reduce_mean(inputs.A),
 72 |                 pg_loss=pg_loss,
 73 |                 approx_kl=.5 * tf.reduce_mean(tf.square(self.neglogpac - inputs.OLD_NEG_LOG_P_AC)),
 74 |                 clip_frac=tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), inputs.CLIP_RANGE)))
 75 |             )
 76 |             if policy.vf is not None:
 77 |                 self.reports.vf_loss = vf_loss
 78 | 
 79 | 
 80 | class Optimize:
 81 |     optimize = None
 82 |     run_optimize = None
 83 | 
 84 |     def __init__(self, *, loss, trainables, lr=None, max_grad_norm=None, max_grad_clip=None,
 85 |                  strict=False, reports=None, **_):
 86 |         """
 87 |         :param trainables: Optional array used for the gradient calculation
 88 |         """
 89 |         with tf.variable_scope('PPO_Optimize'):
 90 |             # grad_placeholders = placeholders_from_variables(trainables)
 91 |             # optimizer.gradients is just a wrapper around tf.gradients, with extra assertions. This is why it raises
 92 |             # errors on non-trainables.
 93 |             _grads = tf.gradients(loss, trainables)
 94 |             if strict:
 95 |                 for g in _grads:
 96 |                     assert g is not None, f'Some Grads are not defined: {_grads}'
 97 |             else:
 98 |                 _grads = [tf.zeros_like(p) if g is None else g for g, p in zip(_grads, trainables)]
 99 | 
100 |             assert (not max_grad_norm or not max_grad_clip), \
101 |                 f'max_grad_norm({max_grad_clip}) and max_grad_norm({max_grad_clip}) can not be trueful at the same time.'
102 |             if max_grad_norm:  # allow 0 to be by-pass
103 |                 # print('setting max-grad-norm to', max_grad_norm)
104 |                 # tf.clip_by_global_norm is just fine. No need to use my own.
105 |                 _grads = [g * tf.stop_gradient(max_grad_norm / tf.maximum(max_grad_norm, tf.norm(g))) for g in _grads]
106 |                 # _grads, grad_norm = tf.clip_by_global_norm(_grads, max_grad_norm)
107 |             elif max_grad_clip:
108 |                 _grads = [tf.clip_by_value(g, -max_grad_clip, max_grad_clip) for g in _grads]
109 | 
110 |             self.grads = _grads
111 | 
112 |             # graph operator for updating the parameter. used by maml with the SGD inner step
113 |             self.apply_grad = lambda *, lr, grad, var: var - lr * grad
114 | 
115 |             if lr is not None:
116 |                 assert hasattr(trainables[0], '_variable'), "trainables have to have the _variable attribute"
117 |                 lr_not_scalar = (hasattr(lr, 'shape') and len(lr.shape)) or (isinstance(lr, Sequence) and len(lr))
118 |                 self.optimize = [v.assign(self.apply_grad(lr=lr[i] if lr_not_scalar else lr, grad=g, var=v))
119 |                                  for i, (v, g) in enumerate(zip(trainables, self.grads))]
120 |                 _ = self.optimize if reports is None else [*vars(reports).values(), *self.optimize]
121 |                 self.run_optimize = lambda feed_dict: tf.get_default_session().run(_, feed_dict=feed_dict)
122 | 
123 |         # Function to compute the PPO gradients
124 |         self.run_grads = lambda *, feed_dict: tf.get_default_session().run([_grads], feed_dict)
125 | 
126 | 
127 | def path_to_feed_dict(*, inputs: Inputs, paths, lr=None, clip_range, **_r):
128 |     if 'adv' in paths:
129 |         phi = paths['advs']
130 |     elif 'values' in paths:
131 |         phi = paths['returns'] - paths['values']
132 |     else:
133 |         phi = paths['returns']
134 |     # advs_normalized = (advs - advs.mean()) / (advs.std() + 1e-8)
135 | 
136 |     n_timesteps, n_envs, *_ = paths['obs'].shape
137 |     n = n_timesteps * n_envs
138 | 
139 |     feed_dict = {
140 |         inputs.X: paths['obs'].reshape(n, -1),
141 |         inputs.A: paths['acs'].reshape(n, -1),
142 |         inputs.ADV: phi.reshape(-1),
143 |         inputs.OLD_NEG_LOG_P_AC: paths['neglogpacs'].reshape(-1),
144 |         inputs.CLIP_RANGE: clip_range
145 |     }
146 |     if hasattr(inputs, 'OLD_V_PRED'):
147 |         feed_dict[inputs.OLD_V_PRED] = paths['values'].reshape(-1)
148 |         feed_dict[inputs.R] = paths['returns'].reshape(-1)
149 |     if lr is not None:
150 |         assert inputs.LR is not None, f'Input should have LR attribute if a learning rate is passed.'
151 |         feed_dict[inputs.LR] = lr
152 |     return feed_dict
153 | 


--------------------------------------------------------------------------------
/e_maml_tf/algos/sac.py:
--------------------------------------------------------------------------------
  1 | from collections import Sequence
  2 | 
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | from gym import spaces
  6 | from waterbear import OrderedBear
  7 | 
  8 | 
  9 | # best way to define the input interface is to use a named_tuple and then others could just import the tuple from here:
 10 | # https://pymotw.com/2/collections/namedtuple.html
 11 | # InputT = namedtuple("Inputs", 'A ADV R OLD_NEG_LOG_P_AC OLD_V_PRED CLIP_RANGE X_act X_train')
 12 | 
 13 | # helper utilities
 14 | class ReplayBuffer:
 15 |     """
 16 |     A simple FIFO experience replay buffer for SAC agents.
 17 |     """
 18 | 
 19 |     def __init__(self, obs_dim, act_dim, size):
 20 |         import numpy as np
 21 |         self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
 22 |         self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
 23 |         self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
 24 |         self.rews_buf = np.zeros(size, dtype=np.float32)
 25 |         self.done_buf = np.zeros(size, dtype=np.float32)
 26 |         self.ptr, self.size, self.max_size = 0, 0, size
 27 | 
 28 |     def store(self, obs, act, rew, next_obs, done):
 29 |         self.obs1_buf[self.ptr] = obs
 30 |         self.obs2_buf[self.ptr] = next_obs
 31 |         self.acts_buf[self.ptr] = act
 32 |         self.rews_buf[self.ptr] = rew
 33 |         self.done_buf[self.ptr] = done
 34 |         self.ptr = (self.ptr + 1) % self.max_size
 35 |         self.size = min(self.size + 1, self.max_size)
 36 | 
 37 |     def sample_batch(self, batch_size=32):
 38 |         idxs = np.random.randint(0, self.size, size=batch_size)
 39 |         return dict(obs=self.obs1_buf[idxs],
 40 |                     obs_next=self.obs2_buf[idxs],
 41 |                     acs=self.acts_buf[idxs],
 42 |                     rews=self.rews_buf[idxs],
 43 |                     dones=self.done_buf[idxs])
 44 | 
 45 | 
 46 | # Here we use a input class to make it easy to define defaults.
 47 | from e_maml_tf.ge_utils import placeholders_from_variables
 48 | 
 49 | 
 50 | class Inputs:
 51 |     def __init__(self, *, ob_shape, action_space, ):
 52 |         self.X = tf.placeholder(dtype=tf.float32, shape=ob_shape, name='obs')
 53 |         self.X_NEXT = tf.placeholder(dtype=tf.float32, shape=ob_shape, name='obs')
 54 | 
 55 |         if isinstance(action_space, spaces.Discrete):
 56 |             self.A = tf.placeholder(tf.int32, [None], name="A")
 57 |         else:
 58 |             self.A = tf.placeholder(tf.float32, [None] + list(action_space.shape), name="A")
 59 | 
 60 |         self.R = tf.placeholder(tf.float32, [None], name="R")
 61 |         self.DONE = tf.placeholder(tf.float32, [None], name="DONE")
 62 | 
 63 | 
 64 | def mlp(x, hidden_sizes=(32,), activation=tf.tanh, output_activation=None):
 65 |     for h in hidden_sizes[:-1]:
 66 |         x = tf.layers.dense(x, units=h, activation=activation)
 67 |     return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation)
 68 | 
 69 | 
 70 | class Critic:
 71 |     def __init__(self, inputs, pi, hidden_sizes=(400, 200), activation='relu', scope='Critic', reuse=False):
 72 |         """
 73 | 
 74 |         :param X: placehodler for th
 75 |         :param A: placeholder for the sampled actions
 76 |         :param pi: the reparameterized action
 77 |         :param hidden_sizes:
 78 |         :param activation:
 79 |         :param scope:
 80 |         :param reuse:
 81 |         """
 82 |         if activation == 'tanh':
 83 |             act = tf.tanh
 84 |         elif activation == "relu":
 85 |             act = tf.nn.relu
 86 |         else:
 87 |             raise TypeError(f"{activation} is not available in this MLP.")
 88 | 
 89 |         def vf_mlp(x):
 90 |             _ = mlp(x, [*hidden_sizes, 1], act, None)
 91 |             return tf.squeeze(_, 1)
 92 | 
 93 |         with tf.variable_scope(scope):
 94 |             # note: allow passing in trainables.
 95 |             _old_trainables = {*tf.trainable_variables()}
 96 | 
 97 |             x_a_ = tf.concat([inputs.X, inputs.A], -1)
 98 |             x_pi_ = tf.concat([inputs.X, pi], -1)
 99 |             with tf.variable_scope('Q_0', reuse=reuse):
100 |                 self.q_0 = vf_mlp(x_a_)
101 |             with tf.variable_scope('Q_0', reuse=True):
102 |                 self.q_0_pi = vf_mlp(x_pi_)
103 |             with tf.variable_scope('Q_1', reuse=reuse):
104 |                 self.q_1 = vf_mlp(x_a_)
105 |             with tf.variable_scope('Q_1', reuse=True):
106 |                 self.q_1_pi = vf_mlp(x_pi_)
107 | 
108 |             _ = tf.trainable_variables()
109 |             with tf.variable_scope('v', reuse=reuse):
110 |                 self.v = vf_mlp(inputs.X)
111 |             self.v_trainables = [v for v in tf.trainable_variables() if v not in _]
112 | 
113 |             self.trainables = [v for v in tf.trainable_variables() if v not in _old_trainables]
114 | 
115 |             _ = tf.trainable_variables()
116 |             with tf.variable_scope('v_target', reuse=reuse):
117 |                 self.v_targ = vf_mlp(inputs.X_NEXT)
118 |             self.v_targ_trainables = [v for v in tf.trainable_variables() if v not in _]
119 | 
120 | 
121 | class Reports(OrderedBear):
122 |     value_loss = None
123 |     pi_kl = None
124 |     q0_loss = None
125 |     q1_loss = None
126 |     v_loss = None
127 |     entropy = None
128 |     act_norm = None
129 | 
130 | 
131 | class SAC:
132 |     from e_maml_tf.ge_policies import MlpPolicy
133 |     def __init__(self, *, inputs: Inputs, policy: MlpPolicy, critic: Critic, polyak, ent_coef, gamma):
134 |         self.inputs = inputs
135 |         self.policy = policy
136 |         self.critic = critic
137 |         with tf.variable_scope('SAC'):
138 |             min_q_pi = tf.minimum(critic.q_0_pi, critic.q_0_pi)
139 |             q_backup = tf.stop_gradient(inputs.R + gamma * (1 - inputs.DONE) * critic.v_targ)
140 |             v_backup = tf.stop_gradient(min_q_pi - ent_coef * policy.logpac)
141 | 
142 |             # this first term is using the Q function as an energy model to compute the KL divergence between
143 |             # the policy distribution and the distribution from the Q function (critic)
144 |             self.pi_loss = tf.reduce_mean(ent_coef * policy.logpac - critic.q_0_pi)
145 |             q0_loss = 0.5 * tf.reduce_mean(tf.square(q_backup - critic.q_0))
146 |             q1_loss = 0.5 * tf.reduce_mean(tf.square(q_backup - critic.q_1))
147 |             v_loss = 0.5 * tf.reduce_mean(tf.square(v_backup - critic.v))
148 |             self.value_loss = q0_loss + q1_loss + v_loss
149 | 
150 |             self.update_v_targ_ops = [tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
151 |                                       for v_main, v_targ in zip(critic.v_trainables, critic.v_targ_trainables)]
152 | 
153 |             # entropy = tf.reduce_mean(policy.pd.entropy())
154 |             self.reports = Reports(
155 |                 value_loss=self.value_loss,
156 |                 pi_kl=self.pi_loss,
157 |                 q0_loss=q0_loss,
158 |                 q1_loss=q1_loss,
159 |                 v_loss=v_loss,
160 |                 entropy=policy.entropy,
161 |                 act_norm=tf.reduce_mean(inputs.A),
162 |             )
163 | 
164 | 
165 | class Optimize:
166 |     optimize = None
167 |     run_optimize = None
168 | 
169 |     def __init__(self, policy_loss, policy_trainables, critic_loss, critic_trainables, lr=None, reports=None, **_):
170 |         """
171 |         :param trainables: Optional array used for the gradient calculation
172 |         """
173 |         with tf.variable_scope('SAC_Optimize'):
174 |             # Note: optimizer.gradients is just a wrapper around tf.gradients, with extra assertions. This is why it
175 |             #  raises errors on non-trainables.
176 |             self.policy_grads = tf.gradients(policy_loss, policy_trainables)
177 |             self.critic_grads = tf.gradients(critic_loss, critic_trainables)
178 | 
179 |             # graph operator for updating the parameter. used by maml with the SGD inner step
180 |             self.apply_grad = lambda *, lr, grad, var: var - lr * grad
181 | 
182 |             if lr is not None:  # this is only called when we use this algo inside MAML, with SGD inner step.
183 |                 # todo: not used, not tested, but should be correct.
184 |                 assert hasattr(policy_trainables[0], '_variable'), "trainables have to have the _variable attribute"
185 |                 lr_not_scalar = (hasattr(lr, 'shape') and len(lr.shape)) or (isinstance(lr, Sequence) and len(lr))
186 |                 pi_opt_op = [v.assign(self.apply_grad(lr=lr[i] if lr_not_scalar else lr, grad=g, var=v))
187 |                              for i, (v, g) in enumerate(zip(policy_trainables, self.policy_grads))]
188 | 
189 |                 with tf.control_dependencies(pi_opt_op):
190 |                     self.optimize = [v.assign(self.apply_grad(lr=lr[i] if lr_not_scalar else lr, grad=g, var=v))
191 |                                      for i, (v, g) in enumerate(zip(critic_trainables, self.critic_grads))]
192 | 
193 |                 _ = self.optimize if reports is None else [*vars(reports).values(), *self.optimize]
194 |                 self.run_optimize = lambda feed_dict: tf.get_default_session().run(_, feed_dict=feed_dict)
195 | 
196 | 
197 | BUFFER: ReplayBuffer = None
198 | 
199 | 
200 | def use_replay_buffer(buffer):
201 |     global BUFFER
202 |     BUFFER = buffer
203 | 
204 | 
205 | def path_to_feed_dict(*, inputs: Inputs, paths, lr=None, **_r):
206 |     """
207 |     In SAC (and other value-based, non-policy gradient methods, where the policy gradient is provided
208 |     by the true critic), the path_to_feed_dict function is stateful and contains a
209 |     replay buffer.
210 | 
211 |     :param inputs:
212 |     :param paths:
213 |     :param lr:
214 |     :param clip_range:
215 |     :param _r:
216 |     :return:
217 |     """
218 |     assert BUFFER is not None, "BUFFER is None. You need to first setup the replay buffer"
219 | 
220 |     buffer = BUFFER
221 | 
222 |     n_timesteps, n_envs, *_ = paths['obs'].shape
223 |     n = n_timesteps * n_envs
224 | 
225 |     obs = paths['obs'].reshape(n, -1)
226 |     acs = paths['acs'].reshape(n, -1)
227 |     rewards = paths['rewards'].reshape(n, -1)
228 |     dones = paths['dones'].reshape(n, -1)
229 | 
230 |     for step in range(1, n_timesteps):
231 |         buffer.store(obs[step - 1], acs[step], rewards[step], obs[step], dones[step])
232 | 
233 |     _ = buffer.sample_batch(batch_size=n)
234 | 
235 |     feed_dict = {
236 |         inputs.X: _['obs'],
237 |         inputs.X_NEXT: _['obs_next'],
238 |         inputs.A: _['acs'],
239 |         inputs.R: _['rews'],
240 |         inputs.DONE: _['dones']
241 |     }
242 |     if lr is not None:
243 |         assert inputs.LR is not None, f'Input should have LR attribute if a learning rate is passed.'
244 |         feed_dict[inputs.LR] = lr
245 |     return feed_dict
246 | 


--------------------------------------------------------------------------------
/e_maml_tf/algos/vpg.py:
--------------------------------------------------------------------------------
  1 | from collections import Sequence
  2 | import tensorflow as tf
  3 | from gym import spaces
  4 | from waterbear import OrderedBear
  5 | 
  6 | import baselines.common.tf_util as U
  7 | from e_maml_tf.config import RUN, DEBUG, G
  8 | 
  9 | # Here we use a input class to make it easy to define defaults.
 10 | from e_maml_tf.ge_utils import placeholders_from_variables
 11 | 
 12 | 
 13 | class Inputs:
 14 |     def __init__(self, *, action_space, value_baseline=False, ):
 15 |         # self.X = X or tf.placeholder(tf.float32, [None], name="obs")
 16 |         if isinstance(action_space, spaces.Discrete):
 17 |             self.A = tf.placeholder(tf.int32, [None], name="A")
 18 |         else:
 19 |             self.A = tf.placeholder(tf.float32, [None] + list(action_space.shape), name="A")
 20 |         self.ADV = tf.placeholder(tf.float32, [None], name="ADV")
 21 | 
 22 |         if value_baseline:
 23 |             self.R = tf.placeholder(tf.float32, [None], name="R")
 24 | 
 25 | 
 26 | class Reports(OrderedBear):
 27 |     loss = None
 28 |     entropy = None
 29 |     approx_kl = None
 30 | 
 31 | 
 32 | class VPG:
 33 |     vf_loss = None
 34 |     def __init__(self, *, inputs, policy, vf_coef=None):
 35 |         self.inputs = inputs
 36 |         self.policy = policy
 37 |         with tf.variable_scope("VPG"):
 38 |             self.neglogpac = policy.pd.neglogp(inputs.A)
 39 | 
 40 |             self.vpg_loss = tf.reduce_mean(inputs.ADV * self.neglogpac)
 41 |             self.loss = self.vpg_loss  # <== this is the value function loss ratio.
 42 | 
 43 |             if policy.vf is not None:
 44 |                 self.vf_loss = tf.square(policy.vf - inputs.R)
 45 |                 self.loss += self.vf_loss * vf_coef
 46 |                 # used for reporting
 47 |             self.reports = Reports(
 48 |                 loss=self.loss,
 49 |                 entropy=tf.reduce_mean(policy.pd.entropy()),
 50 |                 # approx_kl=.5 * tf.reduce_mean(tf.square(self.neglogpac - inputs.OLD_NEG_LOG_P_AC))
 51 |             )
 52 |             if policy.vf is not None:
 53 |                 self.reports.vf_loss = self.vf_loss
 54 | 
 55 | 
 56 | class Optimize(object):
 57 |     optimize = None
 58 |     run_optimize = None
 59 | 
 60 |     def __init__(self, *, loss, trainables, lr=None, max_grad_norm=None, max_grad_clip=None, optimizer="SGD",
 61 |                  strict=None,
 62 |                  reports=None, **_):
 63 |         """
 64 |         If lr is None, do not create the self.optimize operator.
 65 | 
 66 |         :param loss:
 67 |         :param trainables:
 68 |         :param lr:
 69 |         :param max_grad_norm:
 70 |         :param max_grad_clip:
 71 |         :param optimizer:
 72 |         :param strict:
 73 |         :param reports:
 74 |         :param _:
 75 |         """
 76 |         with tf.variable_scope('VPG_Optimize'):
 77 |             # optimizer.gradients is just a wrapper around tf.gradients, with extra assertions. This is why it raises
 78 |             # errors on non-trainables.
 79 |             _grads = tf.gradients(loss, trainables)
 80 |             if strict:
 81 |                 for g in _grads:
 82 |                     assert g is not None, f'Some Grads are not defined: {_grads}'
 83 |             else:
 84 |                 _grads = [tf.zeros_like(p) if g is None else g for g, p in zip(_grads, trainables)]
 85 | 
 86 |             assert (not max_grad_norm or not max_grad_clip), \
 87 |                 f'max_grad_norm({max_grad_clip}) and max_grad_norm({max_grad_clip}) can not be trueful at the same time.'
 88 |             if max_grad_norm:  # allow 0 to be by-pass
 89 |                 # print('setting max-grad-norm to', max_grad_norm)
 90 |                 # tf.clip_by_global_norm is just fine. No need to use my own.
 91 |                 _grads = [g * tf.stop_gradient(max_grad_norm / tf.maximum(max_grad_norm, tf.norm(g))) for g in _grads]
 92 |                 # _grads, grad_norm = tf.clip_by_global_norm(_grads, max_grad_norm)
 93 |             elif max_grad_clip:
 94 |                 _grads = [tf.clip_by_value(g, -max_grad_clip, max_grad_clip) for g in _grads]
 95 | 
 96 |             self.grads = _grads
 97 | 
 98 |             # beta = tf.get_variable('RMSProp_beta')
 99 |             # avg_grad = tf.get_variable('RMSProp_avg_g')
100 |             # avg_grad = beta * avg_grad + (1 - beta) * grad
101 |             # graph operator for updating the parameter. used by maml with the SGD inner step
102 |             self.apply_grad = lambda *, lr, grad, var: var - lr * grad
103 | 
104 |             if lr is not None:
105 |                 assert hasattr(trainables[0], '_variable'), "trainables have to have the _variable attribute"
106 |                 lr_not_scalar = (hasattr(lr, 'shape') and len(lr.shape)) or (isinstance(lr, Sequence) and len(lr))
107 |                 self.optimize = [v.assign(self.apply_grad(lr=lr[i] if lr_not_scalar else lr, grad=g, var=v))
108 |                                  for i, (v, g) in enumerate(zip(trainables, self.grads))]
109 |                 _ = self.optimize if reports is None else [*vars(reports).values(), *self.optimize]
110 |                 self.run_optimize = lambda feed_dict: tf.get_default_session().run(_, feed_dict=feed_dict)
111 | 
112 |         # Function to compute the PPO gradients
113 |         self.run_grads = lambda *, feed_dict: tf.get_default_session().run([_grads], feed_dict)
114 | 
115 | 
116 | def path_to_feed_dict(*, inputs: Inputs, paths, lr=None, **_r):
117 |     if 'adv' in paths:
118 |         phi = paths['advs']
119 |     elif 'values' in paths:
120 |         phi = paths['returns'] - paths['values']
121 |     else:
122 |         phi = paths['returns']
123 |     # advs_normalized = (advs - advs.mean()) / (advs.std() + 1e-8)
124 | 
125 |     n_timesteps, n_envs, *_ = paths['obs'].shape
126 |     n = n_timesteps * n_envs
127 | 
128 |     feed_dict = {
129 |         inputs.X: paths['obs'].reshape(n, -1),
130 |         inputs.A: paths['acs'].reshape(n, -1),
131 |         inputs.ADV: phi.reshape(-1),
132 |     }
133 | 
134 |     if hasattr(inputs, 'R'):
135 |         feed_dict[inputs.R] = paths['returns'].reshape(-1)
136 |     if lr is not None:
137 |         assert inputs.LR is not None, f'Input should have LR attribute if a learning rate is passed.'
138 |         feed_dict[inputs.LR] = lr
139 |     return feed_dict
140 | 


--------------------------------------------------------------------------------
/e_maml_tf/config.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | from params_proto import cli_parse, Proto
  3 | 
  4 | ALLOWED_ALGS = "rl_algs.PPO", "rl_algs.VPG", "PPO", "VPG"
  5 | DIR_TEMPLATE = "{now:%Y-%m-%d}/e_maml_tf/" \
  6 |                "{G.run_mode}-{G.env_name}-n_grad({G.n_grad_steps})" \
  7 |                "-{G.inner_alg}-{G.inner_optimizer}" \
  8 |                "-{G.meta_alg}-{G.meta_optimizer}-alpha({G.alpha})-beta({G.beta})" \
  9 |                "-n_graphs({G.n_graphs})-env_norm({G.normalize_env})" \
 10 |                "-grad_norm({G.inner_max_grad_norm})-meta_grad_norm({G.meta_max_grad_norm})-{now:%H%M%S}-{now:%f}"
 11 | 
 12 | from datetime import datetime
 13 | 
 14 | now = datetime.now()
 15 | 
 16 | 
 17 | @cli_parse
 18 | class RUN:
 19 |     log_dir = ""
 20 |     log_prefix = 'e-maml-debug'
 21 | 
 22 | 
 23 | def config_run(**_G):
 24 |     G.update(_G)
 25 |     from datetime import datetime
 26 |     now = datetime.now()
 27 |     RUN.log_prefix = DIR_TEMPLATE.format(now=now, G=G)
 28 | 
 29 | 
 30 | # decorator help generate a as command line parser.
 31 | @cli_parse
 32 | class G:
 33 |     # Termination conditions
 34 |     term_loss_threshold = 100
 35 |     term_reward_threshold = -8000.0
 36 | 
 37 |     run_mode = "maml"  # type:  "Choose between maml and e_maml. Switches the loss function used for training"
 38 |     e_maml_lambda = Proto(1.0, help="The scaling factor for the E-MAML term")
 39 |     # env_name = 'HalfCheetah-v2'  # type:  "Name of the task environment"
 40 |     env_name = 'HalfCheetahGoalDir-v0'  # type:  "Name of the task environment"
 41 |     start_seed = Proto(0, help="seed for initialization of each game")
 42 |     render = False
 43 |     n_cpu = multiprocessing.cpu_count() * 2  # type: "number of threads used"
 44 | 
 45 |     # (E_)MAML Training Parameters
 46 |     n_tasks = Proto(20, help="40 for locomotion, 20 for 2D navigation ref:cbfinn")
 47 |     n_graphs = Proto(1, help="number of parallel graphs for multi-device parallelism. Hard coded to 1 atm.")
 48 |     n_grad_steps = 5  # type:  "number of gradient descent steps for the worker." #TODO change back to 1
 49 |     meta_n_grad_steps = Proto(1, help="number of gradient descent steps for the meta algorithm.")
 50 |     reuse_meta_optimizer = Proto(True, help="Whether to use the same AdamW optimizer for all "
 51 |                                             "meta gradient steps. MUCH FASTER to initialize with [True].")
 52 |     eval_grad_steps = Proto(list(range(n_grad_steps + 1)),
 53 |                             help="the gradient steps at which we evaluate the policy. Used to make pretty plots.")
 54 | 
 55 |     bias_dim = Proto(20, help="the input bias variable dimension that breaks the input symmetry")
 56 |     # 40k per task (action, state) tuples, or 20k (per task) if you have 10/20 meta tasks
 57 |     n_parallel_envs = 40  # type:  "Number of parallel envs in minibatch. The SubprocVecEnv batch_size."
 58 |     batch_timesteps = 100  # type:  "max_steps for each episode, used to set env._max_steps parameter"
 59 | 
 60 |     epoch_init = Proto(0, help="the epoch to start with.")
 61 |     n_epochs = 800  # type:  "Number of epochs"
 62 |     eval_interval = Proto(None, help="epoch interval for evaluation.")
 63 |     eval_num_envs = Proto(n_parallel_envs, help="default to same as sampling envs")
 64 |     eval_timesteps = Proto(50, help="batch size for the evaluation RL runs")
 65 | 
 66 |     record_movie_interval = 500
 67 |     start_movie_after_epoch = 700
 68 |     render_num_envs = Proto(10, help="keep small b/c rendering is slow")
 69 |     movie_timesteps = 100  # type: "now runs in batch mode"
 70 |     start_checkpoint_after_epoch = Proto(200, help="epoch at which start saving checkpoints.")
 71 |     checkpoint_interval = Proto(None, help="the frequency for saving checkpoints on the policy")
 72 |     load_from_checkpoint = Proto(None, help="the path to the checkpoint file (saved by logger) to be loaded at the"
 73 |                                             " beginning of the training session. Also includes the learned loss, "
 74 |                                             "and learned learning rates if available.")
 75 | 
 76 |     # RL sampling settings
 77 |     reset_on_start = Proto(False, help="reset the environment at the beginning of each episode. "
 78 |                                        "Do NOT use this when using SubProcessVecEnv")
 79 | 
 80 |     # behavior cloning
 81 |     mask_meta_bc_data = Proto(False, help='masking the state space for one-shot imitation baseline')
 82 |     # bc_eval_timesteps = Proto(100, help="number of timesteps for evaluation")
 83 |     episode_subsample = Proto(1, help='the subsampling ratio for episodic training dataset. Active under episode mode')
 84 |     sample_limit = Proto(None, help='the number of timesteps uses in behavior cloning algorithm.')
 85 |     k_fold = Proto(5, help='the k-fold cross validation')
 86 | 
 87 |     env_max_timesteps = Proto(0, help="max_steps for each episode, used to set env._max_steps parameter. 0 to use "
 88 |                                       "gym default.")
 89 |     single_sampling = 0  # type:  "flag for running a single sampling step. 1 ON, 0 OFF"
 90 |     baseline = Proto('linear', help="using the critic as the baseline")
 91 |     use_gae = Proto(True, help="flag to turn GAE on and off")
 92 |     # GAE runner options
 93 |     gamma = Proto(0.995, help="GAE gamma")
 94 |     lam = Proto(0.97, help="GAE lambda")
 95 |     # Imperfect Demonstration Options
 96 |     # imperfect_demo = Proto(None, help='flag to turn on the systematic noise for the imperfect demonstration')
 97 |     # demo_offset_abs = Proto(None, help='size of the systematic offset to the goal position in expert demo')
 98 |     # demo_noise_scale = Proto(None, help='scale of the noise added to the goal position in expert demo')
 99 | 
100 |     # MAML Options
101 |     first_order = Proto(False, help="Whether to stop gradient calculation during meta-gradient calculation")
102 |     alpha = 0.05  # type:  "worker learning rate. use 0.1 for first step, 0.05 afterward ref:cbfinn"
103 |     meta_sgd = Proto(None, help='One of [None, True, "full"]. When full learns alpha same shape as tensors.')
104 |     beta = 0.01  # type:  "meta learning rate"
105 |     inner_alg = "VPG"  # type:  '"PPO" or "VPG", "rl_algs.VPG" or "rl_algs.PPO" for rl_algs baselines'
106 |     learned_loss_type = None
107 |     inner_optimizer = "SGD"  # type:  '"AdamW", "Adam", or "SGD"'
108 |     meta_alg = "PPO"  # type:  "PPO or TRPO, TRPO is not yet implemented."
109 |     meta_optimizer = "AdamW"  # type:  '"AdamW", "Adam" or "SGD"'
110 |     activation = "tanh"
111 |     n_layers = 4  # type: "the number of hidden layers for the policy network. Sometimes, bigger, is better"
112 |     hidden_size = 64  # type: "hidden size for the MLP policy"
113 | 
114 |     # Model options
115 |     use_k_index = Proto(False, help="whether to wrap k_index around the environment. Helps for the value baseline")
116 |     normalize_env = False  # type: "normalize the environment"
117 |     vf_coef = 0.5  # type:  "loss weighing coefficient for the value function loss. with the VPG loss being 1.0"
118 |     ent_coef = 0.01  # type:  "PPO entropy coefficient"
119 |     inner_max_grad_norm = 1.0  # type:  "PPO maximum gradient norm"
120 |     meta_max_grad_norm = 1.0  # type:  "PPO maximum gradient norm"
121 |     inner_max_grad_clip = Proto(None, help="maximum gradient clip")
122 |     meta_max_grad_clip = Proto(None, help="maximum gradient clip")
123 |     clip_range = Proto(0.2, help="PPO clip_range parameter")
124 | 
125 |     # policy parameters
126 |     init_logstd = Proto(0, help="initial log standard deviation of the gaussian policy")
127 |     control_variance = Proto(False, help='flag for fixing the variance of the policy for the inner worker. Helps '
128 |                                          'prevent inner adaptation from gaining too much from reducing variance.')
129 |     fix_meta_variance = Proto(False, help="flag for fixing the meta runner's variance.")
130 |     std_l2_coef = Proto(0, help="the regularization coefficient for the standard deviation")
131 | 
132 |     # Grid World config parameters
133 |     change_colors = 0  # type:  "shuffle colors of the board game"
134 |     change_dynamics = 0  # type:  'shuffle control actions (up down, left right) of the game'
135 | 
136 | 
137 | @cli_parse
138 | class Reporting:
139 |     report_mean = False  # type:  "plot the mean instead of the total reward per episode"
140 |     log_device_placement = False
141 | 
142 | 
143 | @cli_parse
144 | class DEBUG:
145 |     """To debug:
146 |     Set debug_params = 1,
147 |     set debug_apply_gradient = 1.
148 |     Then the gradient ratios between the worker and the meta runner should be print out, and they should be 1.
149 |     Otherwise, the runner model is diverging from the meta network.
150 |     """
151 |     no_weight_reset = Proto(0, help="flag to turn off the caching and resetting the weights")
152 |     no_task_resample = Proto(0, help="by-pass task re-sample")
153 | 


--------------------------------------------------------------------------------
/e_maml_tf/custom_vendor/README.md:
--------------------------------------------------------------------------------
 1 | # Custom Vendor (Patches)
 2 | 
 3 | This module patches OpenAI `gym` and `al_algs`. Specifically, it
 4 | 
 5 | - adds various gym tasks that are used in the `maml-rl` experiments
 6 |     - `HalfCheetahGoalVel-v0`: This is the Cheetah task with a velocity goal.
 7 |     
 8 | - patches the `al_algs` `Wrapper` class, which was implemented disregarding wrappee method passing.
 9 | - same applying to `SubprocVecEnv` and it's `worker` method.
10 | 
11 | The reason to monkey patch is so that these modifications can stay at one place instead of buried in the forked source code of these large libraries. This makes it easier to merge back upstream.


--------------------------------------------------------------------------------
/e_maml_tf/custom_vendor/__init__.py:
--------------------------------------------------------------------------------
1 | from .patches import *
2 | # from .krazy_worlds.krazy_world_envs import *
3 | from .maze_env import *
4 | from .half_cheetah_goal_velocity import *
5 | from .half_cheetah_goal_direction import *
6 | 
7 | IS_PATCHED = True
8 | 


--------------------------------------------------------------------------------
/e_maml_tf/custom_vendor/half_cheetah_goal_direction.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import utils
 3 | from gym.envs import register
 4 | from gym.envs.mujoco import mujoco_env
 5 | 
 6 | 
 7 | class Controls:
 8 |     goal_direction = 1
 9 | 
10 |     def sample(self, goal_direction=None):
11 |         if goal_direction is not None:
12 |             self.goal_direction = goal_direction
13 |         else:
14 |             self.goal_direction = 1 if np.random.rand() > 0.5 else -1
15 | 
16 | 
17 | class HalfCheetahGoalDirEnv(mujoco_env.MujocoEnv, utils.EzPickle):
18 |     """
19 |     Half cheetah environment with a randomly generated goal path.
20 |     """
21 | 
22 |     def __init__(self):
23 |         self.controls = Controls()
24 |         # call super init after initializing the variables.
25 |         mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 5)
26 |         utils.EzPickle.__init__(self)
27 | 
28 |     def step(self, action):
29 |         # xposbefore = self.model.data.qpos[0, 0]
30 |         # change model api to work with Mujoco1.5
31 |         xposbefore = self.data.qpos[0]
32 |         self.do_simulation(action, self.frame_skip)
33 |         # xposafter = self.model.data.qpos[0, 0]
34 |         xposafter = self.data.qpos[0]
35 |         ob = self._get_obs()
36 |         reward_ctrl = - 1e-1 * 0.5 * np.square(action).sum()  # add factor of 0.5, ref cbfinn.
37 |         velocity = (xposafter - xposbefore) / self.dt
38 |         cost_run = self.controls.goal_direction * velocity
39 |         reward = reward_ctrl - cost_run
40 |         done = False
41 |         return ob, reward, done, dict(cost_run=cost_run, reward_ctrl=reward_ctrl)
42 | 
43 |     def _get_obs(self):
44 |         return np.concatenate([
45 |             # self.model.data.qpos.flat[1:],
46 |             self.data.qpos[1:],
47 |             # self.model.data.qvel.flat,
48 |             self.data.qvel,
49 |         ])
50 | 
51 |     def set_goal_direction(self, goal_direction=None):
52 |         self.controls.sample(goal_direction=goal_direction)
53 | 
54 |     def get_goal_direction(self):  # only for debugging
55 |         return self.controls.goal_direction
56 | 
57 |     def reset_model(self):
58 |         qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
59 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
60 |         self.set_state(qpos, qvel)
61 |         return self._get_obs()
62 | 
63 |     def viewer_setup(self):
64 |         self.viewer.cam.distance = self.model.stat.extent * 0.5
65 | 
66 | 
67 | register(
68 |     id='HalfCheetahGoalDir-v0',
69 |     # todo: use module.sub_module:ClassName syntax to work with rcall and cloudpickle.
70 |     # entry_point=lambda: HalfCheetahGoalVelEnv(),
71 |     entry_point="e_maml_tf.custom_vendor.half_cheetah_goal_direction:HalfCheetahGoalDirEnv",
72 |     kwargs={},
73 |     max_episode_steps=200,
74 |     reward_threshold=4800.0,
75 | )
76 | 


--------------------------------------------------------------------------------
/e_maml_tf/custom_vendor/half_cheetah_goal_velocity.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import utils
 3 | from gym.envs import register
 4 | from gym.envs.mujoco import mujoco_env
 5 | from gym.utils import seeding
 6 | 
 7 | 
 8 | class Controls:
 9 |     goal_velocity = 0.5
10 |     low = 0.0
11 |     high = 2.0
12 | 
13 |     def sample(self, goal_velocity=None):
14 |         if goal_velocity:
15 |             self.goal_velocity = goal_velocity
16 |         else:
17 |             self.goal_velocity = np.random.uniform(low=self.low, high=self.high)
18 | 
19 | 
20 | class HalfCheetahGoalVelEnv(mujoco_env.MujocoEnv, utils.EzPickle):
21 |     """
22 |     Half cheetah environment with a randomly generated goal path.
23 |     """
24 | 
25 |     def __init__(self):
26 |         self.controls = Controls()
27 |         # call super init after initializing the variables.
28 |         mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 5)
29 |         utils.EzPickle.__init__(self)
30 | 
31 |     def seed(self, seed=None):
32 |         self.np_random, seed = seeding.np_random(seed)
33 |         return [seed]
34 | 
35 |     def step(self, action):
36 |         # xposbefore = self.model.data.qpos[0, 0]
37 |         # change model api to work with Mujoco1.5
38 |         xposbefore = self.data.qpos[0]
39 |         self.do_simulation(action, self.frame_skip)
40 |         # xposafter = self.model.data.qpos[0, 0]
41 |         xposafter = self.data.qpos[0]
42 |         ob = self._get_obs()
43 |         reward_ctrl = - 1e-1 * 0.5 * np.square(action).sum()  # add factor of 0.5, ref cbfinn.
44 |         velocity = (xposafter - xposbefore) / self.dt
45 |         cost_run = abs(velocity - self.controls.goal_velocity)
46 |         reward = reward_ctrl - cost_run
47 |         done = False
48 |         return ob, reward, done, dict(cost_run=cost_run, reward_ctrl=reward_ctrl)
49 | 
50 |     def _get_obs(self):
51 |         return np.concatenate([
52 |             # self.model.data.qpos.flat[1:],
53 |             self.data.qpos[1:],
54 |             # self.model.data.qvel.flat,
55 |             self.data.qvel,
56 |         ])
57 | 
58 |     def set_goal_velocity(self, goal_velocity=None):
59 |         # print('***** goal velocity **********>>', goal_velocity)
60 |         self.controls.sample(goal_velocity=goal_velocity)
61 | 
62 |     def get_goal_velocity(self):  # only for debugging
63 |         return self.controls.goal_velocity
64 | 
65 |     def reset_model(self):
66 |         qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
67 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
68 |         self.set_state(qpos, qvel)
69 |         return self._get_obs()
70 | 
71 |     def viewer_setup(self):
72 |         self.viewer.cam.distance = self.model.stat.extent * 0.5
73 | 
74 | 
75 | register(
76 |     id='HalfCheetahGoalVel-v0',
77 |     # todo: use module.sub_module:ClassName syntax to work with rcall and cloudpickle.
78 |     # entry_point=lambda: HalfCheetahGoalVelEnv(),
79 |     entry_point="e_maml_tf.custom_vendor.half_cheetah_goal_velocity:HalfCheetahGoalVelEnv",
80 |     kwargs={},
81 |     max_episode_steps=200,
82 |     reward_threshold=4800.0,
83 | )
84 | 


--------------------------------------------------------------------------------
/e_maml_tf/custom_vendor/maze_env.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import os
  3 | from glob import glob
  4 | 
  5 | import numpy as np
  6 | from gym import spaces, Env, utils
  7 | from gym.envs import register
  8 | 
  9 | 
 10 | class MazeEnv(Env, utils.EzPickle):
 11 |     _action_set = np.array([[0, 1], [0, -1], [1, 0], [-1, 0]], np.int32)
 12 | 
 13 |     def _seed(self, seed):
 14 |         self.task_rng = np.random.RandomState(seed)
 15 | 
 16 |     def __init__(self, batch_size, path, n_episodes=5, episode_horizon=12, seed=69, num_envs=None):
 17 |         self.action_space = spaces.Discrete(4)
 18 |         self.observation_space = spaces.Box(-1, 1, 15)  # new(1), rew(1), onehot act(4), obs(9)
 19 |         self._mazes = np.load(path)['arr_0']
 20 |         self.num_envs = num_envs
 21 |         self.reward_range = [-100, 100]
 22 |         self.metadata = dict()
 23 | 
 24 |         # maze id (1), ep count (1), time count (1), cur loc (2), goal(2), start(2)
 25 |         self._state = np.zeros([batch_size, 9], np.int32)
 26 |         self._alive_cost = 1.0 / episode_horizon
 27 |         self.batch_size, self.n_episodes, self.episode_horizon = batch_size, n_episodes, episode_horizon
 28 |         self._state_for_reset = copy.deepcopy(self._state)
 29 |         # self.init_pos_rng = random.Random(init_pos_seed)
 30 |         self._seed(seed)
 31 |         self.reset_task()
 32 | 
 33 |         utils.EzPickle.__init__(self)
 34 | 
 35 |     def _close(self):
 36 |         pass
 37 | 
 38 |     def _reset(self, dones=None):
 39 |         if dones is None:
 40 |             dones = np.ones(self.batch_size, np.bool)
 41 | 
 42 |         batch_size = np.sum(dones)
 43 |         self._state = copy.deepcopy(self._state_for_reset)
 44 |         obs = np.zeros((batch_size,) + self.observation_space.shape, np.float32)
 45 |         obs[:, 0] = 1
 46 |         obs[:, 6:] = self._get_obs()[dones]
 47 |         return obs.squeeze()
 48 | 
 49 |     def reset_task(self, dones=None):
 50 |         if dones is None:
 51 |             dones = np.ones(self.batch_size, np.bool)
 52 | 
 53 |         batch_size = np.sum(dones)
 54 |         maze_idx = self.task_rng.randint(len(self._mazes), size=batch_size)
 55 |         # maze_idx = self.mazes_fixed
 56 |         starts, goals = [], []
 57 |         for i in maze_idx:
 58 |             locs = list(zip(*np.where(~self._mazes[i])))
 59 |             # starts.append(self.starts_fixed)
 60 |             # goals.append(self.goals_fixed)
 61 |             starts.append(locs[self.task_rng.randint(len(locs))])
 62 |             goals.append(locs[self.task_rng.randint(len(locs))])
 63 | 
 64 |         self._state[dones, 0] = maze_idx
 65 |         self._state[dones, 1:3] = 0
 66 |         self._state[dones, 3:5] = np.array(starts)
 67 |         self._state[dones, 5:7] = np.array(goals)
 68 |         self._state[dones, 7:9] = np.array(starts)
 69 |         self._state_for_reset = copy.deepcopy(self._state)
 70 | 
 71 |         obs = np.zeros((batch_size,) + self.observation_space.shape, np.float32)
 72 |         obs[:, 0] = 1
 73 |         obs[:, 6:] = self._get_obs()[dones]
 74 |         return obs.squeeze()
 75 | 
 76 |     def _step(self, actions):
 77 |         t = self._state[:, 2]
 78 |         next_loc = self._state[:, 3:5] + self._action_set[actions]
 79 |         hit_wall = self._mazes[self._state[:, 0], next_loc[:, 0], next_loc[:, 1]]
 80 | 
 81 |         self._state[~hit_wall, 3:5] = next_loc[~hit_wall]
 82 |         t[:] += 1
 83 | 
 84 |         at_goal = np.equal(self._state[:, 3:5], self._state[:, 5:7]).all(1)
 85 |         finished_episode = np.equal(t, self.episode_horizon) | at_goal
 86 |         t[finished_episode] = 0
 87 |         self._state[finished_episode, 1] += 1
 88 |         self._state[finished_episode, 3:5] = self._state[finished_episode, 7:9]
 89 | 
 90 |         rewards = (1 + self._alive_cost) * at_goal - 1e-3 * hit_wall - self._alive_cost
 91 |         dones = np.equal(self._state[:, 1], self.n_episodes)
 92 | 
 93 |         obs = np.zeros((self.batch_size,) + self.observation_space.shape, np.float32)
 94 |         obs[:, 0] = finished_episode
 95 |         obs[:, 1] = rewards
 96 |         obs[np.arange(1), 2 + actions] = 1.0
 97 |         obs[:, 6:] = self._get_obs()
 98 |         return obs.squeeze(), rewards.squeeze(), dones.squeeze(), dict()
 99 | 
100 |     def _get_obs(self):
101 |         x, y = self._state[:, 3:5].T
102 |         dx, dy = np.meshgrid(np.arange(-1, 2), np.arange(-1, 2), indexing='ij')
103 |         xi, yi = x[:, None, None] + dx, y[:, None, None] + dy
104 |         mi = self._state[:, :1, None]
105 |         obs = self._mazes[mi, xi, yi].reshape((-1, 9))
106 |         if obs[:, 4].any():
107 |             raise ValueError
108 |         return obs
109 | 
110 | 
111 | MAZE_DATA_PATH = glob('../../krazy_grid_world/maze_data/*.npz')  # use as assertion base
112 | directory = os.path.dirname(__file__)
113 | MAZES = {
114 |     "Maze10-v0": os.path.join(directory, '../../krazy_grid_world/maze_data/mazes_10k_10x10.npz'),
115 |     "Maze20-v0": os.path.join(directory, '../../krazy_grid_world/maze_data/mazes_10k_20x20.npz'),
116 |     "MazeTest-v0": os.path.join(directory, '../../krazy_grid_world/maze_data/mazes_test_10k_20x20.npz')
117 | }
118 | for env_id, path in MAZES.items():
119 |     register(
120 |         env_id,
121 |         entry_point="custom_vendor.maze_env:MazeEnv",
122 |         kwargs=dict(path=path, batch_size=1, n_episodes=1, episode_horizon=12),
123 |         max_episode_steps=12,
124 |         reward_threshold=50.0
125 |     )
126 | 


--------------------------------------------------------------------------------
/e_maml_tf/custom_vendor/patches.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | gym.logger.set_level(40)  # set logging level to avoid annoying warning.
 3 | 
 4 | 
 5 | def __getattribute__(self, attr_name):
 6 |     try:
 7 |         return object.__getattribute__(self, attr_name)
 8 |     except AttributeError:
 9 |         return object.__getattribute__(self.env, attr_name)
10 | 
11 | 
12 | gym.core.Wrapper.__getattribute__ = __getattribute__
13 | 
14 | ### subproc_vec_env patches
15 | import numpy as np
16 | import baselines.common.vec_env.subproc_vec_env as subproc
17 | 
18 | 
19 | def __getattr__(self, attr_name):
20 |     if attr_name in dir(self):
21 |         return object.__getattribute__(self, attr_name)
22 |     else:
23 |         def remote_exec(*args, **kwargs):
24 |             for remote in self.remotes:
25 |                 remote.send((attr_name, dict(args=args, kwargs=kwargs)))
26 |             return np.stack([remote.recv() for remote in self.remotes])
27 | 
28 |         return remote_exec
29 | 
30 | 
31 | subproc.SubprocVecEnv.__getattr__ = __getattr__
32 | 
33 | 
34 | def worker(remote, parent_remote, env_fn_wrapper):
35 |     parent_remote.close()
36 |     env = env_fn_wrapper.x()
37 |     while True:
38 |         cmd, data = remote.recv()
39 |         if cmd == 'step':
40 |             ob, reward, done, info = env.step(data)
41 |             if done:
42 |                 ob = env.reset()
43 |             remote.send((ob, reward, done, info))
44 |         elif cmd == 'reset':
45 |             ob = env.reset()
46 |             remote.send(ob)
47 |         elif cmd == 'reset_task':
48 |             ob = env.reset_task()
49 |             remote.send(ob)
50 |         elif cmd == 'close':
51 |             remote.close()
52 |             break
53 |         elif cmd == 'get_spaces':
54 |             remote.send((env.observation_space, env.action_space))
55 |         else:
56 |             # todo: to distinguish between a functional call and a getitem, this needs some more thought
57 |             remote.send(getattr(env, cmd)(*data['args'], **data['kwargs']))
58 | 
59 | 
60 | subproc.worker = worker
61 | 
62 | from gym.envs.mujoco.mujoco_env import MujocoEnv
63 | 
64 | 
65 | def close(self):
66 |     if self.viewer is not None:
67 |         try:
68 |             self.viewer.finish()
69 |         except:
70 |             pass
71 |         self.viewer = None
72 | 
73 | 
74 | MujocoEnv.close = close
75 | 
76 | from baselines.common import distributions
77 | 
78 | 
79 | def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
80 |     import tensorflow as tf
81 |     import numpy as np
82 |     import baselines.common.tf_util as U
83 |     from baselines.a2c.utils import fc
84 |     mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
85 |     logstd = tf.get_variable(name='logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
86 |     pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
87 |     return self.pdfromflat(pdparam), mean
88 | 
89 | 
90 | distributions.DiagGaussianPdType.pdfromlatent = pdfromlatent
91 | 


--------------------------------------------------------------------------------
/e_maml_tf/distributions.py:
--------------------------------------------------------------------------------
 1 | from baselines.common.distributions import Pd, DiagGaussianPd
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | class SquashedDiagGaussianPd(Pd):
 7 |     def __init__(self, flat):
 8 |         self.flat = flat
 9 |         mean, logstd = tf.split(axis=len(flat.shape) - 1, num_or_size_splits=2, value=flat)
10 |         self.mean = tf.tanh(mean)
11 |         self.logstd = logstd
12 |         self.std = tf.exp(logstd)
13 | 
14 |     def flatparam(self):
15 |         return self.flat
16 | 
17 |     def mode(self):
18 |         return self.mean
19 | 
20 |     def neglogp(self, x):
21 |         _ = 0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std), axis=-1) \
22 |             + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \
23 |             + tf.reduce_sum(self.logstd, axis=-1)
24 |         _ += tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - self.sample() ** 2, l=0, u=1) + 1e-6), axis=1)
25 |         return _
26 | 
27 |     def kl(self, other):
28 |         assert isinstance(other, DiagGaussianPd)
29 |         return tf.reduce_sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (
30 |                 2.0 * tf.square(other.std)) - 0.5, axis=-1)
31 | 
32 |     def entropy(self):
33 |         return tf.reduce_sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
34 | 
35 |     def sample(self):
36 |         return tf.tanh(self.mean + self.std * tf.random_normal(tf.shape(self.mean)))
37 | 
38 |     @classmethod
39 |     def fromflat(cls, flat):
40 |         return cls(flat)
41 | 


--------------------------------------------------------------------------------
/e_maml_tf/e_maml_ge.py:
--------------------------------------------------------------------------------
  1 | """stands for 'correction-MAML. Could also argue complete-maml. Whatever."""
  2 | from collections import Sequence
  3 | from numbers import Number
  4 | from typing import Any
  5 | 
  6 | import matplotlib
  7 | from tqdm import trange
  8 | import tensorflow as tf
  9 | 
 10 | from e_maml_tf.ge_utils import get_scope_name, stem
 11 | from .config import G
 12 | from e_maml_tf.algos.vpg import Inputs as VPGInputs, VPG, Optimize as VPG_Optimize
 13 | from e_maml_tf.algos.ppo2 import Inputs as PPOInputs, PPO, Optimize as PPO_Optimize
 14 | from e_maml_tf.algos.cpi import Inputs as CPIInputs, CPI, Optimize as CPI_Optimize
 15 | from e_maml_tf.algos.bc import Inputs as BCInputs, BC, Optimize as BC_Optimize
 16 | from e_maml_tf.algos.bc_learned_loss import Inputs as BCLearnedLossInputs, BCLearnedLoss
 17 | from .ge_utils import defaultlist, make_with_custom_variables, GradientSum, Cache, var_map
 18 | 
 19 | matplotlib.use("Agg")
 20 | 
 21 | import baselines.common.tf_util as U
 22 | from .ge_policies import MlpPolicy
 23 | 
 24 | ALLOWED_ALGS = ('VPG', 'PPO', 'CPI', "BC", "BCLearnedLoss")
 25 | 
 26 | 
 27 | class Meta:
 28 |     optim = None
 29 | 
 30 |     def __init__(self, *, scope_name, act_space, ob_shape, algo, reuse: Any = False, trainables=None, optimizer=None,
 31 |                  add_loss=None, loss_only=False, lr_rank=None, max_grad_norm=None, max_grad_clip=None,
 32 |                  fix_variance=False):
 33 |         """
 34 |         Meta Graph Constructor
 35 | 
 36 |         :param scope_name:
 37 |         :param act_space:
 38 |         :param ob_shape:
 39 |         :param algo:
 40 |         :param reuse:
 41 |         :param trainables:
 42 |         :param optimizer:
 43 |         :param lr_rank: One of [None, 0, 1, 2] corresponding to [(), 'scalar', 'simple', "full"] learned learning rate.
 44 |         :param max_grad_norm:
 45 |         :param max_grad_clip:
 46 |         :param fix_variance:
 47 |         """
 48 |         assert algo in ALLOWED_ALGS, "model algorithm need to be one of {}".format(ALLOWED_ALGS)
 49 |         with tf.variable_scope(scope_name, reuse=reuse):
 50 |             obs = tf.placeholder(dtype=tf.float32, shape=ob_shape, name='obs')  # obs
 51 |             if algo == "PPO":
 52 |                 self.inputs = inputs = PPOInputs(action_space=act_space, value_baseline=(G.baseline == "critic"))
 53 |                 Optimize = PPO_Optimize
 54 |             elif algo == "VPG":
 55 |                 self.inputs = inputs = VPGInputs(action_space=act_space, value_baseline=(G.baseline == "critic"))
 56 |                 Optimize = VPG_Optimize
 57 |             elif algo == "CPI":
 58 |                 self.inputs = inputs = CPIInputs(action_space=act_space, value_baseline=(G.baseline == "critic"))
 59 |                 Optimize = CPI_Optimize
 60 |             elif algo == "BC":
 61 |                 self.inputs = inputs = BCInputs(action_space=act_space)
 62 |                 Optimize = BC_Optimize
 63 |             elif algo == "BCLearnedLoss":
 64 |                 self.inputs = inputs = BCLearnedLossInputs(action_space=act_space, type=G.learned_loss_type)
 65 |                 Optimize = BC_Optimize
 66 |             else:
 67 |                 raise NotImplementedError(
 68 |                     'Only supports PPO, VPG, CPI, BC and BC with Learned Loss (BCLearnedLoss)')
 69 |             inputs.X = obs  # https://github.com/tianheyu927/mil/blob/master/mil.py#L218
 70 |             bias_transformation = tf.get_variable('input_bias', [1, G.bias_dim], initializer=tf.zeros_initializer())
 71 |             batch_n = tf.shape(obs)[0]
 72 |             trans_input = tf.tile(bias_transformation, [batch_n, 1])
 73 |             self.policy = policy = MlpPolicy(
 74 |                 ac_space=act_space, hidden_size=G.hidden_size, n_layers=G.n_layers,
 75 |                 activation=G.activation, value_baseline=(G.baseline == "critic"),
 76 |                 reuse=reuse, X=tf.concat(values=(obs, trans_input), axis=1), X_placeholder=obs,
 77 |                 init_logstd=G.init_logstd, fix_variance=fix_variance)
 78 | 
 79 |             # note that policy.trainables are the original trainable parameters, not the mocked variables.
 80 |             # todo: concatenate policy.trainable with local trainable (bias_transformation)
 81 |             self.trainables = tf.trainable_variables() if trainables is None else trainables
 82 | 
 83 |             ext_loss = add_loss(inputs.ADV) if callable(add_loss) else None
 84 |             if algo == "PPO":
 85 |                 self.model = PPO(inputs=inputs, policy=policy, vf_coef=G.vf_coef, ent_coef=G.ent_coef)
 86 |             elif algo == "VPG":
 87 |                 self.model = VPG(inputs=inputs, policy=policy, vf_coef=G.vf_coef)
 88 |             elif algo == "CPI":
 89 |                 self.model = CPI(inputs=inputs, policy=policy, vf_coef=G.vf_coef, ent_coef=G.ent_coef)
 90 |             elif algo == "BC":
 91 |                 self.model = BC(inputs=inputs, policy=policy)
 92 |             elif algo == "BCLearnedLoss":
 93 |                 self.model = BCLearnedLoss(inputs=inputs, policy=policy, type=G.learned_loss_type)
 94 | 
 95 |             self.loss = self.model.loss if ext_loss is None else (self.model.loss + ext_loss)
 96 | 
 97 |             if not loss_only:
 98 |                 if lr_rank == 0:
 99 |                     inputs.LR = lr = tf.placeholder(tf.float32, shape=[], name="LR")
100 |                 elif lr_rank == 1:
101 |                     inputs.LR = lr = tf.placeholder(tf.float32, shape=(len(self.trainables),), name="LR")
102 |                 elif lr_rank == 2:
103 |                     inputs.LR = lr = [tf.placeholder(tf.float32, shape=t.shape, name=f"LR_{stem(t, 2)}")
104 |                                       for t in self.trainables]
105 |                 elif lr_rank is None:
106 |                     lr = None
107 |                 else:
108 |                     raise NotImplementedError(f"lr_rank = {lr_rank} is not supported. Check for programming error.")
109 |                 self.optim = Optimize(lr=lr, loss=self.loss, reports=self.model.reports,
110 |                                       trainables=self.trainables, max_grad_norm=max_grad_norm,
111 |                                       max_grad_clip=max_grad_clip, optimizer=optimizer)
112 | 
113 | 
114 | def _mean(x, axis=None, keepdims=False):
115 |     return tf.reduce_mean(x, reduction_indices=None if axis is None else [axis], keep_dims=keepdims)
116 | 
117 | 
118 | def cmaml_loss(neglogpacs, advantage):
119 |     mean_adv = _mean(advantage)
120 |     # we attribute adv to all workers in the style of DICE
121 |     exploration_term = _mean(neglogpacs) * mean_adv
122 |     return exploration_term * G.e_maml_lambda
123 | 
124 | 
125 | class SingleTask:
126 |     def __init__(self, act_space, ob_shape, trainable_map, meta_trainable_map=None, lr=None):
127 |         # no need to go beyond despite of large G.eval_grad_steps, b/c RL samples using runner policy.
128 | 
129 |         if meta_trainable_map is None:
130 |             meta_trainable_map = trainable_map
131 | 
132 |         self.workers = defaultlist(None)
133 |         self.metas = defaultlist(None)
134 | 
135 |         params = defaultlist(None)
136 |         params[0] = meta_trainable_map.copy()
137 |         params[0].update(trainable_map)
138 | 
139 |         import gym
140 |         assert type(act_space) is gym.spaces.Box
141 |         act_dim, *_ = act_space.shape
142 | 
143 |         for k in range(G.n_grad_steps + 1):
144 |             if k < G.n_grad_steps:  # 0 - 9,
145 | 
146 |                 self.workers[k] = worker = make_with_custom_variables(
147 |                     lambda: Meta(scope_name=f'inner_{k}_grad_network',
148 |                                  act_space=act_space, ob_shape=ob_shape, algo=G.inner_alg,
149 |                                  # do NOT pass in learning rate to inhibit the Meta.optimize operator.
150 |                                  optimizer=G.inner_optimizer, reuse=True, trainables=list(params[k].values()),
151 |                                  max_grad_norm=G.inner_max_grad_norm, max_grad_clip=G.inner_max_grad_clip,
152 |                                  fix_variance=True
153 |                                  ),  # pass in the trainable_map for proper gradient
154 |                     params[k], f'{get_scope_name()}/inner_{k}_grad_network/'
155 |                 )
156 | 
157 |                 with tf.variable_scope(f'SGD_grad_{k}'):
158 |                     if (isinstance(lr, Sequence) and len(lr)) or (hasattr(lr, 'shape') and len(lr.shape)):
159 |                         learn_rates = lr[k]
160 |                     else:
161 |                         worker.inputs.LR = lr  # this is important because this is needed by the feed_dict
162 |                         learn_rates = [lr] * len(worker.optim.grads)
163 |                     params[k + 1] = meta_trainable_map.copy()
164 |                     if G.first_order:
165 |                         params[k + 1].update({k: worker.optim.apply_grad(lr=lr, grad=tf.stop_gradient(g), var=v)
166 |                                               for g, lr, (k, v) in
167 |                                               zip(learn_rates, worker.optim.grads, params[k].items())})
168 |                     else:
169 |                         params[k + 1].update({k: worker.optim.apply_grad(lr=lr, grad=g, var=v)
170 |                                               for g, lr, (k, v) in
171 |                                               zip(learn_rates, worker.optim.grads, params[k].items())})
172 | 
173 |             if k == G.n_grad_steps:  # 10 or 1.
174 |                 add_loss = None if G.run_mode != 'e-maml' \
175 |                     else lambda ADV: cmaml_loss([w.model.neglogpac for w in self.workers], ADV)
176 |                 self.meta = make_with_custom_variables(
177 |                     lambda: Meta(scope_name="meta_network", act_space=act_space, ob_shape=ob_shape,
178 |                                  algo=G.meta_alg, reuse=True, add_loss=add_loss, loss_only=True, )
179 |                     , params[k], f'{get_scope_name()}/meta_network/'
180 |                 )
181 | 
182 |         # Expose as non-public API for debugging purposes
183 |         self._params = params
184 | 
185 | 
186 | def assert_match(l1, l2):
187 |     assert len(l1) > 0
188 |     for i, (a, b) in enumerate(zip(l1, l2)):
189 |         assert a == b, "existing items has to be the same."
190 |     return l1[i + 1:] if len(l1) > len(l2) else l2[i + 1:]
191 | 
192 | 
193 | # Algorithm Summary
194 | # 1. [sample] with pi(theta) `run_episode`
195 | # 2. compute policy gradient (vanilla)
196 | # 3. apply gradient to get \theta' using SGD
197 | # 4. [sample] with pi(theta') `run_episode`
198 | # 5. use PPO, compute meta gradient
199 | # 6. sum up the PPO gradient from multiple tasks and average
200 | # 6. apply this gradient
201 | class E_MAML:
202 |     gradient_sum = None
203 |     alpha = None
204 | 
205 |     def __init__(self, ob_space, act_space):
206 |         """
207 |         Usage:
208 |             self.env = env
209 |             ob_shape = (None,) + self.env.observation_space.shape
210 |         """
211 |         from ml_logger import logger
212 |         logger.upload_file(__file__)
213 | 
214 |         ob_shape = (None,) + ob_space.shape
215 | 
216 |         import gym
217 |         assert type(act_space) is gym.spaces.Box
218 |         act_dim, *_ = act_space.shape
219 | 
220 |         if G.meta_sgd == 'full':
221 |             lr_rank = 2
222 |         elif G.meta_sgd:
223 |             lr_rank = 1
224 |         else:
225 |             lr_rank = 0
226 |         # Meta holds policy, inner optimizer. Also creates an input.LR placeholder.
227 |         self.runner = Meta(scope_name='runner', act_space=act_space, ob_shape=ob_shape, algo=G.inner_alg,
228 |                            lr_rank=lr_rank, optimizer=G.inner_optimizer, max_grad_norm=G.inner_max_grad_norm,
229 |                            max_grad_clip=G.inner_max_grad_clip, fix_variance=G.control_variance)
230 | 
231 |         trainables = self.runner.trainables
232 |         runner_var_map = var_map(trainables, 'runner/')
233 |         # note: the point of AUTO_REUSE is:
234 |         # note:            if reuse=True, gives error when no prior is available. Otherwise always creates new.
235 |         # note:            This yaw, only creates new when old is not available.
236 |         self.meta_runner = Meta(scope_name="runner", act_space=act_space, ob_shape=ob_shape, algo=G.meta_alg,
237 |                                 reuse=tf.AUTO_REUSE, loss_only=True, fix_variance=G.fix_meta_variance)
238 |         meta_trainables = self.meta_runner.trainables
239 |         meta_runner_var_map = var_map(meta_trainables, 'runner/')
240 |         # meta_trainables = assert_match(trainables, meta_trainables)
241 | 
242 |         self.beta = tf.placeholder(tf.float32, [], name="beta")
243 | 
244 |         print(">>>>>>>>>>> Constructing Meta Graph <<<<<<<<<<<")
245 |         # todo: we can do multi-GPU placement of the graph here.
246 |         self.graphs = []
247 |         assert G.n_graphs == 1 or G.n_graphs == G.n_tasks, "graph number is 1 or equal to the number of tasks"
248 | 
249 |         if G.meta_sgd:
250 |             assert isinstance(G.alpha, Number), "alpha need to be a scalar."
251 |             self.alpha = []  # has to be per-layer per-block. Bias and weights require different scales.
252 |             for k in range(G.n_grad_steps):
253 |                 with tf.variable_scope(f'learned_alpha_{k}'):
254 |                     self.alpha.append([
255 |                         tf.get_variable(f'alpha_{stem(t.name, 2)}', shape=t.shape if G.meta_sgd == "full" else (),
256 |                                         initializer=tf.constant_initializer(G.alpha))
257 |                         for t in trainables
258 |                     ])
259 |         else:
260 |             self.alpha = self.runner.inputs.LR
261 |         for t in trange(G.n_graphs):
262 |             with tf.variable_scope(f"graph_{t}"):
263 |                 # note: should use different learning rate for each gradient step
264 |                 task_graph = SingleTask(act_space=act_space, ob_shape=ob_shape, trainable_map=runner_var_map,
265 |                                         meta_trainable_map=meta_runner_var_map, lr=self.alpha)
266 |                 self.graphs.append(task_graph)
267 | 
268 |         all_trainables = tf.trainable_variables()  # might be controlled variables in the meta loop
269 | 
270 |         # Only do this after the meta graph has finished using policy.trainables
271 |         # Note: stateful operators for saving to a cache and loading from it. Only used to reset runner
272 |         # Note: Slots are not supported. Only weights.
273 |         # fixit: all_variables might not be needed. Only that of the runner need to be cached.
274 |         with tf.variable_scope("weight_cache"):
275 |             self.cache = Cache(all_trainables)
276 |             self.save_weight_cache = U.function([], [self.cache.save])
277 |             self.load_weight_cache = U.function([], [self.cache.load])
278 | 
279 |         # Now construct the meta optimizers
280 |         with tf.variable_scope('meta_optimizer'):
281 |             # call gradient_sum.set_op first, then add_op. Call k times in-total.
282 |             self.meta_grads = tf.gradients(tf.reduce_mean([task_graph.meta.loss for task_graph in self.graphs]),
283 |                                            all_trainables)
284 |             if G.n_graphs == 1:
285 |                 self.gradient_sum = GradientSum(all_trainables, self.meta_grads)
286 |                 grads = [c / G.n_tasks for c in self.gradient_sum.cache]
287 |             else:
288 |                 grads = self.meta_grads
289 | 
290 |             if G.meta_max_grad_norm:  # allow 0 to be by-pass
291 |                 grads = [None if g is None else
292 |                          g * tf.stop_gradient(G.meta_max_grad_norm / tf.maximum(G.meta_max_grad_norm, tf.norm(g)))
293 |                          for g in grads]
294 | 
295 |             # do NOT apply gradient norm here.
296 |             if G.meta_optimizer == "Adam":
297 |                 Optim, kwargs = tf.train.AdamOptimizer, {}
298 |             elif G.meta_optimizer == "AdamW":
299 |                 Optim, kwargs = tf.contrib.opt.AdamWOptimizer, dict(weight_decay=0.0001)
300 |             elif G.meta_optimizer == "SGD":
301 |                 Optim, kwargs = tf.train.GradientDescentOptimizer, {}
302 |             else:
303 |                 raise NotImplemented(f"{G.meta_optimizer} as a meta optimizer is not implemented.")
304 | 
305 |             # Uses a different optimizer (with slots) for each step in the meta update.
306 |             self.meta_update_ops = defaultlist(None)
307 |             self.meta_optimizers = defaultlist(None)
308 |             for i in range(1 if G.reuse_meta_optimizer else G.meta_n_grad_steps):
309 |                 self.meta_optimizers[i] = Optim(learning_rate=self.beta, **kwargs)
310 |                 self.meta_update_ops[i] = self.meta_optimizers[i].apply_gradients(zip(grads, all_trainables))
311 | 
312 |             self.meta_reporting_keys = self.graphs[0].meta.model.reports.keys()
313 |             self.meta_reporting = self.graphs[0].meta.model.reports.values() if G.n_graphs == 1 else \
314 |                 [tf.reduce_mean(_) for _ in zip(*[graph.meta.model.reports.values() for graph in self.graphs])]
315 | 


--------------------------------------------------------------------------------
/e_maml_tf/ge_policies.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from gym import spaces
  4 | 
  5 | from baselines.a2c.utils import ortho_init
  6 | from baselines.common.distributions import make_pdtype
  7 | 
  8 | 
  9 | def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0):
 10 |     with tf.variable_scope(scope):
 11 |         nin = x.get_shape()[-1].value  # can take batched or individual tensors.
 12 |         w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
 13 |         b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0))
 14 |         z = tf.matmul(x, w) + b
 15 |         h = act(z)
 16 |         return h
 17 | 
 18 | 
 19 | LOG_STD_MAX = 10
 20 | LOG_STD_MIN = -10
 21 | 
 22 | 
 23 | class MlpPolicy:
 24 |     vf = None
 25 | 
 26 |     def __repr__(self):
 27 |         return f"{self.__class__} {self.name}"
 28 | 
 29 |     # noinspection PyPep8Naming
 30 |     def __init__(self, ac_space, X, hidden_size, n_layers=2, activation="tanh", value_baseline=False,
 31 |                  scope='MlpPolicy', reuse=False, X_placeholder=None, fix_variance=False, init_logstd=None):
 32 |         """
 33 |         Gaussian Policy. The variance is learned as parameters. You can also pass in the logstd from the outside.
 34 | 
 35 |             __init__: Construct the graph for the MLP policy.
 36 | 
 37 |         :param ac_space: action space, one of `gym.spaces.Box`
 38 |         :param X: Tensor or input placeholder for the observation
 39 |             :param hidden_size: size of hidden layers in network
 40 |         :param activation: one of 'reLU', 'tanh'
 41 |         :param scope: str, name of variable scope.
 42 |         :param reuse:
 43 |         :param value_baseline: bool flag whether compute a value baseline
 44 |         :param X_placeholder:
 45 |         :param fix_variance:
 46 |         :param init_logstd:
 47 |         """
 48 |         assert n_layers >= 2, f"hey, what's going on with this puny {n_layers}-layer network? " \
 49 |             f"--Ge (your friendly lab-mate)"
 50 |         if isinstance(scope, tf.VariableScope):
 51 |             self.scope_name = scope.name
 52 |         else:
 53 |             self.scope_name = scope
 54 |         self.name = (self.scope_name + "_reuse") if reuse else self.scope_name
 55 | 
 56 |         self.X_ph = X if X_placeholder is None else X_placeholder
 57 | 
 58 |         # done: this only applies to Discrete action space. Need to make more general.
 59 |         # now it works for both discrete action and gaussian policies.
 60 |         if isinstance(ac_space, spaces.Discrete):
 61 |             act_dim = ac_space.n
 62 |         else:
 63 |             act_dim, *_ = ac_space.shape
 64 | 
 65 |         if activation == 'tanh':
 66 |             act = tf.tanh
 67 |         elif activation == "relu":
 68 |             act = tf.nn.relu
 69 |         else:
 70 |             raise TypeError(f"{activation} is not available in this MLP.")
 71 |         with tf.variable_scope(scope, reuse=reuse):
 72 |             h_ = X
 73 |             for i in range(1, n_layers + 1):  # there is no off-by-one error here --Ge.
 74 |                 h_ = fc(h_, f'pi_fc_{i}', nh=hidden_size, init_scale=np.sqrt(2), act=act)
 75 |                 # a_ = fc(h_, f'pi_attn_{i}', nh=h_.shape[1], init_scale=np.sqrt(2), act=tf.math.sigmoid)
 76 |                 # h_ = fc(h_ * a_, f'pi_fc_{i}', nh=hidden_size, init_scale=np.sqrt(2), act=act)
 77 |             mu = fc(h_, 'pi', act_dim, act=lambda x: x, init_scale=0.01)
 78 |             # _ = fc(h2, 'pi', act_dim, act=tf.tanh, init_scale=0.01)
 79 |             # mu = ac_space.low + 0.5 * (ac_space.high - ac_space.low) * (_ + 1)
 80 | 
 81 |             self.h_ = h_  # used for learned loss
 82 | 
 83 |             # assert (not G.vf_coef) ^ (G.baseline == "critic"), "These two can not be true or false at the same time."
 84 |             if value_baseline:
 85 |                 # todo: conditionally declare these only when used
 86 |                 # h1 = fc(X, 'vf_fc1', nh=hidden_size, init_scale=np.sqrt(2), act=act)
 87 |                 # h2 = fc(h1, 'vf_fc2', nh=hidden_size, init_scale=np.sqrt(2), act=act)
 88 |                 self.vf = fc(self.h_, 'vf', 1, act=lambda x: x)[:, 0]
 89 | 
 90 |             if isinstance(ac_space, spaces.Box):  # gaussian policy requires logstd
 91 |                 shape = tf.shape(mu)[0]
 92 |                 if fix_variance:
 93 |                     _ = tf.ones(shape=[1, act_dim], name="unit_logstd") * (init_logstd or 0)
 94 |                     logstd = tf.tile(_, [shape, 1])
 95 |                 elif init_logstd is not None:
 96 |                     _ = tf.get_variable(name="logstd", shape=[1, act_dim],
 97 |                                         initializer=tf.constant_initializer(init_logstd))
 98 |                     # todo: clip logstd to limit the range.
 99 |                     logstd = tf.tile(_, [shape, 1])
100 |                 else:
101 |                     # use variance network when no initial logstd is given.
102 |                     # _ = fc(X, 'logstd_fc1', nh=hidden_size, init_scale=np.sqrt(2), act=act)
103 |                     # _ = fc(_, 'logstd_fc2', nh=hidden_size, init_scale=np.sqrt(2), act=act)
104 | 
105 |                     # note: this doesn't work. Really need to bound the variance.
106 |                     # logstd = 1 + fc(self.h_, 'logstd', act_dim, act=lambda x: x, init_scale=0.01)
107 |                     logstd = fc(self.h_, 'logstd', act_dim, act=lambda x: x, init_scale=0.01)
108 |                     # logstd = fc(self.h2, 'logstd', act_dim, act=tf.tanh, init_scale=0.01)
109 |                     # logstd = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (logstd + 1)
110 | 
111 |                 # GaussianPd takes 2 * [act_length] b/c of the logstd concatenation.
112 |                 ac = tf.concat([mu, logstd], axis=1)
113 |                 # A much simpler way is to multiply _logstd with a zero tensor shaped as mu.
114 |                 # [mu, mu * 0 + _logstd]
115 |             else:
116 |                 raise NotImplemented('Discrete action space is not implemented!')
117 | 
118 |             # list of parameters is fixed at graph time.
119 |             # todo: Only gets trainables that are newly created by the current policy function.
120 |             # self.trainables = tf.trainable_variables()
121 | 
122 |             # placeholders = placeholders_from_variables(self.trainables)
123 |             # self._assign_placeholder_dict = {t.name: p for t, p in zip(self.trainables, placeholders)}
124 |             # self._assign_op = tf.group(*[v.assign(p) for v, p in zip(self.trainables, placeholders)])
125 | 
126 |         with tf.variable_scope("Gaussian_Action"):
127 |             self.pdtype = make_pdtype(ac_space)
128 |             self.pd = self.pdtype.pdfromflat(ac)
129 | 
130 |             self.a = a = self.pd.sample()
131 |             self.mu = self.pd.mode()
132 |             self.neglogpac = self.pd.neglogp(a)
133 | 
134 |     @property
135 |     def trainables(self):
136 |         raise DeprecationWarning("deprecated b/c bias transform.")
137 | 
138 |     @property
139 |     def state_dict(self):
140 |         # todo: should make the tensor names scoped locally.
141 |         return {t.name: v for t, v in zip(self.trainables, tf.get_default_session().run(self.trainables))}
142 | 
143 |     # def load_from_state_dict(self, state_dict):
144 |     #     # todo: this adds new assign ops each time, and causes the graph to grow.
145 |     #     feed_dict = {self._assign_placeholder_dict[t.name]: state_dict[t.name] for t in self.trainables}
146 |     #     return tf.get_default_session().run(self._assign_op, feed_dict=feed_dict)
147 | 
148 |     def step(self, ob, soft, feed_dict=None):
149 |         if feed_dict:
150 |             feed_dict.update({self.X_ph: ob})
151 |         else:
152 |             feed_dict = {self.X_ph: ob}
153 |         sess = tf.get_default_session()
154 |         if self.vf is None:
155 |             ts = [self.a if soft else self.mu, self.neglogpac]
156 |             return sess.run(ts, feed_dict=feed_dict)
157 |         else:
158 |             ts = [self.a if soft else self.mu, self.vf, self.neglogpac]
159 |             return sess.run(ts, feed_dict=feed_dict)
160 | 
161 |     act = step
162 | 
163 |     def value(self, ob, feed_dict=None):
164 |         if feed_dict:
165 |             feed_dict.update({self.X_ph: ob})
166 |         else:
167 |             feed_dict = {self.X_ph: ob}
168 |         sess = tf.get_default_session()
169 |         return sess.run(self.vf, feed_dict=feed_dict)
170 | 


--------------------------------------------------------------------------------
/e_maml_tf/ge_utils.py:
--------------------------------------------------------------------------------
  1 | import mock
  2 | import tensorflow as tf
  3 | from typing import Callable, Any, List, TypeVar
  4 | 
  5 | 
  6 | def probe_var(*variables):
  7 |     return tf.get_default_session().run(variables)
  8 | 
  9 | 
 10 | def as_dict(c):
 11 |     return {k: v for k, v in vars(c).items() if k[0] != "_"}
 12 | 
 13 | 
 14 | def var_like(var, trainable=False):
 15 |     name, dtype, shape = var.name, var.dtype, tuple(var.get_shape().as_list())
 16 |     new_name = name.split(':')[0]
 17 |     # note: assuming that you are using a variable scope for this declaration.
 18 |     new_var = tf.Variable(initial_value=tf.zeros(shape, dtype), name=new_name)
 19 |     # print(f"declaring variable like {name} w/ new name: {new_var.name}")
 20 |     return new_var
 21 | 
 22 | 
 23 | def placeholders_from_variables(var, name=None):
 24 |     """Returns a nested collection of TensorFlow placeholders that match shapes
 25 |     and dtypes of the given nested collection of variables.
 26 |     Arguments:
 27 |     ----------
 28 |         var: Nested collection of variables.
 29 |         name: Placeholder name.
 30 |     Returns:
 31 |     --------
 32 |         Nested collection (same structure as `var`) of TensorFlow placeholders.
 33 |     """
 34 |     if isinstance(var, list) or isinstance(var, tuple):
 35 |         result = [placeholders_from_variables(v, name) for v in var]
 36 |         if isinstance(var, tuple):
 37 |             return tuple(result)
 38 |         return result
 39 |     else:
 40 |         dtype, shape = var.dtype, tuple(var.get_shape().as_list())
 41 |         return tf.placeholder(dtype=dtype, shape=shape, name=name)
 42 | 
 43 | 
 44 | def wrap_variable_creation(func, custom_getter):
 45 |     """Provides a custom getter for all variable creations."""
 46 |     original_get_variable = tf.get_variable
 47 | 
 48 |     def custom_get_variable(*args, **kwargs):
 49 |         if hasattr(kwargs, "custom_getter"):
 50 |             raise AttributeError("Custom getters are not supported for optimizee variables.")
 51 |         return original_get_variable(*args, custom_getter=custom_getter, **kwargs)
 52 | 
 53 |     # Mock the get_variable method.
 54 |     with mock.patch("tensorflow.get_variable", custom_get_variable):
 55 |         return func()
 56 | 
 57 | 
 58 | def get_var_name(string):
 59 |     return string.split(':')[0]
 60 | 
 61 | 
 62 | def var_map(variables, root_scope_name):
 63 |     """
 64 |     only returns those that starts with the root_scope_name.
 65 | 
 66 |     :param variables:
 67 |     :param root_scope_name:
 68 |     :return:
 69 |     """
 70 |     return {get_var_name(v.name)[len(root_scope_name):]: v for v in variables if v.name.startswith(root_scope_name)}
 71 | 
 72 | 
 73 | def get_scope_name():
 74 |     return tf.get_default_graph().get_name_scope()
 75 | 
 76 | 
 77 | def stem(n, k=1):
 78 |     """
 79 |     Allow using k > 1 to leave a longer segment of the bread crum
 80 | 
 81 |     Example Variable(output Tensor) Names:
 82 |     ```
 83 |         runner/input_bias:0
 84 |         runner/MlpPolicy/pi_fc1/w:0
 85 |         runner/MlpPolicy/pi_fc1/b:0
 86 |         runner/MlpPolicy/pi_fc2/w:0
 87 |     ```
 88 | 
 89 |     stem(tensor.name, 2) should give us
 90 | 
 91 |     ```
 92 |         runner/input_bias
 93 |         pi_fc1/w
 94 |         pi_fc1/b
 95 |         pi_fc2/w
 96 |     ```
 97 | 
 98 | 
 99 |     :param n:
100 |     :param k:
101 |     :return:
102 |     """
103 |     return "/".join(n.split(":")[0].split('/')[-k:])
104 | 
105 | 
106 | T = TypeVar('T')
107 | 
108 | 
109 | def make_with_custom_variables(func: Callable[[Any], T], variable_map, root_name_space="") -> T:
110 |     """Calls func and replaces any trainable variables.
111 |     This returns the output of func, but whenever `get_variable` is called it
112 |     will replace any trainable variables with the tensors in `variables`, in the
113 |     same order. Non-trainable variables will re-use any variables already
114 |     created.
115 |     Arguments:
116 |     ----------
117 |         func: Function to be called.
118 |         variables: A list of tensors replacing the trainable variables.
119 |     Returns:
120 |     --------
121 |         The return value of func is returned.
122 |     """
123 | 
124 |     def custom_getter(getter, name, **kwargs):
125 |         nonlocal variable_map
126 |         postfix = name[len(root_name_space):]
127 |         return variable_map[postfix]
128 | 
129 |     return wrap_variable_creation(func, custom_getter)
130 | 
131 | 
132 | # noinspection PyPep8Naming
133 | class defaultlist():
134 |     """allow using -1, -2 index to query from the end of the list, which is not possible with `defaultdict`. """
135 | 
136 |     def __init__(self, default_factory):
137 |         self.data = list()
138 |         self.default_factory = default_factory if callable(default_factory) else lambda: default_factory
139 | 
140 |     def __setitem__(self, key, value):
141 |         try:
142 |             self.data[key] = value
143 |         except IndexError:
144 |             self.data.extend([self.default_factory()] * (key + 1 - len(self.data)))
145 |             self.data[key] = value
146 | 
147 |     def __getitem__(self, item):
148 |         return self.data[item]
149 | 
150 |     def __setstate__(self, state):
151 |         raise NotImplementedError('need to be implemented for remote execution.')
152 | 
153 |     def __getstate__(self):
154 |         raise NotImplementedError('need to be implemented for remote execution.')
155 | 
156 | 
157 | class Cache:
158 |     def __init__(self, variables):
159 |         """
160 |         creates a variable flip-flop in-memory.
161 | 
162 |         :param variables:
163 |         :return: save_op, load_op, cache array
164 |         """
165 |         self.cache = [var_like(v) for v in variables]
166 |         self.save = tf.group(*[c.assign(tf.stop_gradient(v)) for c, v in zip(self.cache, variables)])
167 |         self.load = tf.group(*[v.assign(tf.stop_gradient(c)) for c, v in zip(self.cache, variables)])
168 | 
169 | 
170 | # goal: try to add the gradients without going through python.
171 | class GradientSum:
172 |     def __init__(self, variables, grad_inputs):
173 |         """k is the number of gradients you want to sum.
174 |         zero this gradient op once every meta iteration. """
175 |         self.cache = [var_like(v) for v in variables]
176 |         # call set before calling add op, faster than zeroing out the cache.
177 |         self.set_op = tf.group(*[c.assign(tf.stop_gradient(g)) for c, g in zip(self.cache, grad_inputs)])
178 |         self.add_op = tf.group(*[c.assign_add(tf.stop_gradient(g)) for c, g in zip(self.cache, grad_inputs)])
179 | 
180 | 
181 | def flatten(arr):
182 |     """swap and then flatten axes 0 and 1"""
183 |     n_steps, n_envs, *_ = arr.shape
184 |     return arr.swapaxes(0, 1).reshape(n_steps * n_envs, *_)
185 | 


--------------------------------------------------------------------------------
/e_maml_tf/meta_rl_tasks.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | 
  3 | from e_maml_tf import config
  4 | from e_maml_tf.custom_vendor import IS_PATCHED  # GRID_WORLDS
  5 | from e_maml_tf.wrappers.subproc_vec_env import SubprocVecEnv
  6 | 
  7 | assert IS_PATCHED, "need to use patch for new env and proper monitor wraps"
  8 | 
  9 | # MAZE_KEYS = MAZES.keys()
 10 | # GRID_WORLD_KEYS = GRID_WORLDS.keys()
 11 | ALLOWED_ENVS = ["HalfCheetah-v2",
 12 |                 "HalfCheetahGoalVel-v0",
 13 |                 "HalfCheetahGoalDir-v0",
 14 |                 "PointMassQuadrangle-v0",  # used to show exploration
 15 |                 "ReacherSingleTask-v1",
 16 |                 "ReacherMultitaskSimple-v1",
 17 |                 "ReacherMultitask-v1",
 18 |                 "PointMass-v0",
 19 |                 "PointMassMultitaskSimple-v0",
 20 |                 "PointMassMultitask-v0",
 21 |                 "SawyerDoorFixedMultitask-v0",
 22 |                 "SawyerDoorMultitask-v0",
 23 |                 "SawyerPointMultitaskSimple-v0",
 24 |                 "SawyerPointMultitask-v0",
 25 |                 "SawyerPickLiftMultitaskSimple-v0",
 26 |                 "SawyerPickLiftMultitask-v0",
 27 |                 "SawyerPickReachMultitaskSimple-v0",
 28 |                 "SawyerPickReachMultitask-v0",
 29 |                 "SawyerPickPlaceMultitaskSimple-v0",
 30 |                 "SawyerPickPlaceMultitask-v0",
 31 |                 "SawyerMixedMultitask-v0",
 32 |                 ]  # *MAZE_KEYS, *GRID_WORLD_KEYS
 33 | 
 34 | 
 35 | class MetaRLTasks:
 36 |     def __enter__(self):
 37 |         return self
 38 | 
 39 |     def __exit__(self, exc_type, exc_val, exc_tb):
 40 |         self.envs.close()
 41 | 
 42 |     @property
 43 |     def k_tasks(self):
 44 |         return self.spec._kwargs['k_tasks']
 45 | 
 46 |     def __init__(self, *, env_name, batch_size, start_seed, log_directory=None, max_steps=None):
 47 |         """
 48 |         use log_directory/{seed}/ to dynamically generate movies with individual seeds.
 49 |         """
 50 |         import gym
 51 |         gym.logger.set_level(40)  # set logging level to avoid annoying warning.
 52 | 
 53 |         assert env_name in ALLOWED_ENVS, \
 54 |             "environment {} is not supported. Need to be one of {}".format(env_name, ALLOWED_ENVS)
 55 | 
 56 |         # keep the env_name for sampling logic. Can be removed if made more general.
 57 |         self.env_name = env_name
 58 | 
 59 |         def make_env(env_seed, env_name, monitor_log_directory=None, wrap=None):
 60 |             def _f():
 61 |                 nonlocal max_steps
 62 |                 env = gym.make(env_name)
 63 |                 # Note: gym seed does not allow task_seed. Use constructor instead.
 64 |                 # if self.env_name in GRID_WORLD_KEYS:
 65 |                 #     env.seed(seed=(seed, task_seed))
 66 |                 # else:
 67 |                 env.seed(seed=env_seed)
 68 |                 # fixit: this seems a bit counter-intuitive. Should probably remove.
 69 |                 if max_steps:  # 0, None, False are null values.
 70 |                     # see issue #410: https://github.com/openai/gym/issues/410 the TimeLimit wrapper is now used as a
 71 |                     # standard wrapper, and the _max_episode_steps is used inside TimeLimit wrapper for episode step-out
 72 |                     # limit.
 73 |                     # Note: should not override the default when reporting.
 74 |                     env._max_episode_steps = max_steps
 75 | 
 76 |                 numpy.random.seed(env_seed)
 77 |                 # deprecation: we can remove this code
 78 |                 if monitor_log_directory is not None:
 79 |                     env = gym.wrappers.Monitor(env, monitor_log_directory.format(seed=env_seed), force=True)
 80 |                     # todo: use bench Montior
 81 |                     # from rl_algs.bench import Monitor
 82 |                     # env = Monitor(env, monitor_log_directory.format(seed=seed), force=True)
 83 |                 if wrap:
 84 |                     env = wrap(env)
 85 |                 return env
 86 | 
 87 |             return _f
 88 | 
 89 |         from e_maml_tf.wrappers.k_index import k_index
 90 |         self.envs = SubprocVecEnv(
 91 |             [make_env(env_seed=start_seed + s, env_name=env_name, monitor_log_directory=log_directory,
 92 |                       wrap=k_index if config.G.use_k_index else None) for s in
 93 |              range(batch_size)])
 94 | 
 95 |         if config.G.normalize_env:
 96 |             from e_maml_tf.wrappers.vec_env_normalize import vec_normalize
 97 |             self.envs = vec_normalize(self.envs)
 98 | 
 99 |         # This is used in the reporting logic, to respect the standard reporting for episode length etc..
100 |         self.spec = gym.envs.registry.spec(env_name)
101 | 
102 |     def sample(self, index=None, identical_batch=True):
103 |         """has to set the goals by batch at least once. Otherwise the initial goals are different depending on the
104 |         random seed."""
105 |         envs: SubprocVecEnv = self.envs
106 |         if self.env_name == "HalfCheetahGoalVel-v0":
107 |             new_goal = index or numpy.random.uniform(0, 2.0)
108 |             # print('New Goal Velocity: ', new_goal)
109 |             envs.call_sync("set_goal_velocity", new_goal if identical_batch else None)
110 |         elif self.env_name == "HalfCheetahGoalDir-v0":
111 |             new_direction = index or (1 if numpy.random.rand() > 0.5 else -1)
112 |             envs.call_sync("set_goal_direction", new_direction if identical_batch else None)
113 |         elif index is None:
114 |             new_obj_index = numpy.random.randint(0, self.k_tasks) if identical_batch else None
115 |             envs.call_sync("sample_task", index=new_obj_index)
116 |         else:
117 |             envs.call_sync("sample_task", index=index)
118 | 
119 |         self._task_spec = None
120 |         # algorithm always resets, so no need to reset here.
121 |         return envs
122 | 
123 |     _task_spec = None
124 | 
125 |     @property
126 |     def task_spec(self):
127 |         if self.env_name.startswith("ReacherMultitask") or \
128 |                 self.env_name.startswith("PointMassMultitask") or \
129 |                 self.env_name == "ReacherSingleTask-v1" or \
130 |                 self.env_name == "PointMass-v0" or \
131 |                 self.env_name == "SawyerPointMultitaskSimple-v0" or \
132 |                 self.env_name == "SawyerPointMultitask-v0" or \
133 |                 self.env_name == "PointMassQuadrangle-v0" or \
134 |                 self.env_name == "SawyerPickLiftMultitaskSimple-v0" or \
135 |                 self.env_name == "SawyerPickLiftMultitask-v0" or \
136 |                 self.env_name == "SawyerPickReachMultitaskSimple-v0" or \
137 |                 self.env_name == "SawyerPickReachMultitask-v0" or \
138 |                 self.env_name == "SawyerPickPlaceMultitaskSimple-v0" or \
139 |                 self.env_name == "SawyerPickPlaceMultitask-v0" or \
140 |                 self.env_name == 'SawyerDoorFixedMultitask-v0' or \
141 |                 self.env_name == 'SawyerDoorMultitask-v0' or \
142 |                 self.env_name == 'SawyerMixedMultitask-v0':
143 | 
144 |             if self._task_spec:
145 |                 return self._task_spec
146 |             # take just the index from the first env.
147 |             index, *_ = self.envs.call_sync("get_goal_index")
148 |             self._task_spec = dict(index=index)
149 |             return self._task_spec
150 |         raise NotImplemented
151 | 
152 | 
153 | if __name__ == "__main__":
154 |     class TestGlobals:
155 |         # env_name = 'HalfCheetah-v1'
156 |         # env_name = 'HalfCheetahGoalVel-v0'
157 |         env_name = 'PointMassQuadrangle-v0'
158 |         # env_name = 'MediumWorld-v0'
159 |         n_envs = 10
160 |         start_seed = 42
161 |         log_directory = '../test_runs/demo_envs/{env_name}'
162 | 
163 | 
164 |     # Example Usages:
165 |     tasks = MetaRLTasks(env_name=TestGlobals.env_name, batch_size=TestGlobals.n_envs, start_seed=TestGlobals.start_seed,
166 |                         # log_directory=TestGlobals.log_directory.format(env_name=TestGlobals.env_name) + "/{seed}",
167 |                         max_steps=10)
168 | 
169 |     envs = tasks.sample()
170 |     envs.reset()
171 | 
172 |     if TestGlobals.env_name == "HalfCheetahGoalVel-v0":
173 |         goal_velocities = envs.get_goal_velocity()
174 |         print('Goal velocities are:', end=" ")
175 |         print(', '.join([str(g) for g in goal_velocities]))
176 |         assert goal_velocities[0] == goal_velocities[2], "goal_velocities are different"
177 |         print('✓', end=" ")
178 |         print('They are identical!', end=" ")
179 |     # elif TestGlobals.env_name in GRID_WORLDS.keys():
180 |     #     # change_colors
181 |     #     colors = envs.change_colors()
182 |     #     assert colors[0]['goal'] == colors[1]['goal'], 'goal color should be identical'
183 |     #     print(colors)
184 |     #     # change_dynamics
185 |     #     assert TestGlobals.env_name == "MediumWorld-v0"
186 |     #     old_dynamics = dynamics = envs.change_dynamics()
187 |     #     assert dynamics[0]['r'] == dynamics[0]['r'], 'right action should be identical'
188 |     #     for i in range(2):
189 |     #         dynamics = envs.change_dynamics()
190 |     #         assert old_dynamics[0]['r'] != dynamics[0]['r'], 'dynamics should be different'
191 |     #         old_dynamics = dynamics
192 |     #     envs.reset_board()
193 |     elif TestGlobals.env_name == "HalfCheetah-v1":
194 |         envs.reset()
195 | 


--------------------------------------------------------------------------------
/e_maml_tf/packages/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/e-maml/336291d6819a82650d5bcc5f08dd431742897416/e_maml_tf/packages/__init__.py


--------------------------------------------------------------------------------
/e_maml_tf/packages/schedules.py:
--------------------------------------------------------------------------------
  1 | from collections import Generator
  2 | import inspect
  3 | from typing import Callable
  4 | 
  5 | 
  6 | class Schedule(Generator):
  7 |     def __init__(self, schedule_fn: Callable):
  8 |         assert callable(schedule_fn), 'need to pass in a real callable.'
  9 |         self.schedule_fn = schedule_fn
 10 |         source = inspect.getsource(self.schedule_fn)
 11 |         self.repr = "Schedule:\n" + source.rstrip() if len(source.split('\n')[1]) > 1 else source.rstrip()
 12 | 
 13 |     def send(self, epoch_ind):
 14 |         return self.schedule_fn(epoch_ind)
 15 | 
 16 |     def throw(self, *args):
 17 |         raise StopIteration(*args)
 18 | 
 19 |     def __repr__(self):
 20 |         return self.repr
 21 | 
 22 |     def __str__(self):
 23 |         return self.__repr__()
 24 | 
 25 | 
 26 | if __name__ == "__main__":
 27 |     s = Schedule(lambda i: 10 if i < 10 else 50)
 28 |     print(s)
 29 | 
 30 | 
 31 |     def longer_schedule(i):
 32 |         if i < 10:
 33 |             return 5
 34 |         elif i < 40:
 35 |             return 10
 36 |         return 50
 37 | 
 38 | 
 39 |     s = Schedule(longer_schedule)
 40 |     print(s)
 41 | 
 42 |     assert s.send(1) == 5
 43 |     assert s.send(20) == 10
 44 |     assert s.send(50) == 50
 45 | 
 46 | 
 47 | def dilated_delta(n, k):
 48 |     """Dilated Delta Schedule function
 49 | 
 50 |     returns a dilated delta function, starting with 0 and increasing, with double
 51 |     of the duty cycle after each cycle.
 52 | 
 53 |     :param n: total number of steps
 54 |     :param k: number of cycles
 55 |     :return: value between 0 and 1, in floats
 56 |     """
 57 |     import numpy as np
 58 |     import math
 59 | 
 60 |     ints = 2 ** np.arange(k)
 61 |     ends = ints * 2 - 1
 62 |     si = np.concatenate([[(e - i, i)] * i for i, e in zip(ints, ends)])
 63 |     schedule_ratio = n / len(si)
 64 | 
 65 |     def dilated_delta_fn(ep):
 66 |         i = math.floor(ep / schedule_ratio)
 67 |         s, i = si[i]
 68 |         return (ep - schedule_ratio * s) / (schedule_ratio * i)
 69 | 
 70 |     return dilated_delta_fn
 71 | 
 72 | 
 73 | if __name__ == "__main__":
 74 |     import matplotlib.pyplot as plt
 75 | 
 76 |     delta_anneal_fn = dilated_delta(1500, 5)
 77 |     betas = [delta_anneal_fn(i) for i in range(1500)]
 78 | 
 79 |     plt.figure(figsize=(4, 2))
 80 |     plt.title('dilated delta factory')
 81 |     plt.plot(betas)
 82 |     plt.show()
 83 | 
 84 | 
 85 | class DeltaAneal(Schedule):
 86 |     def __init__(self, min, max, n, k):
 87 |         """Delta Anneal Scheduler,
 88 | 
 89 |         Starting from max, goes down linearly to min, then repeat with 
 90 | 
 91 |         :param min: minimum of the parameter, to which the schedule converges to
 92 |         :param max: maximum of the parameter, that the schedule starts with
 93 |         :param n: the total number of epochs for this schedule
 94 |         :param k: the number of dilated cycles.
 95 |         :return: A dilated delta annealing schedule generator g, call g.send(ep) for the parameter value.
 96 |         """
 97 |         delta_fn = dilated_delta(n, k)
 98 |         super().__init__(lambda ep: max - (max - min) * delta_fn(ep))
 99 |         self.repr = f"DeltaAnneal(min={min}, max={max}, n={n}, k={k})"
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     s = DeltaAneal(0.04, 0.1, 1500, 4)
104 |     import numpy as np
105 |     import matplotlib.pyplot as plt
106 | 
107 |     plt.figure(figsize=(6, 2))
108 |     plt.title(f'{s}')
109 |     plt.plot([s.send(x) for x in range(1500)])
110 |     plt.ylim(-0.1, 0.2)
111 |     plt.show()
112 | 
113 | 
114 | class CosineAnneal(Schedule):
115 |     def __init__(self, min, max, n, k):
116 |         """Cosine Anneal Scheduler,
117 | 
118 |         Starting from max, goes down as a cosine function to min, then repeat with
119 | 
120 |         :param min: minimum of the parameter, to which the schedule converges to
121 |         :param max: maximum of the parameter, that the schedule starts with
122 |         :param n: the total number of epochs for this schedule
123 |         :param k: the number of dilated cycles.
124 |         :return: A dilated delta annealing schedule generator g, call g.send(ep) for the parameter value.
125 |         """
126 |         import numpy as np
127 |         delta_fn = dilated_delta(n, k)
128 |         super().__init__(lambda ep: min + (max - min) * 0.5 * (1 + np.cos(np.pi * delta_fn(ep))))
129 |         self.repr = f"CosineAnneal(min={min}, max={max}, n={n}, k={k})"
130 | 
131 | 
132 | if __name__ == "__main__":
133 |     s = CosineAnneal(0.04, 0.1, 1500, 4)
134 |     import numpy as np
135 |     import matplotlib.pyplot as plt
136 | 
137 |     plt.figure(figsize=(6, 2))
138 |     plt.title(f'{s}')
139 |     plt.plot([s.send(x) for x in range(1500)])
140 |     plt.ylim(-0.1, 0.2)
141 |     plt.show()
142 | 
143 | 
144 | # test that the instance detection still works
145 | if __name__ == "__main__":
146 |     s = CosineAnneal(0.04, 0.1, 1500, 4)
147 |     assert isinstance(s, Schedule), "CosineAnneal is an instance of Schedule"
148 |     print('passed the `isinstance` test!!')
149 | 


--------------------------------------------------------------------------------
/e_maml_tf/sampler.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
  3 | 
  4 | import numpy as np
  5 | 
  6 | from e_maml_tf.ge_policies import MlpPolicy
  7 | 
  8 | 
  9 | def path_gen_fn(env: SubprocVecEnv, policy: MlpPolicy, start_reset=False, soft=True):
 10 |     """
 11 |     Generator function for the path data. This one outputs the log-likelihood, value and baseline.
 12 | 
 13 |     Usage:
 14 | 
 15 |     |   s = path_gen_fn(...)
 16 |     |   timesteps = 100
 17 |     |   paths = s.send(timesteps)
 18 |     |
 19 |     |   assert "acs" in paths
 20 |     |   assert "obs" in paths
 21 | 
 22 |     :param env: A parallel env, with first index being the batch
 23 |     :param policy: has the signature `act`, returns batched actions for each observation (in the batch)
 24 |     :param gamma: the gamma parameter for the GAE
 25 |     :param lam: the lambda parameter for the GAE
 26 |     :param start_reset: boolean flag for resting on each generator start.
 27 |     :param soft:
 28 |     :param _render:
 29 |     :return: dimension is Size(timesteps, n_envs, feature_size)
 30 |     """
 31 |     # todo: use a default dict for these data collection. Much cleaner.
 32 | 
 33 |     timesteps = yield
 34 |     obs, dones = env.reset(), [False] * env.num_envs
 35 |     paths = defaultdict(list)
 36 |     while True:
 37 |         paths.clear()
 38 |         # do NOT use this if environment is parallel env.
 39 |         if start_reset:  # note: mostly useless.
 40 |             obs, dones = env.reset(), [False] * env.num_envs
 41 |         for _ in range(timesteps):
 42 |             paths['obs'].append(obs.copy())
 43 |             if policy.vf is None:
 44 |                 actions, neglogpacs = policy.act(obs, soft)
 45 |             else:
 46 |                 actions, values, neglogpacs = policy.act(obs, soft)
 47 |                 paths['values'].append(values)
 48 |             paths['acs'].append(actions.copy())
 49 |             paths['neglogpacs'].append(neglogpacs)
 50 |             obs, rewards, dones, info = env.step(actions)
 51 | 
 52 |             paths['rewards'].append(rewards)
 53 |             paths['dones'].append(dones)
 54 | 
 55 |             # In multiworld, `info` contains the entire observation. Processing these
 56 |             # will take way too much time. So we don't do that.
 57 |             _suc = [_['success'] for _ in info if 'success' in _]
 58 |             if _suc:
 59 |                 paths['info.successes'].append(_suc)
 60 |             _dist = [_['dist'] for _ in info if 'dist' in _]
 61 |             if _dist:
 62 |                 paths['info.dists'].append(_dist)
 63 | 
 64 | 
 65 |         # The TimeLimit env wrapper "dones" the env when time limit
 66 |         # has been reached. This is technically not correct.
 67 |         # if has vf and not done. Discounted infinite horizon.
 68 |         done_mask = 1 - dones
 69 |         if policy.vf is not None and done_mask.all():  # bootstrap from the (k + 1)th value
 70 |             paths['last_values'] = policy.value(obs) * done_mask
 71 | 
 72 |         timesteps = yield {k: np.array(v) for k, v in paths.items()}
 73 |         # now, this is missing bunch of stuff, return for example.
 74 | 
 75 | 
 76 | def paths_reshape(paths, horizon):
 77 |     """
 78 |     reshapes the trajectories in the path. Used to split paths data with multiple
 79 |     rollouts in a single env into k independent rollout vectors. This is needed
 80 |     for fitting the linear feature baseline.
 81 | 
 82 |     | n -> timesteps, k -> rollouts, c -> features.
 83 |     
 84 |     :param paths: dict('acs', 'obs', ...)
 85 |     :param horizon: int, the horizon we want to chop the paths dict into
 86 |     :return:
 87 |     """
 88 |     _ = paths.copy()
 89 |     for key, d in _.items():
 90 |         if not isinstance(d, np.ndarray) or len(d.shape) < 2:
 91 |             continue  # I prefer explicitness, but this requires less maintenance
 92 |         n, k, *c = d.shape  # *c accommodate rank-2 rewards/returns tensors
 93 |         _[key] = d.swapaxes(0, 1) \
 94 |             .reshape(n * k // horizon, horizon, *c) \
 95 |             .swapaxes(0, 1)
 96 |     return _
 97 | 
 98 | 
 99 | def mc(paths, gamma=None):
100 |     rewards = paths['rewards']
101 |     dones = paths['dones']  # not used
102 |     returns = np.zeros_like(rewards)
103 |     value_so_far = paths['last_values'] if 'last_values' in paths else np.zeros_like(rewards[-1])
104 |     for step in range(len(returns) - 1, -1, -1):
105 |         done_mask = 1 - dones[step]
106 |         value_so_far = rewards[step] + gamma * value_so_far * done_mask
107 |         returns[step] = value_so_far
108 |     return returns
109 | 
110 | 
111 | from e_maml_tf.value_baselines.linear_feature_baseline import LinearFeatureBaseline
112 | 
113 | 
114 | def value_baseline(paths, m: LinearFeatureBaseline = None):
115 |     m = m or LinearFeatureBaseline()
116 |     m.fit(paths['obs'], paths['rewards'], paths['returns'])
117 |     return m.predict(paths['obs'], paths['rewards'])
118 | 
119 | 
120 | def gae(paths, gamma, lam):
121 |     assert 'values' in paths, 'paths data need to contain value estimates.'
122 |     gl = gamma * lam
123 |     rewards = paths['rewards']
124 |     dones = paths['dones']
125 |     values = paths['values']
126 |     last_values = paths['last_values'] if 'last_values' in paths else np.zeros_like(rewards[-1])
127 |     gae = np.zeros_like(rewards)
128 |     last_gae = 0
129 |     l = len(rewards)
130 |     for step in range(l - 1, -1, -1):
131 |         done_mask = 1 - dones[step]
132 |         delta = rewards[step] + gamma * (last_values if step == l - 1 else values[step]) * done_mask - values[step]
133 |         last_gae = delta + gl * last_gae * done_mask
134 |         gae[step] = last_gae
135 |     return gae
136 | 
137 | 
138 | linear_baseline_model = LinearFeatureBaseline()
139 | 
140 | 
141 | def paths_process(paths, baseline, horizon, gamma=None, use_gae=None, lam=None, **_):
142 |     """
143 |     Master RL sample Processor, with GAE configurations and value baseline.
144 | 
145 |     :param paths:
146 |     :param baseline:
147 |     :param use_gae:
148 |     :param gamma:
149 |     :param lam:
150 |     :return:
151 |     """
152 |     _ = paths.copy()
153 |     _['returns'] = mc(_, gamma=gamma)
154 |     # fixit: this is wrong. Need to fix
155 |     if horizon:
156 |         _ = paths_reshape(_, horizon)  # Need to reshape by rollout for the fitted linearFeatureBaseline.
157 |     if baseline == 'linear':
158 |         assert 'values' not in _, '_ should not contain value estimates when ' \
159 |                                   'using the linear feature baseline. LFB Overwrites original estimate.'
160 |         # todo: use a single baseline model instance to save on speed.
161 |         _['values'] = value_baseline(_, m=linear_baseline_model)
162 |     if use_gae:
163 |         _['advs'] = gae(_, gamma=gamma, lam=lam)
164 |     return _
165 | 
166 | 
167 | if __name__ == "__main__":
168 |     import gym
169 |     from e_maml_tf.custom_vendor import IS_PATCHED
170 | 
171 |     make_env = lambda: gym.make('PointMass-v0')
172 |     envs = SubprocVecEnv([make_env for i in range(1)])
173 |     print(envs)
174 |     envs.reset()
175 |     policy_stub = lambda: None
176 |     policy_stub.vf = None
177 |     policy_stub.value = None
178 |     # policy_stub.act = lambda obs, _: [np.random.rand(1, 2), np.zeros(1)]
179 |     policy_stub.act = lambda obs, _: [obs[:, -2:] - 0.5 * obs[:, 2:4], np.zeros(1)]
180 |     path_gen = path_gen_fn(envs, policy_stub, start_reset=True)
181 |     next(path_gen)
182 |     timesteps = 1000
183 |     paths = path_gen.send(timesteps)
184 | 
185 |     # Usage Example: Using GAE
186 |     gamma, lam = 0.995, 0.99
187 |     paths['returns'] = mc(paths, gamma=gamma)
188 |     paths = paths_reshape(paths, 50)  # Need to reshape by rollout for the fitted linearFeatureBaseline.
189 |     if "values" not in paths:  # use linear baseline when critic is not available.
190 |         paths['values'] = value_baseline(paths)
191 |     phi = gae(paths, gamma=gamma, lam=lam)
192 | 
193 |     # Usage Example: Not Using GAE
194 |     # gamma, lam = 0.995, 0.99
195 |     # paths['returns'] = mc(paths, gamma=gamma)
196 |     # phi = paths['returns']
197 | 
198 |     # plot the results
199 |     import matplotlib.pyplot as plt
200 | 
201 |     plt.plot(paths['rewards'], color='green')
202 |     plt.plot(paths['returns'], color='red')
203 |     plt.plot(paths['values'], color='gray')
204 |     plt.plot(phi, color='blue')
205 |     plt.show()
206 | 
207 |     exit()
208 | 
209 | 
210 | def _deprecated_ppo2_gae():
211 |     # from OpenAI.baselines.ppo2
212 |     # compute returns from path.
213 |     # 0. compute rewards
214 |     # 1. compute adv (GAE)
215 |     # 2. compute regular adv (no GAE)
216 |     """
217 |     rewards = r + \gamma * V(s_{t + 1})
218 |     """
219 |     advs = np.zeros_like(paths['rewards'])
220 | 
221 |     # discount/bootstrap off value fn
222 |     _advs = np.zeros_like(paths['rewards'])
223 |     last_gae_lam = 0
224 |     n_rollouts = len(_obs)
225 |     for t in reversed(range(n_rollouts)):
226 |         if t == n_rollouts - 1:
227 |             next_non_terminal = 1.0 - dones
228 |             next_values = last_values
229 |         else:
230 |             next_non_terminal = 1.0 - _dones[t + 1]
231 |             next_values = _values[t + 1]
232 |         delta = _rewards[t] + gamma * next_values * next_non_terminal - _values[t]
233 |         _advs[t] = last_gae_lam = delta + gamma * lam * next_non_terminal * last_gae_lam
234 |     _returns = _advs + _values
235 | 
236 |     # return dimension is Size(timesteps, n_envs, feature_size)
237 |     timesteps = yield dict(obs=_obs, acs=_actions, rewards=_rewards, dones=_dones,
238 |                            returns=_returns,
239 |                            values=_values, neglogpacs=_neglogpacs, ep_info=ep_info)
240 | 
241 | 
242 | def _deprecated_gae_old(paths, gamma, lam):
243 |     """
244 |     Compute advantage with GAE(lambda)
245 |     """
246 |     # last element is only used for last vtarg, but we already zeroed it if last new = 1
247 |     new = np.append(paths["new"], 0)
248 |     vpred = np.append(paths["vpred"], paths["nextvpred"])
249 |     T = len(paths["rew"])
250 |     paths["adv"] = gaelam = np.empty(T, 'float32')
251 |     rew = paths["rew"]
252 |     lastgaelam = 0
253 |     for t in reversed(range(T)):
254 |         nonterminal = 1 - new[t + 1]
255 |         delta = rew[t] + gamma * vpred[t + 1] * nonterminal - vpred[t]
256 |         gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
257 | 
258 |     paths["tdlamret"] = paths["adv"] + paths["vpred"]
259 | 


--------------------------------------------------------------------------------
/e_maml_tf/sampling_utils.py:
--------------------------------------------------------------------------------
 1 | from numbers import Number
 2 | 
 3 | 
 4 | def is_scalar(n):
 5 |     return (hasattr(n, 'shape') and len(n.shape) < 1) or isinstance(n, Number)
 6 | 
 7 | 
 8 | def batchify(paths, batch_size, n, shuffle):
 9 |     """
10 | 
11 |     :param paths:
12 |     :param batch_size:
13 |     :param n: length of the
14 |     :param shuffle: boolean flag to shuffle the batch.
15 |     :return:
16 |     """
17 |     import numpy as np
18 |     shuffled_inds = np.random.randn(n).argsort()
19 |     for i in range(n // batch_size):
20 |         start = i * batch_size
21 |         end = start + batch_size
22 |         yield {
23 |             k: v if is_scalar(v) else v[shuffled_inds[start:end] if shuffle else range(start, end)]
24 |             for k, v in paths.items()
25 |         }
26 | 


--------------------------------------------------------------------------------
/e_maml_tf/train.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from ml_logger import logger
  3 | 
  4 | from e_maml_tf.meta_rl_tasks import MetaRLTasks
  5 | from e_maml_tf import config
  6 | from e_maml_tf.e_maml_ge import E_MAML
  7 | from e_maml_tf.trainer import train_maml
  8 | 
  9 | 
 10 | def run_e_maml(_G=None, _DEBUG=None):
 11 |     import baselines.common.tf_util as U
 12 |     if _G is not None:
 13 |         config.G.update(_G)
 14 |     if _DEBUG is not None:
 15 |         config.DEBUG.update(_DEBUG)
 16 | 
 17 |     # todo: let's take the control of the log director away from the train script. It should all be set from outside.
 18 |     # done: now this is set in the runner thunk.
 19 |     # logger.configure(log_directory=config.RUN.log_dir, prefix=config.RUN.log_prefix)
 20 |     logger.log_params(
 21 |         G=vars(config.G),
 22 |         Reporting=vars(config.Reporting),
 23 |         DEBUG=vars(config.DEBUG)
 24 |     )
 25 |     logger.upload_file(__file__)
 26 | 
 27 |     tasks = MetaRLTasks(env_name=config.G.env_name, batch_size=config.G.n_parallel_envs,
 28 |                         start_seed=config.G.start_seed,
 29 |                         log_directory=(config.RUN.log_directory + "/{seed}") if config.G.render else None,
 30 |                         max_steps=config.G.env_max_timesteps)
 31 | 
 32 |     # sess_config = tf.ConfigProto(log_device_placement=config.Reporting.log_device_placement)
 33 |     # with tf.Session(config=sess_config), tf.device('/gpu:0'), tasks:
 34 |     graph = tf.Graph()
 35 |     with graph.as_default(), U.make_session(num_cpu=config.G.n_cpu), tasks:
 36 |         maml = E_MAML(ob_space=tasks.envs.observation_space, act_space=tasks.envs.action_space)
 37 | 
 38 |         U.initialize()
 39 | 
 40 |         import gym
 41 |         from rl.helpers import unbatch_policy, render_gen_fn
 42 | 
 43 |         eval_env = gym.make(config.G.env_name)
 44 | 
 45 |         if config.G.use_k_index:
 46 |             from e_maml_tf.wrappers.k_index import k_index
 47 |             eval_env = k_index(eval_env)
 48 | 
 49 |         _policy = unbatch_policy(maml.runner.policy)
 50 | 
 51 |         # todo: use batch-mode to accelerate rendering.
 52 |         rend_gen = render_gen_fn(_policy, eval_env, stochastic=False, width=640, height=480, reset_on_done=True)
 53 | 
 54 |         _ep_ind, _hook_cache = None, {}
 55 |         train_iter = train_maml(n_tasks=config.G.n_tasks, tasks=tasks, maml=maml)
 56 |         while True:
 57 |             try:
 58 |                 status, epoch, task_spec, *_ = next(train_iter)
 59 | 
 60 |                 t_id = task_spec['index']
 61 |                 if epoch != _ep_ind:
 62 |                     _hook_cache.clear()
 63 |                 _ep_ind = epoch
 64 | 
 65 |                 if status.startswith('grad-') and status.endswith('movie'):
 66 |                     k, = _
 67 |                     hook = f"{config.G.env_name}_{epoch:04d}_k({k})_t({t_id})"
 68 |                     if hook in _hook_cache:
 69 |                         continue
 70 |                     _hook_cache[hook] = True
 71 | 
 72 |                     eval_env.sample_task(**task_spec)
 73 |                     movie = [next(rend_gen) for _ in range(config.G.movie_timesteps)]
 74 |                     logger.log_video(movie, "videos/" + hook + ".mp4", fps=30)
 75 |                     del movie
 76 |                     # samples = [next(sample_gen) for _ in range(config.G.movie_timesteps)]
 77 |                     # logger.log_data(samples, hook + ".pkl")
 78 | 
 79 |                 hook = f"{config.G.env_name}_{epoch:04d}_t({t_id})"
 80 |                 if status == 'post-update-movie' and hook not in _hook_cache:
 81 |                     _hook_cache[hook] = True
 82 |                     eval_env.sample_task(**task_spec)
 83 |                     movie = [next(rend_gen) for _ in range(config.G.movie_timesteps)]
 84 |                     logger.log_video(movie, "videos/" + hook + ".mp4", fps=30)
 85 |                     del movie
 86 |                     # samples = [next(sample_gen) for _ in range(config.G.movie_timesteps)]
 87 |                     # logger.log_data(samples, hook + ".pkl")
 88 | 
 89 |             except StopIteration:
 90 |                 break
 91 |         logger.flush()
 92 | 
 93 |     tf.reset_default_graph()
 94 | 
 95 | 
 96 | def launch(**_G):
 97 |     import traceback
 98 |     import os
 99 |     os.environ['CUDA_VISIBLE_DEVICES'] = str(3)
100 | 
101 |     try:
102 |         config.config_run(**_G)
103 |         run_e_maml(_G)
104 |     except Exception as e:
105 |         tb = traceback.format_exc()
106 |         logger.log_line(tb)
107 |         raise e
108 | 
109 | 
110 | if __name__ == '__main__':
111 |     config.RUN.log_prefix = "alpha-0-check"
112 |     launch()
113 | 


--------------------------------------------------------------------------------
/e_maml_tf/trainer.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from collections import defaultdict
  3 | from ml_logger import logger, metrify
  4 | from e_maml_tf.meta_rl_tasks import MetaRLTasks
  5 | from termcolor import colored
  6 | 
  7 | from e_maml_tf.ge_utils import stem
  8 | from e_maml_tf.packages.schedules import Schedule
  9 | from e_maml_tf.sampler import path_gen_fn, paths_process
 10 | from .e_maml_ge import E_MAML
 11 | from .config import G, DEBUG, Reporting
 12 | import numpy as np
 13 | from e_maml_tf.algos import vpg, ppo2, cpi, bc, bc_learned_loss
 14 | 
 15 | 
 16 | def train_supervised_maml(*, k_tasks=1, maml: E_MAML):
 17 |     # env used for evaluation purposes only.
 18 |     if G.meta_sgd:
 19 |         assert maml.alpha is not None, "Coding Mistake if meta_sgd is trueful but maml.alpha is None."
 20 | 
 21 |     assert G.n_tasks >= k_tasks, f"Is this intended? You probably want to have " \
 22 |                                  f"meta-batch({G.n_tasks}) >= k_tasks({k_tasks})."
 23 | 
 24 |     sess = tf.get_default_session()
 25 | 
 26 |     epoch_ind, pref = -1, ""
 27 |     while epoch_ind < G.n_epochs:
 28 |         # for epoch_ind in range(G.n_epochs + 1):
 29 |         logger.flush()
 30 |         logger.split()
 31 | 
 32 |         is_bc_test = (pref != "test/" and G.eval_interval and epoch_ind % G.eval_interval == 0)
 33 |         pref = "test/" if is_bc_test else ""
 34 |         epoch_ind += 0 if is_bc_test else 1
 35 | 
 36 |         if G.meta_sgd:
 37 |             alpha_lr = sess.run(maml.alpha)  # only used in the runner.
 38 |             logger.log(metrics={f"alpha_{i}/{stem(t.name, 2)}": a
 39 |                                 for i, a_ in enumerate(alpha_lr)
 40 |                                 for t, a in zip(maml.runner.trainables, a_)}, silent=True)
 41 |         else:
 42 |             alpha_lr = G.alpha.send(epoch_ind) if isinstance(G.alpha, Schedule) else np.array(G.alpha)
 43 |             logger.log(alpha=metrify(alpha_lr), epoch=epoch_ind, silent=True)
 44 | 
 45 |         beta_lr = G.beta.send(epoch_ind) if isinstance(G.beta, Schedule) else np.array(G.beta)
 46 |         logger.log(beta=metrify(beta_lr), epoch=epoch_ind, silent=True)
 47 | 
 48 |         if G.checkpoint_interval and epoch_ind % G.checkpoint_interval == 0:
 49 |             yield "pre-update-checkpoint", epoch_ind
 50 | 
 51 |         # Compute updates for each task in the batch
 52 |         # 0. save value of variables
 53 |         # 1. sample
 54 |         # 2. gradient descent
 55 |         # 3. repeat step 1., 2. until all gradient steps are exhausted.
 56 |         batch_data = defaultdict(list)
 57 | 
 58 |         maml.save_weight_cache()
 59 |         load_ops = [] if DEBUG.no_weight_reset else [maml.cache.load]
 60 | 
 61 |         feed_dict = {}
 62 |         for task_ind in range(k_tasks if is_bc_test else G.n_tasks):
 63 |             graph_branch = maml.graphs[0] if G.n_graphs == 1 else maml.graphs[task_ind]
 64 |             if G.n_graphs == 1:
 65 |                 gradient_sum_op = maml.gradient_sum.set_op if task_ind == 0 else maml.gradient_sum.add_op
 66 | 
 67 |             """
 68 |             In BC mode, we don't have an environment. The sampling is handled here then fed to the sampler.
 69 |             > task_spec = dict(index=0)
 70 |             
 71 |             Here we make the testing more efficient.
 72 |             """
 73 |             if not DEBUG.no_task_resample:
 74 |                 if not is_bc_test:
 75 |                     task_spec = dict(index=np.random.randint(0, k_tasks))
 76 |                 elif task_ind < k_tasks:
 77 |                     task_spec = dict(index=task_ind % k_tasks)
 78 |                 else:
 79 |                     raise RuntimeError('should never hit here.')
 80 | 
 81 |             for k in range(G.n_grad_steps + 1):  # 0 - 10 <== last one being the maml policy.
 82 | 
 83 |                 # for imitation inner loss, we still sample trajectory for evaluation purposes, but
 84 |                 # replace it with the demonstration data for learning
 85 |                 if k < G.n_grad_steps:
 86 |                     p = p if G.single_sampling and k > 0 else \
 87 |                         bc.sample_demonstration_data(task_spec, key=("eval" if is_bc_test else None))
 88 |                 elif k == G.n_grad_steps:
 89 |                     # note: use meta bc samples.
 90 |                     p = bc.sample_demonstration_data(task_spec, key="meta")
 91 |                 else:
 92 |                     raise Exception('Implementation error. Should never reach this line.')
 93 | 
 94 |                 _p = {k: v for k, v in p.items() if k != "ep_info"}
 95 | 
 96 |                 if k < G.n_grad_steps:
 97 |                     # note: under meta-SGD mode, the runner needs the k^th learning rate.
 98 |                     _lr = alpha_lr[k] if G.meta_sgd else alpha_lr
 99 | 
100 |                     runner_feed_dict = \
101 |                         path_to_feed_dict(inputs=maml.runner.inputs, paths=_p, lr=_lr)
102 |                     # todo: optimize `maml.meta_runner` if k >= G.n_grad_steps.
103 |                     loss, *_, __ = maml.runner.optim.run_optimize(feed_dict=runner_feed_dict)
104 |                     runner_feed_dict.clear()
105 | 
106 |                     for key, value in zip(maml.runner.model.reports.keys(), [loss, *_]):
107 |                         batch_data[pref + f"grad_{k}_step_{key}"].append(value)
108 |                         logger.log_key_value(pref + f"task_{task_ind}_grad_{k}_{key}", value, silent=True)
109 | 
110 |                     if loss > G.term_loss_threshold:  # todo: make this batch-based instead of on single episode
111 |                         err = pref + "episode loss blew up:", loss, "terminating training."
112 |                         logger.log_line(colored(err, "red"), flush=True)
113 |                         raise RuntimeError('loss is TOO HIGH. Terminating the experiment.')
114 | 
115 |                     # fixit: has bug when using fixed learning rate. Still needs to get learning rate from placeholder
116 |                     feed_dict.update(path_to_feed_dict(inputs=graph_branch.workers[k].inputs, paths=_p))
117 |                 elif k == G.n_grad_steps:
118 |                     yield_keys = dict(
119 |                         movie=G.record_movie_interval and epoch_ind >= G.start_movie_after_epoch and
120 |                               epoch_ind % G.record_movie_interval == 0,
121 |                         eval=is_bc_test
122 |                     )
123 |                     if np.fromiter(yield_keys.values(), bool).any():
124 |                         yield yield_keys, epoch_ind, task_spec
125 |                     if is_bc_test:
126 |                         if load_ops:
127 |                             tf.get_default_session().run(load_ops)
128 |                         continue  # do NOT meta learn from test samples.
129 | 
130 |                     # we don't treat the meta_input the same way even though we could. This is more clear to read.
131 |                     # note: feed in the learning rate only later.
132 |                     feed_dict.update(path_to_feed_dict(inputs=graph_branch.meta.inputs, paths=_p))
133 | 
134 |                     if G.n_graphs == 1:
135 |                         # load from checkpoint before computing the meta gradient\nrun gradient sum operation
136 |                         if load_ops:
137 |                             tf.get_default_session().run(load_ops)
138 |                         # note: meta reporting should be run here. Not supported for simplicity. (need to reduce across
139 |                         # note: tasks, and can not be done outside individual task graphs.
140 |                         if G.meta_sgd is None:
141 |                             feed_dict[maml.alpha] = alpha_lr
142 |                         tf.get_default_session().run(gradient_sum_op, feed_dict)
143 |                         feed_dict.clear()
144 | 
145 |                     if load_ops:
146 |                         tf.get_default_session().run(load_ops)
147 | 
148 |         if is_bc_test:
149 |             continue  # do NOT meta learn from test samples.
150 | 
151 |         if G.meta_sgd is None:
152 |             feed_dict[maml.alpha] = alpha_lr
153 | 
154 |         if G.n_graphs == 1:
155 |             assert G.meta_n_grad_steps == 1, "ERROR: Can only run 1 meta gradient step with a single graph."
156 |             # note: remove meta reporting b/c meta report should be in each task in this case.
157 |             tf.get_default_session().run(maml.meta_update_ops[0], {maml.beta: beta_lr})
158 |         else:
159 |             assert feed_dict, "ERROR: It is likely that you jumped here from L:178."
160 |             feed_dict[maml.beta] = beta_lr
161 |             for i in range(G.meta_n_grad_steps):
162 |                 update_op = maml.meta_update_ops[0 if G.reuse_meta_optimizer else i]
163 |                 *reports, _ = tf.get_default_session().run(maml.meta_reporting + [update_op], feed_dict)
164 |                 if i not in (0, G.meta_n_grad_steps - 1):
165 |                     continue
166 |                 for key, v in zip(maml.meta_reporting_keys, reports):
167 |                     logger.log_key_value(pref + f"grad_{G.n_grad_steps + i}_step_{key}", v, silent=True)
168 | 
169 |             feed_dict.clear()
170 | 
171 |         tf.get_default_session().run(maml.cache.save)
172 | 
173 |         # Now compute the meta gradients.
174 |         # note: runner shares variables with the MAML graph. Reload from state_dict
175 |         # note: if max_grad_step is the same as n_grad_steps then no need here.
176 | 
177 |         dt = logger.split()
178 |         logger.log_line('Timer Starts...' if dt is None else f'{dt:0.2f} sec/epoch')
179 |         logger.log(dt_epoch=dt or np.nan, epoch=epoch_ind)
180 | 
181 |         for key, arr in batch_data.items():
182 |             reduced = np.array(arr).mean()
183 |             logger.log_key_value(key, reduced)
184 | 
185 | 
186 | def train_maml(*, n_tasks: int, tasks: MetaRLTasks, maml: E_MAML):
187 |     if not G.inner_alg.startswith("BC"):
188 |         path_gen = path_gen_fn(env=tasks.envs, policy=maml.runner.policy, start_reset=G.reset_on_start)
189 |         next(path_gen)
190 | 
191 |     meta_path_gen = path_gen_fn(env=tasks.envs, policy=maml.meta_runner.policy, start_reset=G.reset_on_start)
192 |     next(meta_path_gen)
193 | 
194 |     if G.load_from_checkpoint:
195 |         # todo: add variable to checkpoint
196 |         # todo: set the epoch_ind starting point here.
197 |         logger.load_variables(G.load_from_checkpoint)
198 | 
199 |     if G.meta_sgd:
200 |         assert maml.alpha is not None, "Coding Mistake if meta_sgd is trueful but maml.alpha is None."
201 | 
202 |     max_episode_length = tasks.spec.max_episode_steps
203 | 
204 |     sess = tf.get_default_session()
205 |     epoch_ind, prefix = G.epoch_init - 1, ""
206 |     while epoch_ind < G.epoch_init + G.n_epochs:
207 |         logger.flush()
208 |         logger.split()
209 | 
210 |         is_bc_test = (prefix != "test/" and G.eval_interval and epoch_ind % G.eval_interval == 0)
211 |         prefix = "test/" if is_bc_test else ""
212 |         epoch_ind += 0 if is_bc_test else 1
213 | 
214 |         if G.meta_sgd:
215 |             alpha_lr = sess.run(maml.alpha)  # only used in the runner.
216 |             logger.log(metrics={f"alpha_{i}/{stem(t.name, 2)}": a
217 |                                 for i, a_ in enumerate(alpha_lr)
218 |                                 for t, a in zip(maml.runner.trainables, a_)}, silent=True)
219 |         else:
220 |             alpha_lr = G.alpha.send(epoch_ind) if isinstance(G.alpha, Schedule) else np.array(G.alpha)
221 |             logger.log(alpha=metrify(alpha_lr), epoch=epoch_ind, silent=True)
222 | 
223 |         beta_lr = G.beta.send(epoch_ind) if isinstance(G.beta, Schedule) else np.array(G.beta)
224 |         clip_range = G.clip_range.send(epoch_ind) if isinstance(G.clip_range, Schedule) else np.array(G.clip_range)
225 |         logger.log(beta=metrify(beta_lr), clip_range=metrify(clip_range), epoch=epoch_ind, silent=True)
226 | 
227 |         batch_timesteps = G.batch_timesteps.send(epoch_ind) \
228 |             if isinstance(G.batch_timesteps, Schedule) else G.batch_timesteps
229 | 
230 |         # Compute updates for each task in the batch
231 |         # 0. save value of variables
232 |         # 1. sample
233 |         # 2. gradient descent
234 |         # 3. repeat step 1., 2. until all gradient steps are exhausted.
235 |         batch_data = defaultdict(list)
236 | 
237 |         maml.save_weight_cache()
238 |         load_ops = [] if DEBUG.no_weight_reset else [maml.cache.load]
239 | 
240 |         if G.checkpoint_interval and epoch_ind % G.checkpoint_interval == 0 \
241 |                 and not is_bc_test and epoch_ind >= G.start_checkpoint_after_epoch:
242 |             cp_path = f"checkpoints/variables_{epoch_ind:04d}.pkl"
243 |             logger.log_line(f'saving checkpoint {cp_path}')
244 |             # note: of course I don't know that are all of the trainables at the moment.
245 |             logger.save_variables(tf.trainable_variables(), path=cp_path)
246 | 
247 |         feed_dict = {}
248 |         for task_ind in range(n_tasks if is_bc_test else G.n_tasks):
249 |             graph_branch = maml.graphs[0] if G.n_graphs == 1 else maml.graphs[task_ind]
250 |             if G.n_graphs == 1:
251 |                 gradient_sum_op = maml.gradient_sum.set_op if task_ind == 0 else maml.gradient_sum.add_op
252 | 
253 |             print(f"task_ind {task_ind}...")
254 |             if not DEBUG.no_task_resample:
255 |                 if not is_bc_test:
256 |                     print(f'L250: sampling task')
257 |                     tasks.sample()
258 |                 elif task_ind < n_tasks:
259 |                     task_spec = dict(index=task_ind % n_tasks)
260 |                     print(f'L254: sampling task {task_spec}')
261 |                     tasks.sample(**task_spec)
262 |                 else:
263 |                     raise RuntimeError('should never hit here.')
264 | 
265 |             for k in range(G.n_grad_steps + 1):  # 0 - 10 <== last one being the maml policy.
266 |                 _is_new = False
267 |                 # for imitation inner loss, we still sample trajectory for evaluation purposes, but
268 |                 # replace it with the demonstration data for learning
269 |                 if k < G.n_grad_steps:
270 |                     if G.inner_alg.startswith("BC"):
271 |                         p = p if G.single_sampling and k > 0 else \
272 |                             bc.sample_demonstration_data(tasks.task_spec, key=("eval" if is_bc_test else None))
273 |                     else:
274 |                         p, _is_new = path_gen.send(batch_timesteps), True
275 |                 elif k == G.n_grad_steps:
276 |                     if G.meta_alg.startswith("BC"):
277 |                         # note: use meta bc samples.
278 |                         p = bc.sample_demonstration_data(tasks.task_spec, key="meta")
279 |                     else:
280 |                         p, _is_new = meta_path_gen.send(batch_timesteps), True
281 |                 else:
282 |                     raise Exception('Implementation error. Should never reach this line.')
283 | 
284 |                 if k in G.eval_grad_steps:
285 |                     _ = path_gen if k < G.n_grad_steps else meta_path_gen
286 |                     p_eval = p if _is_new else _.send(G.eval_timesteps)
287 |                     # reporting on new trajectory samples
288 |                     avg_r = p_eval['ep_info']['reward'] if G.normalize_env else np.mean(p_eval['rewards'])
289 |                     episode_r = avg_r * max_episode_length  # default horizon for HalfCheetah
290 | 
291 |                     if episode_r < G.term_reward_threshold:  # todo: make this batch-based instead of on single episode
292 |                         logger.log_line("episode reward is too low: ", episode_r, "terminating training.", flush=True)
293 |                         raise RuntimeError('AVERAGE REWARD TOO LOW. Terminating the experiment.')
294 | 
295 |                     batch_data[prefix + f"grad_{k}_step_reward"].append(avg_r if Reporting.report_mean else episode_r)
296 |                     if k in G.eval_grad_steps:
297 |                         logger.log_key_value(prefix + f"task_{task_ind}_grad_{k}_reward", episode_r, silent=True)
298 | 
299 |                 _p = {k: v for k, v in p.items() if k != "ep_info"}
300 | 
301 |                 if k < G.n_grad_steps:
302 |                     # note: under meta-SGD mode, the runner needs the k^th learning rate.
303 |                     _lr = alpha_lr[k] if G.meta_sgd else alpha_lr
304 | 
305 |                     # clip_range is not used in BC mode. but still passed in.
306 |                     runner_feed_dict = \
307 |                         path_to_feed_dict(inputs=maml.runner.inputs, paths=_p, lr=_lr,
308 |                                           baseline=G.baseline, gamma=G.gamma, use_gae=G.use_gae, lam=G.lam,
309 |                                           horizon=max_episode_length, clip_range=clip_range)
310 |                     # todo: optimize `maml.meta_runner` if k >= G.n_grad_steps.
311 |                     loss, *_, __ = maml.runner.optim.run_optimize(feed_dict=runner_feed_dict)
312 |                     runner_feed_dict.clear()
313 | 
314 |                     for key, value in zip(maml.runner.model.reports.keys(), [loss, *_]):
315 |                         batch_data[prefix + f"grad_{k}_step_{key}"].append(value)
316 |                         logger.log_key_value(prefix + f"task_{task_ind}_grad_{k}_{key}", value, silent=True)
317 | 
318 |                     if loss > G.term_loss_threshold:  # todo: make this batch-based instead of on single episode
319 |                         logger.log_line(prefix + "episode loss blew up:", loss, "terminating training.", flush=True)
320 |                         raise RuntimeError('loss is TOO HIGH. Terminating the experiment.')
321 | 
322 |                     # done: has bug when using fixed learning rate. Needs the learning rate as input.
323 |                     feed_dict.update(  # do NOT pass in the learning rate because the graph already includes those.
324 |                         path_to_feed_dict(inputs=graph_branch.workers[k].inputs, paths=_p,
325 |                                           lr=None if G.meta_sgd else alpha_lr,  # but do with fixed alpha
326 |                                           horizon=max_episode_length,
327 |                                           baseline=G.baseline, gamma=G.gamma, use_gae=G.use_gae, lam=G.lam,
328 |                                           clip_range=clip_range))
329 | 
330 |                 elif k == G.n_grad_steps:
331 |                     yield_keys = dict(
332 |                         movie=epoch_ind >= G.start_movie_after_epoch and epoch_ind % G.record_movie_interval == 0,
333 |                         eval=is_bc_test
334 |                     )
335 |                     if np.fromiter(yield_keys.values(), bool).any():
336 |                         yield yield_keys, epoch_ind, tasks.task_spec
337 |                     if is_bc_test:
338 |                         if load_ops:  # we need to reset the weights. Otherwise the world would be on fire.
339 |                             tf.get_default_session().run(load_ops)
340 |                         continue  # do NOT meta learn from test samples.
341 | 
342 |                     # we don't treat the meta_input the same way even though we could. This is more clear to read.
343 |                     # note: feed in the learning rate only later.
344 |                     feed_dict.update(  # do NOT need learning rate
345 |                         path_to_feed_dict(inputs=graph_branch.meta.inputs, paths=_p,
346 |                                           horizon=max_episode_length,
347 |                                           baseline=G.baseline, gamma=G.gamma, use_gae=G.use_gae, lam=G.lam,
348 |                                           clip_range=clip_range))
349 | 
350 |                     if G.n_graphs == 1:
351 |                         # load from checkpoint before computing the meta gradient\nrun gradient sum operation
352 |                         if load_ops:
353 |                             tf.get_default_session().run(load_ops)
354 |                         # note: meta reporting should be run here. Not supported for simplicity. (need to reduce across
355 |                         # note: tasks, and can not be done outside individual task graphs.
356 |                         if G.meta_sgd is None:  # note: copied from train_supervised_maml, not tested
357 |                             feed_dict[maml.alpha] = alpha_lr
358 |                         tf.get_default_session().run(gradient_sum_op, feed_dict)
359 |                         feed_dict.clear()
360 | 
361 |                     if load_ops:
362 |                         tf.get_default_session().run(load_ops)
363 | 
364 |         if is_bc_test:
365 |             continue  # do NOT meta learn from test samples.
366 | 
367 |         # note: copied from train_supervised_maml, not tested
368 |         if G.meta_sgd is None:
369 |             feed_dict[maml.alpha] = alpha_lr
370 | 
371 |         if G.n_graphs == 1:
372 |             assert G.meta_n_grad_steps == 1, "ERROR: Can only run 1 meta gradient step with a single graph."
373 |             # note: remove meta reporting b/c meta report should be in each task in this case.
374 |             tf.get_default_session().run(maml.meta_update_ops[0], {maml.beta: beta_lr})
375 |         else:
376 |             assert feed_dict, "ERROR: It is likely that you jumped here from L:178."
377 |             feed_dict[maml.beta] = beta_lr
378 |             for i in range(G.meta_n_grad_steps):
379 |                 update_op = maml.meta_update_ops[0 if G.reuse_meta_optimizer else i]
380 |                 *reports, _ = tf.get_default_session().run(maml.meta_reporting + [update_op], feed_dict)
381 |                 if i not in (0, G.meta_n_grad_steps - 1):
382 |                     continue
383 |                 for key, v in zip(maml.meta_reporting_keys, reports):
384 |                     logger.log_key_value(prefix + f"grad_{G.n_grad_steps + i}_step_{key}", v, silent=True)
385 | 
386 |             feed_dict.clear()
387 | 
388 |         tf.get_default_session().run(maml.cache.save)
389 | 
390 |         # Now compute the meta gradients.
391 |         # note: runner shares variables with the MAML graph. Reload from state_dict
392 |         # note: if max_grad_step is the same as n_grad_steps then no need here.
393 | 
394 |         dt = logger.split()
395 |         logger.log_line('Timer Starts...' if dt is None else f'{dt:0.2f} sec/epoch')
396 |         logger.log(dt_epoch=dt or np.nan, epoch=epoch_ind)
397 | 
398 |         for key, arr in batch_data.items():
399 |             reduced = np.array(arr).mean()
400 |             logger.log_key_value(key, reduced)
401 | 
402 |         logger.flush()
403 | 
404 | 
405 | def path_to_feed_dict(*, inputs, paths, lr=None, **rest):
406 |     from e_maml_tf.sampler import paths_process
407 |     if isinstance(inputs, vpg.Inputs):
408 |         paths = paths_process(paths, **rest)
409 |         return vpg.path_to_feed_dict(inputs=inputs, paths=paths, lr=lr)  # kl limit etc
410 |     elif isinstance(inputs, ppo2.Inputs):
411 |         paths = paths_process(paths, **rest)
412 |         return ppo2.path_to_feed_dict(inputs=inputs, paths=paths, lr=lr, **rest)  # kl limit etc
413 |     elif isinstance(inputs, cpi.Inputs):
414 |         paths = paths_process(paths, **rest)
415 |         return cpi.path_to_feed_dict(inputs=inputs, paths=paths, lr=lr, **rest)  # kl limit etc
416 |     elif isinstance(inputs, bc.Inputs):
417 |         return bc.path_to_feed_dict(inputs=inputs, paths=paths, lr=lr, **rest)  # kl limit etc
418 |     elif isinstance(inputs, bc_learned_loss.Inputs):
419 |         return bc_learned_loss.path_to_feed_dict(inputs=inputs, paths=paths, lr=lr, **rest)  # kl limit etc
420 |     else:
421 |         raise NotImplementedError("Input type is not recognised")
422 | 
423 | 
424 | # debug only
425 | def eval_tensors(*, variable, feed_dict):
426 |     return tf.get_default_session().run(variable, feed_dict)
427 | 


--------------------------------------------------------------------------------
/e_maml_tf/value_baselines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/e-maml/336291d6819a82650d5bcc5f08dd431742897416/e_maml_tf/value_baselines/__init__.py


--------------------------------------------------------------------------------
/e_maml_tf/value_baselines/base.py:
--------------------------------------------------------------------------------
 1 | class Baseline(object):
 2 |     @property
 3 |     def algorithm_parallelized(self):
 4 |         return False
 5 | 
 6 |     def get_param_values(self):
 7 |         raise NotImplementedError
 8 | 
 9 |     def set_param_values(self, val):
10 |         raise NotImplementedError
11 | 
12 |     def fit(self, obs, rewards, returns):
13 |         raise NotImplementedError
14 | 
15 |     def predict(self, obs, rewards):
16 |         raise NotImplementedError
17 | 
18 |     @classmethod
19 |     def add_args(cls, parser):
20 |         pass
21 | 
22 |     @classmethod
23 |     def new_from_args(cls, args, mdp):
24 |         pass
25 | 
26 |     def log_diagnostics(self, paths):
27 |         """
28 |         Log extra information per iteration based on the collected paths
29 |         """
30 |         pass
31 | 


--------------------------------------------------------------------------------
/e_maml_tf/value_baselines/gaussian_conv_baseline.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from .base import Baseline
 4 | from rllab.regressors.gaussian_conv_regressor import GaussianConvRegressor
 5 | 
 6 | 
 7 | class GaussianConvBaseline(Baseline):
 8 |     def __init__(self, env_spec, regressor_args=None, ):
 9 |         super().__init__(env_spec)
10 |         if regressor_args is None:
11 |             regressor_args = dict()
12 | 
13 |         self._regressor = GaussianConvRegressor(
14 |             input_shape=env_spec.observation_space.shape,
15 |             output_dim=1,
16 |             name="vf",
17 |             **regressor_args
18 |         )
19 | 
20 |     def fit(self, paths):
21 |         observations = np.concatenate([p["obs"] for p in paths])
22 |         returns = np.concatenate([p["returns"] for p in paths])
23 |         self._regressor.fit(observations, returns.reshape((-1, 1)))
24 | 
25 |     def fit_by_samples_data(self, samples_data):
26 |         observations = samples_data["obs"]
27 |         returns = samples_data["returns"]
28 |         self._regressor.fit(observations, returns.reshape((-1, 1)))
29 | 
30 |     def predict(self, path):
31 |         return self._regressor.predict(path["obs"]).flatten()
32 | 
33 |     def get_param_values(self, **tags):
34 |         return self._regressor.get_param_values(**tags)
35 | 
36 |     def set_param_values(self, flattened_params, **tags):
37 |         self._regressor.set_param_values(flattened_params, **tags)
38 | 


--------------------------------------------------------------------------------
/e_maml_tf/value_baselines/gaussian_mlp_baseline.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from .base import Baseline
 4 | from rllab.regressors.gaussian_mlp_regressor import GaussianMLPRegressor
 5 | 
 6 | 
 7 | class GaussianMLPBaseline(Baseline):
 8 |     def __init__(self, env_spec, subsample_factor=1., num_seq_inputs=1, regressor_args=None, ):
 9 |         self._subsample_factor = subsample_factor
10 |         if regressor_args is None:
11 |             regressor_args = dict()
12 | 
13 |         self._regressor = GaussianMLPRegressor(
14 |             input_shape=(env_spec.observation_space.flat_dim * num_seq_inputs,),
15 |             output_dim=1,
16 |             name="vf",
17 |             **regressor_args
18 |         )
19 | 
20 |     def fit(self, paths):
21 |         # --
22 |         # Subsample before fitting.
23 |         if self._subsample_factor < 1:
24 |             lst_rnd_idx = []
25 |             for path in paths:
26 |                 # Subsample index
27 |                 path_len = len(path['returns'])
28 |                 rnd_idx = np.random.choice(path_len, int(np.ceil(path_len * self._subsample_factor)),
29 |                                            replace=False)
30 |                 lst_rnd_idx.append(rnd_idx)
31 |             observations = np.concatenate([p["obs"][idx] for p, idx in zip(paths, lst_rnd_idx)])
32 |             returns = np.concatenate([p["returns"][idx] for p, idx in zip(paths, lst_rnd_idx)])
33 |         else:
34 |             observations = np.concatenate([p["obs"] for p in paths])
35 |             returns = np.concatenate([p["returns"] for p in paths])
36 |         self._regressor.fit(observations, returns.reshape((-1, 1)))
37 | 
38 |     def predict(self, path):
39 |         return self._regressor.predict(path["obs"]).flatten()
40 | 
41 |     def get_param_values(self, **tags):
42 |         return self._regressor.get_param_values(**tags)
43 | 
44 |     def set_param_values(self, flattened_params, **tags):
45 |         self._regressor.set_param_values(flattened_params, **tags)
46 | 


--------------------------------------------------------------------------------
/e_maml_tf/value_baselines/linear_feature_baseline.py:
--------------------------------------------------------------------------------
 1 | from .base import Baseline
 2 | import numpy as np
 3 | 
 4 | 
 5 | class LinearFeatureBaseline(Baseline):
 6 |     def __init__(self, reg_coeff=1e-5):
 7 |         self._coeffs = None
 8 |         self._reg_coeff = reg_coeff
 9 | 
10 |     def get_param_values(self, **tags):
11 |         return self._coeffs
12 | 
13 |     def set_param_values(self, val, **tags):
14 |         self._coeffs = val
15 | 
16 |     @staticmethod
17 |     def features(obs, rewards):
18 |         o = np.clip(obs, -10, 10)  # hidden defaults are evil. -- Ge
19 |         l = len(rewards)
20 |         al = np.arange(l).reshape(-1, 1) / 100.0
21 |         return np.concatenate([o, o ** 2, al, al ** 2, al ** 3, np.ones((l, 1))], axis=1)
22 | 
23 |     def fit(self, obs, rewards, returns):
24 |         """
25 |         Fits each path separately, from the state, the rewards, to the returns.
26 | 
27 |         Note: The signature of this function is questionable. Why pass in the opaque paths object,
28 |         when we use the return, the reward, and the observation as feature?
29 | 
30 |         n -> timesteps, k -> rollouts, c -> features.
31 | 
32 |         :param obs: the observation with size(n, k, c)
33 |         :param rewards: the rewards with size(n, k)
34 |         :param returns: the returns with size(n, k)
35 |         :return: The fitted
36 |         """
37 |         obs = obs.swapaxes(0, 1)
38 |         rewards = rewards.swapaxes(0, 1)
39 |         featmat = np.concatenate([self.features(ob, r) for ob, r in zip(obs, rewards)])
40 |         returns = returns.swapaxes(0, 1).reshape(-1)
41 |         reg_coeff = self._reg_coeff
42 |         for _ in range(10):
43 |             self._coeffs, *_ = np.linalg.lstsq(
44 |                 featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]),
45 |                 featmat.T.dot(returns), rcond=None, )
46 |             if not np.any(np.isnan(self._coeffs)):
47 |                 break
48 |             reg_coeff *= 2
49 | 
50 |     def predict(self, obs, rewards):
51 |         assert self._coeffs is not None, "need to fit the observation and rewards first."
52 |         n_timesteps, n_envs, *_ = rewards.shape
53 |         obs = obs.swapaxes(0, 1)
54 |         rewards = rewards.swapaxes(0, 1)
55 |         featmat = np.concatenate([self.features(ob, r) for ob, r in zip(obs, rewards)])
56 |         return featmat.dot(self._coeffs).reshape(n_envs, n_timesteps).swapaxes(0, 1)
57 | 


--------------------------------------------------------------------------------
/e_maml_tf/value_baselines/zero_baseline.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from .base import Baseline
 3 | 
 4 | 
 5 | class ZeroBaseline(Baseline):
 6 |     def get_param_values(self, **kwargs):
 7 |         return None
 8 | 
 9 |     def set_param_values(self, val, **kwargs):
10 |         pass
11 | 
12 |     def fit(self, paths):
13 |         pass
14 | 
15 |     def predict(self, path):
16 |         return np.zeros_like(path["rewards"])
17 | 


--------------------------------------------------------------------------------
/e_maml_tf/wrappers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geyang/e-maml/336291d6819a82650d5bcc5f08dd431742897416/e_maml_tf/wrappers/__init__.py


--------------------------------------------------------------------------------
/e_maml_tf/wrappers/k_index.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def k_index(env):
 5 |     """
 6 |     add a k timestep index to the observation. Making value prediction and other
 7 |     modeling significantly easier. Should work with both single and batched environments, but not tested
 8 |     on the latter.
 9 | 
10 |     :param env:
11 |     :return:
12 |     """
13 |     import gym
14 |     gym.logger.set_level(40)
15 |     _step = env.step
16 |     _reset = env.reset
17 | 
18 |     assert isinstance(env.observation_space, gym.spaces.box.Box), 'we only support the box observation atm.'
19 |     ks = None  # the step counter
20 | 
21 |     def obfilt(obs):
22 |         nonlocal ks
23 |         if ks is None:
24 |             ks = np.zeros(*obs.shape[:-1], 1)
25 |         return np.concatenate([obs, ks], axis=-1)
26 | 
27 |     def step(vac):
28 |         """
29 |         Apply sequence of actions to sequence of environments
30 |         actions -> (observations, rewards, news)
31 | 
32 |         where 'news' is a boolean vector indicating whether each element is new.
33 |         """
34 |         nonlocal ks
35 |         obs, rewards, news, infos = _step(vac)
36 |         _obs = obfilt(obs)
37 |         ks = (1 - news) + ks * (1 - news)
38 |         return _obs, rewards, news, infos
39 | 
40 |     def reset():
41 |         nonlocal ks
42 |         obs = _reset()
43 |         ks = None  # clear the step counter
44 |         return obfilt(obs)
45 | 
46 |     env.step = step
47 |     env.reset = reset
48 | 
49 |     obs_space = env.observation_space
50 |     env.observation_space = gym.spaces.box.Box(
51 |         np.concatenate([obs_space.low, [0]]),
52 |         np.concatenate([obs_space.high, [env.spec.max_episode_steps or None]]),  # note: magic numbers are evil --Ge
53 |     )
54 | 
55 |     return env
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     import gym
60 | 
61 |     env = gym.make('Reacher-v2')
62 |     print(env.observation_space)
63 |     assert env.observation_space.shape == (11,), 'reacher is 11 dimensional'
64 |     k_env = k_index(env)
65 |     assert env.observation_space.shape == (12,), 'k_index adds 1 to the observation space'
66 |     obs = k_env.reset()
67 |     assert obs.shape == (12,), 'observation should agree with the type'
68 | 


--------------------------------------------------------------------------------
/e_maml_tf/wrappers/subproc_vec_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from multiprocessing import Process, Pipe
  3 | from baselines.common.vec_env import VecEnv, CloudpickleWrapper
  4 | 
  5 | 
  6 | # make synchronous interface for get call
  7 | 
  8 | def worker(remote, parent_remote, env):
  9 |     parent_remote.close()
 10 |     env = env.x() if hasattr(env, 'x') else env()
 11 |     while True:
 12 |         try:
 13 |             cmd, data = remote.recv()
 14 |             if cmd == 'step':
 15 |                 ob, reward, done, info = env.step(data)
 16 |                 if done:
 17 |                     ob = env.reset()
 18 |                 remote.send((ob, reward, done, info))
 19 |             elif cmd == 'get':
 20 |                 remote.send(getattr(env, data))
 21 |             elif cmd == 'close':
 22 |                 remote.close()
 23 |                 break  # this terminates the process.
 24 |             else:
 25 |                 data = data or dict()
 26 |                 args = data.get('args', tuple())
 27 |                 kwargs = data.get('kwargs', dict())
 28 |                 _ = getattr(env, cmd)(*args, **kwargs)
 29 |                 remote.send(_)
 30 | 
 31 |         except EOFError as e:  # process has ended from inside
 32 |             break  # this terminates the process
 33 |         except BaseException as e:
 34 |             print(e)
 35 |             break
 36 | 
 37 | 
 38 | class SubprocVecEnv:
 39 |     reset_on_done = True
 40 | 
 41 |     def __init__(self, env_fns):
 42 |         """
 43 |         envs: list of gym environments to run in subprocesses
 44 |         """
 45 |         self.waiting = False
 46 |         self.closed = False
 47 |         self.num_envs = len(env_fns)
 48 | 
 49 |         self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(self.num_envs)])
 50 |         self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
 51 |                    for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
 52 |         for p in self.ps:
 53 |             p.daemon = True  # if the main process crashes, we should not cause things to hang
 54 |             p.start()
 55 |         for remote in self.work_remotes:
 56 |             remote.close()
 57 | 
 58 |         self.first = self.remotes[0]
 59 |         self.first.send(('get', 'action_space'))
 60 |         self.action_space = self.first.recv()
 61 |         self.first.send(('get', 'observation_space'))
 62 |         self.observation_space = self.first.recv()
 63 |         self.first.send(('get', 'spec'))
 64 |         self.spec = self.first.recv()
 65 | 
 66 |     def fork(self, n):
 67 |         from copy import copy
 68 |         _self = copy(self)
 69 |         _self.remotes = _self.remotes[:n]
 70 |         return _self
 71 | 
 72 |     def call_sync(self, fn_name, *args, **kwargs):
 73 |         _ = fn_name, dict(args=args, kwargs=kwargs)
 74 |         for remote in self.remotes:
 75 |             remote.send(_)
 76 |         try:
 77 |             return np.stack([remote.recv() for remote in self.remotes])
 78 |         except EOFError as e:
 79 |             raise RuntimeError('Unknown Error has occurred with the environment.') from e
 80 | 
 81 |     def get(self, key):
 82 |         raise NotImplementedError('need to decide for self.first or all.')
 83 | 
 84 |     def step(self, actions):
 85 |         for remote, action in zip(self.remotes, actions):
 86 |             remote.send(('step', action))
 87 |         obs, rews, dones, infos = zip(*[remote.recv() for remote in self.remotes])
 88 |         return np.stack(obs), np.stack(rews), np.stack(dones), infos
 89 | 
 90 |     def render(self, *args, **kwargs):
 91 |         return self.call_sync('render', *args, **kwargs)
 92 | 
 93 |     def step_async(self, actions):
 94 |         for remote, action in zip(self.remotes, actions):
 95 |             remote.send(('step', action))
 96 |         self.waiting = True
 97 | 
 98 |     def step_wait(self):
 99 |         results = [remote.recv() for remote in self.remotes]
100 |         self.waiting = False
101 |         obs, rews, dones, infos = zip(*results)
102 |         return np.stack(obs), np.stack(rews), np.stack(dones), infos
103 | 
104 |     def reset(self):
105 |         return self.call_sync('reset')
106 | 
107 |     def sample_task(self, *args, **kwargs):
108 |         return self.call_sync('sample_task', *args, **kwargs)
109 | 
110 |     def reset_task(self):
111 |         self.call_sync('reset_task')
112 | 
113 |     def close(self):
114 |         """looks bad: mix sync and async handling."""
115 |         if self.closed:
116 |             return
117 |         if self.waiting:
118 |             for remote in self.remotes:
119 |                 remote.recv()
120 |         for remote in self.remotes:
121 |             remote.send(('close', None))
122 |         for p in self.ps:
123 |             p.join()
124 |         self.closed = True
125 | 
126 |     def first_call_sync(self, fn_name, *args, **kwargs):
127 |         self.first.send((fn_name, dict(args=args, kwargs=kwargs)))
128 |         return self.first.recv()
129 | 
130 | 
131 | if __name__ == "__main__":
132 |     def make_env():
133 |         import gym
134 |         return gym.make('Reacher-v2')
135 | 
136 | 
137 |     parallel_envs = SubprocVecEnv([make_env for i in range(6)])
138 |     obs = parallel_envs.reset()
139 |     assert len(obs) == 6, "the original should have 6 envs"
140 | 
141 |     render_envs = parallel_envs.fork(4)
142 |     # note: here we test the `fork` method, useful for selectiong a sub-batch for rendering purposes.
143 |     obs = render_envs.reset()
144 |     assert len(obs) == 4, "the forked env should have only 4 envs."
145 | 
146 |     print('test complete.')
147 | 


--------------------------------------------------------------------------------
/e_maml_tf/wrappers/vec_env_normalize.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def vec_normalize(envs, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99):
 5 |     ob_rms = RunningMeanStd(shape=envs.observation_space.shape) if ob else None
 6 |     ret_rms = RunningMeanStd(shape=()) if ret else None
 7 |     ret = np.zeros(envs.num_envs)
 8 |     gamma = gamma
 9 | 
10 |     _step = envs.step
11 |     _reset = envs.reset
12 | 
13 |     def step(vac):
14 |         """
15 |         Apply sequence of actions to sequence of environments
16 |         actions -> (observations, rewards, news)
17 | 
18 |         where 'news' is a boolean vector indicating whether each element is new.
19 |         """
20 |         nonlocal ret, ret_rms, cliprew
21 |         obs, rewards, news, infos = _step(vac)
22 |         _info = dict(reward_mean=rewards.mean(), reward_std=rewards.std())
23 |         ret = ret * gamma + rewards
24 |         obs = _obfilt(obs)
25 |         if ret_rms:
26 |             ret_rms.update(ret)
27 |             rewards = np.clip(rewards / np.sqrt(ret_rms.var), -cliprew, cliprew)
28 |         return obs, rewards, news, _info
29 | 
30 |     def _obfilt(obs):
31 |         nonlocal clipob
32 |         if ob_rms:
33 |             ob_rms.update(obs)
34 |             obs = np.clip((obs - ob_rms.mean) / np.sqrt(ob_rms.var), -clipob, clipob)
35 |             return obs
36 |         else:
37 |             return obs
38 | 
39 |     def reset():
40 |         """
41 |         Reset all environments
42 |         """
43 |         obs = _reset()
44 |         return _obfilt(obs)
45 | 
46 |     envs.step = step
47 |     envs.reset = reset
48 | 
49 |     return envs
50 | 
51 | 
52 | class RunningMeanStd(object):
53 |     # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
54 |     def __init__(self, epsilon=1e-4, shape=()):
55 |         self.mean = np.zeros(shape, 'float64')
56 |         self.var = np.zeros(shape, 'float64')
57 |         self.count = epsilon
58 | 
59 |     def update(self, x):
60 |         batch_mean = np.mean(x, axis=0)
61 |         batch_var = np.var(x, axis=0)
62 |         batch_count = x.shape[0]
63 | 
64 |         delta = batch_mean - self.mean
65 |         tot_count = self.count + batch_count
66 | 
67 |         new_mean = self.mean + delta * batch_count / tot_count
68 |         m_a = self.var * (self.count)
69 |         m_b = batch_var * (batch_count)
70 |         M2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count)
71 |         new_var = M2 / (self.count + batch_count)
72 | 
73 |         new_count = batch_count + self.count
74 | 
75 |         self.mean = new_mean
76 |         self.var = new_var
77 |         self.count = new_count
78 | 
79 | 
80 | def test_runningmeanstd():
81 |     for (x1, x2, x3) in [
82 |         (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
83 |         (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)),
84 |     ]:
85 |         rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
86 | 
87 |         x = np.concatenate([x1, x2, x3], axis=0)
88 |         ms1 = [x.mean(axis=0), x.var(axis=0)]
89 |         rms.update(x1)
90 |         rms.update(x2)
91 |         rms.update(x3)
92 |         ms2 = [rms.mean, rms.var]
93 | 
94 |         assert np.allclose(ms1, ms2)
95 | 


--------------------------------------------------------------------------------
/jaynes-template.yml:
--------------------------------------------------------------------------------
 1 | version: 0
 2 | hosts:
 3 |   slurm: &slurm-host
 4 |     ip: <cluster-ip>
 5 |     username: <your-ssh-username>
 6 |     password: <your-password>
 7 |     port: 22
 8 | mounts:
 9 |   - !mounts.SSHCode &e_maml_code
10 |     <<: *slurm-host
11 |     local_path: .
12 |     host_path: /usr/home/<your-username>/jaynes-mounts/e_maml_tf/{now:%Y-%m-%d}/{now:%H%M%S.%f}/e_maml_tf
13 |     pypath: true
14 |     excludes: >-
15 |       --exclude='data'
16 |       --exclude='samples'
17 |       --exclude='figures'
18 |       --exclude='results'
19 |       --exclude='analysis'
20 |       --exclude='*__pycache__'
21 |       --exclude='*.git'
22 |       --exclude='*.idea'
23 |       --exclude='*.egg-info'
24 |       --exclude='*.pkl'
25 |     compress: true
26 | default: &default_run
27 |   mounts:
28 |     - *e_maml_code
29 |   runner: &default_runner
30 |     # example environment configuration for MuJoCo simulators
31 |     envs: >-
32 |       LC_CTYPE=en_US.UTF-8
33 |       PYTHONPATH=$PYTHONPATH:/user/home/<your-username>/.local/lib/python3.6/site-packages/
34 |       LD_LIBRARY_PATH=/user/home/<your-username>/.mujoco/mujoco200/bin:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH
35 |     # this is the setup script. To see when/where this is run, use `jaynes.config(verbose=True)`.
36 |     pypath: "{mounts[0].host_path}"
37 |     launch_directory: "{mounts[0].host_path}"
38 |     partition: "p100" # p100 | max12hours | cpu
39 |     setup: | # The tensorflow-gpu module fails without cuda (or GPU)
40 |       . /usr/share/modules/init/profile.sh
41 |       module load pytorch-36
42 |     entry_script: "/pkgs/anaconda3/bin/python -u -m jaynes.entry"
43 |     args:
44 |       - preserve-env
45 |     n_cpu: 8
46 |     n_gpu: 0
47 |     time_limit: "6:0:0"
48 |     output: all
49 |     mem: 8G
50 |     comment: ICLR-2019
51 |     name: LeaF
52 |   launch:
53 |     type: ssh
54 |     <<: *slurm-host
55 |   host: !host
56 |     log_dir: /user/home/<your-username>/jaynes-mounts/leaf/{now:%Y-%m-%d}/{now:%H%M%S.%f}
57 | modes:
58 |   default:
59 |     <<: *default_run
60 |     runner:
61 |       !runners.Slurm
62 |       <<: *default_runner
63 |       partition: cpu
64 |       n_cpu: 8
65 |       n_gpu: 0
66 |   default-gpu:
67 |     <<: *default_run
68 |     runner:
69 |       !runners.Slurm
70 |       <<: *default_runner
71 |       setup: | # this is not really used.
72 |         . /usr/share/modules/init/profile.sh
73 |         module load tensorflow-gpu-36
74 |       partition: "p100"
75 |       n_cpu: 8
76 |       n_gpu: 1
77 |       exclude: gpu027
78 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | .
2 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open('requirements.txt', 'r') as f:
 4 |     dependencies = f.read()
 5 | 
 6 | setup(name='leaf',
 7 |       packages=find_packages(),
 8 |       install_requires=[
 9 |           "numpy",
10 |           "mpi4py",
11 |           "scipy",
12 |           "pandas",
13 |           "gym",
14 |           "baselines",
15 |           "tqdm",
16 |           "params-proto",
17 |           "tensorflow-gpu",
18 |           "ml-logger",
19 |           "moleskin",
20 |           "jaynes",
21 |           "pyyaml",
22 |           "waterbear",
23 |           "dill",
24 |           "mock",
25 |           "mujoco-py",
26 |       ],
27 |       description='E-MAML, and RL-MAML baseline implemented in Tensorflow v1',
28 |       author='Ge Yang',
29 |       url='https://github.com/episodeyang/e-maml',
30 |       author_email='ge.ike.yang@gmail.com',
31 |       version='0.0.1')
32 | 


--------------------------------------------------------------------------------