├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── docs
    ├── Makefile
    ├── autodoc_reqs.txt
    ├── data
    │   └── MAMLlogic.png
    ├── requirements.txt
    └── source
    │   ├── MAMLlogic.png
    │   ├── conf.py
    │   ├── index.rst
    │   └── modules
    │       ├── baselines
    │           └── baselines.rst
    │       ├── envs
    │           └── envs.rst
    │       ├── meta_algos
    │           └── meta_algos.rst
    │       ├── meta_policy_search.rst
    │       ├── optimizers
    │           └── optimizers.rst
    │       ├── policies
    │           ├── policies.distributions.rst
    │           └── policies.rst
    │       └── samplers
    │           └── samplers.rst
├── experiment_utils
    ├── config.py
    ├── experiment.py
    ├── run_sweep.py
    └── utils.py
├── meta_policy_search
    ├── __init__.py
    ├── baselines
    │   ├── __init__.py
    │   ├── base.py
    │   ├── linear_baseline.py
    │   └── zero_baseline.py
    ├── envs
    │   ├── __init__.py
    │   ├── base.py
    │   ├── mujoco_envs
    │   │   ├── ant_rand_direc.py
    │   │   ├── ant_rand_direc_2d.py
    │   │   ├── ant_rand_goal.py
    │   │   ├── half_cheetah_rand_direc.py
    │   │   ├── half_cheetah_rand_vel.py
    │   │   ├── humanoid_rand_direc.py
    │   │   ├── humanoid_rand_direc_2d.py
    │   │   ├── swimmer_rand_vel.py
    │   │   ├── walker2d_rand_direc.py
    │   │   └── walker2d_rand_vel.py
    │   ├── normalized_env.py
    │   ├── point_envs
    │   │   ├── corner_goals_point_env_2d.py
    │   │   ├── point_env_2d.py
    │   │   ├── point_env_2d_corner.py
    │   │   ├── point_env_2d_momentum.py
    │   │   ├── point_env_2d_v2.py
    │   │   └── point_env_2d_walls.py
    │   └── sawyer_envs
    │   │   ├── sawyer_door.py
    │   │   ├── sawyer_pick_and_place.py
    │   │   ├── sawyer_push.py
    │   │   └── sawyer_push_simple.py
    ├── meta_algos
    │   ├── __init__.py
    │   ├── base.py
    │   ├── dice_maml.py
    │   ├── pro_mp.py
    │   ├── trpo_maml.py
    │   ├── vpg_dice_maml.py
    │   └── vpg_maml.py
    ├── meta_trainer.py
    ├── optimizers
    │   ├── __init__.py
    │   ├── base.py
    │   ├── conjugate_gradient_optimizer.py
    │   └── maml_first_order_optimizer.py
    ├── policies
    │   ├── __init__.py
    │   ├── base.py
    │   ├── distributions
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   └── diagonal_gaussian.py
    │   ├── gaussian_mlp_policy.py
    │   ├── meta_gaussian_mlp_policy.py
    │   └── networks
    │   │   ├── __init__.py
    │   │   └── mlp.py
    ├── samplers
    │   ├── __init__.py
    │   ├── base.py
    │   ├── dice_sample_processor.py
    │   ├── meta_sample_processor.py
    │   ├── meta_sampler.py
    │   ├── utils.py
    │   └── vectorized_env_executor.py
    └── utils
    │   ├── __init__.py
    │   ├── logger.py
    │   ├── serializable.py
    │   └── utils.py
├── requirements.txt
├── run_scripts
    ├── e-maml_run_mujoco.py
    ├── maml_run_mujoco.py
    ├── pro-mp_run_mujoco.py
    └── pro-mp_run_point_mass.py
├── setup.py
└── tests
    ├── __init__.py
    ├── test_baselines.py
    ├── test_integration.py
    ├── test_optimizers.py
    ├── test_policies.py
    └── test_samplers.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # pycharm
107 | .idea/
108 | 
109 | # sphinx doc
110 | /docs/builds
111 | 
112 | # Data
113 | /data
114 | 
115 | # mjkey
116 | /docker/mjkey.txt
117 | 
118 | #env file
119 | .env
120 | 
121 | # sandbox
122 | /sandbox
123 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | 
 3 | python:
 4 |   - "3.5"
 5 |   - "3.5-dev"  # 3.5 development branch
 6 |   - "3.6"
 7 |   - "3.6-dev"  # 3.6 development branch
 8 | 
 9 | before_install:
10 |   - sudo apt-get install -y libopenmpi-dev wget unzip
11 |   #- sudo apt-get install -y curl git libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev
12 |   #- wget https://www.roboti.us/download/mjpro150_linux.zip
13 |   #- unzip mjpro150_linux.zip -d /home/travis/.mujoco
14 |   #- echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/travis/.mujoco/mjpro150/bin" >> /home/travis/.bashrc
15 |   #- source /home/travis/.bashrc
16 | 
17 | # command to install dependencies
18 | install:
19 |   - pip install .
20 |   - pip install mpi4py click
21 | 
22 | # command to run tests
23 | script:
24 |   - python -m tests.test_baselines
25 |   - python -m tests.test_optimizers
26 |   - python -m tests.test_policies
27 |   - python -m tests.test_samplers
28 |   - python -m tests.test_integration
29 | 
30 | notifications:
31 |   email:
32 |     recipients:
33 |       - jonas.rothfuss@gmail.com
34 |     on_success: never
35 |     on_failure: always


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Jonas Rothfuss, Ignasi Clavera, Dennis Lee
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://api.travis-ci.com/jonasrothfuss/ProMP.svg?branch=master)](https://travis-ci.com/jonasrothfuss/ProMP)
  2 | [![Docs](https://readthedocs.org/projects/promp/badge/?version=latest)](https://promp.readthedocs.io)
  3 | 
  4 | # ProMP: Proximal Meta-Policy Search
  5 | Implementations corresponding to ProMP ([Rothfuss et al., 2018](https://arxiv.org/abs/1810.06784)). 
  6 | Overall this repository consists of two branches:
  7 | 
  8 | 1) master: lightweight branch that provides the necessary code to run Meta-RL algorithms such as ProMP, E-MAML, MAML.
  9 |             This branch is meant to provide an easy start with Meta-RL and can be integrated into other projects and setups.
 10 | 2) full-code: branch that provides the comprehensive code that was used to produce the experimental results in [Rothfuss et al. (2018)](https://arxiv.org/abs/1810.06784).
 11 |               This includes experiment scripts and plotting scripts that can be used to reproduce the experimental results in the paper.
 12 |               
 13 | The code is written in Python 3 and builds on [Tensorflow](https://www.tensorflow.org/). 
 14 | Many of the provided reinforcement learning environments require the [Mujoco](http://www.mujoco.org/) physics engine.
 15 | Overall the code was developed under consideration of modularity and computational efficiency.
 16 | Many components of the Meta-RL algorithm are parallelized either using either [MPI](https://mpi4py.readthedocs.io/en/stable/) 
 17 | or [Tensorflow](https://www.tensorflow.org/) in order to ensure efficient use of all CPU cores.
 18 | 
 19 | ## Documentation
 20 | 
 21 | An API specification and explanation of the code components can be found [here](https://promp.readthedocs.io/en/latest/).
 22 | Also the documentation can be build locally by running the following commands
 23 | 
 24 | ```
 25 | # ensure that you are in the root folder of the project
 26 | cd docs
 27 | # install the sphinx documentaiton tool dependencies
 28 | pip install requirements.txt
 29 | # build the documentaiton
 30 | make clean && make html
 31 | # now the html documentation can be found under docs/build/html/index.html
 32 | ```
 33 | 
 34 | ## Installation / Dependencies
 35 | The provided code can be either run in A) docker container provided by us or B) using python on
 36 | your local machine. The latter requires multiple installation steps in order to setup dependencies.
 37 | 
 38 | ### A. Docker
 39 | If not installed yet, [set up](https://docs.docker.com/install/) docker on your machine.
 40 | Pull our docker container ``jonasrothfuss/promp`` from docker-hub:
 41 | 
 42 | ```
 43 | docker pull jonasrothfuss/promp
 44 | ```
 45 | 
 46 | All the necessary dependencies are already installed inside the docker container.
 47 | 
 48 | ### B. Anaconda or Virtualenv
 49 | 
 50 | ##### B.1. Installing MPI
 51 | Ensure that you have a working MPI implementation ([see here](https://mpi4py.readthedocs.io/en/stable/install.html) for more instructions). 
 52 | 
 53 | For Ubuntu you can install MPI through the package manager:
 54 | 
 55 | ```
 56 | sudo apt-get install libopenmpi-dev
 57 | ```
 58 | 
 59 | ##### B.2. Create either venv or conda environment and activate it
 60 | 
 61 | ###### Virtualenv
 62 | ```
 63 | pip install --upgrade virtualenv
 64 | virtualenv <venv-name>
 65 | source <venv-name>/bin/activate
 66 | ```
 67 | 
 68 | ###### Anaconda 
 69 | If not done yet, install [anaconda](https://www.anaconda.com/) by following the instructions [here](https://www.anaconda.com/download/#linux).
 70 | Then reate a anaconda environment, activate it and install the requirements in [`requirements.txt`](requirements.txt).
 71 | ```
 72 | conda create -n <env-name> python=3.6
 73 | source activate <env-name>
 74 | ```
 75 | 
 76 | ##### B.3. Install the required python dependencies
 77 | ```
 78 | pip install -r requirements.txt
 79 | ```
 80 | 
 81 | ##### B.4. Set up the Mujoco physics engine and mujoco-py
 82 | For running the majority of the provided Meta-RL environments, the Mujoco physics engine as well as a 
 83 | corresponding python wrapper are required.
 84 | For setting up [Mujoco](http://www.mujoco.org/) and [mujoco-py](https://github.com/openai/mujoco-py), 
 85 | please follow the instructions [here](https://github.com/openai/mujoco-py).
 86 | 
 87 | 
 88 | 
 89 | ## Running ProMP
 90 | In order to run the ProMP algorithm point environment (no Mujoco needed) with default configurations execute:
 91 | ```
 92 | python run_scripts/pro-mp_run_point_mass.py 
 93 | ```
 94 | 
 95 | To run the ProMP algorithm in a Mujoco environment with default configurations:
 96 | ```
 97 | python run_scripts/pro-mp_run_mujoco.py 
 98 | ```
 99 | 
100 | The run configuration can be change either in the run script directly or by providing a JSON configuration file with all
101 | the necessary hyperparameters. A JSON configuration file can be provided through the flag. Additionally the dump path 
102 | can be specified through the dump_path flag:
103 | 
104 | ```
105 | python run_scripts/pro-mp_run.py --config_file <config_file_path> --dump_path <dump_path>
106 | ```
107 | 
108 | Additionally, in order to run the the gradient-based meta-learning methods MAML and E-MAML ([Finn et. al., 2017](https://arxiv.org/abs/1703.03400) and
109 | [Stadie et. al., 2018](https://arxiv.org/abs/1803.01118)) in a Mujoco environment with the default configuration 
110 | execute, respectively:
111 | ```
112 | python run_scripts/maml_run_mujoco.py 
113 | python run_scripts/e-maml_run_mujoco.py 
114 | ```
115 | ## Cite
116 | To cite ProMP please use
117 | ```
118 | @article{rothfuss2018promp,
119 |   title={ProMP: Proximal Meta-Policy Search},
120 |   author={Rothfuss, Jonas and Lee, Dennis and Clavera, Ignasi and Asfour, Tamim and Abbeel, Pieter},
121 |   journal={arXiv preprint arXiv:1810.06784},
122 |   year={2018}
123 | }
124 | ```
125 | 
126 | ## Acknowledgements
127 | This repository includes environments introduced in ([Duan et al., 2016](https://arxiv.org/abs/1611.02779), 
128 | [Finn et al., 2017](https://arxiv.org/abs/1703.03400)).
129 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = python3 -m sphinx
 7 | SPHINXPROJ    = maml-zoo
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/autodoc_reqs.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | tensorflow==1.8.0


--------------------------------------------------------------------------------
/docs/data/MAMLlogic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonasrothfuss/ProMP/93ae339e23dfc6e1133f9538f2c7cc0ccee89d19/docs/data/MAMLlogic.png


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
 1 | sphinx
 2 | sphinx-rtd-theme
 3 | sphinxcontrib-napoleon
 4 | sphinxcontrib-websupport
 5 | 
 6 | joblib==0.12.2
 7 | PyPrind
 8 | numpy
 9 | scipy
10 | gym==0.10.5
11 | python_dateutil
12 | tensorflow


--------------------------------------------------------------------------------
/docs/source/MAMLlogic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonasrothfuss/ProMP/93ae339e23dfc6e1133f9538f2c7cc0ccee89d19/docs/source/MAMLlogic.png


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/master/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | import os
 16 | import sys
 17 | sys.path.insert(0, os.path.abspath('../../'))
 18 | 
 19 | 
 20 | # -- Project information -----------------------------------------------------
 21 | 
 22 | project = 'meta_policy_search'
 23 | copyright = '2018, Dennis Lee, Ignasi Clavera, Jonas Rothfuss'
 24 | author = 'Dennis Lee, Ignasi Clavera, Jonas Rothfuss'
 25 | 
 26 | # The short X.Y version
 27 | version = ''
 28 | # The full version, including alpha/beta/rc tags
 29 | release = ''
 30 | 
 31 | 
 32 | # -- General configuration ---------------------------------------------------
 33 | 
 34 | # If your documentation needs a minimal Sphinx version, state it here.
 35 | #
 36 | # needs_sphinx = '1.0'
 37 | 
 38 | # Add any Sphinx extension module names here, as strings. They can be
 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 40 | # ones.
 41 | extensions = [
 42 |     'sphinx.ext.autodoc',
 43 |     'sphinx.ext.doctest',
 44 |     'sphinx.ext.coverage',
 45 |     'sphinx.ext.mathjax',
 46 |     'sphinx.ext.viewcode',
 47 | #     'sphinx.ext.githubpages',
 48 |     'sphinx.ext.napoleon'
 49 | ]
 50 | 
 51 | # Add any paths that contain templates here, relative to this directory.
 52 | templates_path = ['.templates']
 53 | 
 54 | # The suffix(es) of source filenames.
 55 | # You can specify multiple suffix as a list of string:
 56 | #
 57 | # source_suffix = ['.rst', '.md']
 58 | source_suffix = '.rst'
 59 | 
 60 | # The master toctree document.
 61 | master_doc = 'index'
 62 | 
 63 | # The language for content autogenerated by Sphinx. Refer to documentation
 64 | # for a list of supported languages.
 65 | #
 66 | # This is also used if you do content translation via gettext catalogs.
 67 | # Usually you set "language" from the command line for these cases.
 68 | language = None
 69 | 
 70 | # List of patterns, relative to source directory, that match files and
 71 | # directories to ignore when looking for source files.
 72 | # This pattern also affects html_static_path and html_extra_path .
 73 | exclude_patterns = []
 74 | 
 75 | # The name of the Pygments (syntax highlighting) style to use.
 76 | pygments_style = 'sphinx'
 77 | 
 78 | 
 79 | # -- Options for HTML output -------------------------------------------------
 80 | 
 81 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 82 | # a list of builtin themes.
 83 | #
 84 | html_theme = 'sphinx_rtd_theme'
 85 | 
 86 | # Theme options are theme-specific and customize the look and feel of a theme
 87 | # further.  For a list of options available for each theme, see the
 88 | # documentation.
 89 | #
 90 | # html_theme_options = {}
 91 | 
 92 | # Add any paths that contain custom static files (such as style sheets) here,
 93 | # relative to this directory. They are copied after the builtin static files,
 94 | # so a file named "default.css" will overwrite the builtin "default.css".
 95 | html_static_path = ['.static']
 96 | 
 97 | # Custom sidebar templates, must be a dictionary that maps document names
 98 | # to template names.
 99 | #
100 | # The default sidebars (for documents that don't match any pattern) are
101 | # defined by theme itself.  Builtin themes are using these templates by
102 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
103 | # 'searchbox.html']``.
104 | #
105 | # html_sidebars = {}
106 | 
107 | 
108 | # -- Options for HTMLHelp output ---------------------------------------------
109 | 
110 | # Output file base name for HTML help builder.
111 | htmlhelp_basename = 'maml-zoodoc'
112 | 
113 | 
114 | # -- Options for LaTeX output ------------------------------------------------
115 | 
116 | latex_elements = {
117 |     # The paper size ('letterpaper' or 'a4paper').
118 |     #
119 |     # 'papersize': 'letterpaper',
120 | 
121 |     # The font size ('10pt', '11pt' or '12pt').
122 |     #
123 |     # 'pointsize': '10pt',
124 | 
125 |     # Additional stuff for the LaTeX preamble.
126 |     #
127 |     # 'preamble': '',
128 | 
129 |     # Latex figure (float) alignment
130 |     #
131 |     # 'figure_align': 'htbp',
132 | }
133 | 
134 | # Grouping the document tree into LaTeX files. List of tuples
135 | # (source start file, target name, title,
136 | #  author, documentclass [howto, manual, or own class]).
137 | latex_documents = [
138 |     (master_doc, 'meta_policy_search.tex', 'meta_policy_search Documentation',
139 |      'Dennis Lee, Ignasi Clavera, Jonas Rothfuss', 'manual'),
140 | ]
141 | 
142 | 
143 | # -- Options for manual page output ------------------------------------------
144 | 
145 | # One entry per manual page. List of tuples
146 | # (source start file, name, description, authors, manual section).
147 | man_pages = [
148 |     (master_doc, 'meta_policy_search', 'meta_policy_search Documentation',
149 |      [author], 1)
150 | ]
151 | 
152 | 
153 | # -- Options for Texinfo output ----------------------------------------------
154 | 
155 | # Grouping the document tree into Texinfo files. List of tuples
156 | # (source start file, target name, title, author,
157 | #  dir menu entry, description, category)
158 | texinfo_documents = [
159 |     (master_doc, 'meta_policy_search', 'meta_policy_search Documentation',
160 |      author, 'meta_policy_search', 'One line description of project.',
161 |      'Miscellaneous'),
162 | ]
163 | 
164 | 
165 | # -- Options for Epub output -------------------------------------------------
166 | 
167 | # Bibliographic Dublin Core info.
168 | epub_title = project
169 | epub_author = author
170 | epub_publisher = author
171 | epub_copyright = copyright
172 | 
173 | # The unique identifier of the text. This can be a ISBN number
174 | # or the project homepage.
175 | #
176 | # epub_identifier = ''
177 | 
178 | # A unique identification for the text.
179 | #
180 | # epub_uid = ''
181 | 
182 | # A list of files that should not be packed into the epub file.
183 | epub_exclude_files = ['search.html']
184 | 
185 | 
186 | # -- Extension configuration -------------------------------------------------
187 | 
188 | autodoc_mock_imports = ["gym"]


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. maml-zoo documentation master file, created by
 2 |    sphinx-quickstart on Mon Aug 13 09:57:59 2018.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to Meta-Policy Search's documentation!
 7 | ====================================
 8 | 
 9 | Despite recent progress, deep reinforcement learning (RL) still relies heavily on hand-crafted features and reward functions
10 | as well as engineered problem specific inductive bias. Meta-RL aims to forego such reliance by acquiring inductive bias
11 | in a data-driven manner. A particular instance of meta learning that has proven successful in RL is gradient-based meta-learning.
12 | 
13 | The code repository provides implementations of various gradient-based Meta-RL methods including
14 | 
15 | - ProMP: Proximal Meta-Policy Search (`Rothfuss et al., 2018`_)
16 | - MAML: Model Agnostic Meta-Learning (`Finn et al., 2017`_)
17 | - E-MAML: Exploration MAML (`Al-Shedivat et al., 2018`_, `Stadie et al., 2018`_)
18 | 
19 | The code was written as part of ProMP_. Further information and experimental results can be found on our website_.
20 | This documentation specifies the API and interaction of the algorithm's components. Overall, on iteration of
21 | gradient-based Meta-RL consists of the followings steps:
22 | 
23 | 1. Sample trajectories with pre update policy
24 | 2. Perform gradient step for each task to obtain updated/adapted policy
25 | 3. Sample trajectories with the updated/adapted policy
26 | 4. Perform a meta-policy optimization step, changing the pre-updates policy parameters
27 | 
28 | This high level structure of the algorithm is implemented in the Meta-Trainer class. The overall structure and interaction
29 | of the code components is depicted in the following figure:
30 | 
31 | 
32 | .. image:: MAMLlogic.png
33 |    :width: 600
34 | 
35 | .. _ProMP: https://arxiv.org/abs/1810.06784
36 | 
37 | .. _Rothfuss et al., 2018: https://arxiv.org/abs/1810.06784
38 | 
39 | .. _Finn et al., 2017: https://arxiv.org/abs/1703.03400
40 | 
41 | .. _Stadie et al., 2018: https://arxiv.org/pdf/1803.01118.pdf
42 | 
43 | .. _Al-Shedivat et al., 2018: https://arxiv.org/abs/1710.03641
44 | 
45 | .. _website: https://sites.google.com/view/pro-mp/
46 | 
47 | .. toctree::
48 |    :maxdepth: 3
49 |    :caption: Contents:
50 | 
51 |    modules/meta_policy_search.rst
52 | 
53 | Indices and tables
54 | ==================
55 | 
56 | * :ref:`genindex`
57 | * :ref:`modindex`
58 | 


--------------------------------------------------------------------------------
/docs/source/modules/baselines/baselines.rst:
--------------------------------------------------------------------------------
 1 | Baselines
 2 | ===========================
 3 | 
 4 | .. automodule:: meta_policy_search.baselines
 5 | 
 6 | Baseline (Interface)
 7 | ---------------------------
 8 | .. autoclass:: Baseline
 9 |     :members:
10 | 
11 | Linear Feature Baseline
12 | ---------------------------
13 | .. autoclass:: LinearFeatureBaseline
14 |     :members:
15 |     :inherited-members:
16 | 
17 | LinearTimeBaseline
18 | ---------------------------
19 | .. autoclass:: LinearTimeBaseline
20 |     :members:
21 |     :inherited-members:


--------------------------------------------------------------------------------
/docs/source/modules/envs/envs.rst:
--------------------------------------------------------------------------------
 1 | Environments
 2 | ======================
 3 | 
 4 | .. automodule:: meta_policy_search.envs.base
 5 | 
 6 | MetaEnv (Interface)
 7 | --------------------------
 8 | 
 9 | .. autoclass:: MetaEnv
10 |     :members:
11 | 
12 | 


--------------------------------------------------------------------------------
/docs/source/modules/meta_algos/meta_algos.rst:
--------------------------------------------------------------------------------
 1 | Meta-Algorithms
 2 | =============================
 3 | 
 4 | .. automodule:: meta_policy_search.meta_algos
 5 | 
 6 | MAML-Algorithm (Interface)
 7 | ------------------------------
 8 | 
 9 | .. autoclass:: MAMLAlgo
10 |     :members:
11 |     :inherited-members:
12 |     :show-inheritance:
13 | 
14 | ProMP-Algorithm
15 | ------------------------------
16 | 
17 | .. autoclass:: ProMP
18 |     :members:
19 |     :show-inheritance:
20 |     :inherited-members:
21 | 
22 | TRPO-MAML-Algorithm
23 | ------------------------------
24 | 
25 | .. autoclass:: TRPOMAML
26 |     :members:
27 |     :show-inheritance:
28 |     :inherited-members:
29 | 
30 | VPG-MAML-Algorithm
31 | ------------------------------
32 | 
33 | .. autoclass:: VPGMAML
34 |     :members:
35 |     :show-inheritance:
36 |     :inherited-members:


--------------------------------------------------------------------------------
/docs/source/modules/meta_policy_search.rst:
--------------------------------------------------------------------------------
 1 | Meta-Policy Search
 2 | =================================
 3 | 
 4 | .. toctree::
 5 | 
 6 |     baselines/baselines
 7 |     envs/envs
 8 |     meta_algos/meta_algos
 9 |     optimizers/optimizers
10 |     policies/policies
11 |     samplers/samplers
12 | 
13 | 
14 | Meta-Trainer
15 | ----------------------------------
16 | 
17 | .. automodule:: meta_policy_search.meta_trainer
18 | 
19 | .. autoclass:: Trainer
20 |     :members:
21 |     :inherited-members:
22 |     :show-inheritance:
23 | 


--------------------------------------------------------------------------------
/docs/source/modules/optimizers/optimizers.rst:
--------------------------------------------------------------------------------
 1 | Optimizers
 2 | ============================
 3 | 
 4 | .. automodule:: meta_policy_search.optimizers
 5 | 
 6 | Conjugate Gradient Optimizer
 7 | -----------------------------
 8 | .. autoclass:: ConjugateGradientOptimizer
 9 |     :members:
10 |     :inherited-members:
11 |     :show-inheritance:
12 | 
13 | MAML First Order Optimizer
14 | -----------------------------
15 | .. autoclass:: MAMLFirstOrderOptimizer
16 |     :members:
17 |     :inherited-members:
18 |     :show-inheritance:
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/source/modules/policies/policies.distributions.rst:
--------------------------------------------------------------------------------
 1 | maml\_zoo.policies.distributions package
 2 | ========================================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | maml\_zoo.policies.distributions.base module
 8 | --------------------------------------------
 9 | 
10 | .. automodule:: meta_policy_search.policies.distributions.base
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | maml\_zoo.policies.distributions.diagonal\_gaussian module
16 | ----------------------------------------------------------
17 | 
18 | .. automodule:: meta_policy_search.policies.distributions.diagonal_gaussian
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | 
24 | Module contents
25 | ---------------
26 | 
27 | .. automodule:: meta_policy_search.policies.distributions
28 |     :members:
29 |     :undoc-members:
30 |     :show-inheritance:
31 | 


--------------------------------------------------------------------------------
/docs/source/modules/policies/policies.rst:
--------------------------------------------------------------------------------
 1 | Policies
 2 | ==========================
 3 | 
 4 | Policy Interfaces
 5 | --------------------------
 6 | .. automodule:: meta_policy_search.policies
 7 | 
 8 | .. autoclass:: Policy
 9 |     :members:
10 |     :inherited-members:
11 |     :show-inheritance:
12 | 
13 | .. autoclass:: MetaPolicy
14 |     :members:
15 |     :inherited-members:
16 |     :show-inheritance:
17 | 
18 | 
19 | Gaussian-Policies
20 | --------------------------
21 | 
22 | .. autoclass:: GaussianMLPPolicy
23 |     :members:
24 |     :inherited-members:
25 |     :show-inheritance:
26 | 
27 | .. autoclass:: MetaGaussianMLPPolicy
28 |     :members:
29 |     :inherited-members:
30 |     :show-inheritance:


--------------------------------------------------------------------------------
/docs/source/modules/samplers/samplers.rst:
--------------------------------------------------------------------------------
 1 | Samplers
 2 | ==========================
 3 | 
 4 | .. automodule:: meta_policy_search.samplers
 5 |     :members:
 6 |     :undoc-members:
 7 |     :show-inheritance:
 8 | 
 9 | Sampler
10 | -------------------------
11 | 
12 | .. autoclass:: Sampler
13 |     :members:
14 |     :inherited-members:
15 |     :show-inheritance:
16 | 
17 | .. autoclass:: MetaSampler
18 |     :members:
19 |     :inherited-members:
20 |     :show-inheritance:
21 | 
22 | Sample Processor
23 | -------------------------
24 | 
25 | .. autoclass:: SampleProcessor
26 |     :members:
27 |     :inherited-members:
28 |     :show-inheritance:
29 | 
30 | .. autoclass:: DiceSampleProcessor
31 |     :members:
32 |     :inherited-members:
33 |     :show-inheritance:
34 | 
35 | .. autoclass:: MetaSampleProcessor
36 |     :members:
37 |     :inherited-members:
38 |     :show-inheritance:
39 | 
40 | Vectorized Environment Executor
41 | -------------------------------
42 | 
43 | .. automodule:: meta_policy_search.samplers.vectorized_env_executor
44 | 
45 | .. autoclass:: MetaIterativeEnvExecutor
46 |     :members:
47 |     :inherited-members:
48 |     :show-inheritance:
49 | 
50 | .. autoclass:: MetaParallelEnvExecutor
51 |     :members:
52 |     :inherited-members:
53 |     :show-inheritance:
54 | 


--------------------------------------------------------------------------------
/experiment_utils/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 4 | 
 5 | DOCKER_MOUNT_DIR = '/root/code/data'
 6 | 
 7 | DATA_DIR = os.path.join(BASE_DIR, 'data')
 8 | 
 9 | DOCKER_IMAGE = 'dennisl88/maml_zoo'
10 | 
11 | S3_BUCKET_NAME = 'maml-zoo-experiments'


--------------------------------------------------------------------------------
/experiment_utils/experiment.py:
--------------------------------------------------------------------------------
 1 | # Copied from doodad/run_experiment_lite_doodad.py
 2 | import os
 3 | import pickle
 4 | import base64
 5 | import argparse
 6 | 
 7 | ARGS_DATA = 'DOODAD_ARGS_DATA'
 8 | USE_CLOUDPICKLE = 'DOODAD_USE_CLOUDPICKLE'
 9 | CLOUDPICKLE_VERSION = 'DOODAD_CLOUDPICKLE_VERSION'
10 | 
11 | __ARGS = None
12 | def __get_arg_config():
13 |     """
14 |     global __ARGS
15 |     if __ARGS is not None:
16 |         return __ARGS
17 |     #TODO: use environment variables rather than command-line arguments
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument('--use_cloudpickle', type=bool, default=False)
20 |     parser.add_argument('--'+ARGS_DATA, type=str, default='')
21 |     parser.add_argument('--output_dir', type=str, default='/tmp/expt/')
22 |     args = parser.parse_args()
23 |     __ARGS = args
24 |     """
25 |     args_data = os.environ.get(ARGS_DATA, {})
26 |     cloudpickle_version = os.environ.get(CLOUDPICKLE_VERSION, 'n/a')
27 |     use_cloudpickle = bool(int(os.environ.get(USE_CLOUDPICKLE, '0')))
28 | 
29 |     args = lambda : None # hack - use function as namespace
30 |     args.args_data = args_data
31 |     args.use_cloudpickle = use_cloudpickle
32 |     args.cloudpickle_version = cloudpickle_version
33 |     return args
34 | 
35 | def get_args(key=None, default=None):
36 |     args = __get_arg_config()
37 | 
38 |     if args.args_data:
39 |         if args.use_cloudpickle:
40 |             import cloudpickle
41 |             assert args.cloudpickle_version == cloudpickle.__version__, "Cloudpickle versions do not match! (host) %s vs (remote) %s" % (args.cloudpickle_version, cloudpickle.__version__)
42 |             data = cloudpickle.loads(base64.b64decode(args.args_data))
43 |         else:
44 |             data = pickle.loads(base64.b64decode(args.args_data))
45 |     else:
46 |         data = {}
47 | 
48 |     if key is not None:
49 |         return data.get(key, default)
50 |     return data
51 | 
52 | def encode_args(call_args, cloudpickle=False):
53 |     """
54 |     Encode call_args dictionary as a base64 string
55 |     """
56 |     assert isinstance(call_args, dict)
57 | 
58 |     if cloudpickle:
59 |         import cloudpickle
60 |         cpickle_version = cloudpickle.__version__
61 |         data = base64.b64encode(cloudpickle.dumps(call_args)).decode("utf-8")
62 |     else:
63 |         data = base64.b64encode(pickle.dumps(call_args)).decode("utf-8")
64 |         cpickle_version = 'n/a'
65 |     return data, cpickle_version
66 | 
67 | # These are arguments passed in from launch_python
68 | args_dict = get_args()
69 | print('My args are:', args_dict)
70 | 
71 | 


--------------------------------------------------------------------------------
/experiment_utils/run_sweep.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import argparse
 4 | import itertools
 5 | 
 6 | from experiment_utils import config
 7 | from experiment_utils.utils import query_yes_no
 8 | 
 9 | import doodad as dd
10 | import doodad.mount as mount
11 | import doodad.easy_sweep.launcher as launcher
12 | from doodad.easy_sweep.hyper_sweep import run_sweep_doodad
13 | 
14 | def run_sweep(run_experiment, sweep_params, exp_name, instance_type='c4.xlarge'):
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument('--mode', type=str, default='local',
17 |                         help='Mode for running the experiments - local: runs on local machine, '
18 |                              'ec2: runs on AWS ec2 cluster (requires a proper configuration file)')
19 | 
20 |     args = parser.parse_args(sys.argv[1:])
21 | 
22 |     local_mount = mount.MountLocal(local_dir=config.BASE_DIR, pythonpath=True)
23 | 
24 |     docker_mount_point = os.path.join(config.DOCKER_MOUNT_DIR, exp_name)
25 |     
26 |     sweeper = launcher.DoodadSweeper([local_mount], docker_img=config.DOCKER_IMAGE, docker_output_dir=docker_mount_point,
27 |                                      local_output_dir=os.path.join(config.DATA_DIR, 'local', exp_name))
28 |     sweeper.mount_out_s3 = mount.MountS3(s3_path='', mount_point=docker_mount_point, output=True)
29 | 
30 |     if args.mode == 'ec2':
31 |         print("\n" + "**********" * 10 + "\nexp_prefix: {}\nvariants: {}".format(exp_name, len(list(itertools.product(*[value for value in sweep_params.values()])))))
32 | 
33 |         if query_yes_no("Continue?"):
34 |             sweeper.run_sweep_ec2(run_experiment, sweep_params, bucket_name=config.S3_BUCKET_NAME, instance_type=instance_type,
35 |                               region='us-west-1', s3_log_name=exp_name, add_date_to_logname=False)
36 | 
37 |     elif args.mode == 'local_docker':
38 |         mode_docker = dd.mode.LocalDocker(
39 |             image=sweeper.image,
40 |         )
41 |         run_sweep_doodad(run_experiment, sweep_params, run_mode=mode_docker, 
42 |                 mounts=sweeper.mounts)
43 | 
44 |     elif args.mode == 'local':
45 |         sweeper.run_sweep_serial(run_experiment, sweep_params)
46 | 
47 |     elif args.mode == 'local_singularity':
48 |         mode_singularity = dd.mode.LocalSingularity(
49 |             image='~/meta_policy_search.simg')
50 |         run_sweep_doodad(run_experiment, sweep_params, run_mode=mode_singularity, 
51 |                 mounts=sweeper.mounts) 
52 |     else:
53 |         raise NotImplementedError


--------------------------------------------------------------------------------
/experiment_utils/utils.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | def query_yes_no(question, default="no", allow_skip=False):
 4 |     """Ask a yes/no question via raw_input() and return their answer.
 5 | 
 6 |     "question" is a string that is presented to the user.
 7 |     "default" is the presumed answer if the user just hits <Enter>.
 8 |         It must be "yes" (the default), "no" or None (meaning
 9 |         an answer is required of the user).
10 | 
11 |     The "answer" return value is True for "yes" or False for "no".
12 |     """
13 |     valid = {"yes": True, "y": True, "ye": True,
14 |              "no": False, "n": False}
15 |     if allow_skip:
16 |         valid["skip"] = "skip"
17 |     if default is None:
18 |         prompt = " [y/n] "
19 |     elif default == "yes":
20 |         prompt = " [Y/n] "
21 |     elif default == "no":
22 |         prompt = " [y/N] "
23 |     else:
24 |         raise ValueError("invalid default answer: '%s'" % default)
25 |     if allow_skip:
26 |         prompt += " or skip"
27 |     while True:
28 |         sys.stdout.write(question + prompt)
29 |         choice = input().lower()
30 |         if default is not None and choice == '':
31 |             return valid[default]
32 |         elif choice in valid:
33 |             return valid[choice]
34 |         else:
35 |             sys.stdout.write("Please respond with 'yes' or 'no' "
36 |                              "(or 'y' or 'n').\n")


--------------------------------------------------------------------------------
/meta_policy_search/__init__.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | 
 4 | class MyTestCase(unittest.TestCase):
 5 |     def test_something(self):
 6 |         self.assertEqual(True, False)
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     unittest.main()


--------------------------------------------------------------------------------
/meta_policy_search/baselines/__init__.py:
--------------------------------------------------------------------------------
1 | from meta_policy_search.baselines.base import Baseline
2 | from meta_policy_search.baselines.linear_baseline import LinearFeatureBaseline
3 | from meta_policy_search.baselines.linear_baseline import LinearTimeBaseline


--------------------------------------------------------------------------------
/meta_policy_search/baselines/base.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class Baseline():
 5 |     """
 6 |     Reward baseline interface
 7 |     """
 8 | 
 9 |     def get_param_values(self):
10 |         """
11 |         Returns the parameter values of the baseline object
12 | 
13 |         """
14 |         raise NotImplementedError
15 | 
16 |     def set_params(self, value):
17 |         """
18 |         Sets the parameter values of the baseline object
19 | 
20 |         Args:
21 |             value: parameter value to be set
22 | 
23 |         """
24 |         raise NotImplementedError
25 | 
26 |     def fit(self, paths):
27 |         """
28 |         Fits the baseline model with the provided paths
29 | 
30 |         Args:
31 |             paths: list of paths
32 | 
33 |         """
34 |         raise NotImplementedError
35 | 
36 |     def predict(self, path):
37 |         """
38 |         Predicts the reward baselines for a provided trajectory / path
39 | 
40 |         Args:
41 |             path: dict of lists/numpy array containing trajectory / path information
42 |                   such as "observations", "rewards", ...
43 | 
44 |         Returns: numpy array of the same length as paths["observations"] specifying the reward baseline
45 | 
46 |         """
47 |         raise NotImplementedError
48 | 
49 |     def log_diagnostics(self, paths, prefix):
50 |         """
51 |         Log extra information per iteration based on the collected paths
52 |         """
53 |         pass


--------------------------------------------------------------------------------
/meta_policy_search/baselines/linear_baseline.py:
--------------------------------------------------------------------------------
  1 | from meta_policy_search.baselines.base import Baseline
  2 | from meta_policy_search.utils.serializable import Serializable
  3 | import numpy as np
  4 | 
  5 | 
  6 | class LinearBaseline(Baseline):
  7 |     """
  8 |     Abstract class providing the functionality for fitting a linear baseline
  9 |     Don't instantiate this class. Instead use LinearFeatureBaseline or LinearTimeBaseline
 10 |     """
 11 | 
 12 |     def __init__(self, reg_coeff=1e-5):
 13 |         super(LinearBaseline, self).__init__()
 14 |         self._coeffs = None
 15 |         self._reg_coeff = reg_coeff
 16 | 
 17 |     def predict(self, path):
 18 |         """
 19 |         Abstract Class for the LinearFeatureBaseline and the LinearTimeBaseline
 20 |         Predicts the linear reward baselines estimates for a provided trajectory / path.
 21 |         If the baseline is not fitted - returns zero baseline
 22 | 
 23 |         Args:
 24 |            path (dict): dict of lists/numpy array containing trajectory / path information
 25 |                  such as "observations", "rewards", ...
 26 | 
 27 |         Returns:
 28 |              (np.ndarray): numpy array of the same length as paths["observations"] specifying the reward baseline
 29 | 
 30 |         """
 31 |         if self._coeffs is None:
 32 |             return np.zeros(len(path["observations"]))
 33 |         return self._features(path).dot(self._coeffs)
 34 | 
 35 |     def get_param_values(self, **tags):
 36 |         """
 37 |         Returns the parameter values of the baseline object
 38 | 
 39 |         Returns:
 40 |             numpy array of linear_regression coefficients
 41 | 
 42 |         """
 43 |         return self._coeffs
 44 | 
 45 |     def set_params(self, value, **tags):
 46 |         """
 47 |         Sets the parameter values of the baseline object
 48 | 
 49 |         Args:
 50 |             value: numpy array of linear_regression coefficients
 51 | 
 52 |         """
 53 |         self._coeffs = value
 54 | 
 55 |     def fit(self, paths, target_key='returns'):
 56 |         """
 57 |         Fits the linear baseline model with the provided paths via damped least squares
 58 | 
 59 |         Args:
 60 |             paths (list): list of paths
 61 |             target_key (str): path dictionary key of the target that shall be fitted (e.g. "returns")
 62 | 
 63 |         """
 64 |         assert all([target_key in path.keys() for path in paths])
 65 | 
 66 |         featmat = np.concatenate([self._features(path) for path in paths], axis=0)
 67 |         target = np.concatenate([path[target_key] for path in paths], axis=0)
 68 |         reg_coeff = self._reg_coeff
 69 |         for _ in range(5):
 70 |             self._coeffs = np.linalg.lstsq(
 71 |                 featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]),
 72 |                 featmat.T.dot(target),
 73 |                 rcond=-1
 74 |             )[0]
 75 |             if not np.any(np.isnan(self._coeffs)):
 76 |                 break
 77 |             reg_coeff *= 10
 78 | 
 79 |     def _features(self, path):
 80 |         raise NotImplementedError("this is an abstract class, use either LinearFeatureBaseline or LinearTimeBaseline")
 81 | 
 82 | 
 83 | class LinearFeatureBaseline(LinearBaseline):
 84 |     """
 85 |     Linear (polynomial) time-state dependent return baseline model
 86 |     (see. Duan et al. 2016, "Benchmarking Deep Reinforcement Learning for Continuous Control", ICML)
 87 | 
 88 |     Fits the following linear model
 89 | 
 90 |     reward = b0 + b1*obs + b2*obs^2 + b3*t + b4*t^2+  b5*t^3
 91 | 
 92 |     Args:
 93 |         reg_coeff: list of paths
 94 | 
 95 |     """
 96 |     def __init__(self, reg_coeff=1e-5):
 97 |         super(LinearFeatureBaseline, self).__init__()
 98 |         self._coeffs = None
 99 |         self._reg_coeff = reg_coeff
100 | 
101 |     def _features(self, path):
102 |         obs = np.clip(path["observations"], -10, 10)
103 |         path_length = len(path["observations"])
104 |         time_step = np.arange(path_length).reshape(-1, 1) / 100.0
105 |         return np.concatenate([obs, obs ** 2, time_step, time_step ** 2, time_step ** 3, np.ones((path_length, 1))],
106 |                               axis=1)
107 | 
108 | 
109 | class LinearTimeBaseline(LinearBaseline):
110 |     """
111 |     Linear (polynomial) time-dependent reward baseline model
112 | 
113 |     Fits the following linear model
114 | 
115 |     reward = b0 + b3*t + b4*t^2+  b5*t^3
116 | 
117 |     Args:
118 |         reg_coeff: list of paths
119 | 
120 |     """
121 | 
122 |     def _features(self, path):
123 |         path_length = len(path["observations"])
124 |         time_step = np.arange(path_length).reshape(-1, 1) / 100.0
125 |         return np.concatenate([time_step, time_step ** 2, time_step ** 3, np.ones((path_length, 1))],
126 |                               axis=1)
127 | 
128 | 


--------------------------------------------------------------------------------
/meta_policy_search/baselines/zero_baseline.py:
--------------------------------------------------------------------------------
 1 | from meta_policy_search.baselines.base import Baseline
 2 | import numpy as np
 3 | 
 4 | 
 5 | class ZeroBaseline(Baseline):
 6 |     """
 7 |     Dummy baseline
 8 |     """
 9 | 
10 |     def __init__(self):
11 |         super(ZeroBaseline, self).__init__()
12 | 
13 |     def get_param_values(self, **kwargs):
14 |         """
15 |         Returns the parameter values of the baseline object
16 | 
17 |         Returns:
18 |             (None): coefficients of the baseline
19 | 
20 |         """
21 |         return None
22 | 
23 |     def set_param_values(self, value, **kwargs):
24 |         """
25 |         Sets the parameter values of the baseline object
26 | 
27 |         Args:
28 |             value (None): coefficients of the baseline
29 | 
30 |         """
31 |         pass
32 | 
33 |     def fit(self, paths, **kwargs):
34 |         """
35 |         Improves the quality of zeroes output by baseline
36 | 
37 |         Args:
38 |             paths: list of paths
39 | 
40 |         """
41 |         pass
42 | 
43 |     def predict(self, path):
44 |         """
45 |         Produces some zeroes
46 | 
47 |         Args:
48 |             path (dict): dict of lists/numpy array containing trajectory / path information
49 |                 such as "observations", "rewards", ...
50 | 
51 |         Returns:
52 |              (np.ndarray): numpy array of the same length as paths["observations"] specifying the reward baseline
53 |                 
54 |         """
55 |         return np.zeros_like(path["rewards"])


--------------------------------------------------------------------------------
/meta_policy_search/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from meta_policy_search.envs.base import MetaEnv


--------------------------------------------------------------------------------
/meta_policy_search/envs/base.py:
--------------------------------------------------------------------------------
  1 | from gym.core import Env
  2 | from gym.envs.mujoco import MujocoEnv
  3 | import numpy as np
  4 | 
  5 | 
  6 | class MetaEnv(Env):
  7 |     """
  8 |     Wrapper around OpenAI gym environments, interface for meta learning
  9 |     """
 10 | 
 11 |     def sample_tasks(self, n_tasks):
 12 |         """
 13 |         Samples task of the meta-environment
 14 | 
 15 |         Args:
 16 |             n_tasks (int) : number of different meta-tasks needed
 17 | 
 18 |         Returns:
 19 |             tasks (list) : an (n_tasks) length list of tasks
 20 |         """
 21 |         raise NotImplementedError
 22 | 
 23 |     def set_task(self, task):
 24 |         """
 25 |         Sets the specified task to the current environment
 26 | 
 27 |         Args:
 28 |             task: task of the meta-learning environment
 29 |         """
 30 |         raise NotImplementedError
 31 | 
 32 |     def get_task(self):
 33 |         """
 34 |         Gets the task that the agent is performing in the current environment
 35 | 
 36 |         Returns:
 37 |             task: task of the meta-learning environment
 38 |         """
 39 |         raise NotImplementedError
 40 | 
 41 |     def log_diagnostics(self, paths, prefix):
 42 |         """
 43 |         Logs env-specific diagnostic information
 44 | 
 45 |         Args:
 46 |             paths (list) : list of all paths collected with this env during this iteration
 47 |             prefix (str) : prefix for logger
 48 |         """
 49 |         pass
 50 | 
 51 | class RandomEnv(MetaEnv, MujocoEnv):
 52 |     """
 53 |     This class provides functionality for randomizing the physical parameters of a mujoco model
 54 |     The following parameters are changed:
 55 |         - body_mass
 56 |         - body_inertia
 57 |         - damping coeff at the joints
 58 |     """
 59 |     RAND_PARAMS = ['body_mass', 'dof_damping', 'body_inertia', 'geom_friction']
 60 |     RAND_PARAMS_EXTENDED = RAND_PARAMS + ['geom_size']
 61 | 
 62 |     def __init__(self, log_scale_limit, *args, rand_params=RAND_PARAMS, **kwargs):
 63 |         super(RandomEnv, self).__init__(*args, **kwargs)
 64 |         assert set(rand_params) <= set(self.RAND_PARAMS_EXTENDED), \
 65 |             "rand_params must be a subset of " + str(self.RAND_PARAMS_EXTENDED)
 66 |         self.log_scale_limit = log_scale_limit            
 67 |         self.rand_params = rand_params
 68 |         self.save_parameters()
 69 | 
 70 |     def sample_tasks(self, n_tasks):
 71 |         """
 72 |         Generates randomized parameter sets for the mujoco env
 73 | 
 74 |         Args:
 75 |             n_tasks (int) : number of different meta-tasks needed
 76 | 
 77 |         Returns:
 78 |             tasks (list) : an (n_tasks) length list of tasks
 79 |         """
 80 |         param_sets = []
 81 | 
 82 |         for _ in range(n_tasks):
 83 |             # body mass -> one multiplier for all body parts
 84 | 
 85 |             new_params = {}
 86 | 
 87 |             if 'body_mass' in self.rand_params:
 88 |                 body_mass_multiplyers = np.array(1.5) ** np.random.uniform(-self.log_scale_limit, self.log_scale_limit,  size=self.model.body_mass.shape)
 89 |                 new_params['body_mass'] = self.init_params['body_mass'] * body_mass_multiplyers
 90 | 
 91 |             # body_inertia
 92 |             if 'body_inertia' in self.rand_params:
 93 |                 body_inertia_multiplyers = np.array(1.5) ** np.random.uniform(-self.log_scale_limit, self.log_scale_limit,  size=self.model.body_inertia.shape)
 94 |                 new_params['body_inertia'] = body_inertia_multiplyers * self.init_params['body_inertia']
 95 | 
 96 |             # damping -> different multiplier for different dofs/joints
 97 |             if 'dof_damping' in self.rand_params:
 98 |                 dof_damping_multipliers = np.array(1.3) ** np.random.uniform(-self.log_scale_limit, self.log_scale_limit, size=self.model.dof_damping.shape)
 99 |                 new_params['dof_damping'] = np.multiply(self.init_params['dof_damping'], dof_damping_multipliers)
100 | 
101 |             # friction at the body components
102 |             if 'geom_friction' in self.rand_params:
103 |                 dof_damping_multipliers = np.array(1.5) ** np.random.uniform(-self.log_scale_limit, self.log_scale_limit, size=self.model.geom_friction.shape)
104 |                 new_params['geom_friction'] = np.multiply(self.init_params['geom_friction'], dof_damping_multipliers)
105 | 
106 |             param_sets.append(new_params)
107 | 
108 |         return param_sets
109 | 
110 |     def set_task(self, task):
111 |         for param, param_val in task.items():
112 |             param_variable = getattr(self.model, param)
113 |             assert param_variable.shape == param_val.shape, 'shapes of new parameter value and old one must match'
114 |             setattr(self.model, param, param_val)
115 |         self.cur_params = task
116 | 
117 |     def get_task(self):
118 |         return self.cur_params
119 | 
120 |     def save_parameters(self):
121 |         self.init_params = {}
122 |         if 'body_mass' in self.rand_params:
123 |             self.init_params['body_mass'] = self.model.body_mass
124 | 
125 |         # body_inertia
126 |         if 'body_inertia' in self.rand_params:
127 |             self.init_params['body_inertia'] = self.model.body_inertia
128 | 
129 |         # damping -> different multiplier for different dofs/joints
130 |         if 'dof_damping' in self.rand_params:
131 |             self.init_params['dof_damping'] = self.model.dof_damping
132 | 
133 |         # friction at the body components
134 |         if 'geom_friction' in self.rand_params:
135 |             self.init_params['geom_friction'] = self.model.geom_friction
136 |         self.cur_params = self.init_params


--------------------------------------------------------------------------------
/meta_policy_search/envs/mujoco_envs/ant_rand_direc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from meta_policy_search.envs.base import MetaEnv
 3 | from gym.envs.mujoco.mujoco_env import MujocoEnv
 4 | from meta_policy_search.utils import logger
 5 | import gym
 6 | 
 7 | 
 8 | class AntRandDirecEnv(MetaEnv, MujocoEnv, gym.utils.EzPickle):
 9 |     def __init__(self, goal_direction=None):
10 |         self.goal_direction = goal_direction if goal_direction else 1.0
11 |         MujocoEnv.__init__(self, 'ant.xml', 5)
12 |         gym.utils.EzPickle.__init__(self)
13 | 
14 |     def sample_tasks(self, n_tasks):
15 |         # for fwd/bwd env, goal direc is backwards if - 1.0, forwards if + 1.0
16 |         return np.random.choice((-1.0, 1.0), (n_tasks, ))
17 | 
18 |     def set_task(self, task):
19 |         """
20 |         Args:
21 |             task: task of the meta-learning environment
22 |         """
23 |         self.goal_direction = task
24 | 
25 |     def get_task(self):
26 |         """
27 |         Returns:
28 |             task: task of the meta-learning environment
29 |         """
30 |         return self.goal_direction
31 | 
32 |     def step(self, a):
33 |         xposbefore = self.get_body_com("torso")[0]
34 |         self.do_simulation(a, self.frame_skip)
35 |         xposafter = self.get_body_com("torso")[0]
36 |         forward_reward = self.goal_direction * (xposafter - xposbefore)/self.dt
37 |         ctrl_cost = .5 * np.square(a).sum()
38 |         contact_cost = 0.5 * 1e-3 * np.sum(
39 |             np.square(np.clip(self.sim.data.cfrc_ext, -1, 1)))
40 |         survive_reward = 1.0
41 |         reward = forward_reward - ctrl_cost - contact_cost + survive_reward
42 |         state = self.state_vector()
43 |         notdone = np.isfinite(state).all() and 1.0 >= state[2] >= 0.
44 |         done = not notdone
45 |         ob = self._get_obs()
46 |         return ob, reward, done, dict(
47 |             reward_forward=forward_reward,
48 |             reward_ctrl=-ctrl_cost,
49 |             reward_contact=-contact_cost,
50 |             reward_survive=survive_reward)
51 | 
52 |     def _get_obs(self):
53 |         return np.concatenate([
54 |             self.sim.data.qpos.flat[2:],
55 |             self.sim.data.qvel.flat,
56 |             np.clip(self.sim.data.cfrc_ext, -1, 1).flat,
57 |         ])
58 | 
59 |     def reset_model(self):
60 |         qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1)
61 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
62 |         self.set_state(qpos, qvel)
63 |         return self._get_obs()
64 | 
65 |     def viewer_setup(self):
66 |         self.viewer.cam.distance = self.model.stat.extent * 0.5
67 | 
68 |     def log_diagnostics(self, paths, prefix=''):
69 |         progs = [np.mean(path["env_infos"]["reward_forward"]) for path in paths]
70 |         ctrl_cost = [-np.mean(path["env_infos"]["reward_ctrl"]) for path in paths]
71 | 
72 |         logger.logkv(prefix + 'AverageForwardReturn', np.mean(progs))
73 |         logger.logkv(prefix + 'MaxForwardReturn', np.max(progs))
74 |         logger.logkv(prefix + 'MinForwardReturn', np.min(progs))
75 |         logger.logkv(prefix + 'StdForwardReturn', np.std(progs))
76 | 
77 |         logger.logkv(prefix + 'AverageCtrlCost', np.mean(ctrl_cost))


--------------------------------------------------------------------------------
/meta_policy_search/envs/mujoco_envs/ant_rand_direc_2d.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from meta_policy_search.envs.base import MetaEnv
 3 | from gym.envs.mujoco.mujoco_env import MujocoEnv
 4 | from meta_policy_search.utils import logger
 5 | import gym
 6 | 
 7 | 
 8 | class AntRandDirec2DEnv(MetaEnv, MujocoEnv, gym.utils.EzPickle):
 9 |     def __init__(self):
10 |         self.set_task(self.sample_tasks(1)[0])
11 |         MujocoEnv.__init__(self, 'ant.xml', 5)
12 |         gym.utils.EzPickle.__init__(self)
13 | 
14 |     def sample_tasks(self, n_tasks):
15 |         # for fwd/bwd env, goal direc is backwards if - 1.0, forwards if + 1.0
16 |         directions = np.random.normal(size=(n_tasks, 2))
17 |         directions /= np.linalg.norm(directions, axis=1)[..., np.newaxis]
18 |         return directions
19 | 
20 |     def set_task(self, task):
21 |         """
22 |         Args:
23 |             task: task of the meta-learning environment
24 |         """
25 |         self.goal_direction = task
26 | 
27 |     def get_task(self):
28 |         """
29 |         Returns:
30 |             task: task of the meta-learning environment
31 |         """
32 |         return self.goal_direction
33 | 
34 |     def step(self, a):
35 |         posbefore = np.copy(self.get_body_com("torso")[:2])
36 |         self.do_simulation(a, self.frame_skip)
37 |         posafter = self.get_body_com("torso")[:2]
38 |         forward_reward = np.sum(self.goal_direction * (posafter - posbefore))/self.dt
39 |         ctrl_cost = .5 * np.square(a).sum()
40 |         contact_cost = 0.5 * 1e-3 * np.sum(
41 |             np.square(np.clip(self.sim.data.cfrc_ext, -1, 1)))
42 |         survive_reward = 1.0
43 |         reward = forward_reward - ctrl_cost - contact_cost + survive_reward
44 |         state = self.state_vector()
45 |         notdone = np.isfinite(state).all() and 1.0 >= state[2] >= 0.
46 |         done = not notdone
47 |         ob = self._get_obs()
48 |         return ob, reward, done, dict(
49 |             reward_forward=forward_reward,
50 |             reward_ctrl=-ctrl_cost,
51 |             reward_contact=-contact_cost,
52 |             reward_survive=survive_reward)
53 | 
54 |     def _get_obs(self):
55 |         return np.concatenate([
56 |             self.sim.data.qpos.flat[2:],
57 |             self.sim.data.qvel.flat,
58 |             np.clip(self.sim.data.cfrc_ext, -1, 1).flat,
59 |         ])
60 | 
61 |     def reset_model(self):
62 |         qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1)
63 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
64 |         self.set_state(qpos, qvel)
65 |         return self._get_obs()
66 | 
67 |     def viewer_setup(self):
68 |         self.viewer.cam.distance = self.model.stat.extent * 0.5
69 | 
70 |     def log_diagnostics(self, paths, prefix=''):
71 |         progs = [np.mean(path["env_infos"]["reward_forward"]) for path in paths]
72 |         ctrl_cost = [-np.mean(path["env_infos"]["reward_ctrl"]) for path in paths]
73 | 
74 |         logger.logkv(prefix + 'AverageForwardReturn', np.mean(progs))
75 |         logger.logkv(prefix + 'MaxForwardReturn', np.max(progs))
76 |         logger.logkv(prefix + 'MinForwardReturn', np.min(progs))
77 |         logger.logkv(prefix + 'StdForwardReturn', np.std(progs))
78 | 
79 |         logger.logkv(prefix + 'AverageCtrlCost', np.mean(ctrl_cost))
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     env = AntRandDirec2DEnv()
84 |     while True:
85 |         task = env.sample_tasks(1)[0]
86 |         env.set_task(task)
87 |         env.reset()
88 |         for _ in range(100):
89 |             env.render()
90 |             _, reward, _, _ = env.step(env.action_space.sample())  # take a random action


--------------------------------------------------------------------------------
/meta_policy_search/envs/mujoco_envs/ant_rand_goal.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from meta_policy_search.envs.base import MetaEnv
 3 | from meta_policy_search.utils import logger
 4 | import gym
 5 | from gym.envs.mujoco.mujoco_env import MujocoEnv
 6 | 
 7 | class AntRandGoalEnv(MetaEnv, gym.utils.EzPickle, MujocoEnv):
 8 |     def __init__(self):
 9 |         self.set_task(self.sample_tasks(1)[0])
10 |         MujocoEnv.__init__(self, 'ant.xml', 5)
11 |         gym.utils.EzPickle.__init__(self)
12 | 
13 |     def sample_tasks(self, n_tasks):
14 |         a = np.random.random(n_tasks) * 2 * np.pi
15 |         r = 3 * np.random.random(n_tasks) ** 0.5
16 |         return np.stack((r * np.cos(a), r * np.sin(a)), axis=-1)
17 | 
18 |     def set_task(self, task):
19 |         """
20 |         Args:
21 |             task: task of the meta-learning environment
22 |         """
23 |         self.goal_pos = task
24 | 
25 |     def get_task(self):
26 |         """
27 |         Returns:
28 |             task: task of the meta-learning environment
29 |         """
30 |         return self.goal_pos
31 | 
32 |     def step(self, a):
33 |         self.do_simulation(a, self.frame_skip)
34 |         xposafter = self.get_body_com("torso")
35 |         goal_reward = -np.sum(np.abs(xposafter[:2] - self.goal_pos))  # make it happy, not suicidal
36 |         ctrl_cost = .1 * np.square(a).sum()
37 |         contact_cost = 0.5 * 1e-3 * np.sum(np.square(np.clip(self.sim.data.cfrc_ext, -1, 1)))
38 |         # survive_reward = 1.0
39 |         survive_reward = 0.0
40 |         reward = goal_reward - ctrl_cost - contact_cost + survive_reward
41 |         state = self.state_vector()
42 |         # notdone = np.isfinite(state).all() and 1.0 >= state[2] >= 0.
43 |         # done = not notdone
44 |         done = False
45 |         ob = self._get_obs()
46 |         return ob, reward, done, dict(
47 |             reward_forward=goal_reward,
48 |             reward_ctrl=-ctrl_cost,
49 |             reward_contact=-contact_cost,
50 |             reward_survive=survive_reward)
51 | 
52 |     def _get_obs(self):
53 |         return np.concatenate([
54 |             self.sim.data.qpos.flat,
55 |             self.sim.data.qvel.flat,
56 |             np.clip(self.sim.data.cfrc_ext, -1, 1).flat,
57 |         ])
58 | 
59 |     def reset_model(self):
60 |         qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1)
61 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
62 |         self.set_state(qpos, qvel)
63 |         return self._get_obs()
64 | 
65 |     def viewer_setup(self):
66 |         self.viewer.cam.distance = self.model.stat.extent * 0.5
67 | 
68 |     def log_diagnostics(self, paths, prefix=''):
69 |         progs = [np.mean(path["env_infos"]["reward_forward"]) for path in paths]
70 |         ctrl_cost = [-np.mean(path["env_infos"]["reward_ctrl"]) for path in paths]
71 | 
72 |         logger.logkv(prefix + 'AverageForwardReturn', np.mean(progs))
73 |         logger.logkv(prefix + 'MaxForwardReturn', np.max(progs))
74 |         logger.logkv(prefix + 'MinForwardReturn', np.min(progs))
75 |         logger.logkv(prefix + 'StdForwardReturn', np.std(progs))
76 | 
77 |         logger.logkv(prefix + 'AverageCtrlCost', np.mean(ctrl_cost))
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     env = AntRandGoalEnv()
82 |     while True:
83 |         env.reset()
84 |         for _ in range(100):
85 |             env.render()
86 |             _, reward, _, _ = env.step(env.action_space.sample())  # take a random action


--------------------------------------------------------------------------------
/meta_policy_search/envs/mujoco_envs/half_cheetah_rand_direc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from meta_policy_search.envs.base import MetaEnv
 3 | from meta_policy_search.utils import logger
 4 | import gym
 5 | from gym.envs.mujoco.mujoco_env import MujocoEnv
 6 | 
 7 | 
 8 | class HalfCheetahRandDirecEnv(MetaEnv, MujocoEnv, gym.utils.EzPickle):
 9 |     def __init__(self, goal_direction=None):
10 |         self.goal_direction = goal_direction if goal_direction else 1.0
11 |         MujocoEnv.__init__(self, 'half_cheetah.xml', 5)
12 |         gym.utils.EzPickle.__init__(self, goal_direction)
13 | 
14 |     def sample_tasks(self, n_tasks):
15 |         # for fwd/bwd env, goal direc is backwards if - 1.0, forwards if + 1.0
16 |         return np.random.choice((-1.0, 1.0), (n_tasks, ))
17 | 
18 |     def set_task(self, task):
19 |         """
20 |         Args:
21 |             task: task of the meta-learning environment
22 |         """
23 |         self.goal_direction = task
24 | 
25 |     def get_task(self):
26 |         """
27 |         Returns:
28 |             task: task of the meta-learning environment
29 |         """
30 |         return self.goal_direction
31 | 
32 |     def step(self, action):
33 |         xposbefore = self.sim.data.qpos[0]
34 |         self.do_simulation(action, self.frame_skip)
35 |         xposafter = self.sim.data.qpos[0]
36 |         ob = self._get_obs()
37 |         reward_ctrl = - 0.5 * 0.1 * np.square(action).sum()
38 |         reward_run = self.goal_direction * (xposafter - xposbefore) / self.dt
39 |         reward = reward_ctrl + reward_run
40 |         done = False
41 |         return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl)
42 | 
43 |     def _get_obs(self):
44 |         return np.concatenate([
45 |             self.sim.data.qpos.flat[1:],
46 |             self.sim.data.qvel.flat,
47 |         ])
48 | 
49 |     def reset_model(self):
50 |         qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
51 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
52 |         self.set_state(qpos, qvel)
53 |         return self._get_obs()
54 | 
55 |     def viewer_setup(self):
56 |         self.viewer.cam.distance = self.model.stat.extent * 0.5
57 | 
58 |     def log_diagnostics(self, paths, prefix=''):
59 |         fwrd_vel = [path["env_infos"]['reward_run'] for path in paths]
60 |         final_fwrd_vel = [path["env_infos"]['reward_run'][-1] for path in paths]
61 |         ctrl_cost = [-path["env_infos"]['reward_ctrl'] for path in paths]
62 | 
63 |         logger.logkv(prefix + 'AvgForwardVel', np.mean(fwrd_vel))
64 |         logger.logkv(prefix + 'AvgFinalForwardVel', np.mean(final_fwrd_vel))
65 |         logger.logkv(prefix + 'AvgCtrlCost', np.std(ctrl_cost))
66 | 
67 |     def __str__(self):
68 |         return 'HalfCheetahRandDirecEnv'


--------------------------------------------------------------------------------
/meta_policy_search/envs/mujoco_envs/half_cheetah_rand_vel.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from meta_policy_search.envs.base import MetaEnv
 3 | from meta_policy_search.utils import logger
 4 | import gym
 5 | from gym.envs.mujoco.mujoco_env import MujocoEnv
 6 | 
 7 | class HalfCheetahRandVelEnv(MetaEnv, MujocoEnv, gym.utils.EzPickle):
 8 |     def __init__(self):
 9 |         self.set_task(self.sample_tasks(1)[0])
10 |         MujocoEnv.__init__(self, 'half_cheetah.xml', 5)
11 |         gym.utils.EzPickle.__init__(self)
12 | 
13 |     def sample_tasks(self, n_tasks):
14 |         return np.random.uniform(0.0, 3.0, (n_tasks, ))
15 | 
16 |     def set_task(self, task):
17 |         """
18 |         Args:
19 |             task: task of the meta-learning environment
20 |         """
21 |         self.goal_velocity = task
22 | 
23 |     def get_task(self):
24 |         """
25 |         Returns:
26 |             task: task of the meta-learning environment
27 |         """
28 |         return self.goal_velocity
29 | 
30 |     def step(self, action):
31 |         xposbefore = self.sim.data.qpos[0]
32 |         self.do_simulation(action, self.frame_skip)
33 |         xposafter = self.sim.data.qpos[0]
34 |         ob = self._get_obs()
35 |         reward_ctrl = - 0.5 * 0.1 * np.square(action).sum()
36 |         forward_vel = (xposafter - xposbefore) / self.dt
37 |         reward_run = - np.abs(forward_vel - self.goal_velocity)
38 |         reward = reward_ctrl + reward_run
39 |         done = False
40 |         return ob, reward, done, dict(forward_vel=forward_vel, reward_run=reward_run, reward_ctrl=reward_ctrl)
41 | 
42 |     def _get_obs(self):
43 |         return np.concatenate([
44 |             self.sim.data.qpos.flat[1:],
45 |             self.sim.data.qvel.flat,
46 |         ])
47 | 
48 |     def reset_model(self):
49 |         qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
50 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
51 |         self.set_state(qpos, qvel)
52 |         return self._get_obs()
53 | 
54 |     def viewer_setup(self):
55 |         self.viewer.cam.distance = self.model.stat.extent * 0.5
56 | 
57 |     def log_diagnostics(self, paths, prefix=''):
58 |         fwrd_vel = [path["env_infos"]['forward_vel'] for path in paths]
59 |         final_fwrd_vel = [path["env_infos"]['forward_vel'][-1] for path in paths]
60 |         ctrl_cost = [-path["env_infos"]['reward_ctrl'] for path in paths]
61 | 
62 |         logger.logkv(prefix + 'AvgForwardVel', np.mean(fwrd_vel))
63 |         logger.logkv(prefix + 'AvgFinalForwardVel', np.mean(final_fwrd_vel))
64 |         logger.logkv(prefix + 'AvgCtrlCost', np.std(ctrl_cost))
65 | 


--------------------------------------------------------------------------------
/meta_policy_search/envs/mujoco_envs/humanoid_rand_direc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from meta_policy_search.envs.base import MetaEnv
 3 | import gym
 4 | from gym.envs.mujoco.mujoco_env import MujocoEnv
 5 | 
 6 | def mass_center(model, sim):
 7 |     mass = np.expand_dims(model.body_mass, 1)
 8 |     xpos = sim.data.xipos
 9 |     return (np.sum(mass * xpos, 0) / np.sum(mass))
10 | 
11 | class HumanoidRandDirecEnv(MetaEnv, gym.utils.EzPickle, MujocoEnv):
12 |     def __init__(self):
13 |         self.set_task(self.sample_tasks(1)[0])
14 |         MujocoEnv.__init__(self, 'humanoid.xml', 5)
15 |         gym.utils.EzPickle.__init__(self)
16 |     
17 |     def sample_tasks(self, n_tasks):
18 |         return np.random.choice((-1.0, 1.0), (n_tasks, ))
19 | 
20 |     def set_task(self, task):
21 |         """
22 |         Args:
23 |             task: task of the meta-learning environment
24 |         """
25 |         self.goal_direction = task
26 | 
27 |     def get_task(self):
28 |         """
29 |         Returns:
30 |             task: task of the meta-learning environment
31 |         """
32 |         return self.goal_direction
33 | 
34 |     def _get_obs(self):
35 |         data = self.sim.data
36 |         return np.concatenate([data.qpos.flat[2:],
37 |                                data.qvel.flat,
38 |                                data.cinert.flat,
39 |                                data.cvel.flat,
40 |                                data.qfrc_actuator.flat,
41 |                                data.cfrc_ext.flat])
42 | 
43 |     def step(self, a):
44 |         pos_before = mass_center(self.model, self.sim)[0]
45 |         self.do_simulation(a, self.frame_skip)
46 |         pos_after = mass_center(self.model, self.sim)[0]
47 |         alive_bonus = 5.0
48 |         data = self.sim.data
49 |         lin_vel_cost = 0.25 * self.goal_direction * (pos_after - pos_before) / self.model.opt.timestep
50 |         quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum()
51 |         quad_impact_cost = .5e-6 * np.square(data.cfrc_ext).sum()
52 |         quad_impact_cost = min(quad_impact_cost, 10)
53 |         reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus
54 |         qpos = self.sim.data.qpos
55 |         done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0))
56 |         return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost, reward_alive=alive_bonus, reward_impact=-quad_impact_cost)
57 | 
58 |     def reset_model(self):
59 |         c = 0.01
60 |         self.set_state(
61 |             self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq),
62 |             self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,)
63 |         )
64 |         return self._get_obs()
65 | 
66 |     def viewer_setup(self):
67 |         self.viewer.cam.trackbodyid = 1
68 |         self.viewer.cam.distance = self.model.stat.extent * 1.0
69 |         self.viewer.cam.elevation = -20
70 | 
71 | if __name__ == "__main__":
72 |     env = HumanoidRandDirecEnv()
73 |     while True:
74 |         env.reset()
75 |         for _ in range(200):
76 |             env.render()
77 |             _, reward, _, _ = env.step(env.action_space.sample())  # take a random action


--------------------------------------------------------------------------------
/meta_policy_search/envs/mujoco_envs/humanoid_rand_direc_2d.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from meta_policy_search.envs.base import MetaEnv
 3 | import gym
 4 | from gym.envs.mujoco.mujoco_env import MujocoEnv
 5 | 
 6 | def mass_center(model, sim):
 7 |     mass = np.expand_dims(model.body_mass, 1)
 8 |     xpos = sim.data.xipos
 9 |     return (np.sum(mass * xpos, 0) / np.sum(mass))
10 | 
11 | class HumanoidRandDirec2DEnv(MetaEnv, gym.utils.EzPickle, MujocoEnv):
12 |     def __init__(self):
13 |         self.set_task(self.sample_tasks(1)[0])
14 |         MujocoEnv.__init__(self, 'humanoid.xml', 5)
15 |         gym.utils.EzPickle.__init__(self)
16 |     
17 |     def sample_tasks(self, n_tasks):
18 |         directions = np.random.normal(size=(n_tasks, 2))
19 |         directions /= np.linalg.norm(directions, axis=1)[..., np.newaxis]
20 |         return directions
21 | 
22 |     def set_task(self, task):
23 |         """
24 |         Args:
25 |             task: task of the meta-learning environment
26 |         """
27 |         self.goal_direction = task
28 | 
29 |     def get_task(self):
30 |         """
31 |         Returns:
32 |             task: task of the meta-learning environment
33 |         """
34 |         return self.goal_direction
35 | 
36 |     def _get_obs(self):
37 |         data = self.sim.data
38 |         return np.concatenate([data.qpos.flat[2:],
39 |                                data.qvel.flat,
40 |                                data.cinert.flat,
41 |                                data.cvel.flat,
42 |                                data.qfrc_actuator.flat,
43 |                                data.cfrc_ext.flat])
44 | 
45 |     def step(self, a):
46 |         pos_before = np.copy(mass_center(self.model, self.sim)[:2])
47 |         self.do_simulation(a, self.frame_skip)
48 |         pos_after = mass_center(self.model, self.sim)[:2]
49 |         alive_bonus = 5.0
50 |         data = self.sim.data
51 |         lin_vel_cost = 0.25 * np.sum(self.goal_direction * (pos_after - pos_before)) / self.model.opt.timestep
52 |         quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum()
53 |         quad_impact_cost = .5e-6 * np.square(data.cfrc_ext).sum()
54 |         quad_impact_cost = min(quad_impact_cost, 10)
55 |         reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus
56 |         qpos = self.sim.data.qpos
57 |         done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0))
58 |         return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost, reward_alive=alive_bonus, reward_impact=-quad_impact_cost)
59 | 
60 |     def reset_model(self):
61 |         c = 0.01
62 |         self.set_state(
63 |             self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq),
64 |             self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,)
65 |         )
66 |         return self._get_obs()
67 | 
68 |     def viewer_setup(self):
69 |         self.viewer.cam.trackbodyid = 1
70 |         self.viewer.cam.distance = self.model.stat.extent * 1.0
71 |         self.viewer.cam.elevation = -20
72 | 
73 | if __name__ == "__main__":
74 |     env = HumanoidRandDirec2DEnv()
75 |     while True:
76 |         env.reset()
77 |         for _ in range(200):
78 |             env.render()
79 |             _, reward, _, _ = env.step(env.action_space.sample())  # take a random action
80 |             print(reward)


--------------------------------------------------------------------------------
/meta_policy_search/envs/mujoco_envs/swimmer_rand_vel.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | from meta_policy_search.utils import logger
 4 | from meta_policy_search.envs.base import MetaEnv
 5 | from gym.envs.mujoco.mujoco_env import MujocoEnv
 6 | 
 7 | class SwimmerRandVelEnv(MetaEnv, MujocoEnv, gym.utils.EzPickle):
 8 |     def __init__(self):
 9 |         self.set_task(self.sample_tasks(1)[0])
10 |         MujocoEnv.__init__(self, 'swimmer.xml', 4)
11 |         gym.utils.EzPickle.__init__(self)
12 | 
13 |     def sample_tasks(self, n_tasks):
14 |         # for fwd/bwd env, goal direc is backwards if - 1.0, forwards if + 1.0
15 |         return np.random.uniform(0.1, 0.2, (n_tasks, ))
16 | 
17 |     def set_task(self, task):
18 |         """
19 |         Args:
20 |             task: task of the meta-learning environment
21 |         """
22 |         self.goal_vel = task
23 | 
24 |     def get_task(self):
25 |         """
26 |         Returns:
27 |             task: task of the meta-learning environment
28 |         """
29 |         return self.goal_vel
30 | 
31 |     def step(self, a):
32 |         ctrl_cost_coeff = 0.0001
33 |         xposbefore = self.sim.data.qpos[0]
34 |         self.do_simulation(a, self.frame_skip)
35 |         xposafter = self.sim.data.qpos[0]
36 |         reward_fwd = np.abs((xposafter - xposbefore) / self.dt - self.goal_vel)
37 |         reward_ctrl = - ctrl_cost_coeff * np.square(a).sum()
38 |         reward = reward_fwd + reward_ctrl
39 |         ob = self._get_obs()
40 |         return ob, reward, False, dict(reward_fwd=reward_fwd, reward_ctrl=reward_ctrl)
41 | 
42 |     def _get_obs(self):
43 |         qpos = self.sim.data.qpos
44 |         qvel = self.sim.data.qvel
45 |         return np.concatenate([qpos.flat[2:], qvel.flat])
46 | 
47 |     def reset_model(self):
48 |         self.set_state(
49 |             self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq),
50 |             self.init_qvel + self.np_random.uniform(low=-.1, high=.1, size=self.model.nv)
51 |         )
52 |         return self._get_obs()
53 |         
54 |     def log_diagnostics(self, paths, prefix=''):
55 |         progs = [
56 |             path["observations"][-1][-3] - path["observations"][0][-3]
57 |             for path in paths
58 |         ]
59 |         logger.record_tabular(prefix + 'AverageForwardProgress', np.mean(progs))
60 |         logger.record_tabular(prefix + 'MaxForwardProgress', np.max(progs))
61 |         logger.record_tabular(prefix + 'MinForwardProgress', np.min(progs))
62 |         logger.record_tabular(prefix + 'StdForwardProgress', np.std(progs))
63 | 
64 | 


--------------------------------------------------------------------------------
/meta_policy_search/envs/mujoco_envs/walker2d_rand_direc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from meta_policy_search.envs.base import MetaEnv
 3 | import gym
 4 | from gym.envs.mujoco.mujoco_env import MujocoEnv
 5 | 
 6 | class Walker2DRandDirecEnv(MetaEnv, gym.utils.EzPickle, MujocoEnv):
 7 |     def __init__(self):
 8 |         self.set_task(self.sample_tasks(1)[0])
 9 |         MujocoEnv.__init__(self, 'walker2d.xml', 8)
10 |         gym.utils.EzPickle.__init__(self)
11 |     
12 |     def sample_tasks(self, n_tasks):
13 |         return np.random.choice((-1.0, 1.0), (n_tasks, ))
14 | 
15 |     def set_task(self, task):
16 |         """
17 |         Args:
18 |             task: task of the meta-learning environment
19 |         """
20 |         self.goal_direction = task
21 | 
22 |     def get_task(self):
23 |         """
24 |         Returns:
25 |             task: task of the meta-learning environment
26 |         """
27 |         return self.goal_direction
28 | 
29 |     def step(self, a):
30 |         posbefore = self.sim.data.qpos[0]
31 |         self.do_simulation(a, self.frame_skip)
32 |         posafter, height, ang = self.sim.data.qpos[0:3]
33 |         alive_bonus = 1.0
34 |         reward = (self.goal_direction * (posafter - posbefore) / self.dt)
35 |         reward += alive_bonus
36 |         reward -= 1e-3 * np.square(a).sum()
37 |         done = not (height > 0.8 and height < 2.0 and
38 |                     ang > -1.0 and ang < 1.0)
39 |         ob = self._get_obs()
40 |         return ob, reward, done, {}
41 | 
42 |     def _get_obs(self):
43 |         qpos = self.sim.data.qpos
44 |         qvel = self.sim.data.qvel
45 |         return np.concatenate([qpos[1:], np.clip(qvel, -10, 10)]).ravel()
46 | 
47 |     def reset_model(self):
48 |         self.set_state(
49 |             self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq),
50 |             self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv)
51 |         )
52 |         return self._get_obs()
53 | 
54 |     def viewer_setup(self):
55 |         self.viewer.cam.trackbodyid = 2
56 |         self.viewer.cam.distance = self.model.stat.extent * 0.5
57 | 
58 | if __name__ == "__main__":
59 |     env = Walker2DRandDirecEnv()
60 |     while True:
61 |         env.reset()
62 |         for _ in range(200):
63 |             env.render()
64 |             _, reward, _, _ = env.step(env.action_space.sample())  # take a random action


--------------------------------------------------------------------------------
/meta_policy_search/envs/mujoco_envs/walker2d_rand_vel.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from meta_policy_search.envs.base import MetaEnv
 3 | import gym
 4 | from gym.envs.mujoco.mujoco_env import MujocoEnv
 5 | 
 6 | class Walker2DRandVelEnv(MetaEnv, gym.utils.EzPickle, MujocoEnv):
 7 |     def __init__(self):
 8 |         self.set_task(self.sample_tasks(1)[0])
 9 |         MujocoEnv.__init__(self, 'walker2d.xml', 8)
10 |         gym.utils.EzPickle.__init__(self)
11 |     
12 |     def sample_tasks(self, n_tasks):
13 |         return np.random.uniform(0.0, 10.0, (n_tasks, ))
14 | 
15 |     def set_task(self, task):
16 |         """
17 |         Args:
18 |             task: task of the meta-learning environment
19 |         """
20 |         self.goal_velocity = task
21 | 
22 |     def get_task(self):
23 |         """
24 |         Returns:
25 |             task: task of the meta-learning environment
26 |         """
27 |         return self.goal_velocity
28 | 
29 |     def step(self, a):
30 |         posbefore = self.sim.data.qpos[0]
31 |         self.do_simulation(a, self.frame_skip)
32 |         posafter, height, ang = self.sim.data.qpos[0:3]
33 |         alive_bonus = 15.0
34 |         forward_vel = (posafter - posbefore) / self.dt
35 |         reward = - np.abs(forward_vel - self.goal_velocity)
36 |         reward += alive_bonus
37 |         reward -= 1e-3 * np.square(a).sum()
38 |         done = not (height > 0.8 and height < 2.0 and
39 |                     ang > -1.0 and ang < 1.0)
40 |         ob = self._get_obs()
41 |         return ob, reward, done, {}
42 | 
43 |     def _get_obs(self):
44 |         qpos = self.sim.data.qpos
45 |         qvel = self.sim.data.qvel
46 |         return np.concatenate([qpos[1:], np.clip(qvel, -10, 10)]).ravel()
47 | 
48 |     def reset_model(self):
49 |         self.set_state(
50 |             self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq),
51 |             self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv)
52 |         )
53 |         return self._get_obs()
54 | 
55 |     def viewer_setup(self):
56 |         self.viewer.cam.trackbodyid = 2
57 |         self.viewer.cam.distance = self.model.stat.extent * 0.5
58 | 
59 | if __name__ == "__main__":
60 |     env = Walker2DRandVelEnv()
61 |     while True:
62 |         env.reset()
63 |         for _ in range(200):
64 |             env.render()
65 |             _, reward, _, _ = env.step(env.action_space.sample())  # take a random action


--------------------------------------------------------------------------------
/meta_policy_search/envs/normalized_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from meta_policy_search.utils.serializable import Serializable
  3 | from gym.spaces import Box
  4 | from rand_param_envs.gym.spaces import Box as OldBox
  5 | 
  6 | class NormalizedEnv(Serializable):
  7 |     """
  8 |     Normalizes the environment class.
  9 | 
 10 |     Args:
 11 |         Env (gym.Env): class of the unnormalized gym environment
 12 |         scale_reward (float): scale of the reward
 13 |         normalize_obs (bool): whether normalize the observations or not
 14 |         normalize_reward (bool): whether normalize the reward or not
 15 |         obs_alpha (float): step size of the running mean and variance for the observations
 16 |         reward_alpha (float): step size of the running mean and variance for the observations
 17 |         normalization_scale (float): rescaled action magnitude
 18 | 
 19 |     """
 20 |     def __init__(self,
 21 |                  env,
 22 |                  scale_reward=1.,
 23 |                  normalize_obs=False,
 24 |                  normalize_reward=False,
 25 |                  obs_alpha=0.001,
 26 |                  reward_alpha=0.001,
 27 |                  normalization_scale=10.,
 28 |                  ):
 29 |         Serializable.quick_init(self, locals())
 30 | 
 31 |         self._scale_reward = 1
 32 |         self._wrapped_env = env
 33 | 
 34 |         self._normalize_obs = normalize_obs
 35 |         self._normalize_reward = normalize_reward
 36 |         self._obs_alpha = obs_alpha
 37 |         self._obs_mean = np.zeros(self.observation_space.shape)
 38 |         self._obs_var = np.ones(self.observation_space.shape)
 39 |         self._reward_alpha = reward_alpha
 40 |         self._reward_mean = 0.
 41 |         self._reward_var = 1.
 42 |         self._normalization_scale = normalization_scale
 43 | 
 44 | 
 45 |     @property
 46 |     def action_space(self):
 47 |         if isinstance(self._wrapped_env.action_space, Box):
 48 |             ub = np.ones(self._wrapped_env.action_space.shape) * self._normalization_scale
 49 |             return Box(-1 * ub, ub, dtype=np.float32)
 50 |         return self._wrapped_env.action_space
 51 | 
 52 |     def __getattr__(self, attr):
 53 |         """
 54 |         If normalized env does not have the attribute then call the attribute in the wrapped_env
 55 |         Args:
 56 |             attr: attribute to get
 57 | 
 58 |         Returns:
 59 |             attribute of the wrapped_env
 60 | 
 61 |         """
 62 |         orig_attr = self._wrapped_env.__getattribute__(attr)
 63 | 
 64 |         if callable(orig_attr):
 65 |             def hooked(*args, **kwargs):
 66 |                 result = orig_attr(*args, **kwargs)
 67 |                 return result
 68 | 
 69 |             return hooked
 70 |         else:
 71 |             return orig_attr
 72 | 
 73 |     def _update_obs_estimate(self, obs):
 74 |         o_a = self._obs_alpha
 75 |         self._obs_mean = (1 - o_a) * self._obs_mean + o_a * obs
 76 |         self._obs_var = (1 - o_a) * self._obs_var + o_a * np.square(obs - self._obs_mean)
 77 | 
 78 |     def _update_reward_estimate(self, reward):
 79 |         r_a = self._reward_alpha
 80 |         self._reward_mean = (1 - r_a) * self._reward_mean + r_a * reward
 81 |         self._reward_var = (1 - r_a) * self._reward_var + r_a * np.square(reward - self._reward_mean)
 82 | 
 83 |     def _apply_normalize_obs(self, obs):
 84 |         self._update_obs_estimate(obs)
 85 |         return (obs - self._obs_mean) / (np.sqrt(self._obs_var) + 1e-8)
 86 | 
 87 |     def _apply_normalize_reward(self, reward):
 88 |         self._update_reward_estimate(reward)
 89 |         return reward / (np.sqrt(self._reward_var) + 1e-8)
 90 | 
 91 |     def reset(self):
 92 |         obs = self._wrapped_env.reset()
 93 |         if self._normalize_obs:
 94 |             return self._apply_normalize_obs(obs)
 95 |         else:
 96 |             return obs
 97 | 
 98 |     def __getstate__(self):
 99 |         d = Serializable.__getstate__(self)
100 |         d["_obs_mean"] = self._obs_mean
101 |         d["_obs_var"] = self._obs_var
102 |         return d
103 | 
104 |     def __setstate__(self, d):
105 |         Serializable.__setstate__(self, d)
106 |         self._obs_mean = d["_obs_mean"]
107 |         self._obs_var = d["_obs_var"]
108 | 
109 |     def step(self, action):
110 |         if isinstance(self._wrapped_env.action_space, Box) or isinstance(self._wrapped_env.action_space, OldBox):
111 |             # rescale the action
112 |             lb, ub = self._wrapped_env.action_space.low, self._wrapped_env.action_space.high
113 |             scaled_action = lb + (action + self._normalization_scale) * (ub - lb) / (2 * self._normalization_scale)
114 |             scaled_action = np.clip(scaled_action, lb, ub)
115 |         else:
116 |             scaled_action = action
117 |         wrapped_step = self._wrapped_env.step(scaled_action)
118 |         next_obs, reward, done, info = wrapped_step
119 |         if getattr(self, "_normalize_obs", False):
120 |             next_obs = self._apply_normalize_obs(next_obs)
121 |         if getattr(self, "_normalize_reward", False):
122 |             reward = self._apply_normalize_reward(reward)
123 |         return next_obs, reward * self._scale_reward, done, info
124 | 
125 | 
126 | normalize = NormalizedEnv


--------------------------------------------------------------------------------
/meta_policy_search/envs/point_envs/corner_goals_point_env_2d.py:
--------------------------------------------------------------------------------
 1 | from meta_policy_search.envs.base import MetaEnv
 2 | 
 3 | import numpy as np
 4 | from gym.spaces import Box
 5 | 
 6 | 
 7 | class MetaPointEnv(MetaEnv):
 8 | 
 9 |     def step(self, action):
10 |         """
11 |         Run one timestep of the environment's dynamics. When end of episode
12 |         is reached, reset() should be called to reset the environment's internal state.
13 | 
14 |         Args:
15 |             action : an action provided by the environment
16 |         Returns:
17 |             (observation, reward, done, info)
18 |             observation : agent's observation of the current environment
19 |             reward [Float] : amount of reward due to the previous action
20 |             done : a boolean, indicating whether the episode has ended
21 |             info : a dictionary containing other diagnostic information from the previous action
22 |         """
23 |         prev_state = self._state
24 |         self._state = prev_state + np.clip(action, -0.1, 0.1)
25 |         reward = self.reward(prev_state, action, self._state)
26 |         done = self.done(self._state)
27 |         next_observation = np.copy(self._state)
28 |         return next_observation, reward, done, {}
29 | 
30 |     def reset(self):
31 |         """
32 |         Resets the state of the environment, returning an initial observation.
33 |         Outputs
34 |         -------
35 |         observation : the initial observation of the space. (Initial reward is assumed to be 0.)
36 |         """
37 |         self._state = np.random.uniform(-2, 2, size=(2,))
38 |         observation = np.copy(self._state)
39 |         return observation
40 | 
41 |     @property
42 |     def observation_space(self):
43 |         return Box(low=-np.inf, high=np.inf, shape=(2,))
44 | 
45 |     @property
46 |     def action_space(self):
47 |         return Box(low=-0.1, high=0.1, shape=(2,))
48 | 
49 |     def done(self, obs):
50 |         if obs.ndim == 1:
51 |             return abs(obs[0]) < 0.01 and abs(obs[1]) < 0.01
52 |         elif obs.ndim == 2:
53 |             return np.logical_and(np.abs(obs[:, 0]) < 0.01, np.abs(obs[:, 1]) < 0.01)
54 | 
55 |     def reward(self, obs, act, obs_next):
56 |         if obs_next.ndim == 1:
57 |             return - np.sqrt(obs_next[0]**2 + obs_next[1]**2)
58 |         elif obs_next.ndim == 2:
59 |             return - np.sqrt(obs_next[:, 0] ** 2 + obs_next[:, 1] ** 2)
60 | 
61 |     def log_diagnostics(self, paths):
62 |         pass
63 | 
64 |     def sample_tasks(self, n_tasks):
65 |         return [{}] * n_tasks
66 | 
67 |     def set_task(self, task):
68 |         pass
69 | 
70 |     def get_task(self):
71 |         return {}


--------------------------------------------------------------------------------
/meta_policy_search/envs/point_envs/point_env_2d.py:
--------------------------------------------------------------------------------
 1 | from meta_policy_search.envs.base import MetaEnv
 2 | 
 3 | import numpy as np
 4 | from gym.spaces import Box
 5 | 
 6 | 
 7 | class MetaPointEnv(MetaEnv):
 8 | 
 9 |     def step(self, action):
10 |         """
11 |         Run one timestep of the environment's dynamics. When end of episode
12 |         is reached, reset() should be called to reset the environment's internal state.
13 | 
14 |         Args:
15 |             action : an action provided by the environment
16 |         Returns:
17 |             (observation, reward, done, info)
18 |             observation : agent's observation of the current environment
19 |             reward [Float] : amount of reward due to the previous action
20 |             done : a boolean, indicating whether the episode has ended
21 |             info : a dictionary containing other diagnostic information from the previous action
22 |         """
23 |         prev_state = self._state
24 |         self._state = prev_state + np.clip(action, -0.1, 0.1)
25 |         reward = self.reward(prev_state, action, self._state)
26 |         done = self.done(self._state)
27 |         next_observation = np.copy(self._state)
28 |         return next_observation, reward, done, {}
29 | 
30 |     def reset(self):
31 |         """
32 |         Resets the state of the environment, returning an initial observation.
33 |         Outputs
34 |         -------
35 |         observation : the initial observation of the space. (Initial reward is assumed to be 0.)
36 |         """
37 |         self._state = np.random.uniform(-2, 2, size=(2,))
38 |         observation = np.copy(self._state)
39 |         return observation
40 | 
41 |     @property
42 |     def observation_space(self):
43 |         return Box(low=-np.inf, high=np.inf, shape=(2,))
44 | 
45 |     @property
46 |     def action_space(self):
47 |         return Box(low=-0.1, high=0.1, shape=(2,))
48 | 
49 |     def done(self, obs):
50 |         if obs.ndim == 1:
51 |             return abs(obs[0]) < 0.01 and abs(obs[1]) < 0.01
52 |         elif obs.ndim == 2:
53 |             return np.logical_and(np.abs(obs[:, 0]) < 0.01, np.abs(obs[:, 1]) < 0.01)
54 | 
55 |     def reward(self, obs, act, obs_next):
56 |         if obs_next.ndim == 1:
57 |             return - np.sqrt(obs_next[0]**2 + obs_next[1]**2)
58 |         elif obs_next.ndim == 2:
59 |             return - np.sqrt(obs_next[:, 0] ** 2 + obs_next[:, 1] ** 2)
60 | 
61 |     def log_diagnostics(self, paths):
62 |         pass
63 | 
64 |     def sample_tasks(self, n_tasks):
65 |         return [{}] * n_tasks
66 | 
67 |     def set_task(self, task):
68 |         pass
69 | 
70 |     def get_task(self):
71 |         return {}


--------------------------------------------------------------------------------
/meta_policy_search/envs/point_envs/point_env_2d_corner.py:
--------------------------------------------------------------------------------
  1 | from meta_policy_search.envs.base import MetaEnv
  2 | 
  3 | import numpy as np
  4 | from gym.spaces import Box
  5 | 
  6 | 
  7 | class MetaPointEnvCorner(MetaEnv):
  8 |     """
  9 |     Simple 2D point meta environment. Each meta-task corresponds to a different goal / corner
 10 |     (one of the 4 points (-2,-2), (-2, 2), (2, -2), (2,2)) which are sampled with equal probability
 11 |     """
 12 | 
 13 |     def __init__(self, reward_type='sparse', sparse_reward_radius=0.5):
 14 |         assert reward_type in ['dense', 'dense_squared', 'sparse']
 15 |         self.reward_type = reward_type
 16 |         print("Point Env reward type is", reward_type)
 17 |         self.sparse_reward_radius = sparse_reward_radius
 18 |         self.corners = [np.array([-2,-2]), np.array([2,-2]), np.array([-2,2]), np.array([2, 2])]
 19 |         self.observation_space = Box(low=-np.inf, high=np.inf, shape=(2,))
 20 |         self.action_space = Box(low=-0.2, high=0.2, shape=(2,))
 21 | 
 22 |     def step(self, action):
 23 |         """
 24 |         Run one timestep of the environment's dynamics. When end of episode
 25 |         is reached, reset() should be called to reset the environment's internal state.
 26 | 
 27 |         Args:
 28 |             action : an action provided by the environment
 29 |         Returns:
 30 |             (observation, reward, done, info)
 31 |             observation : agent's observation of the current environment
 32 |             reward [Float] : amount of reward due to the previous action
 33 |             done : a boolean, indicating whether the episode has ended
 34 |             info : a dictionary containing other diagnostic information from the previous action
 35 |         """
 36 |         prev_state = self._state
 37 |         self._state = prev_state + np.clip(action, -0.2, 0.2)
 38 |         reward = self.reward(prev_state, action, self._state)
 39 |         done = False # self.done(self._state)
 40 |         next_observation = np.copy(self._state)
 41 |         return next_observation, reward, done, {}
 42 | 
 43 |     def reset(self):
 44 |         """
 45 |         Resets the state of the environment, returning an initial observation.
 46 |         Outputs
 47 |         -------
 48 |         observation : the initial observation of the space. (Initial reward is assumed to be 0.)
 49 |         """
 50 |         self._state = np.random.uniform(-0.2, 0.2, size=(2,))
 51 |         observation = np.copy(self._state)
 52 |         return observation
 53 | 
 54 |     def done(self, obs):
 55 |         if obs.ndim == 1:
 56 |             return self.done(np.array([obs]))
 57 |         elif obs.ndim == 2:
 58 |             goal_distance = np.linalg.norm(obs - self.goal[None,:], axis=1)
 59 |             return np.max(self._state) > 3
 60 | 
 61 |     def reward(self, obs, act, obs_next):
 62 |         if obs_next.ndim == 2:
 63 |             goal_distance = np.linalg.norm(obs_next - self.goal[None,:], axis=1)[0]
 64 |             if self.reward_type == 'dense':
 65 |                 return - goal_distance
 66 |             elif self.reward_type == 'dense_squared':
 67 |                 return - goal_distance**2
 68 |             elif self.reward_type == 'sparse':
 69 |                 dist_from_start = np.linalg.norm(obs_next, ord=1, axis=1)[0]
 70 |                 if dist_from_start < self.sparse_reward_radius:
 71 |                     return 0
 72 |                 dists = [np.linalg.norm(obs_next - corner[None, :], axis=1) for corner in self.corners]
 73 |                 if np.min(goal_distance) == min(dists):
 74 |                     return np.linalg.norm(obs - self.goal[None,:], axis=1)[0] - goal_distance
 75 |                 return 0
 76 |                 # return np.maximum(self.sparse_reward_radius - goal_distance, 0)
 77 | 
 78 |         elif obs_next.ndim == 1:
 79 |             return self.reward(np.array([obs]), np.array([act]), np.array([obs_next]))
 80 |         else:
 81 |             raise NotImplementedError
 82 | 
 83 |     def log_diagnostics(self, *args):
 84 |         pass
 85 | 
 86 |     def sample_tasks(self, n_tasks):
 87 |         return [self.corners[idx] for idx in np.random.choice(range(len(self.corners)), size=n_tasks)]
 88 | 
 89 |     def set_task(self, task):
 90 |         self.goal = task
 91 | 
 92 |     def get_task(self):
 93 |         return self.goal
 94 | 
 95 | if __name__ == "__main__":
 96 |     env = MetaPointEnvCorner()
 97 |     task = env.sample_tasks(10)
 98 |     print(task[0])
 99 |     while True:
100 |         env.set_task(task[0])
101 |         env.reset()
102 |         done = False
103 |         i = 0
104 |         t_r = 0
105 |         while not done:
106 |             obs, reward, done, _ = env.step(env.action_space.sample())  # take a random action
107 |             t_r += reward
108 |             i += 1
109 |             if reward > 0:
110 |                 print(obs)
111 |                 break
112 |             if i > 200:
113 |                 print(obs)
114 |                 break
115 |         print(i, t_r)


--------------------------------------------------------------------------------
/meta_policy_search/envs/point_envs/point_env_2d_momentum.py:
--------------------------------------------------------------------------------
  1 | from meta_policy_search.envs.base import MetaEnv
  2 | 
  3 | import numpy as np
  4 | from gym.spaces import Box
  5 | 
  6 | 
  7 | class MetaPointEnvMomentum(MetaEnv):
  8 |     """
  9 |     Simple 2D point meta environment. Each meta-task corresponds to a different goal / corner
 10 |     (one of the 4 points (-2,-2), (-2, 2), (2, -2), (2,2)) which are sampled with equal probability
 11 |     """
 12 | 
 13 |     def __init__(self, reward_type='sparse', sparse_reward_radius=2):
 14 |         assert reward_type in ['dense', 'dense_squared', 'sparse']
 15 |         self.reward_type = reward_type
 16 |         print("Point Env reward type is", reward_type)
 17 |         self.sparse_reward_radius = sparse_reward_radius
 18 |         self.corners = [np.array([-2,-2]), np.array([2,-2]), np.array([-2,2]), np.array([2, 2])]
 19 |         self.observation_space = Box(low=-np.inf, high=np.inf, shape=(4,))
 20 |         self.action_space = Box(low=-0.1, high=0.1, shape=(2,))
 21 | 
 22 |     def step(self, action):
 23 |         """
 24 |         Run one timestep of the environment's dynamics. When end of episode
 25 |         is reached, reset() should be called to reset the environment's internal state.
 26 | 
 27 |         Args:
 28 |             action : an action provided by the environment
 29 |         Returns:
 30 |             (observation, reward, done, info)
 31 |             observation : agent's observation of the current environment
 32 |             reward [Float] : amount of reward due to the previous action
 33 |             done : a boolean, indicating whether the episode has ended
 34 |             info : a dictionary containing other diagnostic information from the previous action
 35 |         """
 36 |         prev_state = self._state
 37 |         self._velocity += np.clip(action, -0.1, 0.1)
 38 |         self._state = prev_state + self._velocity
 39 |         reward = self.reward(prev_state, action, self._state)
 40 |         done = False # self.done(self._state)
 41 |         next_observation = np.hstack((self._state, self._velocity))
 42 |         return next_observation, reward, done, {}
 43 | 
 44 |     def reset(self):
 45 |         """
 46 |         Resets the state of the environment, returning an initial observation.
 47 |         Outputs
 48 |         -------
 49 |         observation : the initial observation of the space. (Initial reward is assumed to be 0.)
 50 |         """
 51 |         self._state = np.random.uniform(-0.2, 0.2, size=(2,))
 52 |         self._velocity = np.random.uniform(-0.1, 0.1, size=(2,))
 53 |         observation = np.hstack((self._state, self._velocity))
 54 |         return observation
 55 | 
 56 |     def done(self, obs):
 57 |         if obs.ndim == 1:
 58 |             return self.done(np.array([obs]))
 59 |         elif obs.ndim == 2:
 60 |             goal_distance = np.linalg.norm(obs[:2] - self.goal[None,:], axis=1)
 61 |             return np.max(self._state) > 3
 62 | 
 63 |     def reward(self, obs, act, obs_next):
 64 |         if obs_next.ndim == 2:
 65 |             goal_distance = np.linalg.norm(obs_next[:2] - self.goal[None,:], axis=1)[0]
 66 |             if self.reward_type == 'dense':
 67 |                 return - goal_distance
 68 |             elif self.reward_type == 'dense_squared':
 69 |                 return - goal_distance**2
 70 |             elif self.reward_type == 'sparse':
 71 |                 return np.maximum(self.sparse_reward_radius - goal_distance, 0)
 72 | 
 73 |         elif obs_next.ndim == 1:
 74 |             return self.reward(np.array([obs]), np.array([act]), np.array([obs_next]))
 75 |         else:
 76 |             raise NotImplementedError
 77 | 
 78 |     def log_diagnostics(self, *args):
 79 |         pass
 80 | 
 81 |     def sample_tasks(self, n_tasks):
 82 |         return [self.corners[idx] for idx in np.random.choice(range(len(self.corners)), size=n_tasks)]
 83 | 
 84 |     def set_task(self, task):
 85 |         self.goal = task
 86 | 
 87 |     def get_task(self):
 88 |         return self.goal
 89 | 
 90 | if __name__ == "__main__":
 91 |     env = MetaPointEnvMomentum()
 92 |     while True:
 93 |         task = env.sample_tasks(10)
 94 |         env.set_task(task[0])
 95 |         env.reset()
 96 |         done = False
 97 |         i = 0
 98 |         t_r = 0
 99 |         while not done:
100 |             obs, reward, done, _ = env.step(env.action_space.sample())  # take a random action
101 |             t_r += reward
102 |             i += 1
103 |             if reward > 0:
104 |                 break
105 |             if np.max(obs) > 300:
106 |                 break
107 |             if i > 200:
108 |                 break
109 |         print(i, t_r)


--------------------------------------------------------------------------------
/meta_policy_search/envs/point_envs/point_env_2d_v2.py:
--------------------------------------------------------------------------------
 1 | from meta_policy_search.envs.base import MetaEnv
 2 | 
 3 | import numpy as np
 4 | from gym.spaces import Box
 5 | 
 6 | 
 7 | class MetaPointEnv(MetaEnv):
 8 |     def __init__(self):
 9 |         self.goal = np.random.uniform(-2, 2, size=(2,))
10 | 
11 |     def step(self, action):
12 |         """
13 |         Run one timestep of the environment's dynamics. When end of episode
14 |         is reached, reset() should be called to reset the environment's internal state.
15 | 
16 |         Args:
17 |             action : an action provided by the environment
18 |         Returns:
19 |             (observation, reward, done, info)
20 |             observation : agent's observation of the current environment
21 |             reward [Float] : amount of reward due to the previous action
22 |             done : a boolean, indicating whether the episode has ended
23 |             info : a dictionary containing other diagnostic information from the previous action
24 |         """
25 |         prev_state = self._state
26 |         self._state = prev_state + np.clip(action, -0.1, 0.1)
27 |         reward = self.reward(prev_state, action, self._state)
28 |         done = self.done(self._state)
29 |         next_observation = np.copy(self._state)
30 |         return next_observation, reward, done, {}
31 | 
32 | 
33 |     def reset(self):
34 |         """
35 |         Resets the state of the environment, returning an initial observation.
36 |         Outputs
37 |         -------
38 |         observation : the initial observation of the space. (Initial reward is assumed to be 0.)
39 |         """
40 |         self._state = np.zeros(2)
41 |         observation = np.copy(self._state)
42 |         return observation
43 | 
44 |     @property
45 |     def observation_space(self):
46 |         return Box(low=-np.inf, high=np.inf, shape=(2,))
47 | 
48 |     @property
49 |     def action_space(self):
50 |         return Box(low=-0.1, high=0.1, shape=(2,))
51 | 
52 |     def done(self, obs):
53 |         if obs.ndim == 1:
54 |             return abs(obs[0]) < 0.01 and abs(obs[1]) < 0.01
55 |         elif obs.ndim == 2:
56 |             return np.logical_and(np.abs(obs[:, 0]) < 0.01, np.abs(obs[:, 1]) < 0.01)
57 | 
58 |     def reward(self, obs, act, obs_next):
59 |         return - np.sqrt(np.sum((self.goal - obs_next) ** 2))
60 |         # if obs_next.ndim == 1:
61 |         #     return - np.sqrt(obs_next[0]**2 + obs_next[1]**2)
62 |         # elif obs_next.ndim == 2:
63 |         #     return - np.sqrt(obs_next[:, 0] ** 2 + obs_next[:, 1] ** 2)
64 | 
65 |     def log_diagnostics(self, paths):
66 |         pass
67 | 
68 |     def sample_tasks(self, n_tasks):
69 |         return np.random.uniform(-2, 2, size=(n_tasks, 2))
70 | 
71 |     def set_task(self, task):
72 |         self.goal = task
73 | 
74 |     def get_task(self):
75 |         return self.task


--------------------------------------------------------------------------------
/meta_policy_search/envs/point_envs/point_env_2d_walls.py:
--------------------------------------------------------------------------------
  1 | from meta_policy_search.envs.base import MetaEnv
  2 | 
  3 | import numpy as np
  4 | from gym.spaces import Box
  5 | 
  6 | 
  7 | class MetaPointEnvWalls(MetaEnv):
  8 |     """
  9 |     Simple 2D point meta environment. Each meta-task corresponds to a different goal / corner
 10 |     (one of the 4 points (-2,-2), (-2, 2), (2, -2), (2,2)) which are sampled with equal probability
 11 |     """
 12 | 
 13 |     def __init__(self, reward_type='dense', sparse_reward_radius=2):
 14 |         assert reward_type in ['dense', 'dense_squared', 'sparse']
 15 |         self.reward_type = reward_type
 16 |         print("Point Env reward type is", reward_type)
 17 |         self.sparse_reward_radius = sparse_reward_radius
 18 |         self.corners = [np.array([-2,-2]), np.array([2,-2]), np.array([-2,2]), np.array([2, 2])]
 19 |         self.observation_space = Box(low=-np.inf, high=np.inf, shape=(2,))
 20 |         self.action_space = Box(low=-0.2, high=0.2, shape=(2,))
 21 | 
 22 |     def step(self, action):
 23 |         """
 24 |         Run one timestep of the environment's dynamics. When end of episode
 25 |         is reached, reset() should be called to reset the environment's internal state.
 26 | 
 27 |         Args:
 28 |             action : an action provided by the environment
 29 |         Returns:
 30 |             (observation, reward, done, info)
 31 |             observation : agent's observation of the current environment
 32 |             reward [Float] : amount of reward due to the previous action
 33 |             done : a boolean, indicating whether the episode has ended
 34 |             info : a dictionary containing other diagnostic information from the previous action
 35 |         """
 36 |         prev_state = self._state
 37 |         self._state = prev_state + np.clip(action, -0.2, 0.2)
 38 |         reward = self.reward(prev_state, action, self._state)
 39 |         done = False # self.done(self._state)
 40 |         if np.linalg.norm(prev_state) < 1 and np.linalg.norm(self._state) > 1:
 41 |             gap_1_dist = np.linalg.norm(self._state - self.gap_1[None,:], axis=1)[0]
 42 |             if gap_1_dist > 1:
 43 |                 self._state = self._state / (np.linalg.norm(self._state) + 1e-6)
 44 |             assert gap_1_dist < 1 or np.linalg.norm(self._state) < 1
 45 |         elif np.linalg.norm(prev_state) < 2 and np.linalg.norm(self._state) > 2:
 46 |             gap_2_dist = np.linalg.norm(self._state - self.gap_2[None,:], axis=1)[0]
 47 |             if gap_2_dist > 1:
 48 |                 self._state = self._state / (np.linalg.norm(self._state) * 0.5 + 1e-6)
 49 |             assert gap_2_dist < 1 or np.linalg.norm(self._state) < 2
 50 |         next_observation = np.copy(self._state)
 51 |         return next_observation, reward, done, {}
 52 | 
 53 |     def reset(self):
 54 |         """
 55 |         Resets the state of the environment, returning an initial observation.
 56 |         Outputs
 57 |         -------
 58 |         observation : the initial observation of the space. (Initial reward is assumed to be 0.)
 59 |         """
 60 |         self._state = np.random.uniform(-0.2, 0.2, size=(2,))
 61 |         observation = np.copy(self._state)
 62 |         return observation
 63 | 
 64 |     def done(self, obs):
 65 |         if obs.ndim == 1:
 66 |             return self.done(np.array([obs]))
 67 |         elif obs.ndim == 2:
 68 |             goal_distance = np.linalg.norm(obs - self.goal[None,:], axis=1)
 69 |             return np.max(self._state) > 3
 70 | 
 71 |     def reward(self, obs, act, obs_next):
 72 |         if obs_next.ndim == 2:
 73 |             goal_distance = np.linalg.norm(obs_next - self.goal[None,:], axis=1)[0]
 74 |             if self.reward_type == 'dense':
 75 |                 return - goal_distance
 76 |             elif self.reward_type == 'dense_squared':
 77 |                 return - goal_distance**2
 78 |             elif self.reward_type == 'sparse':
 79 |                 if goal_distance < self.sparse_reward_radius:
 80 |                     return np.linalg.norm(obs - self.goal[None,:], axis=1)[0] - goal_distance
 81 |                 else:
 82 |                     return
 83 |                 # return np.maximum(self.sparse_reward_radius - goal_distance, 0)
 84 | 
 85 |         elif obs_next.ndim == 1:
 86 |             return self.reward(np.array([obs]), np.array([act]), np.array([obs_next]))
 87 |         else:
 88 |             raise NotImplementedError
 89 | 
 90 |     def log_diagnostics(self, *args):
 91 |         pass
 92 | 
 93 |     def sample_tasks(self, n_tasks):
 94 |         goals = [self.corners[idx] for idx in np.random.choice(range(len(self.corners)), size=n_tasks)]
 95 |         gaps_1 = np.random.normal(size=(n_tasks, 2))
 96 |         gaps_1 /= np.linalg.norm(gaps_1, axis=1)[..., np.newaxis]
 97 |         gaps_2 = np.random.normal(size=(n_tasks, 2))
 98 |         gaps_2 /= (np.linalg.norm(gaps_2, axis=1) / 2)[..., np.newaxis]
 99 |         return [dict(goal=goal, gap_1=gap_1, gap_2=gap_2) for goal, gap_1, gap_2 in zip(goals, gaps_1, gaps_2)]
100 | 
101 |     def set_task(self, task):
102 |         self.goal = task['goal']
103 |         self.gap_1 = task['gap_1']
104 |         self.gap_2 = task['gap_2']
105 | 
106 |     def get_task(self):
107 |         return dict(goal=self.goal, gap_1=self.gap_1, gap_2=self.gap_2)
108 | 
109 | if __name__ == "__main__":
110 |     env = MetaPointEnvWalls()
111 |     while True:
112 |         task = env.sample_tasks(10)
113 |         env.set_task(task[0])
114 |         env.reset()
115 |         done = False
116 |         i = 0
117 |         t_r = 0
118 |         while not done:
119 |             obs, reward, done, _ = env.step(env.action_space.sample())  # take a random action
120 |             t_r += reward
121 |             i += 1
122 |             if reward > 0:
123 |                 break
124 |             if np.max(obs) > 300:
125 |                 break
126 |             if i > 200:
127 |                 break
128 |         print(i, t_r)


--------------------------------------------------------------------------------
/meta_policy_search/envs/sawyer_envs/sawyer_door.py:
--------------------------------------------------------------------------------
 1 | from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv as SawyerEnv
 2 | from multiworld.core.flat_goal_env import FlatGoalEnv
 3 | import numpy as np
 4 | from meta_policy_search.envs.base import MetaEnv
 5 | from meta_policy_search.utils import logger
 6 | 
 7 | 
 8 | class SawyerPushEnv(FlatGoalEnv, MetaEnv):
 9 |     """
10 |     Wrapper for SawyerPushEnv from multiworld envs, using our method headers
11 |     """
12 |     def __init__(self, *args, **kwargs):
13 |         self.quick_init(locals())
14 |         sawyer_env = SawyerEnv(*args, **kwargs)
15 |         FlatGoalEnv.__init__(self, sawyer_env, obs_keys=['state_observation'], goal_keys=['state_desired_goal'])
16 | 
17 |     def sample_tasks(self, n_tasks):
18 |         return self.sample_goals(n_tasks)
19 | 
20 |     def set_task(self, task):
21 |         """
22 |         Args:
23 |             task: task of the meta-learning environment
24 |         """
25 |         return self.set_goal(task)
26 | 
27 |     def get_task(self):
28 |         """
29 |         Returns:
30 |             task: task of the meta-learning environment
31 |         """
32 |         return self.get_goal()
33 | 
34 |     def log_diagnostics(self, paths, prefix=''):
35 |         self.get_diagnostics(paths)
36 | 
37 |     @property
38 |     def action_space(self):
39 |         return FlatGoalEnv.action_space(self)
40 | 
41 |     def render(self):
42 |         SawyerEnv.render(self)
43 | 
44 |     def log_diagnostics(self, paths, prefix=''):
45 |         reach_dist = [path["env_infos"]['reachDist'] for path in paths]
46 |         placing_dist = [path["env_infos"]['placeDist'] for path in paths]
47 | 
48 |         logger.logkv(prefix + 'AverageReachDistance', np.mean(reach_dist))
49 |         logger.logkv(prefix + 'AveragePlaceDistance', np.mean(placing_dist))
50 | 
51 | if __name__ == "__main__":
52 |     env = SawyerPushEnv()
53 |     while True:
54 |         task = env.sample_tasks(1)[0]
55 |         env.set_task(task)
56 |         env.reset()
57 |         for _ in range(500):
58 |             env.render()
59 |             _, reward, _, _ = env.step(env.action_space.sample())  # take a random action


--------------------------------------------------------------------------------
/meta_policy_search/envs/sawyer_envs/sawyer_pick_and_place.py:
--------------------------------------------------------------------------------
 1 | from multiworld.envs.mujoco.sawyer_xyz.pickPlace.sawyer_pick_and_place import SawyerPickPlaceEnv as SawyerEnv
 2 | from multiworld.core.flat_goal_env import FlatGoalEnv
 3 | import numpy as np
 4 | from meta_policy_search.envs.base import MetaEnv
 5 | from meta_policy_search.utils import logger
 6 | 
 7 | 
 8 | class SawyerPickAndPlaceEnv(FlatGoalEnv, MetaEnv):
 9 |     """
10 |     Wrapper for SawyerPickAndPlaceEnv from multiworld envs, using our method headers
11 |     """
12 |     def __init__(self, *args, **kwargs):
13 |         self.quick_init(locals())
14 |         sawyer_env = SawyerEnv(*args, **kwargs)
15 |         FlatGoalEnv.__init__(self, sawyer_env, obs_keys=['state_observation'], goal_keys=['state_desired_goal'])
16 | 
17 |     def sample_tasks(self, n_tasks):
18 |         return self.sample_goals(n_tasks)
19 | 
20 |     def set_task(self, task):
21 |         """
22 |         Args:
23 |             task: task of the meta-learning environment
24 |         """
25 |         return self.set_goal(task)
26 | 
27 |     def get_task(self):
28 |         """
29 |         Returns:
30 |             task: task of the meta-learning environment
31 |         """
32 |         return self.get_goal()
33 | 
34 |     def log_diagnostics(self, paths, prefix=''):
35 |         self.get_diagnostics(paths)
36 | 
37 |     @property
38 |     def action_space(self):
39 |         return FlatGoalEnv.action_space(self)
40 | 
41 |     def render(self):
42 |         SawyerEnv.render(self)
43 | 
44 |     def log_diagnostics(self, paths, prefix=''):
45 |         reach_rew = [path["env_infos"]['reachRew'] for path in paths]
46 |         pick_rew = [path["env_infos"]['pickRew'][-1] for path in paths]
47 |         place_rew = [path["env_infos"]['placeRew'] for path in paths]
48 |         reach_dist = [path["env_infos"]['reachDist'] for path in paths]
49 |         placing_dist = [path["env_infos"]['placingDist'] for path in paths]
50 | 
51 |         logger.logkv(prefix + 'AverageReachReward', np.mean(reach_rew))
52 |         logger.logkv(prefix + 'AveragePickReward', np.mean(pick_rew))
53 |         logger.logkv(prefix + 'AveragePlaceReward', np.mean(place_rew))
54 |         logger.logkv(prefix + 'AverageReachDistance', np.mean(reach_dist))
55 |         logger.logkv(prefix + 'AveragePlaceDistance', np.mean(placing_dist))
56 | 
57 | if __name__ == "__main__":
58 |     env = SawyerPickAndPlaceEnv()
59 |     while True:
60 |         task = env.sample_tasks(1)[0]
61 |         env.set_task(task)
62 |         env.reset()
63 |         for _ in range(500):
64 |             SawyerEnv.render(env)
65 |             _, reward, _, _ = env.step(env.action_space.sample())  # take a random action


--------------------------------------------------------------------------------
/meta_policy_search/envs/sawyer_envs/sawyer_push.py:
--------------------------------------------------------------------------------
 1 | from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv as SawyerEnv
 2 | from multiworld.core.flat_goal_env import FlatGoalEnv
 3 | import numpy as np
 4 | from meta_policy_search.envs.base import MetaEnv
 5 | from meta_policy_search.utils import logger
 6 | 
 7 | 
 8 | class SawyerPushEnv(FlatGoalEnv, MetaEnv):
 9 |     """
10 |     Wrapper for SawyerPushEnv from multiworld envs, using our method headers
11 |     """
12 |     def __init__(self, *args, **kwargs):
13 |         self.quick_init(locals())
14 |         sawyer_env = SawyerEnv(*args, **kwargs)
15 |         FlatGoalEnv.__init__(self, sawyer_env, obs_keys=['state_observation'], goal_keys=['state_desired_goal'])
16 | 
17 |     def sample_tasks(self, n_tasks):
18 |         return self.sample_goals(n_tasks)
19 | 
20 |     def set_task(self, task):
21 |         """
22 |         Args:
23 |             task: task of the meta-learning environment
24 |         """
25 |         return self.set_goal(task)
26 | 
27 |     def get_task(self):
28 |         """
29 |         Returns:
30 |             task: task of the meta-learning environment
31 |         """
32 |         return self.get_goal()
33 | 
34 |     def log_diagnostics(self, paths, prefix=''):
35 |         self.get_diagnostics(paths)
36 | 
37 |     @property
38 |     def action_space(self):
39 |         return FlatGoalEnv.action_space(self)
40 | 
41 |     def render(self):
42 |         SawyerEnv.render(self)
43 | 
44 |     def log_diagnostics(self, paths, prefix=''):
45 |         reach_dist = [path["env_infos"]['reachDist'] for path in paths]
46 |         placing_dist = [path["env_infos"]['placeDist'] for path in paths]
47 | 
48 |         logger.logkv(prefix + 'AverageReachDistance', np.mean(reach_dist))
49 |         logger.logkv(prefix + 'AveragePlaceDistance', np.mean(placing_dist))
50 | 
51 | if __name__ == "__main__":
52 |     env = SawyerPushEnv()
53 |     while True:
54 |         task = env.sample_tasks(1)[0]
55 |         env.set_task(task)
56 |         env.reset()
57 |         for _ in range(500):
58 |             env.render()
59 |             _, reward, _, _ = env.step(env.action_space.sample())  # take a random action


--------------------------------------------------------------------------------
/meta_policy_search/envs/sawyer_envs/sawyer_push_simple.py:
--------------------------------------------------------------------------------
 1 | from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push_simple import SawyerPushSimpleEnv as SawyerEnv
 2 | from multiworld.core.flat_goal_env import FlatGoalEnv
 3 | import numpy as np
 4 | from meta_policy_search.envs.base import MetaEnv
 5 | from meta_policy_search.utils import logger
 6 | 
 7 | 
 8 | class SawyerPushSimpleEnv(FlatGoalEnv, MetaEnv):
 9 |     """
10 |     Wrapper for SawyerPushSimpleEnv from multiworld envs, using our method headers
11 |     """
12 |     def __init__(self, *args, **kwargs):
13 |         self.quick_init(locals())
14 |         sawyer_env = SawyerEnv(
15 |             obj_low=(-0.0, 0.5, 0.02),
16 |             obj_high=(0.0, 0.5, 0.02),
17 |             goal_low=(-0.2, 0.6, 0.02),
18 |             goal_high=(0.2, 0.8, 0.02),
19 |             rew_mode='posPlace',
20 |             *args, **kwargs)
21 |         FlatGoalEnv.__init__(self, sawyer_env, obs_keys=['state_observation'], goal_keys=['state_desired_goal'])
22 | 
23 |     def sample_tasks(self, n_tasks):
24 |         return self.sample_goals(n_tasks)
25 | 
26 |     def set_task(self, task):
27 |         """
28 |         Args:
29 |             task: task of the meta-learning environment
30 |         """
31 |         return self.set_goal(task)
32 | 
33 |     def get_task(self):
34 |         """
35 |         Returns:
36 |             task: task of the meta-learning environment
37 |         """
38 |         return self.get_goal()
39 | 
40 |     def log_diagnostics(self, paths, prefix=''):
41 |         self.get_diagnostics(paths)
42 | 
43 |     @property
44 |     def action_space(self):
45 |         return FlatGoalEnv.action_space(self)
46 | 
47 |     def render(self):
48 |         SawyerEnv.render(self)
49 | 
50 |     def log_diagnostics(self, paths, prefix=''):
51 |         reach_dist = [path["env_infos"]['reachDist'] for path in paths]
52 |         placing_dist = [path["env_infos"]['placeDist'] for path in paths]
53 |         cos_dist = [path["env_infos"]['cosDist'] for path in paths]
54 | 
55 |         logger.logkv(prefix + 'AverageReachDistance', np.mean(reach_dist))
56 |         logger.logkv(prefix + 'AveragePlaceDistance', np.mean(placing_dist))
57 |         logger.logkv(prefix + 'AverageCosDistance', np.mean(cos_dist))


--------------------------------------------------------------------------------
/meta_policy_search/meta_algos/__init__.py:
--------------------------------------------------------------------------------
1 | from meta_policy_search.meta_algos.base import MAMLAlgo
2 | from meta_policy_search.meta_algos.dice_maml import DICEMAML
3 | from meta_policy_search.meta_algos.pro_mp import ProMP
4 | from meta_policy_search.meta_algos.trpo_maml import TRPOMAML
5 | from meta_policy_search.meta_algos.vpg_maml import VPGMAML
6 | from meta_policy_search.meta_algos.vpg_dice_maml import VPG_DICEMAML


--------------------------------------------------------------------------------
/meta_policy_search/meta_algos/vpg_dice_maml.py:
--------------------------------------------------------------------------------
  1 | from meta_policy_search.meta_algos.dice_maml import DICEMAML
  2 | 
  3 | import tensorflow as tf
  4 | from collections import OrderedDict
  5 | 
  6 | 
  7 | class VPG_DICEMAML(DICEMAML):
  8 |     """
  9 |     Algorithm for DICE VPG MAML
 10 | 
 11 |     Args:
 12 |         max_path_length (int): maximum path length
 13 |         policy (Policy) : policy object
 14 |         name (str): tf variable scope
 15 |         learning_rate (float): learning rate for the meta-objective
 16 |         inner_lr (float) : gradient step size used for inner step
 17 |         meta_batch_size (int): number of meta-learning tasks
 18 |         num_inner_grad_steps (int) : number of gradient updates taken per maml iteration
 19 |         trainable_inner_step_size (boolean): whether make the inner step size a trainable variable
 20 |     """
 21 |     def __init__(
 22 |             self,
 23 |             max_path_length,
 24 |             *args,
 25 |             name="vpg_dice_maml",
 26 |             **kwargs
 27 |             ):
 28 |         super(VPG_DICEMAML, self).__init__(max_path_length, *args, **kwargs)
 29 | 
 30 |         self._optimization_keys = ['observations', 'actions', 'advantages', 'adjusted_rewards', 'mask', 'agent_infos']
 31 |         self.name = name
 32 | 
 33 |         self.build_graph()
 34 | 
 35 |     def build_graph(self):
 36 |         """
 37 |         Creates the computation graph for DICE MAML
 38 |         """
 39 | 
 40 |         """ Build graph for sampling """
 41 |         with tf.variable_scope(self.name + '_sampling'):
 42 |             self.step_sizes = self._create_step_size_vars()
 43 | 
 44 |             """ --- Build inner update graph for adapting the policy and sampling trajectories --- """
 45 |             # this graph is only used for adapting the policy and not computing the meta-updates
 46 |             self.adapted_policies_params, self.adapt_input_ph_dict = self._build_inner_adaption()
 47 | 
 48 | 
 49 |         """ Build graph for meta-update """
 50 |         meta_update_scope = tf.variable_scope(self.name + '_meta_update')
 51 | 
 52 |         with meta_update_scope:
 53 |             obs_phs, action_phs, adj_reward_phs, mask_phs, dist_info_old_phs, all_phs_dict = self._make_dice_input_placeholders('step0')
 54 |             self.meta_op_phs_dict = OrderedDict(all_phs_dict)
 55 | 
 56 |             distribution_info_vars, current_policy_params, all_surr_objs = [], [], []
 57 | 
 58 |         for i in range(self.meta_batch_size):
 59 |             obs_stacked = self._reshape_obs_phs(obs_phs[i])
 60 |             dist_info_sym = self.policy.distribution_info_sym(obs_stacked, params=None)
 61 |             distribution_info_vars.append(dist_info_sym)  # step 0
 62 |             current_policy_params.append(self.policy.policy_params) # set to real policy_params (tf.Variable)
 63 | 
 64 |         with meta_update_scope:
 65 |             """ Inner updates"""
 66 |             for step_id in range(1, self.num_inner_grad_steps+1):
 67 |                 with tf.variable_scope("inner_update_%i"%step_id):
 68 |                     surr_objs, adapted_policy_params = [], []
 69 | 
 70 |                     # inner adaptation step for each task
 71 |                     for i in range(self.meta_batch_size):
 72 |                         action_stacked = self._reshape_action_phs(action_phs[i])
 73 |                         surr_loss = self._adapt_objective_sym(action_stacked, adj_reward_phs[i], mask_phs[i], distribution_info_vars[i])
 74 | 
 75 |                         adapted_params_var = self._adapt_sym(surr_loss, current_policy_params[i])
 76 | 
 77 |                         adapted_policy_params.append(adapted_params_var)
 78 |                         surr_objs.append(surr_loss)
 79 | 
 80 |                     all_surr_objs.append(surr_objs)
 81 |                     # Create new placeholders for the next step
 82 |                 obs_phs, action_phs, adj_reward_phs, mask_phs, dist_info_old_phs, all_phs_dict = self._make_dice_input_placeholders('step%i' % step_id)
 83 |                 self.meta_op_phs_dict.update(all_phs_dict)
 84 | 
 85 |                 # dist_info_vars_for_next_step
 86 |                 distribution_info_vars = []
 87 |                 for i in range(self.meta_batch_size):
 88 |                     obs_stacked = self._reshape_obs_phs(obs_phs[i])
 89 |                     distribution_info_vars.append(self.policy.distribution_info_sym(obs_stacked, params=adapted_policy_params[i]))
 90 | 
 91 |                 current_policy_params = adapted_policy_params
 92 | 
 93 |             """ Outer (meta-)objective """
 94 |             with tf.variable_scope("outer_update"):
 95 |                 adv_phs, phs_dict = self._make_advantage_phs('step%i' % self.num_inner_grad_steps)
 96 |                 self.meta_op_phs_dict.update(phs_dict)
 97 | 
 98 |                 surr_objs = []
 99 | 
100 |                 # meta-objective
101 |                 for i in range(self.meta_batch_size):
102 |                     action_stacked = self._reshape_action_phs(action_phs[i])
103 |                     log_likelihood = self.policy.distribution.log_likelihood_sym(action_stacked, distribution_info_vars[i])
104 |                     log_likelihood = tf.reshape(log_likelihood, tf.shape(mask_phs[i]))
105 |                     surr_obj = - tf.reduce_mean(log_likelihood * adv_phs[i] * mask_phs[i])
106 |                     surr_objs.append(surr_obj)
107 | 
108 |                 """ Mean over meta tasks """
109 |                 meta_objective = tf.reduce_mean(tf.stack(surr_objs, 0))
110 | 
111 |                 self.optimizer.build_graph(
112 |                     loss=meta_objective,
113 |                     target=self.policy,
114 |                     input_ph_dict=self.meta_op_phs_dict,
115 |                 )
116 | 
117 |     def _make_advantage_phs(self, prefix=''):
118 |         adv_phs = []
119 |         all_phs_dict = OrderedDict()
120 | 
121 |         for task_id in range(self.meta_batch_size):
122 |             # advantage ph
123 |             ph = tf.placeholder(dtype=tf.float32, shape=[None, self.max_path_length], name='advantage' + '_' + prefix + '_' + str(task_id))
124 |             all_phs_dict['%s_task%i_%s' % (prefix, task_id, 'advantages')] = ph
125 |             adv_phs.append(ph)
126 | 
127 |         return adv_phs, all_phs_dict
128 | 


--------------------------------------------------------------------------------
/meta_policy_search/meta_trainer.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import time
  4 | from meta_policy_search.utils import logger
  5 | 
  6 | 
  7 | class Trainer(object):
  8 |     """
  9 |     Performs steps of meta-policy search.
 10 | 
 11 |      Pseudocode::
 12 | 
 13 |             for iter in n_iter:
 14 |                 sample tasks
 15 |                 for task in tasks:
 16 |                     for adapt_step in num_inner_grad_steps
 17 |                         sample trajectories with policy
 18 |                         perform update/adaptation step
 19 |                     sample trajectories with post-update policy
 20 |                 perform meta-policy gradient step(s)
 21 | 
 22 |     Args:
 23 |         algo (Algo) :
 24 |         env (Env) :
 25 |         sampler (Sampler) :
 26 |         sample_processor (SampleProcessor) :
 27 |         baseline (Baseline) :
 28 |         policy (Policy) :
 29 |         n_itr (int) : Number of iterations to train for
 30 |         start_itr (int) : Number of iterations policy has already trained for, if reloading
 31 |         num_inner_grad_steps (int) : Number of inner steps per maml iteration
 32 |         sess (tf.Session) : current tf session (if we loaded policy, for example)
 33 |     """
 34 |     def __init__(
 35 |             self,
 36 |             algo,
 37 |             env,
 38 |             sampler,
 39 |             sample_processor,
 40 |             policy,
 41 |             n_itr,
 42 |             start_itr=0,
 43 |             num_inner_grad_steps=1,
 44 |             sess=None,
 45 |             ):
 46 |         self.algo = algo
 47 |         self.env = env
 48 |         self.sampler = sampler
 49 |         self.sample_processor = sample_processor
 50 |         self.baseline = sample_processor.baseline
 51 |         self.policy = policy
 52 |         self.n_itr = n_itr
 53 |         self.start_itr = start_itr
 54 |         self.num_inner_grad_steps = num_inner_grad_steps
 55 |         if sess is None:
 56 |             sess = tf.Session()
 57 |         self.sess = sess
 58 | 
 59 |     def train(self):
 60 |         """
 61 |         Trains policy on env using algo
 62 | 
 63 |         Pseudocode::
 64 |         
 65 |             for itr in n_itr:
 66 |                 for step in num_inner_grad_steps:
 67 |                     sampler.sample()
 68 |                     algo.compute_updated_dists()
 69 |                 algo.optimize_policy()
 70 |                 sampler.update_goals()
 71 |         """
 72 |         with self.sess.as_default() as sess:
 73 | 
 74 |             # initialize uninitialized vars  (only initialize vars that were not loaded)
 75 |             uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))]
 76 |             sess.run(tf.variables_initializer(uninit_vars))
 77 | 
 78 |             start_time = time.time()
 79 |             for itr in range(self.start_itr, self.n_itr):
 80 |                 itr_start_time = time.time()
 81 |                 logger.log("\n ---------------- Iteration %d ----------------" % itr)
 82 |                 logger.log("Sampling set of tasks/goals for this meta-batch...")
 83 | 
 84 |                 self.sampler.update_tasks()
 85 |                 self.policy.switch_to_pre_update()  # Switch to pre-update policy
 86 | 
 87 |                 all_samples_data, all_paths = [], []
 88 |                 list_sampling_time, list_inner_step_time, list_outer_step_time, list_proc_samples_time = [], [], [], []
 89 |                 start_total_inner_time = time.time()
 90 |                 for step in range(self.num_inner_grad_steps+1):
 91 |                     logger.log('** Step ' + str(step) + ' **')
 92 | 
 93 |                     """ -------------------- Sampling --------------------------"""
 94 | 
 95 |                     logger.log("Obtaining samples...")
 96 |                     time_env_sampling_start = time.time()
 97 |                     paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step)
 98 |                     list_sampling_time.append(time.time() - time_env_sampling_start)
 99 |                     all_paths.append(paths)
100 | 
101 |                     """ ----------------- Processing Samples ---------------------"""
102 | 
103 |                     logger.log("Processing samples...")
104 |                     time_proc_samples_start = time.time()
105 |                     samples_data = self.sample_processor.process_samples(paths, log='all', log_prefix='Step_%d-' % step)
106 |                     all_samples_data.append(samples_data)
107 |                     list_proc_samples_time.append(time.time() - time_proc_samples_start)
108 | 
109 |                     self.log_diagnostics(sum(list(paths.values()), []), prefix='Step_%d-' % step)
110 | 
111 |                     """ ------------------- Inner Policy Update --------------------"""
112 | 
113 |                     time_inner_step_start = time.time()
114 |                     if step < self.num_inner_grad_steps:
115 |                         logger.log("Computing inner policy updates...")
116 |                         self.algo._adapt(samples_data)
117 |                     # train_writer = tf.summary.FileWriter('/home/ignasi/Desktop/meta_policy_search_graph',
118 |                     #                                      sess.graph)
119 |                     list_inner_step_time.append(time.time() - time_inner_step_start)
120 |                 total_inner_time = time.time() - start_total_inner_time
121 | 
122 |                 time_maml_opt_start = time.time()
123 |                 """ ------------------ Outer Policy Update ---------------------"""
124 | 
125 |                 logger.log("Optimizing policy...")
126 |                 # This needs to take all samples_data so that it can construct graph for meta-optimization.
127 |                 time_outer_step_start = time.time()
128 |                 self.algo.optimize_policy(all_samples_data)
129 | 
130 |                 """ ------------------- Logging Stuff --------------------------"""
131 |                 logger.logkv('Itr', itr)
132 |                 logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled)
133 | 
134 |                 logger.logkv('Time-OuterStep', time.time() - time_outer_step_start)
135 |                 logger.logkv('Time-TotalInner', total_inner_time)
136 |                 logger.logkv('Time-InnerStep', np.sum(list_inner_step_time))
137 |                 logger.logkv('Time-SampleProc', np.sum(list_proc_samples_time))
138 |                 logger.logkv('Time-Sampling', np.sum(list_sampling_time))
139 | 
140 |                 logger.logkv('Time', time.time() - start_time)
141 |                 logger.logkv('ItrTime', time.time() - itr_start_time)
142 |                 logger.logkv('Time-MAMLSteps', time.time() - time_maml_opt_start)
143 | 
144 |                 logger.log("Saving snapshot...")
145 |                 params = self.get_itr_snapshot(itr)
146 |                 logger.save_itr_params(itr, params)
147 |                 logger.log("Saved")
148 | 
149 |                 logger.dumpkvs()
150 | 
151 |         logger.log("Training finished")
152 |         self.sess.close()        
153 | 
154 |     def get_itr_snapshot(self, itr):
155 |         """
156 |         Gets the current policy and env for storage
157 |         """
158 |         return dict(itr=itr, policy=self.policy, env=self.env, baseline=self.baseline)
159 | 
160 |     def log_diagnostics(self, paths, prefix):
161 |         # TODO: we aren't using it so far
162 |         self.env.log_diagnostics(paths, prefix)
163 |         self.policy.log_diagnostics(paths, prefix)
164 |         self.baseline.log_diagnostics(paths, prefix)
165 | 


--------------------------------------------------------------------------------
/meta_policy_search/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | from meta_policy_search.optimizers.base import Optimizer
2 | from meta_policy_search.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer
3 | from meta_policy_search.optimizers.maml_first_order_optimizer import MAMLFirstOrderOptimizer


--------------------------------------------------------------------------------
/meta_policy_search/optimizers/base.py:
--------------------------------------------------------------------------------
 1 | from meta_policy_search import utils
 2 | 
 3 | class Optimizer(object):
 4 |     def __init__(self):
 5 |         self._input_ph_dict = None
 6 | 
 7 |     def build_graph(self, loss, target, input_ph_dict):
 8 |         """
 9 |         Sets the objective function and target weights for the optimize function
10 |         
11 |         Args:
12 |             loss (tf_op) : minimization objective
13 |             target (Policy) : Policy whose values we are optimizing over
14 |             input_ph_dict (dict) : dict containing the placeholders of the computation graph corresponding to loss
15 |         """
16 |         raise NotImplementedError
17 | 
18 |     def optimize(self, input_val_dict):
19 |         """
20 |         Carries out the optimization step
21 | 
22 |         Args:
23 |             input_val_dict (dict): dict containing the values to be fed into the computation graph
24 | 
25 |         """
26 |         raise NotImplementedError
27 | 
28 |     def loss(self, input_val_dict):
29 |         """
30 |         Computes the value of the loss for given inputs
31 | 
32 |         Args:
33 |             input_val_dict (dict): dict containing the values to be fed into the computation graph
34 | 
35 |         Returns:
36 |             (float): value of the loss
37 | 
38 |         """
39 |         raise NotImplementedError
40 | 
41 |     def create_feed_dict(self, input_val_dict):
42 |         return utils.create_feed_dict(placeholder_dict=self._input_ph_dict, value_dict=input_val_dict)
43 | 


--------------------------------------------------------------------------------
/meta_policy_search/optimizers/maml_first_order_optimizer.py:
--------------------------------------------------------------------------------
  1 | from meta_policy_search.utils import logger
  2 | from meta_policy_search.optimizers.base import Optimizer
  3 | import tensorflow as tf
  4 | 
  5 | class MAMLFirstOrderOptimizer(Optimizer):
  6 |     """
  7 |     Optimizer for first order methods (SGD, Adam)
  8 | 
  9 |     Args:
 10 |         tf_optimizer_cls (tf.train.optimizer): desired tensorflow optimzier for training
 11 |         tf_optimizer_args (dict or None): arguments for the optimizer
 12 |         learning_rate (float): learning rate
 13 |         max_epochs: number of maximum epochs for training
 14 |         tolerance (float): tolerance for early stopping. If the loss fucntion decreases less than the specified tolerance
 15 |         after an epoch, then the training stops.
 16 |         num_minibatches (int): number of mini-batches for performing the gradient step. The mini-batch size is
 17 |         batch size//num_minibatches.
 18 |         verbose (bool): Whether to log or not the optimization process
 19 | 
 20 |     """
 21 | 
 22 |     def __init__(
 23 |             self,
 24 |             tf_optimizer_cls=tf.train.AdamOptimizer,
 25 |             tf_optimizer_args=None,
 26 |             learning_rate=1e-3,
 27 |             max_epochs=1,
 28 |             tolerance=1e-6,
 29 |             num_minibatches=1,
 30 |             verbose=False
 31 |             ):
 32 | 
 33 |         self._target = None
 34 |         if tf_optimizer_args is None:
 35 |             tf_optimizer_args = dict()
 36 |         tf_optimizer_args['learning_rate'] = learning_rate
 37 | 
 38 |         self._tf_optimizer = tf_optimizer_cls(**tf_optimizer_args)
 39 |         self._max_epochs = max_epochs
 40 |         self._tolerance = tolerance
 41 |         self._num_minibatches = num_minibatches # Unused
 42 |         self._verbose = verbose
 43 |         self._all_inputs = None
 44 |         self._train_op = None
 45 |         self._loss = None
 46 |         self._input_ph_dict = None
 47 |         
 48 |     def build_graph(self, loss, target, input_ph_dict):
 49 |         """
 50 |         Sets the objective function and target weights for the optimize function
 51 | 
 52 |         Args:
 53 |             loss (tf_op) : minimization objective
 54 |             target (Policy) : Policy whose values we are optimizing over
 55 |             input_ph_dict (dict) : dict containing the placeholders of the computation graph corresponding to loss
 56 |         """
 57 |         assert isinstance(loss, tf.Tensor)
 58 |         assert hasattr(target, 'get_params')
 59 |         assert isinstance(input_ph_dict, dict)
 60 | 
 61 |         self._target = target
 62 |         self._input_ph_dict = input_ph_dict
 63 |         self._loss = loss
 64 |         self._train_op = self._tf_optimizer.minimize(loss, var_list=target.get_params())
 65 | 
 66 |     def loss(self, input_val_dict):
 67 |         """
 68 |         Computes the value of the loss for given inputs
 69 | 
 70 |         Args:
 71 |             input_val_dict (dict): dict containing the values to be fed into the computation graph
 72 | 
 73 |         Returns:
 74 |             (float): value of the loss
 75 | 
 76 |         """
 77 |         sess = tf.get_default_session()
 78 |         feed_dict = self.create_feed_dict(input_val_dict)
 79 |         loss = sess.run(self._loss, feed_dict=feed_dict)
 80 |         return loss
 81 | 
 82 |     def optimize(self, input_val_dict):
 83 |         """
 84 |         Carries out the optimization step
 85 | 
 86 |         Args:
 87 |             input_val_dict (dict): dict containing the values to be fed into the computation graph
 88 | 
 89 |         Returns:
 90 |             (float) loss before optimization
 91 | 
 92 |         """
 93 | 
 94 |         sess = tf.get_default_session()
 95 |         feed_dict = self.create_feed_dict(input_val_dict)
 96 | 
 97 |         # Overload self._batch size
 98 |         # dataset = MAMLBatchDataset(inputs, num_batches=self._batch_size, extra_inputs=extra_inputs, meta_batch_size=self.meta_batch_size, num_grad_updates=self.num_grad_updates)
 99 |         # Todo: reimplement minibatches
100 | 
101 |         loss_before_opt = None
102 |         for epoch in range(self._max_epochs):
103 |             if self._verbose:
104 |                 logger.log("Epoch %d" % epoch)
105 | 
106 |             loss, _ = sess.run([self._loss, self._train_op], feed_dict)
107 |             if not loss_before_opt: loss_before_opt = loss
108 | 
109 |             # if self._verbose:
110 |             #     logger.log("Epoch: %d | Loss: %f" % (epoch, new_loss))
111 |             #
112 |             # if abs(last_loss - new_loss) < self._tolerance:
113 |             #     break
114 |             # last_loss = new_loss
115 |         return loss_before_opt
116 | 
117 | 
118 | class MAMLPPOOptimizer(MAMLFirstOrderOptimizer):
119 |     """
120 |     Adds inner and outer kl terms to first order optimizer  #TODO: (Do we really need this?)
121 | 
122 |     """
123 |     def __init__(self, *args, **kwargs):
124 |         # Todo: reimplement minibatches
125 |         super(MAMLPPOOptimizer, self).__init__(*args, **kwargs)
126 |         self._inner_kl = None
127 |         self._outer_kl = None
128 | 
129 |     def build_graph(self, loss, target, input_ph_dict, inner_kl=None, outer_kl=None):
130 |         """
131 |         Sets the objective function and target weights for the optimize function
132 | 
133 |         Args:
134 |             loss (tf.Tensor) : minimization objective
135 |             target (Policy) : Policy whose values we are optimizing over
136 |             input_ph_dict (dict) : dict containing the placeholders of the computation graph corresponding to loss
137 |             inner_kl (list): list with the inner kl loss for each task
138 |             outer_kl (list): list with the outer kl loss for each task
139 |         """
140 |         super(MAMLPPOOptimizer, self).build_graph(loss, target, input_ph_dict)
141 |         assert inner_kl is not None
142 | 
143 |         self._inner_kl = inner_kl
144 |         self._outer_kl = outer_kl
145 | 
146 |     def compute_stats(self, input_val_dict):
147 |         """
148 |         Computes the value the loss, the outer KL and the inner KL-divergence between the current policy and the
149 |         provided dist_info_data
150 | 
151 |         Args:
152 |            inputs (list): inputs needed to compute the inner KL
153 |            extra_inputs (list): additional inputs needed to compute the inner KL
154 | 
155 |         Returns:
156 |            (float): value of the loss
157 |            (ndarray): inner kls - numpy array of shape (num_inner_grad_steps,)
158 |            (float): outer_kl
159 |         """
160 |         sess = tf.get_default_session()
161 |         feed_dict = self.create_feed_dict(input_val_dict)
162 |         loss, inner_kl, outer_kl = sess.run([self._loss, self._inner_kl, self._outer_kl], feed_dict=feed_dict)
163 |         return loss, inner_kl, outer_kl
164 | 
165 | 
166 | 
167 | 


--------------------------------------------------------------------------------
/meta_policy_search/policies/__init__.py:
--------------------------------------------------------------------------------
1 | from meta_policy_search.policies.base import MetaPolicy
2 | from meta_policy_search.policies.base import Policy
3 | from meta_policy_search.policies.gaussian_mlp_policy import GaussianMLPPolicy
4 | from meta_policy_search.policies.meta_gaussian_mlp_policy import MetaGaussianMLPPolicy


--------------------------------------------------------------------------------
/meta_policy_search/policies/distributions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonasrothfuss/ProMP/93ae339e23dfc6e1133f9538f2c7cc0ccee89d19/meta_policy_search/policies/distributions/__init__.py


--------------------------------------------------------------------------------
/meta_policy_search/policies/distributions/base.py:
--------------------------------------------------------------------------------
  1 | class Distribution(object):
  2 |     """ 
  3 |     General methods for a generic distribution
  4 |     """
  5 |     @property
  6 |     def dim(self):
  7 |         raise NotImplementedError
  8 | 
  9 |     def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
 10 |         """
 11 |         Symbolic KL divergence of two distributions
 12 | 
 13 |         Args:
 14 |             old_dist_info_vars (dict) : dict of old distribution parameters as tf.Tensor
 15 |             new_dist_info_vars (dict) : dict of new distribution parameters as tf.Tensor
 16 | 
 17 |         Returns:
 18 |             (tf.Tensor) : Symbolic representation of kl divergence (tensorflow op)
 19 |         """
 20 |         raise NotImplementedError
 21 | 
 22 |     def kl(self, old_dist_info, new_dist_info):
 23 |         """
 24 |         Compute the KL divergence of two distributions
 25 | 
 26 |         Args: 
 27 |             old_dist_info (dict): dict of old distribution parameters as numpy array
 28 |             new_dist_info (dict): dict of new distribution parameters as numpy array
 29 | 
 30 |         Returns:
 31 |             (numpy array): kl divergence of distributions
 32 |         """
 33 |         raise NotImplementedError
 34 | 
 35 |     def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars):
 36 |         """
 37 |         Symbolic likelihood ratio p_new(x)/p_old(x) of two distributions
 38 | 
 39 |         Args:
 40 |             x_var (tf.Tensor): variable where to evaluate the likelihood ratio p_new(x)/p_old(x)
 41 |             old_dist_info_vars (dict) : dict of old distribution parameters as tf.Tensor
 42 |             new_dist_info_vars (dict) : dict of new distribution parameters as tf.Tensor
 43 | 
 44 |         Returns:
 45 |           (tf.Tensor): likelihood ratio
 46 |         """
 47 |         raise NotImplementedError
 48 | 
 49 |     def likelihood_ratio(self, x_var, old_dist_info, new_dist_info):
 50 |         """
 51 |         Compute the likelihood ratio p_new(x)/p_old(x) of two distributions
 52 | 
 53 |         Args:
 54 |             x_var (numpy array): variable where to evaluate the likelihood ratio p_new(x)/p_old(x)
 55 |             old_dist_info_vars (dict) : dict of old distribution parameters as numpy array
 56 |             new_dist_info_vars (dict) : dict of new distribution parameters as numpy array
 57 | 
 58 |         Returns:
 59 |           (numpy array): likelihood ratio
 60 |         """
 61 |         raise NotImplementedError
 62 | 
 63 |     def entropy_sym(self, dist_info_vars):
 64 |         """
 65 |         Symbolic entropy of the distribution
 66 | 
 67 |         Args:
 68 |             dist_info (dict) : dict of distribution parameters as tf.Tensor
 69 | 
 70 |         Returns:
 71 |             (tf.Tensor): entropy
 72 |         """
 73 |         raise NotImplementedError
 74 | 
 75 |     def entropy(self, dist_info):
 76 |         """
 77 |         Compute the entropy of the distribution
 78 | 
 79 |         Args:
 80 |             dist_info (dict) : dict of distribution parameters as numpy array
 81 | 
 82 |         Returns:
 83 |           (numpy array): entropy
 84 |         """
 85 |         raise NotImplementedError
 86 | 
 87 |     def log_likelihood_sym(self, x_var, dist_info_vars):
 88 |         """
 89 |         Symbolic log likelihood log p(x) of the distribution
 90 | 
 91 |         Args:
 92 |             x_var (tf.Tensor): variable where to evaluate the log likelihood
 93 |             dist_info_vars (dict) : dict of distribution parameters as tf.Tensor
 94 | 
 95 |         Returns:
 96 |              (numpy array): log likelihood
 97 |         """
 98 |         raise NotImplementedError
 99 | 
100 |     def log_likelihood(self, xs, dist_info):
101 |         """
102 |         Compute the log likelihood log p(x) of the distribution
103 | 
104 |         Args:
105 |            x_var (numpy array): variable where to evaluate the log likelihood
106 |            dist_info_vars (dict) : dict of distribution parameters as numpy array
107 | 
108 |         Returns:
109 |             (numpy array): log likelihood
110 |         """
111 |         raise NotImplementedError
112 | 
113 |     def sample(self, dist_info):
114 |         """
115 |         Draws a sample from the distribution
116 | 
117 |         Args:
118 |             dist_info (dict) : dict of distribution parameter instantiations as numpy array
119 | 
120 |         Returns:
121 |             (obj): sample drawn from the corresponding instantiation
122 |         """
123 |         raise NotImplementedError
124 | 
125 |     @property
126 |     def dist_info_specs(self):
127 |         raise NotImplementedError
128 | 
129 |     @property
130 |     def dist_info_keys(self):
131 |         return [k for k, _ in self.dist_info_specs]
132 | 


--------------------------------------------------------------------------------
/meta_policy_search/policies/distributions/diagonal_gaussian.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | from meta_policy_search.policies.distributions.base import Distribution
  4 | 
  5 | class DiagonalGaussian(Distribution):
  6 |     """
  7 |     General methods for a diagonal gaussian distribution of this size 
  8 |     """
  9 |     def __init__(self, dim):
 10 |         self._dim = dim
 11 | 
 12 |     @property
 13 |     def dim(self):
 14 |         return self._dim
 15 | 
 16 |     def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
 17 |         """
 18 |         Computes the symbolic representation of the KL divergence of two multivariate 
 19 |         Gaussian distribution with diagonal covariance matrices
 20 | 
 21 |         Args:
 22 |             old_dist_info_vars (dict) : dict of old distribution parameters as tf.Tensor
 23 |             new_dist_info_vars (dict) : dict of new distribution parameters as tf.Tensor
 24 | 
 25 |         Returns:
 26 |             (tf.Tensor) : Symbolic representation of kl divergence (tensorflow op)
 27 |         """
 28 |         old_means = old_dist_info_vars["mean"]
 29 |         old_log_stds = old_dist_info_vars["log_std"]
 30 |         new_means = new_dist_info_vars["mean"]
 31 |         new_log_stds = new_dist_info_vars["log_std"]
 32 | 
 33 |         # assert ranks
 34 |         tf.assert_rank(old_means, 2), tf.assert_rank(old_log_stds, 2)
 35 |         tf.assert_rank(new_means, 2), tf.assert_rank(new_log_stds, 2)
 36 | 
 37 |         old_std = tf.exp(old_log_stds)
 38 |         new_std = tf.exp(new_log_stds)
 39 | 
 40 |         numerator = tf.square(old_means - new_means) + \
 41 |                     tf.square(old_std) - tf.square(new_std)
 42 |         denominator = 2 * tf.square(new_std) + 1e-8
 43 |         return tf.reduce_sum(
 44 |             numerator / denominator + new_log_stds - old_log_stds, reduction_indices=-1)
 45 | 
 46 |     def kl(self, old_dist_info, new_dist_info):
 47 |         """
 48 |         Compute the KL divergence of two multivariate Gaussian distribution with
 49 |         diagonal covariance matrices
 50 | 
 51 |        Args:
 52 |             old_dist_info (dict): dict of old distribution parameters as numpy array
 53 |             new_dist_info (dict): dict of new distribution parameters as numpy array
 54 | 
 55 |         Returns:
 56 |             (numpy array): kl divergence of distributions
 57 |         """
 58 |         old_means = old_dist_info["mean"]
 59 |         old_log_stds = old_dist_info["log_std"]
 60 |         new_means = new_dist_info["mean"]
 61 |         new_log_stds = new_dist_info["log_std"]
 62 | 
 63 |         old_std = np.exp(old_log_stds)
 64 |         new_std = np.exp(new_log_stds)
 65 |         numerator = np.square(old_means - new_means) + \
 66 |                     np.square(old_std) - np.square(new_std)
 67 |         denominator = 2 * np.square(new_std) + 1e-8
 68 |         return np.sum(
 69 |             numerator / denominator + new_log_stds - old_log_stds, axis=-1)
 70 | 
 71 |     def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars):
 72 |         """
 73 |         Symbolic likelihood ratio p_new(x)/p_old(x) of two distributions
 74 | 
 75 |         Args:
 76 |             x_var (tf.Tensor): variable where to evaluate the likelihood ratio p_new(x)/p_old(x)
 77 |             old_dist_info_vars (dict) : dict of old distribution parameters as tf.Tensor
 78 |             new_dist_info_vars (dict) : dict of new distribution parameters as tf.Tensor
 79 | 
 80 |         Returns:
 81 |             (tf.Tensor): likelihood ratio
 82 |         """
 83 |         with tf.variable_scope("log_li_new"):
 84 |             logli_new = self.log_likelihood_sym(x_var, new_dist_info_vars)
 85 |         with tf.variable_scope("log_li_old"):
 86 |             logli_old = self.log_likelihood_sym(x_var, old_dist_info_vars)
 87 |         return tf.exp(logli_new - logli_old)
 88 | 
 89 |     def log_likelihood_sym(self, x_var, dist_info_vars):
 90 |         """
 91 |         Symbolic log likelihood log p(x) of the distribution
 92 | 
 93 |         Args:
 94 |             x_var (tf.Tensor): variable where to evaluate the log likelihood
 95 |             dist_info_vars (dict) : dict of distribution parameters as tf.Tensor
 96 | 
 97 |         Returns:
 98 |              (numpy array): log likelihood
 99 |         """
100 |         means = dist_info_vars["mean"]
101 |         log_stds = dist_info_vars["log_std"]
102 | 
103 |         # assert ranks
104 |         tf.assert_rank(x_var, 2), tf.assert_rank(means, 2), tf.assert_rank(log_stds, 2)
105 | 
106 |         zs = (x_var - means) / tf.exp(log_stds)
107 |         return - tf.reduce_sum(log_stds, reduction_indices=-1) - \
108 |                0.5 * tf.reduce_sum(tf.square(zs), reduction_indices=-1) - \
109 |                0.5 * self.dim * np.log(2 * np.pi)
110 | 
111 |     def log_likelihood(self, xs, dist_info):
112 |         """
113 |         Compute the log likelihood log p(x) of the distribution
114 | 
115 |         Args:
116 |            x_var (numpy array): variable where to evaluate the log likelihood
117 |            dist_info_vars (dict) : dict of distribution parameters as numpy array
118 | 
119 |         Returns:
120 |             (numpy array): log likelihood
121 |         """
122 |         means = dist_info["mean"]
123 |         log_stds = dist_info["log_std"]
124 |         zs = (xs - means) / np.exp(log_stds)
125 |         return - np.sum(log_stds, axis=-1) - \
126 |                0.5 * np.sum(np.square(zs), axis=-1) - \
127 |                0.5 * self.dim * np.log(2 * np.pi)
128 | 
129 |     def entropy_sym(self, dist_info_vars):
130 |         """
131 |         Symbolic entropy of the distribution
132 | 
133 |         Args:
134 |             dist_info (dict) : dict of distribution parameters as tf.Tensor
135 | 
136 |         Returns:
137 |             (tf.Tensor): entropy
138 |         """
139 |         log_stds = dist_info_vars["log_std"]
140 |         return tf.reduce_sum(log_stds + np.log(np.sqrt(2 * np.pi * np.e)), reduction_indices=-1)
141 | 
142 |     def entropy(self, dist_info):
143 |         """
144 |         Compute the entropy of the distribution
145 | 
146 |         Args:
147 |             dist_info (dict) : dict of distribution parameters as numpy array
148 | 
149 |         Returns:
150 |           (numpy array): entropy
151 |         """
152 |         log_stds = dist_info["log_std"]
153 |         return np.sum(log_stds + np.log(np.sqrt(2 * np.pi * np.e)), axis=-1)
154 | 
155 |     def sample(self, dist_info):
156 |         """
157 |         Draws a sample from the distribution
158 | 
159 |         Args:
160 |            dist_info (dict) : dict of distribution parameter instantiations as numpy array
161 | 
162 |         Returns:
163 |            (obj): sample drawn from the corresponding instantiation
164 |         """
165 |         means = dist_info["mean"]
166 |         log_stds = dist_info["log_std"]
167 |         rnd = np.random.normal(size=means.shape)
168 |         return rnd * np.exp(log_stds) + means
169 | 
170 |     @property
171 |     def dist_info_specs(self):
172 |         return [("mean", (self.dim,)), ("log_std", (self.dim,))]
173 | 


--------------------------------------------------------------------------------
/meta_policy_search/policies/networks/__init__.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | 
 4 | class MyTestCase(unittest.TestCase):
 5 |     def test_something(self):
 6 |         self.assertEqual(True, False)
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     unittest.main()


--------------------------------------------------------------------------------
/meta_policy_search/policies/networks/mlp.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from meta_policy_search.utils.utils import get_original_tf_name, get_last_scope
  3 | 
  4 | 
  5 | def create_mlp(name,
  6 |                output_dim,
  7 |                hidden_sizes,
  8 |                hidden_nonlinearity,
  9 |                output_nonlinearity,
 10 |                input_dim=None,
 11 |                input_var=None,
 12 |                w_init=tf.contrib.layers.xavier_initializer(),
 13 |                b_init=tf.zeros_initializer(),
 14 |                reuse=False
 15 |                ):
 16 |     """
 17 |     Creates a MLP network
 18 |     Args:
 19 |         name (str): scope of the neural network
 20 |         output_dim (int): dimension of the output
 21 |         hidden_sizes (tuple): tuple with the hidden sizes of the fully connected network
 22 |         hidden_nonlinearity (tf): non-linearity for the activations in the hidden layers
 23 |         output_nonlinearity (tf or None): output non-linearity. None results in no non-linearity being applied
 24 |         input_dim (tuple): dimensions of the input variable e.g. (None, action_dim)
 25 |         input_var (tf.placeholder or tf.Variable or None): Input of the network as a symbolic variable
 26 |         w_init (tf.initializer): initializer for the weights
 27 |         b_init (tf.initializer): initializer for the biases
 28 |         reuse (bool): reuse or not the network
 29 | 
 30 |     Returns:
 31 |         input_var (tf.placeholder or tf.Variable): Input of the network as a symbolic variable
 32 |         output_var (tf.Tensor): Output of the network as a symbolic variable
 33 | 
 34 |     """
 35 | 
 36 |     assert input_var is not None or input_dim is not None
 37 | 
 38 |     if input_var is None:
 39 |         input_var = tf.placeholder(dtype=tf.float32, shape=input_dim, name='input')
 40 |     with tf.variable_scope(name):
 41 |         x = input_var
 42 | 
 43 |         for idx, hidden_size in enumerate(hidden_sizes):
 44 |             x = tf.layers.dense(x,
 45 |                                 hidden_size,
 46 |                                 name='hidden_%d' % idx,
 47 |                                 activation=hidden_nonlinearity,
 48 |                                 kernel_initializer=w_init,
 49 |                                 bias_initializer=b_init,
 50 |                                 reuse=reuse,
 51 |                                 )
 52 | 
 53 |         output_var = tf.layers.dense(x,
 54 |                                      output_dim,
 55 |                                      name='output',
 56 |                                      activation=output_nonlinearity,
 57 |                                      kernel_initializer=w_init,
 58 |                                      bias_initializer=b_init,
 59 |                                      reuse=reuse,
 60 |                                      )
 61 | 
 62 |     return input_var, output_var
 63 | 
 64 | 
 65 | def forward_mlp(output_dim,
 66 |                 hidden_sizes,
 67 |                 hidden_nonlinearity,
 68 |                 output_nonlinearity,
 69 |                 input_var,
 70 |                 mlp_params,
 71 |                 ):
 72 |     """
 73 |     Creates the forward pass of an mlp given the input vars and the mlp params. Assumes that the params are passed in
 74 |     order i.e. [hidden_0/kernel, hidden_0/bias, hidden_1/kernel, hidden_1/bias, ..., output/kernel, output/bias]
 75 |     Args:
 76 |         output_dim (int): dimension of the output
 77 |         hidden_sizes (tuple): tuple with the hidden sizes of the fully connected network
 78 |         hidden_nonlinearity (tf): non-linearity for the activations in the hidden layers
 79 |         output_nonlinearity (tf or None): output non-linearity. None results in no non-linearity being applied
 80 |         input_var (tf.placeholder or tf.Variable): Input of the network as a symbolic variable
 81 |         mlp_params (OrderedDict): OrderedDict of the params of the neural network. 
 82 | 
 83 |     Returns:
 84 |         input_var (tf.placeholder or tf.Variable): Input of the network as a symbolic variable
 85 |         output_var (tf.Tensor): Output of the network as a symbolic variable
 86 | 
 87 |     """
 88 |     x = input_var
 89 |     idx = 0
 90 |     bias_added = False
 91 |     sizes = tuple(hidden_sizes) + (output_dim,)
 92 | 
 93 |     if output_nonlinearity is None:
 94 |         output_nonlinearity = tf.identity
 95 | 
 96 |     for name, param in mlp_params.items():
 97 |         assert str(idx) in name or (idx == len(hidden_sizes) and "output" in name)
 98 | 
 99 |         if "kernel" in name:
100 |             assert param.shape == (x.shape[-1], sizes[idx])
101 |             x = tf.matmul(x, param)
102 |         elif "bias" in name:
103 |             assert param.shape == (sizes[idx],)
104 |             x = tf.add(x, param)
105 |             bias_added = True
106 |         else:
107 |             raise NameError
108 | 
109 |         if bias_added:
110 |             if "hidden" in name:
111 |                 x = hidden_nonlinearity(x)
112 |             elif "output" in name:
113 |                 x = output_nonlinearity(x)
114 |             else:
115 |                 raise NameError
116 |             idx += 1
117 |             bias_added = False
118 |     output_var = x
119 |     return input_var, output_var # Todo why return input_var?
120 | 
121 | 


--------------------------------------------------------------------------------
/meta_policy_search/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | from meta_policy_search.samplers.base import Sampler
2 | from meta_policy_search.samplers.base import SampleProcessor
3 | from meta_policy_search.samplers.meta_sample_processor import MetaSampleProcessor
4 | from meta_policy_search.samplers.meta_sampler import MetaSampler
5 | from meta_policy_search.samplers.dice_sample_processor import DiceSampleProcessor
6 | from meta_policy_search.samplers.meta_sample_processor import DiceMetaSampleProcessor
7 | 


--------------------------------------------------------------------------------
/meta_policy_search/samplers/meta_sample_processor.py:
--------------------------------------------------------------------------------
 1 | from meta_policy_search.samplers.base import SampleProcessor
 2 | from meta_policy_search.samplers.dice_sample_processor import DiceSampleProcessor
 3 | from meta_policy_search.utils import utils
 4 | import numpy as np
 5 | 
 6 | class MetaSampleProcessor(SampleProcessor):
 7 | 
 8 |     def process_samples(self, paths_meta_batch, log=False, log_prefix=''):
 9 |         """
10 |         Processes sampled paths. This involves:
11 |             - computing discounted rewards (returns)
12 |             - fitting baseline estimator using the path returns and predicting the return baselines
13 |             - estimating the advantages using GAE (+ advantage normalization id desired)
14 |             - stacking the path data
15 |             - logging statistics of the paths
16 | 
17 |         Args:
18 |             paths_meta_batch (dict): A list of dict of lists, size: [meta_batch_size] x (batch_size) x [5] x (max_path_length)
19 |             log (boolean): indicates whether to log
20 |             log_prefix (str): prefix for the logging keys
21 | 
22 |         Returns:
23 |             (list of dicts) : Processed sample data among the meta-batch; size: [meta_batch_size] x [7] x (batch_size x max_path_length)
24 |         """
25 |         assert isinstance(paths_meta_batch, dict), 'paths must be a dict'
26 |         assert self.baseline, 'baseline must be specified'
27 | 
28 |         samples_data_meta_batch = []
29 |         all_paths = []
30 | 
31 |         for meta_task, paths in paths_meta_batch.items():
32 | 
33 |             # fits baseline, compute advantages and stack path data
34 |             samples_data, paths = self._compute_samples_data(paths)
35 | 
36 |             samples_data_meta_batch.append(samples_data)
37 |             all_paths.extend(paths)
38 | 
39 |         # 7) compute normalized trajectory-batch rewards (for E-MAML)
40 |         overall_avg_reward = np.mean(np.concatenate([samples_data['rewards'] for samples_data in samples_data_meta_batch]))
41 |         overall_avg_reward_std = np.std(np.concatenate([samples_data['rewards'] for samples_data in samples_data_meta_batch]))
42 | 
43 |         for samples_data in samples_data_meta_batch:
44 |             samples_data['adj_avg_rewards'] = (samples_data['rewards'] - overall_avg_reward) / (overall_avg_reward_std + 1e-8)
45 | 
46 |         # 8) log statistics if desired
47 |         self._log_path_stats(all_paths, log=log, log_prefix=log_prefix)
48 | 
49 |         return samples_data_meta_batch
50 | 
51 | class DiceMetaSampleProcessor(DiceSampleProcessor):
52 |     process_samples = MetaSampleProcessor.process_samples


--------------------------------------------------------------------------------
/meta_policy_search/samplers/meta_sampler.py:
--------------------------------------------------------------------------------
  1 | from meta_policy_search.samplers.base import Sampler
  2 | from meta_policy_search.samplers.vectorized_env_executor import MetaParallelEnvExecutor, MetaIterativeEnvExecutor
  3 | from meta_policy_search.utils import utils, logger
  4 | from collections import OrderedDict
  5 | 
  6 | from pyprind import ProgBar
  7 | import numpy as np
  8 | import time
  9 | import itertools
 10 | 
 11 | 
 12 | class MetaSampler(Sampler):
 13 |     """
 14 |     Sampler for Meta-RL
 15 | 
 16 |     Args:
 17 |         env (meta_policy_search.envs.base.MetaEnv) : environment object
 18 |         policy (meta_policy_search.policies.base.Policy) : policy object
 19 |         batch_size (int) : number of trajectories per task
 20 |         meta_batch_size (int) : number of meta tasks
 21 |         max_path_length (int) : max number of steps per trajectory
 22 |         envs_per_task (int) : number of envs to run vectorized for each task (influences the memory usage)
 23 |     """
 24 | 
 25 |     def __init__(
 26 |             self,
 27 |             env,
 28 |             policy,
 29 |             rollouts_per_meta_task,
 30 |             meta_batch_size,
 31 |             max_path_length,
 32 |             envs_per_task=None,
 33 |             parallel=False
 34 |             ):
 35 |         super(MetaSampler, self).__init__(env, policy, rollouts_per_meta_task, max_path_length)
 36 |         assert hasattr(env, 'set_task')
 37 | 
 38 |         self.envs_per_task = rollouts_per_meta_task if envs_per_task is None else envs_per_task
 39 |         self.meta_batch_size = meta_batch_size
 40 |         self.total_samples = meta_batch_size * rollouts_per_meta_task * max_path_length
 41 |         self.parallel = parallel
 42 |         self.total_timesteps_sampled = 0
 43 | 
 44 |         # setup vectorized environment
 45 | 
 46 |         if self.parallel:
 47 |             self.vec_env = MetaParallelEnvExecutor(env, self.meta_batch_size, self.envs_per_task, self.max_path_length)
 48 |         else:
 49 |             self.vec_env = MetaIterativeEnvExecutor(env, self.meta_batch_size, self.envs_per_task, self.max_path_length)
 50 | 
 51 |     def update_tasks(self):
 52 |         """
 53 |         Samples a new goal for each meta task
 54 |         """
 55 |         tasks = self.env.sample_tasks(self.meta_batch_size)
 56 |         assert len(tasks) == self.meta_batch_size
 57 |         self.vec_env.set_tasks(tasks)
 58 | 
 59 |     def obtain_samples(self, log=False, log_prefix=''):
 60 |         """
 61 |         Collect batch_size trajectories from each task
 62 | 
 63 |         Args:
 64 |             log (boolean): whether to log sampling times
 65 |             log_prefix (str) : prefix for logger
 66 | 
 67 |         Returns: 
 68 |             (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length)
 69 |         """
 70 | 
 71 |         # initial setup / preparation
 72 |         paths = OrderedDict()
 73 |         for i in range(self.meta_batch_size):
 74 |             paths[i] = []
 75 | 
 76 |         n_samples = 0
 77 |         running_paths = [_get_empty_running_paths_dict() for _ in range(self.vec_env.num_envs)]
 78 | 
 79 |         pbar = ProgBar(self.total_samples)
 80 |         policy_time, env_time = 0, 0
 81 | 
 82 |         policy = self.policy
 83 | 
 84 |         # initial reset of envs
 85 |         obses = self.vec_env.reset()
 86 |         
 87 |         while n_samples < self.total_samples:
 88 |             
 89 |             # execute policy
 90 |             t = time.time()
 91 |             obs_per_task = np.split(np.asarray(obses), self.meta_batch_size)
 92 |             actions, agent_infos = policy.get_actions(obs_per_task)
 93 |             policy_time += time.time() - t
 94 | 
 95 |             # step environments
 96 |             t = time.time()
 97 |             actions = np.concatenate(actions) # stack meta batch
 98 |             next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
 99 |             env_time += time.time() - t
100 | 
101 |             #  stack agent_infos and if no infos were provided (--> None) create empty dicts
102 |             agent_infos, env_infos = self._handle_info_dicts(agent_infos, env_infos)
103 | 
104 |             new_samples = 0
105 |             for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
106 |                                                                                     rewards, env_infos, agent_infos,
107 |                                                                                     dones):
108 |                 # append new samples to running paths
109 |                 running_paths[idx]["observations"].append(observation)
110 |                 running_paths[idx]["actions"].append(action)
111 |                 running_paths[idx]["rewards"].append(reward)
112 |                 running_paths[idx]["env_infos"].append(env_info)
113 |                 running_paths[idx]["agent_infos"].append(agent_info)
114 | 
115 |                 # if running path is done, add it to paths and empty the running path
116 |                 if done:
117 |                     paths[idx // self.envs_per_task].append(dict(
118 |                         observations=np.asarray(running_paths[idx]["observations"]),
119 |                         actions=np.asarray(running_paths[idx]["actions"]),
120 |                         rewards=np.asarray(running_paths[idx]["rewards"]),
121 |                         env_infos=utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
122 |                         agent_infos=utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
123 |                     ))
124 |                     new_samples += len(running_paths[idx]["rewards"])
125 |                     running_paths[idx] = _get_empty_running_paths_dict()
126 | 
127 |             pbar.update(new_samples)
128 |             n_samples += new_samples
129 |             obses = next_obses
130 |         pbar.stop()
131 | 
132 |         self.total_timesteps_sampled += self.total_samples
133 |         if log:
134 |             logger.logkv(log_prefix + "PolicyExecTime", policy_time)
135 |             logger.logkv(log_prefix + "EnvExecTime", env_time)
136 | 
137 |         return paths
138 | 
139 |     def _handle_info_dicts(self, agent_infos, env_infos):
140 |         if not env_infos:
141 |             env_infos = [dict() for _ in range(self.vec_env.num_envs)]
142 |         if not agent_infos:
143 |             agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
144 |         else:
145 |             assert len(agent_infos) == self.meta_batch_size
146 |             assert len(agent_infos[0]) == self.envs_per_task
147 |             agent_infos = sum(agent_infos, [])  # stack agent_infos
148 | 
149 |         assert len(agent_infos) == self.meta_batch_size * self.envs_per_task == len(env_infos)
150 |         return agent_infos, env_infos
151 | 
152 | 
153 | def _get_empty_running_paths_dict():
154 |     return dict(observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[])
155 | 


--------------------------------------------------------------------------------
/meta_policy_search/samplers/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import time
 3 | 
 4 | 
 5 | def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, save_video=True, video_filename='sim_out.mp4', ignore_done=False):
 6 |     observations = []
 7 |     actions = []
 8 |     rewards = []
 9 |     agent_infos = []
10 |     env_infos = []
11 |     images = []
12 | 
13 |     ''' get wrapped env '''
14 |     wrapped_env = env
15 |     while hasattr(wrapped_env, '_wrapped_env'):
16 |         wrapped_env = wrapped_env._wrapped_env
17 | 
18 |     frame_skip = wrapped_env.frame_skip if hasattr(wrapped_env, 'frame_skip') else 1
19 |     assert hasattr(wrapped_env, 'dt'), 'environment must have dt attribute that specifies the timestep'
20 |     timestep = wrapped_env.dt
21 | 
22 |     o = env.reset()
23 |     agent.reset()
24 |     path_length = 0
25 |     if animated:
26 |         env.render()
27 | 
28 |     while path_length < max_path_length:
29 |         a, agent_info = agent.get_action([o])
30 |         next_o, r, d, env_info = env.step(a)
31 |         observations.append(env.observation_space.flatten(o))
32 |         rewards.append(r)
33 |         actions.append(env.action_space.flatten(a))
34 |         agent_infos.append(agent_info)
35 |         env_infos.append(env_info)
36 |         path_length += 1
37 |         if d and not ignore_done: # and not animated:
38 |             break
39 |         o = next_o
40 |         if animated:
41 |             env.render()
42 |             time.sleep(timestep*frame_skip / speedup)
43 |             if save_video:
44 |                 from PIL import Image
45 |                 image = env.wrapped_env.wrapped_env.get_viewer().get_image()
46 |                 pil_image = Image.frombytes('RGB', (image[1], image[2]), image[0])
47 |                 images.append(np.flipud(np.array(pil_image)))
48 | 
49 |     if animated:
50 |         if save_video:
51 |             import moviepy.editor as mpy
52 |             fps = int(speedup/timestep * frame_skip)
53 |             clip = mpy.ImageSequenceClip(images, fps=fps)
54 |             if video_filename[-3:] == 'gif':
55 |                 clip.write_gif(video_filename, fps=fps)
56 |             else:
57 |                 clip.write_videofile(video_filename, fps=fps)
58 |         #return
59 | 
60 |     return dict(
61 |         observations=observations,
62 |         actons=actions,
63 |         rewards=rewards,
64 |         agent_infos=agent_infos,
65 |         env_infos=env_infos
66 |         )


--------------------------------------------------------------------------------
/meta_policy_search/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from meta_policy_search.utils.serializable import Serializable
2 | from meta_policy_search.utils.utils import *


--------------------------------------------------------------------------------
/meta_policy_search/utils/serializable.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright (c) 2016 rllab contributors
 3 | https://github.com/rll/rllab
 4 | """
 5 | 
 6 | import inspect
 7 | import sys
 8 | 
 9 | 
10 | class Serializable(object):
11 | 
12 |     def __init__(self, *args, **kwargs):
13 |         self.__args = args
14 |         self.__kwargs = kwargs
15 | 
16 |     def quick_init(self, locals_):
17 |         try:
18 |             if object.__getattribute__(self, "_serializable_initialized"):
19 |                 return
20 |         except AttributeError:
21 |             pass
22 |         if sys.version_info >= (3, 0):
23 |             spec = inspect.getfullargspec(self.__init__)
24 |             # Exclude the first "self" parameter
25 |             if spec.varkw:
26 |                 kwargs = locals_[spec.varkw]
27 |             else:
28 |                 kwargs = dict()
29 |         else:
30 |             spec = inspect.getargspec(self.__init__)
31 |             if spec.keywords:
32 |                 kwargs = locals_[spec.keywords]
33 |             else:
34 |                 kwargs = dict()
35 |         if spec.varargs:
36 |             varargs = locals_[spec.varargs]
37 |         else:
38 |             varargs = tuple()
39 |         in_order_args = [locals_[arg] for arg in spec.args][1:]
40 |         self.__args = tuple(in_order_args) + varargs
41 |         self.__kwargs = kwargs
42 |         setattr(self, "_serializable_initialized", True)
43 | 
44 |     def __getstate__(self):
45 |         return {"__args": self.__args, "__kwargs": self.__kwargs}
46 | 
47 |     def __setstate__(self, d):
48 |         out = type(self)(*d["__args"], **d["__kwargs"])
49 |         self.__dict__.update(out.__dict__)
50 | 
51 |     @classmethod
52 |     def clone(cls, obj, **kwargs):
53 |         assert isinstance(obj, Serializable)
54 |         d = obj.__getstate__()
55 | 
56 |         # Split the entries in kwargs between positional and keyword arguments
57 |         # and update d['__args'] and d['__kwargs'], respectively.
58 |         if sys.version_info >= (3, 0):
59 |             spec = inspect.getfullargspec(obj.__init__)
60 |         else:
61 |             spec = inspect.getargspec(obj.__init__)
62 |         in_order_args = spec.args[1:]
63 | 
64 |         d["__args"] = list(d["__args"])
65 |         for kw, val in kwargs.items():
66 |             if kw in in_order_args:
67 |                 d["__args"][in_order_args.index(kw)] = val
68 |             else:
69 |                 d["__kwargs"][kw] = val
70 | 
71 |         out = type(obj).__new__(type(obj))
72 |         out.__setstate__(d)
73 |         return out
74 | 


--------------------------------------------------------------------------------
/meta_policy_search/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy
  3 | import scipy.signal
  4 | import json
  5 | 
  6 | def get_original_tf_name(name):
  7 |     """
  8 |     Args:
  9 |         name (str): full name of the tf variable with all the scopes
 10 | 
 11 |     Returns:
 12 |         (str): name given to the variable when creating it (i.e. name of the variable w/o the scope and the colons)
 13 |     """
 14 |     return name.split("/")[-1].split(":")[0]
 15 | 
 16 | 
 17 | def remove_scope_from_name(name, scope):
 18 |     """
 19 |     Args:
 20 |         name (str): full name of the tf variable with all the scopes
 21 | 
 22 |     Returns:
 23 |         (str): full name of the variable with the scope removed
 24 |     """
 25 |     result = name.split(scope)[1]
 26 |     result = result[1:] if result[0] == '/' else result
 27 |     return result.split(":")[0]
 28 | 
 29 | def remove_first_scope_from_name(name):
 30 |     return name.replace(name + '/', "").split(":")[0]
 31 | 
 32 | def get_last_scope(name):
 33 |     """
 34 |     Args:
 35 |         name (str): full name of the tf variable with all the scopes
 36 | 
 37 |     Returns:
 38 |         (str): name of the last scope
 39 |     """
 40 |     return name.split("/")[-2]
 41 | 
 42 | 
 43 | def extract(x, *keys):
 44 |     """
 45 |     Args:
 46 |         x (dict or list): dict or list of dicts
 47 | 
 48 |     Returns:
 49 |         (tuple): tuple with the elements of the dict or the dicts of the list
 50 |     """
 51 |     if isinstance(x, dict):
 52 |         return tuple(x[k] for k in keys)
 53 |     elif isinstance(x, list):
 54 |         return tuple([xi[k] for xi in x] for k in keys)
 55 |     else:
 56 |         raise NotImplementedError
 57 | 
 58 | 
 59 | def normalize_advantages(advantages):
 60 |     """
 61 |     Args:
 62 |         advantages (np.ndarray): np array with the advantages
 63 | 
 64 |     Returns:
 65 |         (np.ndarray): np array with the advantages normalized
 66 |     """
 67 |     return (advantages - np.mean(advantages)) / (advantages.std() + 1e-8)
 68 | 
 69 | 
 70 | def shift_advantages_to_positive(advantages):
 71 |     return (advantages - np.min(advantages)) + 1e-8
 72 | 
 73 | 
 74 | def discount_cumsum(x, discount):
 75 |     """
 76 |     See https://docs.scipy.org/doc/scipy/reference/tutorial/signal.html#difference-equation-filtering
 77 | 
 78 |     Returns:
 79 |         (float) : y[t] - discount*y[t+1] = x[t] or rev(y)[t] - discount*rev(y)[t-1] = rev(x)[t]
 80 |     """
 81 |     return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
 82 | 
 83 | 
 84 | def explained_variance_1d(ypred, y):
 85 |     """
 86 |     Args:
 87 |         ypred (np.ndarray): predicted values of the variable of interest
 88 |         y (np.ndarray): real values of the variable
 89 | 
 90 |     Returns:
 91 |         (float): variance explained by your estimator
 92 | 
 93 |     """
 94 |     assert y.ndim == 1 and ypred.ndim == 1
 95 |     vary = np.var(y)
 96 |     if np.isclose(vary, 0):
 97 |         if np.var(ypred) > 0:
 98 |             return 0
 99 |         else:
100 |             return 1
101 |     return 1 - np.var(y - ypred) / (vary + 1e-8)
102 | 
103 | 
104 | def concat_tensor_dict_list(tensor_dict_list):
105 |     """
106 |     Args:
107 |         tensor_dict_list (list) : list of dicts of lists of tensors
108 | 
109 |     Returns:
110 |         (dict) : dict of lists of tensors
111 |     """
112 |     keys = list(tensor_dict_list[0].keys())
113 |     ret = dict()
114 |     for k in keys:
115 |         example = tensor_dict_list[0][k]
116 |         if isinstance(example, dict):
117 |             v = concat_tensor_dict_list([x[k] for x in tensor_dict_list])
118 |         else:
119 |             v = np.concatenate([x[k] for x in tensor_dict_list])
120 |         ret[k] = v
121 |     return ret
122 | 
123 | 
124 | def stack_tensor_dict_list(tensor_dict_list):
125 |     """
126 |     Args:
127 |         tensor_dict_list (list) : list of dicts of tensors
128 | 
129 |     Returns:
130 |         (dict) : dict of lists of tensors
131 |     """
132 |     keys = list(tensor_dict_list[0].keys())
133 |     ret = dict()
134 |     for k in keys:
135 |         example = tensor_dict_list[0][k]
136 |         if isinstance(example, dict):
137 |             v = stack_tensor_dict_list([x[k] for x in tensor_dict_list])
138 |         else:
139 |             v = np.asarray([x[k] for x in tensor_dict_list])
140 |         ret[k] = v
141 |     return ret
142 | 
143 | 
144 | def create_feed_dict(placeholder_dict, value_dict):
145 |     """
146 |     matches the placeholders with their values given a placeholder and value_dict.
147 |     The keys in both dicts must match
148 | 
149 |     Args:
150 |         placeholder_dict (dict): dict of placeholders
151 |         value_dict (dict): dict of values to be fed to the placeholders
152 | 
153 |     Returns: feed dict
154 | 
155 |     """
156 |     assert set(placeholder_dict.keys()) <= set(value_dict.keys()), \
157 |         "value dict must provide the necessary data to serve all placeholders in placeholder_dict"
158 |     # match the placeholders with their values
159 |     return dict([(placeholder_dict[key], value_dict[key]) for key in placeholder_dict.keys()])
160 | 
161 | def set_seed(seed):
162 |     """
163 |     Set the random seed for all random number generators
164 | 
165 |     Args:
166 |         seed (int) : seed to use
167 | 
168 |     Returns:
169 |         None
170 |     """
171 |     import random
172 |     import tensorflow as tf
173 |     seed %= 4294967294
174 |     random.seed(seed)
175 |     np.random.seed(seed)
176 |     tf.set_random_seed(seed)
177 |     print('using seed %s' % (str(seed)))
178 | 
179 | class ClassEncoder(json.JSONEncoder):
180 |     def default(self, o):
181 |         if isinstance(o, type):
182 |             return {'$class': o.__module__ + "." + o.__name__}
183 |         if callable(o):
184 |             return {'function': o.__name__}
185 |         return json.JSONEncoder.default(self, o)
186 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | joblib
 3 | gym==0.10.5
 4 | scipy
 5 | PyPrind
 6 | Pillow
 7 | moviepy
 8 | mpi4py
 9 | click
10 | tensorflow>=1.4.0
11 | cloudpickle
12 | matplotlib
13 | git+https://github.com/dennisl88/rand_param_envs.git
14 | git+https://github.com/dennisl88/multiworld.git@russell
15 | 


--------------------------------------------------------------------------------
/run_scripts/e-maml_run_mujoco.py:
--------------------------------------------------------------------------------
  1 | from meta_policy_search.baselines.linear_baseline import LinearFeatureBaseline
  2 | from meta_policy_search.envs.mujoco_envs.half_cheetah_rand_direc import HalfCheetahRandDirecEnv
  3 | from meta_policy_search.envs.normalized_env import normalize
  4 | from meta_policy_search.meta_algos.trpo_maml import TRPOMAML
  5 | from meta_policy_search.meta_trainer import Trainer
  6 | from meta_policy_search.samplers.meta_sampler import MetaSampler
  7 | from meta_policy_search.samplers.meta_sample_processor import MetaSampleProcessor
  8 | from meta_policy_search.policies.meta_gaussian_mlp_policy import MetaGaussianMLPPolicy
  9 | from meta_policy_search.utils import logger
 10 | from meta_policy_search.utils.utils import set_seed, ClassEncoder
 11 | 
 12 | import numpy as np
 13 | import os
 14 | import json
 15 | import argparse
 16 | import time
 17 | 
 18 | meta_policy_search_path = '/'.join(os.path.realpath(os.path.dirname(__file__)).split('/')[:-1])
 19 | 
 20 | def main(config):
 21 |     set_seed(config['seed'])
 22 | 
 23 | 
 24 |     baseline =  globals()[config['baseline']]() #instantiate baseline
 25 | 
 26 |     env = globals()[config['env']]() # instantiate env
 27 |     env = normalize(env) # apply normalize wrapper to env
 28 | 
 29 |     policy = MetaGaussianMLPPolicy(
 30 |             name="meta-policy",
 31 |             obs_dim=np.prod(env.observation_space.shape),
 32 |             action_dim=np.prod(env.action_space.shape),
 33 |             meta_batch_size=config['meta_batch_size'],
 34 |             hidden_sizes=config['hidden_sizes'],
 35 |         )
 36 | 
 37 |     sampler = MetaSampler(
 38 |         env=env,
 39 |         policy=policy,
 40 |         rollouts_per_meta_task=config['rollouts_per_meta_task'],  # This batch_size is confusing
 41 |         meta_batch_size=config['meta_batch_size'],
 42 |         max_path_length=config['max_path_length'],
 43 |         parallel=config['parallel'],
 44 |     )
 45 | 
 46 |     sample_processor = MetaSampleProcessor(
 47 |         baseline=baseline,
 48 |         discount=config['discount'],
 49 |         gae_lambda=config['gae_lambda'],
 50 |         normalize_adv=config['normalize_adv'],
 51 |     )
 52 | 
 53 |     algo = TRPOMAML(
 54 |         policy=policy,
 55 |         step_size=config['step_size'],
 56 |         inner_type=config['inner_type'],
 57 |         inner_lr=config['inner_lr'],
 58 |         meta_batch_size=config['meta_batch_size'],
 59 |         num_inner_grad_steps=config['num_inner_grad_steps'],
 60 |         exploration=True,
 61 |     )
 62 | 
 63 |     trainer = Trainer(
 64 |         algo=algo,
 65 |         policy=policy,
 66 |         env=env,
 67 |         sampler=sampler,
 68 |         sample_processor=sample_processor,
 69 |         n_itr=config['n_itr'],
 70 |         num_inner_grad_steps=config['num_inner_grad_steps'],
 71 |     )
 72 | 
 73 |     trainer.train()
 74 | 
 75 | if __name__=="__main__":
 76 |     idx = int(time.time())
 77 | 
 78 |     parser = argparse.ArgumentParser(description='ProMP: Proximal Meta-Policy Search')
 79 |     parser.add_argument('--config_file', type=str, default='', help='json file with run specifications')
 80 |     parser.add_argument('--dump_path', type=str, default=meta_policy_search_path + '/data/pro-mp/run_%d' % idx)
 81 | 
 82 |     args = parser.parse_args()
 83 | 
 84 | 
 85 |     if args.config_file: # load configuration from json file
 86 |         with open(args.config_file, 'r') as f:
 87 |             config = json.load(f)
 88 | 
 89 |     else: # use default config
 90 | 
 91 |         config = {
 92 |             'seed': 1,
 93 | 
 94 |             'baseline': 'LinearFeatureBaseline',
 95 | 
 96 |             'env': 'HalfCheetahRandDirecEnv',
 97 | 
 98 |             # sampler config
 99 |             'rollouts_per_meta_task': 20,
100 |             'max_path_length': 100,
101 |             'parallel': True,
102 | 
103 |             # sample processor config
104 |             'discount': 0.99,
105 |             'gae_lambda': 1,
106 |             'normalize_adv': True,
107 | 
108 |             # policy config
109 |             'hidden_sizes': (64, 64),
110 |             'learn_std': True, # whether to learn the standard deviation of the gaussian policy
111 | 
112 |             # E-MAML config
113 |             'inner_lr': 0.1, # adaptation step size
114 |             'learning_rate': 1e-3, # meta-policy gradient step size
115 |             'step_size': 0.01, # size of the TRPO trust-region
116 |             'n_itr': 1001, # number of overall training iterations
117 |             'meta_batch_size': 40, # number of sampled meta-tasks per iterations
118 |             'num_inner_grad_steps': 1, # number of inner / adaptation gradient steps
119 |             'inner_type' : 'log_likelihood', # type of inner loss function used
120 | 
121 |         }
122 | 
123 |     # configure logger
124 |     logger.configure(dir=args.dump_path, format_strs=['stdout', 'log', 'csv'],
125 |                      snapshot_mode='last_gap')
126 | 
127 |     # dump run configuration before starting training
128 |     json.dump(config, open(args.dump_path + '/params.json', 'w'), cls=ClassEncoder)
129 | 
130 |     # start the actual algorithm
131 |     main(config)


--------------------------------------------------------------------------------
/run_scripts/maml_run_mujoco.py:
--------------------------------------------------------------------------------
  1 | from meta_policy_search.baselines.linear_baseline import LinearFeatureBaseline
  2 | from meta_policy_search.envs.mujoco_envs.half_cheetah_rand_direc import HalfCheetahRandDirecEnv
  3 | from meta_policy_search.envs.normalized_env import normalize
  4 | from meta_policy_search.meta_algos.trpo_maml import TRPOMAML
  5 | from meta_policy_search.meta_trainer import Trainer
  6 | from meta_policy_search.samplers.meta_sampler import MetaSampler
  7 | from meta_policy_search.samplers.meta_sample_processor import MetaSampleProcessor
  8 | from meta_policy_search.policies.meta_gaussian_mlp_policy import MetaGaussianMLPPolicy
  9 | from meta_policy_search.utils import logger
 10 | from meta_policy_search.utils.utils import set_seed, ClassEncoder
 11 | 
 12 | import numpy as np
 13 | import os
 14 | import json
 15 | import argparse
 16 | import time
 17 | 
 18 | meta_policy_search_path = '/'.join(os.path.realpath(os.path.dirname(__file__)).split('/')[:-1])
 19 | 
 20 | def main(config):
 21 |     set_seed(config['seed'])
 22 | 
 23 | 
 24 |     baseline =  globals()[config['baseline']]() #instantiate baseline
 25 | 
 26 |     env = globals()[config['env']]() # instantiate env
 27 |     env = normalize(env) # apply normalize wrapper to env
 28 | 
 29 |     policy = MetaGaussianMLPPolicy(
 30 |             name="meta-policy",
 31 |             obs_dim=np.prod(env.observation_space.shape),
 32 |             action_dim=np.prod(env.action_space.shape),
 33 |             meta_batch_size=config['meta_batch_size'],
 34 |             hidden_sizes=config['hidden_sizes'],
 35 |         )
 36 | 
 37 |     sampler = MetaSampler(
 38 |         env=env,
 39 |         policy=policy,
 40 |         rollouts_per_meta_task=config['rollouts_per_meta_task'],  # This batch_size is confusing
 41 |         meta_batch_size=config['meta_batch_size'],
 42 |         max_path_length=config['max_path_length'],
 43 |         parallel=config['parallel'],
 44 |     )
 45 | 
 46 |     sample_processor = MetaSampleProcessor(
 47 |         baseline=baseline,
 48 |         discount=config['discount'],
 49 |         gae_lambda=config['gae_lambda'],
 50 |         normalize_adv=config['normalize_adv'],
 51 |     )
 52 | 
 53 |     algo = TRPOMAML(
 54 |         policy=policy,
 55 |         step_size=config['step_size'],
 56 |         inner_type=config['inner_type'],
 57 |         inner_lr=config['inner_lr'],
 58 |         meta_batch_size=config['meta_batch_size'],
 59 |         num_inner_grad_steps=config['num_inner_grad_steps'],
 60 |         exploration=False,
 61 |     )
 62 | 
 63 |     trainer = Trainer(
 64 |         algo=algo,
 65 |         policy=policy,
 66 |         env=env,
 67 |         sampler=sampler,
 68 |         sample_processor=sample_processor,
 69 |         n_itr=config['n_itr'],
 70 |         num_inner_grad_steps=config['num_inner_grad_steps'],
 71 |     )
 72 | 
 73 |     trainer.train()
 74 | 
 75 | if __name__=="__main__":
 76 |     idx = int(time.time())
 77 | 
 78 |     parser = argparse.ArgumentParser(description='ProMP: Proximal Meta-Policy Search')
 79 |     parser.add_argument('--config_file', type=str, default='', help='json file with run specifications')
 80 |     parser.add_argument('--dump_path', type=str, default=meta_policy_search_path + '/data/pro-mp/run_%d' % idx)
 81 | 
 82 |     args = parser.parse_args()
 83 | 
 84 | 
 85 |     if args.config_file: # load configuration from json file
 86 |         with open(args.config_file, 'r') as f:
 87 |             config = json.load(f)
 88 | 
 89 |     else: # use default config
 90 | 
 91 |         config = {
 92 |             'seed': 1,
 93 | 
 94 |             'baseline': 'LinearFeatureBaseline',
 95 | 
 96 |             'env': 'HalfCheetahRandDirecEnv',
 97 | 
 98 |             # sampler config
 99 |             'rollouts_per_meta_task': 20,
100 |             'max_path_length': 100,
101 |             'parallel': True,
102 | 
103 |             # sample processor config
104 |             'discount': 0.99,
105 |             'gae_lambda': 1,
106 |             'normalize_adv': True,
107 | 
108 |             # policy config
109 |             'hidden_sizes': (64, 64),
110 |             'learn_std': True, # whether to learn the standard deviation of the gaussian policy
111 | 
112 |             # E-MAML config
113 |             'inner_lr': 0.1, # adaptation step size
114 |             'learning_rate': 1e-3, # meta-policy gradient step size
115 |             'step_size': 0.01, # size of the TRPO trust-region
116 |             'n_itr': 1001, # number of overall training iterations
117 |             'meta_batch_size': 40, # number of sampled meta-tasks per iterations
118 |             'num_inner_grad_steps': 1, # number of inner / adaptation gradient steps
119 |             'inner_type' : 'log_likelihood', # type of inner loss function used
120 | 
121 |         }
122 | 
123 |     # configure logger
124 |     logger.configure(dir=args.dump_path, format_strs=['stdout', 'log', 'csv'],
125 |                      snapshot_mode='last_gap')
126 | 
127 |     # dump run configuration before starting training
128 |     json.dump(config, open(args.dump_path + '/params.json', 'w'), cls=ClassEncoder)
129 | 
130 |     # start the actual algorithm
131 |     main(config)


--------------------------------------------------------------------------------
/run_scripts/pro-mp_run_mujoco.py:
--------------------------------------------------------------------------------
  1 | from meta_policy_search.baselines.linear_baseline import LinearFeatureBaseline
  2 | from meta_policy_search.envs.mujoco_envs.half_cheetah_rand_direc import HalfCheetahRandDirecEnv
  3 | from meta_policy_search.envs.normalized_env import normalize
  4 | from meta_policy_search.meta_algos.pro_mp import ProMP
  5 | from meta_policy_search.meta_trainer import Trainer
  6 | from meta_policy_search.samplers.meta_sampler import MetaSampler
  7 | from meta_policy_search.samplers.meta_sample_processor import MetaSampleProcessor
  8 | from meta_policy_search.policies.meta_gaussian_mlp_policy import MetaGaussianMLPPolicy
  9 | from meta_policy_search.utils import logger
 10 | from meta_policy_search.utils.utils import set_seed, ClassEncoder
 11 | 
 12 | import numpy as np
 13 | import tensorflow as tf
 14 | import os
 15 | import json
 16 | import argparse
 17 | import time
 18 | 
 19 | meta_policy_search_path = '/'.join(os.path.realpath(os.path.dirname(__file__)).split('/')[:-1])
 20 | 
 21 | def main(config):
 22 |     set_seed(config['seed'])
 23 | 
 24 | 
 25 |     baseline =  globals()[config['baseline']]() #instantiate baseline
 26 | 
 27 |     env = globals()[config['env']]() # instantiate env
 28 |     env = normalize(env) # apply normalize wrapper to env
 29 | 
 30 |     policy = MetaGaussianMLPPolicy(
 31 |             name="meta-policy",
 32 |             obs_dim=np.prod(env.observation_space.shape),
 33 |             action_dim=np.prod(env.action_space.shape),
 34 |             meta_batch_size=config['meta_batch_size'],
 35 |             hidden_sizes=config['hidden_sizes'],
 36 |         )
 37 | 
 38 |     sampler = MetaSampler(
 39 |         env=env,
 40 |         policy=policy,
 41 |         rollouts_per_meta_task=config['rollouts_per_meta_task'],  # This batch_size is confusing
 42 |         meta_batch_size=config['meta_batch_size'],
 43 |         max_path_length=config['max_path_length'],
 44 |         parallel=config['parallel'],
 45 |     )
 46 | 
 47 |     sample_processor = MetaSampleProcessor(
 48 |         baseline=baseline,
 49 |         discount=config['discount'],
 50 |         gae_lambda=config['gae_lambda'],
 51 |         normalize_adv=config['normalize_adv'],
 52 |     )
 53 | 
 54 |     algo = ProMP(
 55 |         policy=policy,
 56 |         inner_lr=config['inner_lr'],
 57 |         meta_batch_size=config['meta_batch_size'],
 58 |         num_inner_grad_steps=config['num_inner_grad_steps'],
 59 |         learning_rate=config['learning_rate'],
 60 |         num_ppo_steps=config['num_promp_steps'],
 61 |         clip_eps=config['clip_eps'],
 62 |         target_inner_step=config['target_inner_step'],
 63 |         init_inner_kl_penalty=config['init_inner_kl_penalty'],
 64 |         adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'],
 65 |     )
 66 | 
 67 |     trainer = Trainer(
 68 |         algo=algo,
 69 |         policy=policy,
 70 |         env=env,
 71 |         sampler=sampler,
 72 |         sample_processor=sample_processor,
 73 |         n_itr=config['n_itr'],
 74 |         num_inner_grad_steps=config['num_inner_grad_steps'],
 75 |     )
 76 | 
 77 |     trainer.train()
 78 | 
 79 | if __name__=="__main__":
 80 |     idx = int(time.time())
 81 | 
 82 |     parser = argparse.ArgumentParser(description='ProMP: Proximal Meta-Policy Search')
 83 |     parser.add_argument('--config_file', type=str, default='', help='json file with run specifications')
 84 |     parser.add_argument('--dump_path', type=str, default=meta_policy_search_path + '/data/pro-mp/run_%d' % idx)
 85 | 
 86 |     args = parser.parse_args()
 87 | 
 88 | 
 89 |     if args.config_file: # load configuration from json file
 90 |         with open(args.config_file, 'r') as f:
 91 |             config = json.load(f)
 92 | 
 93 |     else: # use default config
 94 | 
 95 |         config = {
 96 |             'seed': 1,
 97 | 
 98 |             'baseline': 'LinearFeatureBaseline',
 99 | 
100 |             'env': 'HalfCheetahRandDirecEnv',
101 | 
102 |             # sampler config
103 |             'rollouts_per_meta_task': 20,
104 |             'max_path_length': 100,
105 |             'parallel': True,
106 | 
107 |             # sample processor config
108 |             'discount': 0.99,
109 |             'gae_lambda': 1,
110 |             'normalize_adv': True,
111 | 
112 |             # policy config
113 |             'hidden_sizes': (64, 64),
114 |             'learn_std': True, # whether to learn the standard deviation of the gaussian policy
115 | 
116 |             # ProMP config
117 |             'inner_lr': 0.1, # adaptation step size
118 |             'learning_rate': 1e-3, # meta-policy gradient step size
119 |             'num_promp_steps': 5, # number of ProMp steps without re-sampling
120 |             'clip_eps': 0.3, # clipping range
121 |             'target_inner_step': 0.01,
122 |             'init_inner_kl_penalty': 5e-4,
123 |             'adaptive_inner_kl_penalty': False, # whether to use an adaptive or fixed KL-penalty coefficient
124 |             'n_itr': 1001, # number of overall training iterations
125 |             'meta_batch_size': 40, # number of sampled meta-tasks per iterations
126 |             'num_inner_grad_steps': 1, # number of inner / adaptation gradient steps
127 | 
128 |         }
129 | 
130 |     # configure logger
131 |     logger.configure(dir=args.dump_path, format_strs=['stdout', 'log', 'csv'],
132 |                      snapshot_mode='last_gap')
133 | 
134 |     # dump run configuration before starting training
135 |     json.dump(config, open(args.dump_path + '/params.json', 'w'), cls=ClassEncoder)
136 | 
137 |     # start the actual algorithm
138 |     main(config)


--------------------------------------------------------------------------------
/run_scripts/pro-mp_run_point_mass.py:
--------------------------------------------------------------------------------
  1 | from meta_policy_search.baselines.linear_baseline import LinearFeatureBaseline
  2 | from meta_policy_search.envs.point_envs.point_env_2d_corner import MetaPointEnvCorner
  3 | from meta_policy_search.envs.normalized_env import normalize
  4 | from meta_policy_search.meta_algos.pro_mp import ProMP
  5 | from meta_policy_search.meta_trainer import Trainer
  6 | from meta_policy_search.samplers.meta_sampler import MetaSampler
  7 | from meta_policy_search.samplers.meta_sample_processor import MetaSampleProcessor
  8 | from meta_policy_search.policies.meta_gaussian_mlp_policy import MetaGaussianMLPPolicy
  9 | from meta_policy_search.utils import logger
 10 | from meta_policy_search.utils.utils import set_seed, ClassEncoder
 11 | 
 12 | import numpy as np
 13 | import tensorflow as tf
 14 | import os
 15 | import json
 16 | import argparse
 17 | import time
 18 | 
 19 | meta_policy_search_path = '/'.join(os.path.realpath(os.path.dirname(__file__)).split('/')[:-1])
 20 | 
 21 | def main(config):
 22 |     set_seed(config['seed'])
 23 | 
 24 | 
 25 |     baseline =  globals()[config['baseline']]() #instantiate baseline
 26 | 
 27 |     env = globals()[config['env']]() # instantiate env
 28 |     env = normalize(env) # apply normalize wrapper to env
 29 | 
 30 |     policy = MetaGaussianMLPPolicy(
 31 |             name="meta-policy",
 32 |             obs_dim=np.prod(env.observation_space.shape),
 33 |             action_dim=np.prod(env.action_space.shape),
 34 |             meta_batch_size=config['meta_batch_size'],
 35 |             hidden_sizes=config['hidden_sizes'],
 36 |         )
 37 | 
 38 |     sampler = MetaSampler(
 39 |         env=env,
 40 |         policy=policy,
 41 |         rollouts_per_meta_task=config['rollouts_per_meta_task'],  # This batch_size is confusing
 42 |         meta_batch_size=config['meta_batch_size'],
 43 |         max_path_length=config['max_path_length'],
 44 |         parallel=config['parallel'],
 45 |     )
 46 | 
 47 |     sample_processor = MetaSampleProcessor(
 48 |         baseline=baseline,
 49 |         discount=config['discount'],
 50 |         gae_lambda=config['gae_lambda'],
 51 |         normalize_adv=config['normalize_adv'],
 52 |     )
 53 | 
 54 |     algo = ProMP(
 55 |         policy=policy,
 56 |         inner_lr=config['inner_lr'],
 57 |         meta_batch_size=config['meta_batch_size'],
 58 |         num_inner_grad_steps=config['num_inner_grad_steps'],
 59 |         learning_rate=config['learning_rate'],
 60 |         num_ppo_steps=config['num_promp_steps'],
 61 |         clip_eps=config['clip_eps'],
 62 |         target_inner_step=config['target_inner_step'],
 63 |         init_inner_kl_penalty=config['init_inner_kl_penalty'],
 64 |         adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'],
 65 |     )
 66 | 
 67 |     trainer = Trainer(
 68 |         algo=algo,
 69 |         policy=policy,
 70 |         env=env,
 71 |         sampler=sampler,
 72 |         sample_processor=sample_processor,
 73 |         n_itr=config['n_itr'],
 74 |         num_inner_grad_steps=config['num_inner_grad_steps'],
 75 |     )
 76 | 
 77 |     trainer.train()
 78 | 
 79 | if __name__=="__main__":
 80 |     idx = int(time.time())
 81 | 
 82 |     parser = argparse.ArgumentParser(description='ProMP: Proximal Meta-Policy Search')
 83 |     parser.add_argument('--config_file', type=str, default='', help='json file with run specifications')
 84 |     parser.add_argument('--dump_path', type=str, default=meta_policy_search_path + '/data/pro-mp/run_%d' % idx)
 85 | 
 86 |     args = parser.parse_args()
 87 | 
 88 | 
 89 |     if args.config_file: # load configuration from json file
 90 |         with open(args.config_file, 'r') as f:
 91 |             config = json.load(f)
 92 | 
 93 |     else: # use default config
 94 | 
 95 |         config = {
 96 |             'seed': 1,
 97 | 
 98 |             'baseline': 'LinearFeatureBaseline',
 99 | 
100 |             'env': 'MetaPointEnvCorner',
101 | 
102 |             # sampler config
103 |             'rollouts_per_meta_task': 20,
104 |             'max_path_length': 100,
105 |             'parallel': True,
106 | 
107 |             # sample processor config
108 |             'discount': 0.99,
109 |             'gae_lambda': 1,
110 |             'normalize_adv': True,
111 | 
112 |             # policy config
113 |             'hidden_sizes': (64, 64),
114 |             'learn_std': True, # whether to learn the standard deviation of the gaussian policy
115 | 
116 |             # ProMP config
117 |             'inner_lr': 0.1, # adaptation step size
118 |             'learning_rate': 1e-3, # meta-policy gradient step size
119 |             'num_promp_steps': 5, # number of ProMp steps without re-sampling
120 |             'clip_eps': 0.3, # clipping range
121 |             'target_inner_step': 0.01,
122 |             'init_inner_kl_penalty': 5e-4,
123 |             'adaptive_inner_kl_penalty': False, # whether to use an adaptive or fixed KL-penalty coefficient
124 |             'n_itr': 1001, # number of overall training iterations
125 |             'meta_batch_size': 40, # number of sampled meta-tasks per iterations
126 |             'num_inner_grad_steps': 1, # number of inner / adaptation gradient steps
127 | 
128 |         }
129 | 
130 |     # configure logger
131 |     logger.configure(dir=args.dump_path, format_strs=['stdout', 'log', 'csv'],
132 |                      snapshot_mode='last_gap')
133 | 
134 |     # dump run configuration before starting training
135 |     json.dump(config, open(args.dump_path + '/params.json', 'w'), cls=ClassEncoder)
136 | 
137 |     # start the actual algorithm
138 |     main(config)


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(name="meta_policy_search",
 4 |       version='0.1',
 5 |       description='Framework that provides multiple gradient-based Meta-RL algorithms',
 6 |       url='https://github.com/jonasrothfuss/maml-zoo',
 7 |       author='Dennis Lee, Ignasi Clavera, Jonas Rothfuss',
 8 |       author_email='jonas.rothfuss@berkeley.edu',
 9 |       license='MIT',
10 |       packages=['meta_policy_search'],
11 |       test_suite='nose.collector',
12 |       tests_require=['nose'],
13 |       install_requires=[
14 |         'joblib==0.12.2',
15 |         'PyPrind',
16 |         'numpy',
17 |         'scipy',
18 |         'gym==0.10.5',
19 |         'python_dateutil',
20 |         'tensorflow'
21 |       ],
22 | zip_safe=False)


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonasrothfuss/ProMP/93ae339e23dfc6e1133f9538f2c7cc0ccee89d19/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_baselines.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import numpy as np
  3 | import pickle
  4 | from meta_policy_search.utils import utils
  5 | from meta_policy_search.policies.base import Policy
  6 | from meta_policy_search.baselines.linear_baseline import LinearFeatureBaseline, LinearTimeBaseline
  7 | from meta_policy_search.samplers.meta_sampler import MetaSampler
  8 | from gym import Env
  9 | 
 10 | 
 11 | class RandomEnv(Env):
 12 |     def __init__(self):
 13 |         self.state = np.zeros(1)
 14 |         self.goal = 0
 15 | 
 16 |     def sample_tasks(self, n_tasks):
 17 |         """ 
 18 |         Args:
 19 |             n_tasks (int) : number of different meta-tasks needed
 20 |         Returns:
 21 |             tasks (list) : an (n_tasks) length list of reset args
 22 |         """
 23 |         return np.random.choice(100, n_tasks, replace=False) # Ensure every env has a different goal
 24 | 
 25 |     def set_task(self, task):
 26 |         """
 27 |         Args:
 28 |             task: task of the meta-learning environment
 29 |         """
 30 |         self.goal = task
 31 | 
 32 |     def get_task(self):
 33 |         """
 34 |         Returns:
 35 |             task: task of the meta-learning environment
 36 |         """
 37 |         return self.goal
 38 | 
 39 |     def step(self, action):
 40 |         self.state += (self.goal - action) * np.random.random()
 41 |         return self.state * 100 + self.goal, (self.goal - action)[0], 0, {}
 42 | 
 43 |     def reset(self):
 44 |         self.state = np.zeros(1)
 45 |         return self.state
 46 | 
 47 |     def env_spec(self):
 48 |         return None
 49 | 
 50 | 
 51 | class RandomPolicy(Policy):
 52 |     def get_actions(self, observations):
 53 |         return [[np.random.random() + obs / 100 for obs in task] for task in observations], None
 54 | 
 55 | 
 56 | class TestLinearFeatureBaseline(unittest.TestCase):
 57 |     def setUp(self):
 58 |         self.random_env = RandomEnv()
 59 |         self.random_policy = RandomPolicy(1, 1)
 60 |         self.meta_batch_size = 2
 61 |         self.batch_size = 10
 62 |         self.path_length = 100
 63 |         self.linear = LinearFeatureBaseline()
 64 |         self.sampler = MetaSampler(self.random_env, self.random_policy, self.batch_size,
 65 |                                    self.meta_batch_size, self.path_length, parallel=True)
 66 | 
 67 |     def testFit(self):
 68 |         paths = self.sampler.obtain_samples()
 69 |         for task in paths.values():
 70 |             unfit_error = 0
 71 |             for path in task:
 72 |                 path["returns"] = utils.discount_cumsum(path["rewards"], 0.99)
 73 |                 unfit_pred = self.linear.predict(path)
 74 |                 unfit_error += sum([np.square(pred - actual) for pred, actual in zip(unfit_pred, path['returns'])])
 75 |             self.linear.fit(task)
 76 |             fit_error = 0
 77 |             for path in task:
 78 |                 fit_pred = self.linear.predict(path)
 79 |                 fit_error += sum([np.square(pred - actual) for pred, actual in zip(fit_pred, path['returns'])])
 80 |             self.assertTrue(fit_error < unfit_error)
 81 | 
 82 |     def testSerialize(self):
 83 |         paths = self.sampler.obtain_samples()
 84 |         for task in paths.values():
 85 |             for path in task:
 86 |                 path["returns"] = utils.discount_cumsum(path["rewards"], 0.99)
 87 |             self.linear.fit(task)
 88 |             fit_error_pre = 0
 89 |             for path in task:
 90 |                 fit_pred = self.linear.predict(path)
 91 |                 fit_error_pre += sum([np.square(pred - actual) for pred, actual in zip(fit_pred, path['returns'])])
 92 |             pkl = pickle.dumps(self.linear)
 93 |             self.linear = pickle.loads(pkl)
 94 |             fit_error_post = 0
 95 |             for path in task:
 96 |                 fit_pred = self.linear.predict(path)
 97 |                 fit_error_post += sum([np.square(pred - actual) for pred, actual in zip(fit_pred, path['returns'])])
 98 |             self.assertEqual(fit_error_pre, fit_error_post)
 99 | 
100 | 
101 | class TestLinearFeatureBaseline(unittest.TestCase):
102 |     def setUp(self):
103 |         self.random_env = RandomEnv()
104 |         self.random_policy = RandomPolicy(1, 1)
105 |         self.meta_batch_size = 2
106 |         self.batch_size = 10
107 |         self.path_length = 100
108 |         self.linear = LinearTimeBaseline()
109 |         self.sampler = MetaSampler(self.random_env, self.random_policy, self.batch_size,
110 |                                    self.meta_batch_size, self.path_length, parallel=True)
111 | 
112 |     def testFit(self):
113 |         base_path = np.arange(-4.0, 22.0, step=.6)
114 |         task1 = [{'discounted_rewards': base_path + np.random.normal(scale=2, size=base_path.shape),
115 |                   'observations': base_path} for i in range(10)]
116 |         task2 = [{'discounted_rewards': base_path**3 + np.random.normal(scale=2, size=base_path.shape),
117 |                   'observations': base_path} for i in range(10)]
118 | 
119 | 
120 |         for task in [task1, task2]:
121 |             unfit_error = np.sum([np.sum(path['discounted_rewards']**2) for path in task])
122 |             print('unfit_error', unfit_error)
123 |             self.linear.fit(task, target_key='discounted_rewards')
124 |             fit_error = 0
125 |             for path in task:
126 |                 fit_pred = self.linear.predict(path)
127 |                 fit_error += sum([np.square(pred - actual) for pred, actual in zip(fit_pred, path['discounted_rewards'])])
128 |             print('fit_error', fit_error)
129 |             self.assertTrue(2*fit_error < unfit_error)
130 | 
131 |     def testSerialize(self):
132 |         base_path = np.arange(-4.0, 22.0, step=.6)
133 |         task1 = [{'discounted_rewards': base_path + np.random.normal(scale=2, size=base_path.shape),
134 |                   'observations': base_path} for i in range(10)]
135 |         task2 = [{'discounted_rewards': base_path**3 + np.random.normal(scale=2, size=base_path.shape),
136 |                   'observations': base_path} for i in range(10)]
137 | 
138 |         for task in [task1, task2]:
139 |             self.linear.fit(task, target_key='discounted_rewards')
140 |             fit_error_pre = 0
141 |             for path in task:
142 |                 fit_pred = self.linear.predict(path)
143 |                 fit_error_pre += sum([np.square(pred - actual) for pred, actual in zip(fit_pred, path['discounted_rewards'])])
144 |             pkl = pickle.dumps(self.linear)
145 |             self.linear = pickle.loads(pkl)
146 |             fit_error_post = 0
147 |             for path in task:
148 |                 fit_pred = self.linear.predict(path)
149 |                 fit_error_post += sum([np.square(pred - actual) for pred, actual in zip(fit_pred, path['discounted_rewards'])])
150 |             self.assertEqual(fit_error_pre, fit_error_post)
151 | 
152 | if __name__ == '__main__':
153 |     unittest.main()
154 | 


--------------------------------------------------------------------------------
/tests/test_integration.py:
--------------------------------------------------------------------------------
  1 | from meta_policy_search.baselines.linear_baseline import LinearFeatureBaseline
  2 | from meta_policy_search.meta_algos.pro_mp import ProMP
  3 | from meta_policy_search.samplers.meta_sampler import MetaSampler
  4 | from meta_policy_search.samplers.meta_sample_processor import MetaSampleProcessor
  5 | from meta_policy_search.policies.meta_gaussian_mlp_policy import MetaGaussianMLPPolicy
  6 | 
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | import unittest
 10 | 
 11 | from gym.spaces import Box
 12 | 
 13 | 
 14 | class MetaPointEnv():
 15 | 
 16 |     def step(self, action):
 17 |         """
 18 |         Run one timestep of the environment's dynamics. When end of episode
 19 |         is reached, reset() should be called to reset the environment's internal state.
 20 | 
 21 |         Args:
 22 |             action : an action provided by the environment
 23 |         Returns:
 24 |             (observation, reward, done, info)
 25 |             observation : agent's observation of the current environment
 26 |             reward [Float] : amount of reward due to the previous action
 27 |             done : a boolean, indicating whether the episode has ended
 28 |             info : a dictionary containing other diagnostic information from the previous action
 29 |         """
 30 |         prev_state = self._state
 31 |         self._state = prev_state + np.clip(action, -0.1, 0.1)
 32 |         reward = self.reward(prev_state, action, self._state)
 33 |         done = self.done(self._state)
 34 |         next_observation = np.copy(self._state)
 35 |         return next_observation, reward, done, {}
 36 | 
 37 |     def reset(self):
 38 |         """
 39 |         Resets the state of the environment, returning an initial observation.
 40 |         Outputs
 41 |         -------
 42 |         observation : the initial observation of the space. (Initial reward is assumed to be 0.)
 43 |         """
 44 |         self._state = np.random.uniform(-2, 2, size=(2,))
 45 |         observation = np.copy(self._state)
 46 |         return observation
 47 | 
 48 |     @property
 49 |     def observation_space(self):
 50 |         return Box(low=-np.inf, high=np.inf, shape=(2,))
 51 | 
 52 |     @property
 53 |     def action_space(self):
 54 |         return Box(low=-0.1, high=0.1, shape=(2,))
 55 | 
 56 |     def done(self, obs):
 57 |         if obs.ndim == 1:
 58 |             return abs(obs[0]) < 0.01 and abs(obs[1]) < 0.01
 59 |         elif obs.ndim == 2:
 60 |             return np.logical_and(np.abs(obs[:, 0]) < 0.01, np.abs(obs[:, 1]) < 0.01)
 61 | 
 62 |     def reward(self, obs, act, obs_next):
 63 |         if obs_next.ndim == 1:
 64 |             return - np.sqrt(obs_next[0]**2 + obs_next[1]**2)
 65 |         elif obs_next.ndim == 2:
 66 |             return - np.sqrt(obs_next[:, 0] ** 2 + obs_next[:, 1] ** 2)
 67 | 
 68 |     def log_diagnostics(self, paths):
 69 |         pass
 70 | 
 71 |     def sample_tasks(self, n_tasks):
 72 |         return [{}] * n_tasks
 73 | 
 74 |     def set_task(self, task):
 75 |         pass
 76 | 
 77 | class TestLikelihoodRation(unittest.TestCase):
 78 |     """
 79 |     Assure that likelihhood ratio at first gradient step is approx. one since pi_old = pi_new
 80 |     """
 81 | 
 82 |     def setUp(self):
 83 |         self.env = env = MetaPointEnv()
 84 | 
 85 |         self.baseline = baseline = LinearFeatureBaseline()
 86 | 
 87 |         self.policy = policy = MetaGaussianMLPPolicy(
 88 |             name="meta-policy",
 89 |             obs_dim=np.prod(env.observation_space.shape),
 90 |             action_dim=np.prod(env.action_space.shape),
 91 |             meta_batch_size=10,
 92 |             hidden_sizes=(16, 16),
 93 |             learn_std=True,
 94 |             hidden_nonlinearity=tf.tanh,
 95 |             output_nonlinearity=None,
 96 |         )
 97 | 
 98 |         self.sampler = MetaSampler(
 99 |             env=env,
100 |             policy=policy,
101 |             rollouts_per_meta_task=2,
102 |             meta_batch_size=10,
103 |             max_path_length=50,
104 |             parallel=False,
105 |         )
106 | 
107 |         self.sample_processor = MetaSampleProcessor(
108 |             baseline=baseline,
109 |             discount=0.99,
110 |             gae_lambda=1.0,
111 |             normalize_adv=True,
112 |             positive_adv=False,
113 |         )
114 | 
115 |         self.algo = ProMP(
116 |             policy=policy,
117 |             inner_lr=0.1,
118 |             meta_batch_size=10,
119 |             num_inner_grad_steps=2,
120 |             learning_rate=1e-3,
121 |             num_ppo_steps=5,
122 |             num_minibatches=1,
123 |             clip_eps=0.5,
124 |             target_inner_step=2e-2,
125 |             init_inner_kl_penalty=1e-3,
126 |         )
127 | 
128 |     def test_likelihood_ratio(self):
129 |         with tf.Session() as sess:
130 | 
131 |             # initialize uninitialized vars  (only initialize vars that were not loaded)
132 |             uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))]
133 |             sess.run(tf.variables_initializer(uninit_vars))
134 | 
135 |             self.sampler.update_tasks()
136 |             self.policy.switch_to_pre_update()  # Switch to pre-update policy
137 | 
138 |             all_samples_data, all_paths = [], []
139 |             for step in range(1):
140 | 
141 |                 """ -------------------- Sampling --------------------------"""
142 |                 paths = self.sampler.obtain_samples(log_prefix=str(step))
143 |                 all_paths.append(paths)
144 | 
145 |                 """ ----------------- Processing Samples ---------------------"""
146 |                 samples_data = self.sample_processor.process_samples(paths, log=False)
147 |                 all_samples_data.append(samples_data)
148 | 
149 |                 """ ------------------- Inner Policy Update --------------------"""
150 |                 obs_phs, action_phs, adv_phs, dist_info_phs, all_phs = self.algo._make_input_placeholders('')
151 | 
152 |                 for i in range(self.algo.meta_batch_size):
153 |                     obs = samples_data[i]['observations']
154 |                     actions = samples_data[i]['actions']
155 |                     agent_infos = samples_data[i]['agent_infos']
156 |                     param_vals = self.policy.get_param_values()
157 | 
158 |                     likelihood_ratio_sym = self.policy.likelihood_ratio_sym(obs_phs[i], action_phs[i],
159 |                                                                           dist_info_phs[i],
160 |                                                                           self.policy.policies_params_phs[i])
161 | 
162 |                     feed_dict_params = dict(zip(self.policy.policies_params_phs[i].values(), param_vals.values()))
163 | 
164 |                     feed_dict_dist_infos = dict(zip(dist_info_phs[i].values(), agent_infos.values()))
165 | 
166 |                     feed_dict = {obs_phs[i]: obs,
167 |                                  action_phs[i]: actions
168 |                                  }
169 | 
170 |                     feed_dict.update(feed_dict_params)
171 |                     feed_dict.update(feed_dict_dist_infos)
172 | 
173 |                     lr = sess.run(likelihood_ratio_sym, feed_dict=feed_dict)
174 | 
175 |                     self.assertTrue(np.allclose(lr, 1))
176 | 


--------------------------------------------------------------------------------
/tests/test_optimizers.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import numpy as np
  3 | from meta_policy_search.optimizers.maml_first_order_optimizer import MAMLFirstOrderOptimizer
  4 | from collections import OrderedDict
  5 | import tensorflow as tf
  6 | 
  7 | 
  8 | def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0):
  9 |     with tf.variable_scope(scope):
 10 |         nin = x.get_shape()[1].value
 11 |         w = tf.get_variable("w", [nin, nh], initializer=tf.orthogonal_initializer(init_scale))
 12 |         b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(init_bias))
 13 |         return tf.matmul(x, w)+b
 14 | 
 15 | 
 16 | class Mlp(object):
 17 |     def __init__(self, inputs, output_size, hidden_size=(32, 32), name='mlp'):
 18 |         activ = tf.tanh
 19 |         curr_output = inputs
 20 |         self.name = name
 21 |         with tf.variable_scope(self.name):
 22 |             for i, size in enumerate(hidden_size):
 23 |                 curr_output = activ(fc(curr_output, str(i), nh=size, init_scale=np.sqrt(2)))
 24 |             self.output = fc(curr_output, 'y_pred', nh=output_size, init_scale=np.sqrt(2))
 25 |         self.params = tf.trainable_variables(scope=self.name)
 26 | 
 27 |     def get_params(self):
 28 |         return self.params
 29 | 
 30 | 
 31 | class CombinedMlp(object):
 32 |     def __init__(self, mlps):
 33 |         self.params = sum([mlp.params for mlp in mlps], [])
 34 |         self.output = [mlp.output for mlp in mlps]
 35 | 
 36 |     def get_params(self):
 37 |         return self.params
 38 | 
 39 | 
 40 | class TestOptimizer(unittest.TestCase): #TODO add test for ConjugateGradientOptimizer
 41 | 
 42 |     def testSine(self):
 43 |         np.random.seed(65)
 44 |         for optimizer in [MAMLFirstOrderOptimizer()]:
 45 |             tf.reset_default_graph()
 46 |             with tf.Session():
 47 |                 input_phs = tf.placeholder(dtype=tf.float32, shape=[None, 1])
 48 |                 target_phs = tf.placeholder(dtype=tf.float32, shape=[None, 1])
 49 |                 network = Mlp(input_phs, 1, hidden_size=(32,32), name='sin')
 50 |                 loss = tf.reduce_mean(tf.square(network.output - target_phs))
 51 |                 input_ph_dict = OrderedDict({'x': input_phs, 'y': target_phs})
 52 |                 optimizer.build_graph(loss, network, input_ph_dict)
 53 |                 sess = tf.get_default_session()
 54 |                 sess.run(tf.global_variables_initializer())
 55 | 
 56 |                 for i in range(5000):
 57 |                     xs = np.random.normal(0, 3, (1000, 1))
 58 |                     ys = np.sin(xs)
 59 |                     inputs = {'x': xs, 'y': ys}
 60 |                     optimizer.optimize(inputs)
 61 |                     if i % 100 == 0:
 62 |                         print(optimizer.loss(inputs))
 63 | 
 64 |                 xs = np.random.normal(0, 3, (100, 1))
 65 |                 ys = np.sin(xs)
 66 |                 y_pred = sess.run(network.output, feed_dict=dict(list(zip(input_ph_dict.values(), (xs, ys)))))
 67 |                 self.assertLessEqual(np.mean((ys-y_pred)**2), 0.02)
 68 | 
 69 |     def testGauss(self):
 70 |         np.random.seed(65)
 71 |         for optimizer in [MAMLFirstOrderOptimizer()]:
 72 |             tf.reset_default_graph()
 73 |             with tf.Session():
 74 |                 input_phs = tf.placeholder(dtype=tf.float32, shape=[None, 100])
 75 |                 target_mean_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1])
 76 |                 target_std_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1])
 77 | 
 78 |                 mean_network = Mlp(input_phs, 1, hidden_size=(8,8), name='mean')
 79 |                 std_network = Mlp(input_phs, 1, hidden_size=(8,8), name='std')
 80 | 
 81 |                 target_std = tf.exp(target_std_ph)
 82 |                 pred_std = tf.exp(std_network.output)
 83 | 
 84 |                 numerator = tf.square(target_mean_ph - mean_network.output) + tf.square(target_std) - tf.square(pred_std)
 85 |                 denominator = 2 * tf.square(pred_std) + 1e-8
 86 |                 loss = tf.reduce_mean(tf.reduce_sum(numerator / denominator + std_network.output - target_std_ph, axis=-1))
 87 | 
 88 |                 joined_network = CombinedMlp([mean_network, std_network])
 89 |                 input_ph_dict = OrderedDict({'x': input_phs, 'y_mean': target_mean_ph, 'y_std': target_std_ph})
 90 | 
 91 |                 optimizer.build_graph(loss, joined_network, input_ph_dict)
 92 | 
 93 |                 sess = tf.get_default_session()
 94 |                 sess.run(tf.global_variables_initializer())
 95 | 
 96 |                 for i in range(2000):
 97 |                     means = np.random.random(size=(1000))
 98 |                     stds = np.random.random(size=(1000))
 99 |                     inputs = np.vstack([np.random.normal(mean, np.exp(std), 100) for mean, std in zip(means, stds)])
100 |                     all_inputs = {'x': inputs, 'y_mean': means.reshape(-1, 1), 'y_std': stds.reshape(-1, 1)}
101 |                     optimizer.optimize(all_inputs)
102 |                     if i % 100 == 0:
103 |                         print(optimizer.loss(all_inputs))
104 | 
105 |                 means = np.random.random(size=(20))
106 |                 stds = np.random.random(size=(20))
107 | 
108 |                 inputs = np.stack([np.random.normal(mean, np.exp(std), 100) for mean, std in zip(means, stds)], axis=0)
109 |                 values_dict = OrderedDict({'x': inputs, 'y_mean': means.reshape(-1, 1), 'y_std': stds.reshape(-1, 1)})
110 | 
111 |                 mean_pred, std_pred = sess.run(joined_network.output, feed_dict=dict(list(zip(input_ph_dict.values(),
112 |                                                                                               values_dict.values()))))
113 | 
114 |                 self.assertTrue(np.mean(np.square(mean_pred - means)) < 0.2)
115 |                 self.assertTrue(np.mean(np.square(std_pred - stds)) < 0.2)
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     unittest.main()


--------------------------------------------------------------------------------
/tests/test_policies.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from meta_policy_search.policies.gaussian_mlp_policy import GaussianMLPPolicy
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | import pickle
  6 | import gym
  7 | 
  8 | class DummySpace(object):
  9 |     def __init__(self, dim):
 10 |         self._dim = dim
 11 | 
 12 |     @property
 13 |     def shape(self):
 14 |         return self._dim
 15 | 
 16 | class DummyEnv(object):
 17 |     def __init__(self, obs_dim, act_dim):
 18 |         self._observation_space = gym.spaces.Box(low=-np.ones(obs_dim), high=np.ones(obs_dim), dtype=np.float32)
 19 |         self._action_space = gym.spaces.Box(low=-np.ones(act_dim), high=np.ones(act_dim), dtype=np.float32)
 20 | 
 21 |     @property
 22 |     def observation_space(self):
 23 |         return self._observation_space
 24 | 
 25 |     @property
 26 |     def action_space(self):
 27 |         return self._action_space
 28 | 
 29 |     def get_obs(self, n=None):
 30 |         if n is None:
 31 |             return np.random.uniform(0, 1, size=self.observation_space.shape)
 32 |         else:
 33 |             return np.random.uniform(0, 1, size=(n,) + self.observation_space.shape)
 34 | 
 35 | 
 36 | class TestPolicy(unittest.TestCase):
 37 | 
 38 |     def setUp(self):
 39 |         sess = tf.get_default_session()
 40 |         if sess is None:
 41 |             tf.InteractiveSession()
 42 | 
 43 |     def test_output_sym(self):
 44 |         with tf.Session() as sess:
 45 |             obs_dim = 23
 46 |             action_dim = 7
 47 |             self.env = DummyEnv(obs_dim, action_dim)
 48 |             self.policy = GaussianMLPPolicy(obs_dim,
 49 |                                             action_dim,
 50 |                                             name='test_policy_output_sym',
 51 |                                             hidden_sizes=(64, 64))
 52 | 
 53 |             obs_ph_1 = tf.placeholder(dtype=tf.float32, name="obs_ph_1",
 54 |                                        shape=(None,) +  self.env.observation_space.shape)
 55 |             output_sym_1 = self.policy.distribution_info_sym(obs_ph_1)
 56 | 
 57 |             sess.run(tf.global_variables_initializer())
 58 | 
 59 |             n_obs = self.env.get_obs(n=100)
 60 |             action, agent_infos = self.policy.get_actions(n_obs)
 61 |             agent_infos_output_sym = sess.run(output_sym_1, feed_dict={obs_ph_1: n_obs})
 62 | 
 63 |             for k in agent_infos.keys():
 64 |                 self.assertTrue(np.allclose(agent_infos[k], agent_infos_output_sym[k], rtol=1e-5, atol=1e-5))
 65 | 
 66 |     def test_get_action(self):
 67 | 
 68 |         with tf.Session() as sess:
 69 |             obs_dim = 23
 70 |             action_dim = 7
 71 |             self.env = DummyEnv(obs_dim, action_dim)
 72 |             self.policy = GaussianMLPPolicy(obs_dim,
 73 |                                             action_dim,
 74 |                                             name='test_policy_get_action',
 75 |                                             hidden_sizes=(64, 64))
 76 | 
 77 |             sess.run(tf.global_variables_initializer())
 78 | 
 79 |             obs = self.env.get_obs()
 80 |             action, agent_infos = self.policy.get_action(obs)
 81 |             actions, agents_infos = self.policy.get_actions(np.expand_dims(obs, 0))
 82 |             for k in agent_infos.keys():
 83 |                 self.assertTrue(np.allclose(agent_infos[k], agents_infos[k], rtol=1e-5, atol=1e-5))
 84 | 
 85 |     def testSerialize1(self):
 86 |         obs_dim = 23
 87 |         action_dim = 7
 88 |         self.env = DummyEnv(obs_dim, action_dim)
 89 |         self.policy = GaussianMLPPolicy(obs_dim,
 90 |                                         action_dim,
 91 |                                         name='test_policy_serialize',
 92 |                                         hidden_sizes=(64, 64))
 93 | 
 94 |         sess = tf.get_default_session()
 95 |         sess.run(tf.global_variables_initializer())
 96 |         all_param_values = self.policy.get_param_values()
 97 | 
 98 |         self.policy.set_params(all_param_values)
 99 | 
100 |     def testSerialize2(self):
101 |         obs_dim = 2
102 |         action_dim = 7
103 |         env = DummyEnv(obs_dim, action_dim)
104 |         policy = GaussianMLPPolicy(obs_dim,
105 |                                         action_dim,
106 |                                         name='test_policy_serialize2',
107 |                                         hidden_sizes=(54, 23))
108 | 
109 |         sess = tf.get_default_session()
110 |         sess.run(tf.global_variables_initializer())
111 | 
112 |         obs = env.get_obs()
113 |         _, pre_agent_infos = policy.get_action(obs)
114 |         pkl_str = pickle.dumps(policy)
115 |         tf.reset_default_graph()
116 |         with tf.Session() as sess:
117 |             policy_unpickled = pickle.loads(pkl_str)
118 |             _, post_agent_infos = policy_unpickled.get_action(obs)
119 |             for key in pre_agent_infos.keys():
120 |                 self.assertTrue(np.allclose(pre_agent_infos[key], post_agent_infos[key]))
121 | 
122 | 
123 | if __name__ == '__main__':
124 |     unittest.main()
125 | 


--------------------------------------------------------------------------------