├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── docs ├── Makefile ├── autodoc_reqs.txt ├── data │ └── MAMLlogic.png ├── requirements.txt └── source │ ├── MAMLlogic.png │ ├── conf.py │ ├── index.rst │ └── modules │ ├── baselines │ └── baselines.rst │ ├── envs │ └── envs.rst │ ├── meta_algos │ └── meta_algos.rst │ ├── meta_policy_search.rst │ ├── optimizers │ └── optimizers.rst │ ├── policies │ ├── policies.distributions.rst │ └── policies.rst │ └── samplers │ └── samplers.rst ├── experiment_utils ├── config.py ├── experiment.py ├── run_sweep.py └── utils.py ├── meta_policy_search ├── __init__.py ├── baselines │ ├── __init__.py │ ├── base.py │ ├── linear_baseline.py │ └── zero_baseline.py ├── envs │ ├── __init__.py │ ├── base.py │ ├── mujoco_envs │ │ ├── ant_rand_direc.py │ │ ├── ant_rand_direc_2d.py │ │ ├── ant_rand_goal.py │ │ ├── half_cheetah_rand_direc.py │ │ ├── half_cheetah_rand_vel.py │ │ ├── humanoid_rand_direc.py │ │ ├── humanoid_rand_direc_2d.py │ │ ├── swimmer_rand_vel.py │ │ ├── walker2d_rand_direc.py │ │ └── walker2d_rand_vel.py │ ├── normalized_env.py │ ├── point_envs │ │ ├── corner_goals_point_env_2d.py │ │ ├── point_env_2d.py │ │ ├── point_env_2d_corner.py │ │ ├── point_env_2d_momentum.py │ │ ├── point_env_2d_v2.py │ │ └── point_env_2d_walls.py │ └── sawyer_envs │ │ ├── sawyer_door.py │ │ ├── sawyer_pick_and_place.py │ │ ├── sawyer_push.py │ │ └── sawyer_push_simple.py ├── meta_algos │ ├── __init__.py │ ├── base.py │ ├── dice_maml.py │ ├── pro_mp.py │ ├── trpo_maml.py │ ├── vpg_dice_maml.py │ └── vpg_maml.py ├── meta_trainer.py ├── optimizers │ ├── __init__.py │ ├── base.py │ ├── conjugate_gradient_optimizer.py │ └── maml_first_order_optimizer.py ├── policies │ ├── __init__.py │ ├── base.py │ ├── distributions │ │ ├── __init__.py │ │ ├── base.py │ │ └── diagonal_gaussian.py │ ├── gaussian_mlp_policy.py │ ├── meta_gaussian_mlp_policy.py │ └── networks │ │ ├── __init__.py │ │ └── mlp.py ├── samplers │ ├── __init__.py │ ├── base.py │ ├── dice_sample_processor.py │ ├── meta_sample_processor.py │ ├── meta_sampler.py │ ├── utils.py │ └── vectorized_env_executor.py └── utils │ ├── __init__.py │ ├── logger.py │ ├── serializable.py │ └── utils.py ├── requirements.txt ├── run_scripts ├── e-maml_run_mujoco.py ├── maml_run_mujoco.py ├── pro-mp_run_mujoco.py └── pro-mp_run_point_mass.py ├── setup.py └── tests ├── __init__.py ├── test_baselines.py ├── test_integration.py ├── test_optimizers.py ├── test_policies.py └── test_samplers.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # pycharm 107 | .idea/ 108 | 109 | # sphinx doc 110 | /docs/builds 111 | 112 | # Data 113 | /data 114 | 115 | # mjkey 116 | /docker/mjkey.txt 117 | 118 | #env file 119 | .env 120 | 121 | # sandbox 122 | /sandbox 123 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - "3.5" 5 | - "3.5-dev" # 3.5 development branch 6 | - "3.6" 7 | - "3.6-dev" # 3.6 development branch 8 | 9 | before_install: 10 | - sudo apt-get install -y libopenmpi-dev wget unzip 11 | #- sudo apt-get install -y curl git libgl1-mesa-dev libgl1-mesa-glx libglew-dev libosmesa6-dev 12 | #- wget https://www.roboti.us/download/mjpro150_linux.zip 13 | #- unzip mjpro150_linux.zip -d /home/travis/.mujoco 14 | #- echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/travis/.mujoco/mjpro150/bin" >> /home/travis/.bashrc 15 | #- source /home/travis/.bashrc 16 | 17 | # command to install dependencies 18 | install: 19 | - pip install . 20 | - pip install mpi4py click 21 | 22 | # command to run tests 23 | script: 24 | - python -m tests.test_baselines 25 | - python -m tests.test_optimizers 26 | - python -m tests.test_policies 27 | - python -m tests.test_samplers 28 | - python -m tests.test_integration 29 | 30 | notifications: 31 | email: 32 | recipients: 33 | - jonas.rothfuss@gmail.com 34 | on_success: never 35 | on_failure: always -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Jonas Rothfuss, Ignasi Clavera, Dennis Lee 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://api.travis-ci.com/jonasrothfuss/ProMP.svg?branch=master)](https://travis-ci.com/jonasrothfuss/ProMP) 2 | [![Docs](https://readthedocs.org/projects/promp/badge/?version=latest)](https://promp.readthedocs.io) 3 | 4 | # ProMP: Proximal Meta-Policy Search 5 | Implementations corresponding to ProMP ([Rothfuss et al., 2018](https://arxiv.org/abs/1810.06784)). 6 | Overall this repository consists of two branches: 7 | 8 | 1) master: lightweight branch that provides the necessary code to run Meta-RL algorithms such as ProMP, E-MAML, MAML. 9 | This branch is meant to provide an easy start with Meta-RL and can be integrated into other projects and setups. 10 | 2) full-code: branch that provides the comprehensive code that was used to produce the experimental results in [Rothfuss et al. (2018)](https://arxiv.org/abs/1810.06784). 11 | This includes experiment scripts and plotting scripts that can be used to reproduce the experimental results in the paper. 12 | 13 | The code is written in Python 3 and builds on [Tensorflow](https://www.tensorflow.org/). 14 | Many of the provided reinforcement learning environments require the [Mujoco](http://www.mujoco.org/) physics engine. 15 | Overall the code was developed under consideration of modularity and computational efficiency. 16 | Many components of the Meta-RL algorithm are parallelized either using either [MPI](https://mpi4py.readthedocs.io/en/stable/) 17 | or [Tensorflow](https://www.tensorflow.org/) in order to ensure efficient use of all CPU cores. 18 | 19 | ## Documentation 20 | 21 | An API specification and explanation of the code components can be found [here](https://promp.readthedocs.io/en/latest/). 22 | Also the documentation can be build locally by running the following commands 23 | 24 | ``` 25 | # ensure that you are in the root folder of the project 26 | cd docs 27 | # install the sphinx documentaiton tool dependencies 28 | pip install requirements.txt 29 | # build the documentaiton 30 | make clean && make html 31 | # now the html documentation can be found under docs/build/html/index.html 32 | ``` 33 | 34 | ## Installation / Dependencies 35 | The provided code can be either run in A) docker container provided by us or B) using python on 36 | your local machine. The latter requires multiple installation steps in order to setup dependencies. 37 | 38 | ### A. Docker 39 | If not installed yet, [set up](https://docs.docker.com/install/) docker on your machine. 40 | Pull our docker container ``jonasrothfuss/promp`` from docker-hub: 41 | 42 | ``` 43 | docker pull jonasrothfuss/promp 44 | ``` 45 | 46 | All the necessary dependencies are already installed inside the docker container. 47 | 48 | ### B. Anaconda or Virtualenv 49 | 50 | ##### B.1. Installing MPI 51 | Ensure that you have a working MPI implementation ([see here](https://mpi4py.readthedocs.io/en/stable/install.html) for more instructions). 52 | 53 | For Ubuntu you can install MPI through the package manager: 54 | 55 | ``` 56 | sudo apt-get install libopenmpi-dev 57 | ``` 58 | 59 | ##### B.2. Create either venv or conda environment and activate it 60 | 61 | ###### Virtualenv 62 | ``` 63 | pip install --upgrade virtualenv 64 | virtualenv 65 | source /bin/activate 66 | ``` 67 | 68 | ###### Anaconda 69 | If not done yet, install [anaconda](https://www.anaconda.com/) by following the instructions [here](https://www.anaconda.com/download/#linux). 70 | Then reate a anaconda environment, activate it and install the requirements in [`requirements.txt`](requirements.txt). 71 | ``` 72 | conda create -n python=3.6 73 | source activate 74 | ``` 75 | 76 | ##### B.3. Install the required python dependencies 77 | ``` 78 | pip install -r requirements.txt 79 | ``` 80 | 81 | ##### B.4. Set up the Mujoco physics engine and mujoco-py 82 | For running the majority of the provided Meta-RL environments, the Mujoco physics engine as well as a 83 | corresponding python wrapper are required. 84 | For setting up [Mujoco](http://www.mujoco.org/) and [mujoco-py](https://github.com/openai/mujoco-py), 85 | please follow the instructions [here](https://github.com/openai/mujoco-py). 86 | 87 | 88 | 89 | ## Running ProMP 90 | In order to run the ProMP algorithm point environment (no Mujoco needed) with default configurations execute: 91 | ``` 92 | python run_scripts/pro-mp_run_point_mass.py 93 | ``` 94 | 95 | To run the ProMP algorithm in a Mujoco environment with default configurations: 96 | ``` 97 | python run_scripts/pro-mp_run_mujoco.py 98 | ``` 99 | 100 | The run configuration can be change either in the run script directly or by providing a JSON configuration file with all 101 | the necessary hyperparameters. A JSON configuration file can be provided through the flag. Additionally the dump path 102 | can be specified through the dump_path flag: 103 | 104 | ``` 105 | python run_scripts/pro-mp_run.py --config_file --dump_path 106 | ``` 107 | 108 | Additionally, in order to run the the gradient-based meta-learning methods MAML and E-MAML ([Finn et. al., 2017](https://arxiv.org/abs/1703.03400) and 109 | [Stadie et. al., 2018](https://arxiv.org/abs/1803.01118)) in a Mujoco environment with the default configuration 110 | execute, respectively: 111 | ``` 112 | python run_scripts/maml_run_mujoco.py 113 | python run_scripts/e-maml_run_mujoco.py 114 | ``` 115 | ## Cite 116 | To cite ProMP please use 117 | ``` 118 | @article{rothfuss2018promp, 119 | title={ProMP: Proximal Meta-Policy Search}, 120 | author={Rothfuss, Jonas and Lee, Dennis and Clavera, Ignasi and Asfour, Tamim and Abbeel, Pieter}, 121 | journal={arXiv preprint arXiv:1810.06784}, 122 | year={2018} 123 | } 124 | ``` 125 | 126 | ## Acknowledgements 127 | This repository includes environments introduced in ([Duan et al., 2016](https://arxiv.org/abs/1611.02779), 128 | [Finn et al., 2017](https://arxiv.org/abs/1703.03400)). 129 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python3 -m sphinx 7 | SPHINXPROJ = maml-zoo 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/autodoc_reqs.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | tensorflow==1.8.0 -------------------------------------------------------------------------------- /docs/data/MAMLlogic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonasrothfuss/ProMP/93ae339e23dfc6e1133f9538f2c7cc0ccee89d19/docs/data/MAMLlogic.png -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | sphinx-rtd-theme 3 | sphinxcontrib-napoleon 4 | sphinxcontrib-websupport 5 | 6 | joblib==0.12.2 7 | PyPrind 8 | numpy 9 | scipy 10 | gym==0.10.5 11 | python_dateutil 12 | tensorflow -------------------------------------------------------------------------------- /docs/source/MAMLlogic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonasrothfuss/ProMP/93ae339e23dfc6e1133f9538f2c7cc0ccee89d19/docs/source/MAMLlogic.png -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | import os 16 | import sys 17 | sys.path.insert(0, os.path.abspath('../../')) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'meta_policy_search' 23 | copyright = '2018, Dennis Lee, Ignasi Clavera, Jonas Rothfuss' 24 | author = 'Dennis Lee, Ignasi Clavera, Jonas Rothfuss' 25 | 26 | # The short X.Y version 27 | version = '' 28 | # The full version, including alpha/beta/rc tags 29 | release = '' 30 | 31 | 32 | # -- General configuration --------------------------------------------------- 33 | 34 | # If your documentation needs a minimal Sphinx version, state it here. 35 | # 36 | # needs_sphinx = '1.0' 37 | 38 | # Add any Sphinx extension module names here, as strings. They can be 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 40 | # ones. 41 | extensions = [ 42 | 'sphinx.ext.autodoc', 43 | 'sphinx.ext.doctest', 44 | 'sphinx.ext.coverage', 45 | 'sphinx.ext.mathjax', 46 | 'sphinx.ext.viewcode', 47 | # 'sphinx.ext.githubpages', 48 | 'sphinx.ext.napoleon' 49 | ] 50 | 51 | # Add any paths that contain templates here, relative to this directory. 52 | templates_path = ['.templates'] 53 | 54 | # The suffix(es) of source filenames. 55 | # You can specify multiple suffix as a list of string: 56 | # 57 | # source_suffix = ['.rst', '.md'] 58 | source_suffix = '.rst' 59 | 60 | # The master toctree document. 61 | master_doc = 'index' 62 | 63 | # The language for content autogenerated by Sphinx. Refer to documentation 64 | # for a list of supported languages. 65 | # 66 | # This is also used if you do content translation via gettext catalogs. 67 | # Usually you set "language" from the command line for these cases. 68 | language = None 69 | 70 | # List of patterns, relative to source directory, that match files and 71 | # directories to ignore when looking for source files. 72 | # This pattern also affects html_static_path and html_extra_path . 73 | exclude_patterns = [] 74 | 75 | # The name of the Pygments (syntax highlighting) style to use. 76 | pygments_style = 'sphinx' 77 | 78 | 79 | # -- Options for HTML output ------------------------------------------------- 80 | 81 | # The theme to use for HTML and HTML Help pages. See the documentation for 82 | # a list of builtin themes. 83 | # 84 | html_theme = 'sphinx_rtd_theme' 85 | 86 | # Theme options are theme-specific and customize the look and feel of a theme 87 | # further. For a list of options available for each theme, see the 88 | # documentation. 89 | # 90 | # html_theme_options = {} 91 | 92 | # Add any paths that contain custom static files (such as style sheets) here, 93 | # relative to this directory. They are copied after the builtin static files, 94 | # so a file named "default.css" will overwrite the builtin "default.css". 95 | html_static_path = ['.static'] 96 | 97 | # Custom sidebar templates, must be a dictionary that maps document names 98 | # to template names. 99 | # 100 | # The default sidebars (for documents that don't match any pattern) are 101 | # defined by theme itself. Builtin themes are using these templates by 102 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 103 | # 'searchbox.html']``. 104 | # 105 | # html_sidebars = {} 106 | 107 | 108 | # -- Options for HTMLHelp output --------------------------------------------- 109 | 110 | # Output file base name for HTML help builder. 111 | htmlhelp_basename = 'maml-zoodoc' 112 | 113 | 114 | # -- Options for LaTeX output ------------------------------------------------ 115 | 116 | latex_elements = { 117 | # The paper size ('letterpaper' or 'a4paper'). 118 | # 119 | # 'papersize': 'letterpaper', 120 | 121 | # The font size ('10pt', '11pt' or '12pt'). 122 | # 123 | # 'pointsize': '10pt', 124 | 125 | # Additional stuff for the LaTeX preamble. 126 | # 127 | # 'preamble': '', 128 | 129 | # Latex figure (float) alignment 130 | # 131 | # 'figure_align': 'htbp', 132 | } 133 | 134 | # Grouping the document tree into LaTeX files. List of tuples 135 | # (source start file, target name, title, 136 | # author, documentclass [howto, manual, or own class]). 137 | latex_documents = [ 138 | (master_doc, 'meta_policy_search.tex', 'meta_policy_search Documentation', 139 | 'Dennis Lee, Ignasi Clavera, Jonas Rothfuss', 'manual'), 140 | ] 141 | 142 | 143 | # -- Options for manual page output ------------------------------------------ 144 | 145 | # One entry per manual page. List of tuples 146 | # (source start file, name, description, authors, manual section). 147 | man_pages = [ 148 | (master_doc, 'meta_policy_search', 'meta_policy_search Documentation', 149 | [author], 1) 150 | ] 151 | 152 | 153 | # -- Options for Texinfo output ---------------------------------------------- 154 | 155 | # Grouping the document tree into Texinfo files. List of tuples 156 | # (source start file, target name, title, author, 157 | # dir menu entry, description, category) 158 | texinfo_documents = [ 159 | (master_doc, 'meta_policy_search', 'meta_policy_search Documentation', 160 | author, 'meta_policy_search', 'One line description of project.', 161 | 'Miscellaneous'), 162 | ] 163 | 164 | 165 | # -- Options for Epub output ------------------------------------------------- 166 | 167 | # Bibliographic Dublin Core info. 168 | epub_title = project 169 | epub_author = author 170 | epub_publisher = author 171 | epub_copyright = copyright 172 | 173 | # The unique identifier of the text. This can be a ISBN number 174 | # or the project homepage. 175 | # 176 | # epub_identifier = '' 177 | 178 | # A unique identification for the text. 179 | # 180 | # epub_uid = '' 181 | 182 | # A list of files that should not be packed into the epub file. 183 | epub_exclude_files = ['search.html'] 184 | 185 | 186 | # -- Extension configuration ------------------------------------------------- 187 | 188 | autodoc_mock_imports = ["gym"] -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. maml-zoo documentation master file, created by 2 | sphinx-quickstart on Mon Aug 13 09:57:59 2018. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Meta-Policy Search's documentation! 7 | ==================================== 8 | 9 | Despite recent progress, deep reinforcement learning (RL) still relies heavily on hand-crafted features and reward functions 10 | as well as engineered problem specific inductive bias. Meta-RL aims to forego such reliance by acquiring inductive bias 11 | in a data-driven manner. A particular instance of meta learning that has proven successful in RL is gradient-based meta-learning. 12 | 13 | The code repository provides implementations of various gradient-based Meta-RL methods including 14 | 15 | - ProMP: Proximal Meta-Policy Search (`Rothfuss et al., 2018`_) 16 | - MAML: Model Agnostic Meta-Learning (`Finn et al., 2017`_) 17 | - E-MAML: Exploration MAML (`Al-Shedivat et al., 2018`_, `Stadie et al., 2018`_) 18 | 19 | The code was written as part of ProMP_. Further information and experimental results can be found on our website_. 20 | This documentation specifies the API and interaction of the algorithm's components. Overall, on iteration of 21 | gradient-based Meta-RL consists of the followings steps: 22 | 23 | 1. Sample trajectories with pre update policy 24 | 2. Perform gradient step for each task to obtain updated/adapted policy 25 | 3. Sample trajectories with the updated/adapted policy 26 | 4. Perform a meta-policy optimization step, changing the pre-updates policy parameters 27 | 28 | This high level structure of the algorithm is implemented in the Meta-Trainer class. The overall structure and interaction 29 | of the code components is depicted in the following figure: 30 | 31 | 32 | .. image:: MAMLlogic.png 33 | :width: 600 34 | 35 | .. _ProMP: https://arxiv.org/abs/1810.06784 36 | 37 | .. _Rothfuss et al., 2018: https://arxiv.org/abs/1810.06784 38 | 39 | .. _Finn et al., 2017: https://arxiv.org/abs/1703.03400 40 | 41 | .. _Stadie et al., 2018: https://arxiv.org/pdf/1803.01118.pdf 42 | 43 | .. _Al-Shedivat et al., 2018: https://arxiv.org/abs/1710.03641 44 | 45 | .. _website: https://sites.google.com/view/pro-mp/ 46 | 47 | .. toctree:: 48 | :maxdepth: 3 49 | :caption: Contents: 50 | 51 | modules/meta_policy_search.rst 52 | 53 | Indices and tables 54 | ================== 55 | 56 | * :ref:`genindex` 57 | * :ref:`modindex` 58 | -------------------------------------------------------------------------------- /docs/source/modules/baselines/baselines.rst: -------------------------------------------------------------------------------- 1 | Baselines 2 | =========================== 3 | 4 | .. automodule:: meta_policy_search.baselines 5 | 6 | Baseline (Interface) 7 | --------------------------- 8 | .. autoclass:: Baseline 9 | :members: 10 | 11 | Linear Feature Baseline 12 | --------------------------- 13 | .. autoclass:: LinearFeatureBaseline 14 | :members: 15 | :inherited-members: 16 | 17 | LinearTimeBaseline 18 | --------------------------- 19 | .. autoclass:: LinearTimeBaseline 20 | :members: 21 | :inherited-members: -------------------------------------------------------------------------------- /docs/source/modules/envs/envs.rst: -------------------------------------------------------------------------------- 1 | Environments 2 | ====================== 3 | 4 | .. automodule:: meta_policy_search.envs.base 5 | 6 | MetaEnv (Interface) 7 | -------------------------- 8 | 9 | .. autoclass:: MetaEnv 10 | :members: 11 | 12 | -------------------------------------------------------------------------------- /docs/source/modules/meta_algos/meta_algos.rst: -------------------------------------------------------------------------------- 1 | Meta-Algorithms 2 | ============================= 3 | 4 | .. automodule:: meta_policy_search.meta_algos 5 | 6 | MAML-Algorithm (Interface) 7 | ------------------------------ 8 | 9 | .. autoclass:: MAMLAlgo 10 | :members: 11 | :inherited-members: 12 | :show-inheritance: 13 | 14 | ProMP-Algorithm 15 | ------------------------------ 16 | 17 | .. autoclass:: ProMP 18 | :members: 19 | :show-inheritance: 20 | :inherited-members: 21 | 22 | TRPO-MAML-Algorithm 23 | ------------------------------ 24 | 25 | .. autoclass:: TRPOMAML 26 | :members: 27 | :show-inheritance: 28 | :inherited-members: 29 | 30 | VPG-MAML-Algorithm 31 | ------------------------------ 32 | 33 | .. autoclass:: VPGMAML 34 | :members: 35 | :show-inheritance: 36 | :inherited-members: -------------------------------------------------------------------------------- /docs/source/modules/meta_policy_search.rst: -------------------------------------------------------------------------------- 1 | Meta-Policy Search 2 | ================================= 3 | 4 | .. toctree:: 5 | 6 | baselines/baselines 7 | envs/envs 8 | meta_algos/meta_algos 9 | optimizers/optimizers 10 | policies/policies 11 | samplers/samplers 12 | 13 | 14 | Meta-Trainer 15 | ---------------------------------- 16 | 17 | .. automodule:: meta_policy_search.meta_trainer 18 | 19 | .. autoclass:: Trainer 20 | :members: 21 | :inherited-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/source/modules/optimizers/optimizers.rst: -------------------------------------------------------------------------------- 1 | Optimizers 2 | ============================ 3 | 4 | .. automodule:: meta_policy_search.optimizers 5 | 6 | Conjugate Gradient Optimizer 7 | ----------------------------- 8 | .. autoclass:: ConjugateGradientOptimizer 9 | :members: 10 | :inherited-members: 11 | :show-inheritance: 12 | 13 | MAML First Order Optimizer 14 | ----------------------------- 15 | .. autoclass:: MAMLFirstOrderOptimizer 16 | :members: 17 | :inherited-members: 18 | :show-inheritance: 19 | 20 | -------------------------------------------------------------------------------- /docs/source/modules/policies/policies.distributions.rst: -------------------------------------------------------------------------------- 1 | maml\_zoo.policies.distributions package 2 | ======================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | maml\_zoo.policies.distributions.base module 8 | -------------------------------------------- 9 | 10 | .. automodule:: meta_policy_search.policies.distributions.base 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | maml\_zoo.policies.distributions.diagonal\_gaussian module 16 | ---------------------------------------------------------- 17 | 18 | .. automodule:: meta_policy_search.policies.distributions.diagonal_gaussian 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | Module contents 25 | --------------- 26 | 27 | .. automodule:: meta_policy_search.policies.distributions 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /docs/source/modules/policies/policies.rst: -------------------------------------------------------------------------------- 1 | Policies 2 | ========================== 3 | 4 | Policy Interfaces 5 | -------------------------- 6 | .. automodule:: meta_policy_search.policies 7 | 8 | .. autoclass:: Policy 9 | :members: 10 | :inherited-members: 11 | :show-inheritance: 12 | 13 | .. autoclass:: MetaPolicy 14 | :members: 15 | :inherited-members: 16 | :show-inheritance: 17 | 18 | 19 | Gaussian-Policies 20 | -------------------------- 21 | 22 | .. autoclass:: GaussianMLPPolicy 23 | :members: 24 | :inherited-members: 25 | :show-inheritance: 26 | 27 | .. autoclass:: MetaGaussianMLPPolicy 28 | :members: 29 | :inherited-members: 30 | :show-inheritance: -------------------------------------------------------------------------------- /docs/source/modules/samplers/samplers.rst: -------------------------------------------------------------------------------- 1 | Samplers 2 | ========================== 3 | 4 | .. automodule:: meta_policy_search.samplers 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Sampler 10 | ------------------------- 11 | 12 | .. autoclass:: Sampler 13 | :members: 14 | :inherited-members: 15 | :show-inheritance: 16 | 17 | .. autoclass:: MetaSampler 18 | :members: 19 | :inherited-members: 20 | :show-inheritance: 21 | 22 | Sample Processor 23 | ------------------------- 24 | 25 | .. autoclass:: SampleProcessor 26 | :members: 27 | :inherited-members: 28 | :show-inheritance: 29 | 30 | .. autoclass:: DiceSampleProcessor 31 | :members: 32 | :inherited-members: 33 | :show-inheritance: 34 | 35 | .. autoclass:: MetaSampleProcessor 36 | :members: 37 | :inherited-members: 38 | :show-inheritance: 39 | 40 | Vectorized Environment Executor 41 | ------------------------------- 42 | 43 | .. automodule:: meta_policy_search.samplers.vectorized_env_executor 44 | 45 | .. autoclass:: MetaIterativeEnvExecutor 46 | :members: 47 | :inherited-members: 48 | :show-inheritance: 49 | 50 | .. autoclass:: MetaParallelEnvExecutor 51 | :members: 52 | :inherited-members: 53 | :show-inheritance: 54 | -------------------------------------------------------------------------------- /experiment_utils/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 4 | 5 | DOCKER_MOUNT_DIR = '/root/code/data' 6 | 7 | DATA_DIR = os.path.join(BASE_DIR, 'data') 8 | 9 | DOCKER_IMAGE = 'dennisl88/maml_zoo' 10 | 11 | S3_BUCKET_NAME = 'maml-zoo-experiments' -------------------------------------------------------------------------------- /experiment_utils/experiment.py: -------------------------------------------------------------------------------- 1 | # Copied from doodad/run_experiment_lite_doodad.py 2 | import os 3 | import pickle 4 | import base64 5 | import argparse 6 | 7 | ARGS_DATA = 'DOODAD_ARGS_DATA' 8 | USE_CLOUDPICKLE = 'DOODAD_USE_CLOUDPICKLE' 9 | CLOUDPICKLE_VERSION = 'DOODAD_CLOUDPICKLE_VERSION' 10 | 11 | __ARGS = None 12 | def __get_arg_config(): 13 | """ 14 | global __ARGS 15 | if __ARGS is not None: 16 | return __ARGS 17 | #TODO: use environment variables rather than command-line arguments 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument('--use_cloudpickle', type=bool, default=False) 20 | parser.add_argument('--'+ARGS_DATA, type=str, default='') 21 | parser.add_argument('--output_dir', type=str, default='/tmp/expt/') 22 | args = parser.parse_args() 23 | __ARGS = args 24 | """ 25 | args_data = os.environ.get(ARGS_DATA, {}) 26 | cloudpickle_version = os.environ.get(CLOUDPICKLE_VERSION, 'n/a') 27 | use_cloudpickle = bool(int(os.environ.get(USE_CLOUDPICKLE, '0'))) 28 | 29 | args = lambda : None # hack - use function as namespace 30 | args.args_data = args_data 31 | args.use_cloudpickle = use_cloudpickle 32 | args.cloudpickle_version = cloudpickle_version 33 | return args 34 | 35 | def get_args(key=None, default=None): 36 | args = __get_arg_config() 37 | 38 | if args.args_data: 39 | if args.use_cloudpickle: 40 | import cloudpickle 41 | assert args.cloudpickle_version == cloudpickle.__version__, "Cloudpickle versions do not match! (host) %s vs (remote) %s" % (args.cloudpickle_version, cloudpickle.__version__) 42 | data = cloudpickle.loads(base64.b64decode(args.args_data)) 43 | else: 44 | data = pickle.loads(base64.b64decode(args.args_data)) 45 | else: 46 | data = {} 47 | 48 | if key is not None: 49 | return data.get(key, default) 50 | return data 51 | 52 | def encode_args(call_args, cloudpickle=False): 53 | """ 54 | Encode call_args dictionary as a base64 string 55 | """ 56 | assert isinstance(call_args, dict) 57 | 58 | if cloudpickle: 59 | import cloudpickle 60 | cpickle_version = cloudpickle.__version__ 61 | data = base64.b64encode(cloudpickle.dumps(call_args)).decode("utf-8") 62 | else: 63 | data = base64.b64encode(pickle.dumps(call_args)).decode("utf-8") 64 | cpickle_version = 'n/a' 65 | return data, cpickle_version 66 | 67 | # These are arguments passed in from launch_python 68 | args_dict = get_args() 69 | print('My args are:', args_dict) 70 | 71 | -------------------------------------------------------------------------------- /experiment_utils/run_sweep.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import argparse 4 | import itertools 5 | 6 | from experiment_utils import config 7 | from experiment_utils.utils import query_yes_no 8 | 9 | import doodad as dd 10 | import doodad.mount as mount 11 | import doodad.easy_sweep.launcher as launcher 12 | from doodad.easy_sweep.hyper_sweep import run_sweep_doodad 13 | 14 | def run_sweep(run_experiment, sweep_params, exp_name, instance_type='c4.xlarge'): 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--mode', type=str, default='local', 17 | help='Mode for running the experiments - local: runs on local machine, ' 18 | 'ec2: runs on AWS ec2 cluster (requires a proper configuration file)') 19 | 20 | args = parser.parse_args(sys.argv[1:]) 21 | 22 | local_mount = mount.MountLocal(local_dir=config.BASE_DIR, pythonpath=True) 23 | 24 | docker_mount_point = os.path.join(config.DOCKER_MOUNT_DIR, exp_name) 25 | 26 | sweeper = launcher.DoodadSweeper([local_mount], docker_img=config.DOCKER_IMAGE, docker_output_dir=docker_mount_point, 27 | local_output_dir=os.path.join(config.DATA_DIR, 'local', exp_name)) 28 | sweeper.mount_out_s3 = mount.MountS3(s3_path='', mount_point=docker_mount_point, output=True) 29 | 30 | if args.mode == 'ec2': 31 | print("\n" + "**********" * 10 + "\nexp_prefix: {}\nvariants: {}".format(exp_name, len(list(itertools.product(*[value for value in sweep_params.values()]))))) 32 | 33 | if query_yes_no("Continue?"): 34 | sweeper.run_sweep_ec2(run_experiment, sweep_params, bucket_name=config.S3_BUCKET_NAME, instance_type=instance_type, 35 | region='us-west-1', s3_log_name=exp_name, add_date_to_logname=False) 36 | 37 | elif args.mode == 'local_docker': 38 | mode_docker = dd.mode.LocalDocker( 39 | image=sweeper.image, 40 | ) 41 | run_sweep_doodad(run_experiment, sweep_params, run_mode=mode_docker, 42 | mounts=sweeper.mounts) 43 | 44 | elif args.mode == 'local': 45 | sweeper.run_sweep_serial(run_experiment, sweep_params) 46 | 47 | elif args.mode == 'local_singularity': 48 | mode_singularity = dd.mode.LocalSingularity( 49 | image='~/meta_policy_search.simg') 50 | run_sweep_doodad(run_experiment, sweep_params, run_mode=mode_singularity, 51 | mounts=sweeper.mounts) 52 | else: 53 | raise NotImplementedError -------------------------------------------------------------------------------- /experiment_utils/utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | def query_yes_no(question, default="no", allow_skip=False): 4 | """Ask a yes/no question via raw_input() and return their answer. 5 | 6 | "question" is a string that is presented to the user. 7 | "default" is the presumed answer if the user just hits . 8 | It must be "yes" (the default), "no" or None (meaning 9 | an answer is required of the user). 10 | 11 | The "answer" return value is True for "yes" or False for "no". 12 | """ 13 | valid = {"yes": True, "y": True, "ye": True, 14 | "no": False, "n": False} 15 | if allow_skip: 16 | valid["skip"] = "skip" 17 | if default is None: 18 | prompt = " [y/n] " 19 | elif default == "yes": 20 | prompt = " [Y/n] " 21 | elif default == "no": 22 | prompt = " [y/N] " 23 | else: 24 | raise ValueError("invalid default answer: '%s'" % default) 25 | if allow_skip: 26 | prompt += " or skip" 27 | while True: 28 | sys.stdout.write(question + prompt) 29 | choice = input().lower() 30 | if default is not None and choice == '': 31 | return valid[default] 32 | elif choice in valid: 33 | return valid[choice] 34 | else: 35 | sys.stdout.write("Please respond with 'yes' or 'no' " 36 | "(or 'y' or 'n').\n") -------------------------------------------------------------------------------- /meta_policy_search/__init__.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | 4 | class MyTestCase(unittest.TestCase): 5 | def test_something(self): 6 | self.assertEqual(True, False) 7 | 8 | 9 | if __name__ == '__main__': 10 | unittest.main() -------------------------------------------------------------------------------- /meta_policy_search/baselines/__init__.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.baselines.base import Baseline 2 | from meta_policy_search.baselines.linear_baseline import LinearFeatureBaseline 3 | from meta_policy_search.baselines.linear_baseline import LinearTimeBaseline -------------------------------------------------------------------------------- /meta_policy_search/baselines/base.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Baseline(): 5 | """ 6 | Reward baseline interface 7 | """ 8 | 9 | def get_param_values(self): 10 | """ 11 | Returns the parameter values of the baseline object 12 | 13 | """ 14 | raise NotImplementedError 15 | 16 | def set_params(self, value): 17 | """ 18 | Sets the parameter values of the baseline object 19 | 20 | Args: 21 | value: parameter value to be set 22 | 23 | """ 24 | raise NotImplementedError 25 | 26 | def fit(self, paths): 27 | """ 28 | Fits the baseline model with the provided paths 29 | 30 | Args: 31 | paths: list of paths 32 | 33 | """ 34 | raise NotImplementedError 35 | 36 | def predict(self, path): 37 | """ 38 | Predicts the reward baselines for a provided trajectory / path 39 | 40 | Args: 41 | path: dict of lists/numpy array containing trajectory / path information 42 | such as "observations", "rewards", ... 43 | 44 | Returns: numpy array of the same length as paths["observations"] specifying the reward baseline 45 | 46 | """ 47 | raise NotImplementedError 48 | 49 | def log_diagnostics(self, paths, prefix): 50 | """ 51 | Log extra information per iteration based on the collected paths 52 | """ 53 | pass -------------------------------------------------------------------------------- /meta_policy_search/baselines/linear_baseline.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.baselines.base import Baseline 2 | from meta_policy_search.utils.serializable import Serializable 3 | import numpy as np 4 | 5 | 6 | class LinearBaseline(Baseline): 7 | """ 8 | Abstract class providing the functionality for fitting a linear baseline 9 | Don't instantiate this class. Instead use LinearFeatureBaseline or LinearTimeBaseline 10 | """ 11 | 12 | def __init__(self, reg_coeff=1e-5): 13 | super(LinearBaseline, self).__init__() 14 | self._coeffs = None 15 | self._reg_coeff = reg_coeff 16 | 17 | def predict(self, path): 18 | """ 19 | Abstract Class for the LinearFeatureBaseline and the LinearTimeBaseline 20 | Predicts the linear reward baselines estimates for a provided trajectory / path. 21 | If the baseline is not fitted - returns zero baseline 22 | 23 | Args: 24 | path (dict): dict of lists/numpy array containing trajectory / path information 25 | such as "observations", "rewards", ... 26 | 27 | Returns: 28 | (np.ndarray): numpy array of the same length as paths["observations"] specifying the reward baseline 29 | 30 | """ 31 | if self._coeffs is None: 32 | return np.zeros(len(path["observations"])) 33 | return self._features(path).dot(self._coeffs) 34 | 35 | def get_param_values(self, **tags): 36 | """ 37 | Returns the parameter values of the baseline object 38 | 39 | Returns: 40 | numpy array of linear_regression coefficients 41 | 42 | """ 43 | return self._coeffs 44 | 45 | def set_params(self, value, **tags): 46 | """ 47 | Sets the parameter values of the baseline object 48 | 49 | Args: 50 | value: numpy array of linear_regression coefficients 51 | 52 | """ 53 | self._coeffs = value 54 | 55 | def fit(self, paths, target_key='returns'): 56 | """ 57 | Fits the linear baseline model with the provided paths via damped least squares 58 | 59 | Args: 60 | paths (list): list of paths 61 | target_key (str): path dictionary key of the target that shall be fitted (e.g. "returns") 62 | 63 | """ 64 | assert all([target_key in path.keys() for path in paths]) 65 | 66 | featmat = np.concatenate([self._features(path) for path in paths], axis=0) 67 | target = np.concatenate([path[target_key] for path in paths], axis=0) 68 | reg_coeff = self._reg_coeff 69 | for _ in range(5): 70 | self._coeffs = np.linalg.lstsq( 71 | featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]), 72 | featmat.T.dot(target), 73 | rcond=-1 74 | )[0] 75 | if not np.any(np.isnan(self._coeffs)): 76 | break 77 | reg_coeff *= 10 78 | 79 | def _features(self, path): 80 | raise NotImplementedError("this is an abstract class, use either LinearFeatureBaseline or LinearTimeBaseline") 81 | 82 | 83 | class LinearFeatureBaseline(LinearBaseline): 84 | """ 85 | Linear (polynomial) time-state dependent return baseline model 86 | (see. Duan et al. 2016, "Benchmarking Deep Reinforcement Learning for Continuous Control", ICML) 87 | 88 | Fits the following linear model 89 | 90 | reward = b0 + b1*obs + b2*obs^2 + b3*t + b4*t^2+ b5*t^3 91 | 92 | Args: 93 | reg_coeff: list of paths 94 | 95 | """ 96 | def __init__(self, reg_coeff=1e-5): 97 | super(LinearFeatureBaseline, self).__init__() 98 | self._coeffs = None 99 | self._reg_coeff = reg_coeff 100 | 101 | def _features(self, path): 102 | obs = np.clip(path["observations"], -10, 10) 103 | path_length = len(path["observations"]) 104 | time_step = np.arange(path_length).reshape(-1, 1) / 100.0 105 | return np.concatenate([obs, obs ** 2, time_step, time_step ** 2, time_step ** 3, np.ones((path_length, 1))], 106 | axis=1) 107 | 108 | 109 | class LinearTimeBaseline(LinearBaseline): 110 | """ 111 | Linear (polynomial) time-dependent reward baseline model 112 | 113 | Fits the following linear model 114 | 115 | reward = b0 + b3*t + b4*t^2+ b5*t^3 116 | 117 | Args: 118 | reg_coeff: list of paths 119 | 120 | """ 121 | 122 | def _features(self, path): 123 | path_length = len(path["observations"]) 124 | time_step = np.arange(path_length).reshape(-1, 1) / 100.0 125 | return np.concatenate([time_step, time_step ** 2, time_step ** 3, np.ones((path_length, 1))], 126 | axis=1) 127 | 128 | -------------------------------------------------------------------------------- /meta_policy_search/baselines/zero_baseline.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.baselines.base import Baseline 2 | import numpy as np 3 | 4 | 5 | class ZeroBaseline(Baseline): 6 | """ 7 | Dummy baseline 8 | """ 9 | 10 | def __init__(self): 11 | super(ZeroBaseline, self).__init__() 12 | 13 | def get_param_values(self, **kwargs): 14 | """ 15 | Returns the parameter values of the baseline object 16 | 17 | Returns: 18 | (None): coefficients of the baseline 19 | 20 | """ 21 | return None 22 | 23 | def set_param_values(self, value, **kwargs): 24 | """ 25 | Sets the parameter values of the baseline object 26 | 27 | Args: 28 | value (None): coefficients of the baseline 29 | 30 | """ 31 | pass 32 | 33 | def fit(self, paths, **kwargs): 34 | """ 35 | Improves the quality of zeroes output by baseline 36 | 37 | Args: 38 | paths: list of paths 39 | 40 | """ 41 | pass 42 | 43 | def predict(self, path): 44 | """ 45 | Produces some zeroes 46 | 47 | Args: 48 | path (dict): dict of lists/numpy array containing trajectory / path information 49 | such as "observations", "rewards", ... 50 | 51 | Returns: 52 | (np.ndarray): numpy array of the same length as paths["observations"] specifying the reward baseline 53 | 54 | """ 55 | return np.zeros_like(path["rewards"]) -------------------------------------------------------------------------------- /meta_policy_search/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.envs.base import MetaEnv -------------------------------------------------------------------------------- /meta_policy_search/envs/base.py: -------------------------------------------------------------------------------- 1 | from gym.core import Env 2 | from gym.envs.mujoco import MujocoEnv 3 | import numpy as np 4 | 5 | 6 | class MetaEnv(Env): 7 | """ 8 | Wrapper around OpenAI gym environments, interface for meta learning 9 | """ 10 | 11 | def sample_tasks(self, n_tasks): 12 | """ 13 | Samples task of the meta-environment 14 | 15 | Args: 16 | n_tasks (int) : number of different meta-tasks needed 17 | 18 | Returns: 19 | tasks (list) : an (n_tasks) length list of tasks 20 | """ 21 | raise NotImplementedError 22 | 23 | def set_task(self, task): 24 | """ 25 | Sets the specified task to the current environment 26 | 27 | Args: 28 | task: task of the meta-learning environment 29 | """ 30 | raise NotImplementedError 31 | 32 | def get_task(self): 33 | """ 34 | Gets the task that the agent is performing in the current environment 35 | 36 | Returns: 37 | task: task of the meta-learning environment 38 | """ 39 | raise NotImplementedError 40 | 41 | def log_diagnostics(self, paths, prefix): 42 | """ 43 | Logs env-specific diagnostic information 44 | 45 | Args: 46 | paths (list) : list of all paths collected with this env during this iteration 47 | prefix (str) : prefix for logger 48 | """ 49 | pass 50 | 51 | class RandomEnv(MetaEnv, MujocoEnv): 52 | """ 53 | This class provides functionality for randomizing the physical parameters of a mujoco model 54 | The following parameters are changed: 55 | - body_mass 56 | - body_inertia 57 | - damping coeff at the joints 58 | """ 59 | RAND_PARAMS = ['body_mass', 'dof_damping', 'body_inertia', 'geom_friction'] 60 | RAND_PARAMS_EXTENDED = RAND_PARAMS + ['geom_size'] 61 | 62 | def __init__(self, log_scale_limit, *args, rand_params=RAND_PARAMS, **kwargs): 63 | super(RandomEnv, self).__init__(*args, **kwargs) 64 | assert set(rand_params) <= set(self.RAND_PARAMS_EXTENDED), \ 65 | "rand_params must be a subset of " + str(self.RAND_PARAMS_EXTENDED) 66 | self.log_scale_limit = log_scale_limit 67 | self.rand_params = rand_params 68 | self.save_parameters() 69 | 70 | def sample_tasks(self, n_tasks): 71 | """ 72 | Generates randomized parameter sets for the mujoco env 73 | 74 | Args: 75 | n_tasks (int) : number of different meta-tasks needed 76 | 77 | Returns: 78 | tasks (list) : an (n_tasks) length list of tasks 79 | """ 80 | param_sets = [] 81 | 82 | for _ in range(n_tasks): 83 | # body mass -> one multiplier for all body parts 84 | 85 | new_params = {} 86 | 87 | if 'body_mass' in self.rand_params: 88 | body_mass_multiplyers = np.array(1.5) ** np.random.uniform(-self.log_scale_limit, self.log_scale_limit, size=self.model.body_mass.shape) 89 | new_params['body_mass'] = self.init_params['body_mass'] * body_mass_multiplyers 90 | 91 | # body_inertia 92 | if 'body_inertia' in self.rand_params: 93 | body_inertia_multiplyers = np.array(1.5) ** np.random.uniform(-self.log_scale_limit, self.log_scale_limit, size=self.model.body_inertia.shape) 94 | new_params['body_inertia'] = body_inertia_multiplyers * self.init_params['body_inertia'] 95 | 96 | # damping -> different multiplier for different dofs/joints 97 | if 'dof_damping' in self.rand_params: 98 | dof_damping_multipliers = np.array(1.3) ** np.random.uniform(-self.log_scale_limit, self.log_scale_limit, size=self.model.dof_damping.shape) 99 | new_params['dof_damping'] = np.multiply(self.init_params['dof_damping'], dof_damping_multipliers) 100 | 101 | # friction at the body components 102 | if 'geom_friction' in self.rand_params: 103 | dof_damping_multipliers = np.array(1.5) ** np.random.uniform(-self.log_scale_limit, self.log_scale_limit, size=self.model.geom_friction.shape) 104 | new_params['geom_friction'] = np.multiply(self.init_params['geom_friction'], dof_damping_multipliers) 105 | 106 | param_sets.append(new_params) 107 | 108 | return param_sets 109 | 110 | def set_task(self, task): 111 | for param, param_val in task.items(): 112 | param_variable = getattr(self.model, param) 113 | assert param_variable.shape == param_val.shape, 'shapes of new parameter value and old one must match' 114 | setattr(self.model, param, param_val) 115 | self.cur_params = task 116 | 117 | def get_task(self): 118 | return self.cur_params 119 | 120 | def save_parameters(self): 121 | self.init_params = {} 122 | if 'body_mass' in self.rand_params: 123 | self.init_params['body_mass'] = self.model.body_mass 124 | 125 | # body_inertia 126 | if 'body_inertia' in self.rand_params: 127 | self.init_params['body_inertia'] = self.model.body_inertia 128 | 129 | # damping -> different multiplier for different dofs/joints 130 | if 'dof_damping' in self.rand_params: 131 | self.init_params['dof_damping'] = self.model.dof_damping 132 | 133 | # friction at the body components 134 | if 'geom_friction' in self.rand_params: 135 | self.init_params['geom_friction'] = self.model.geom_friction 136 | self.cur_params = self.init_params -------------------------------------------------------------------------------- /meta_policy_search/envs/mujoco_envs/ant_rand_direc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from meta_policy_search.envs.base import MetaEnv 3 | from gym.envs.mujoco.mujoco_env import MujocoEnv 4 | from meta_policy_search.utils import logger 5 | import gym 6 | 7 | 8 | class AntRandDirecEnv(MetaEnv, MujocoEnv, gym.utils.EzPickle): 9 | def __init__(self, goal_direction=None): 10 | self.goal_direction = goal_direction if goal_direction else 1.0 11 | MujocoEnv.__init__(self, 'ant.xml', 5) 12 | gym.utils.EzPickle.__init__(self) 13 | 14 | def sample_tasks(self, n_tasks): 15 | # for fwd/bwd env, goal direc is backwards if - 1.0, forwards if + 1.0 16 | return np.random.choice((-1.0, 1.0), (n_tasks, )) 17 | 18 | def set_task(self, task): 19 | """ 20 | Args: 21 | task: task of the meta-learning environment 22 | """ 23 | self.goal_direction = task 24 | 25 | def get_task(self): 26 | """ 27 | Returns: 28 | task: task of the meta-learning environment 29 | """ 30 | return self.goal_direction 31 | 32 | def step(self, a): 33 | xposbefore = self.get_body_com("torso")[0] 34 | self.do_simulation(a, self.frame_skip) 35 | xposafter = self.get_body_com("torso")[0] 36 | forward_reward = self.goal_direction * (xposafter - xposbefore)/self.dt 37 | ctrl_cost = .5 * np.square(a).sum() 38 | contact_cost = 0.5 * 1e-3 * np.sum( 39 | np.square(np.clip(self.sim.data.cfrc_ext, -1, 1))) 40 | survive_reward = 1.0 41 | reward = forward_reward - ctrl_cost - contact_cost + survive_reward 42 | state = self.state_vector() 43 | notdone = np.isfinite(state).all() and 1.0 >= state[2] >= 0. 44 | done = not notdone 45 | ob = self._get_obs() 46 | return ob, reward, done, dict( 47 | reward_forward=forward_reward, 48 | reward_ctrl=-ctrl_cost, 49 | reward_contact=-contact_cost, 50 | reward_survive=survive_reward) 51 | 52 | def _get_obs(self): 53 | return np.concatenate([ 54 | self.sim.data.qpos.flat[2:], 55 | self.sim.data.qvel.flat, 56 | np.clip(self.sim.data.cfrc_ext, -1, 1).flat, 57 | ]) 58 | 59 | def reset_model(self): 60 | qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1) 61 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 62 | self.set_state(qpos, qvel) 63 | return self._get_obs() 64 | 65 | def viewer_setup(self): 66 | self.viewer.cam.distance = self.model.stat.extent * 0.5 67 | 68 | def log_diagnostics(self, paths, prefix=''): 69 | progs = [np.mean(path["env_infos"]["reward_forward"]) for path in paths] 70 | ctrl_cost = [-np.mean(path["env_infos"]["reward_ctrl"]) for path in paths] 71 | 72 | logger.logkv(prefix + 'AverageForwardReturn', np.mean(progs)) 73 | logger.logkv(prefix + 'MaxForwardReturn', np.max(progs)) 74 | logger.logkv(prefix + 'MinForwardReturn', np.min(progs)) 75 | logger.logkv(prefix + 'StdForwardReturn', np.std(progs)) 76 | 77 | logger.logkv(prefix + 'AverageCtrlCost', np.mean(ctrl_cost)) -------------------------------------------------------------------------------- /meta_policy_search/envs/mujoco_envs/ant_rand_direc_2d.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from meta_policy_search.envs.base import MetaEnv 3 | from gym.envs.mujoco.mujoco_env import MujocoEnv 4 | from meta_policy_search.utils import logger 5 | import gym 6 | 7 | 8 | class AntRandDirec2DEnv(MetaEnv, MujocoEnv, gym.utils.EzPickle): 9 | def __init__(self): 10 | self.set_task(self.sample_tasks(1)[0]) 11 | MujocoEnv.__init__(self, 'ant.xml', 5) 12 | gym.utils.EzPickle.__init__(self) 13 | 14 | def sample_tasks(self, n_tasks): 15 | # for fwd/bwd env, goal direc is backwards if - 1.0, forwards if + 1.0 16 | directions = np.random.normal(size=(n_tasks, 2)) 17 | directions /= np.linalg.norm(directions, axis=1)[..., np.newaxis] 18 | return directions 19 | 20 | def set_task(self, task): 21 | """ 22 | Args: 23 | task: task of the meta-learning environment 24 | """ 25 | self.goal_direction = task 26 | 27 | def get_task(self): 28 | """ 29 | Returns: 30 | task: task of the meta-learning environment 31 | """ 32 | return self.goal_direction 33 | 34 | def step(self, a): 35 | posbefore = np.copy(self.get_body_com("torso")[:2]) 36 | self.do_simulation(a, self.frame_skip) 37 | posafter = self.get_body_com("torso")[:2] 38 | forward_reward = np.sum(self.goal_direction * (posafter - posbefore))/self.dt 39 | ctrl_cost = .5 * np.square(a).sum() 40 | contact_cost = 0.5 * 1e-3 * np.sum( 41 | np.square(np.clip(self.sim.data.cfrc_ext, -1, 1))) 42 | survive_reward = 1.0 43 | reward = forward_reward - ctrl_cost - contact_cost + survive_reward 44 | state = self.state_vector() 45 | notdone = np.isfinite(state).all() and 1.0 >= state[2] >= 0. 46 | done = not notdone 47 | ob = self._get_obs() 48 | return ob, reward, done, dict( 49 | reward_forward=forward_reward, 50 | reward_ctrl=-ctrl_cost, 51 | reward_contact=-contact_cost, 52 | reward_survive=survive_reward) 53 | 54 | def _get_obs(self): 55 | return np.concatenate([ 56 | self.sim.data.qpos.flat[2:], 57 | self.sim.data.qvel.flat, 58 | np.clip(self.sim.data.cfrc_ext, -1, 1).flat, 59 | ]) 60 | 61 | def reset_model(self): 62 | qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1) 63 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 64 | self.set_state(qpos, qvel) 65 | return self._get_obs() 66 | 67 | def viewer_setup(self): 68 | self.viewer.cam.distance = self.model.stat.extent * 0.5 69 | 70 | def log_diagnostics(self, paths, prefix=''): 71 | progs = [np.mean(path["env_infos"]["reward_forward"]) for path in paths] 72 | ctrl_cost = [-np.mean(path["env_infos"]["reward_ctrl"]) for path in paths] 73 | 74 | logger.logkv(prefix + 'AverageForwardReturn', np.mean(progs)) 75 | logger.logkv(prefix + 'MaxForwardReturn', np.max(progs)) 76 | logger.logkv(prefix + 'MinForwardReturn', np.min(progs)) 77 | logger.logkv(prefix + 'StdForwardReturn', np.std(progs)) 78 | 79 | logger.logkv(prefix + 'AverageCtrlCost', np.mean(ctrl_cost)) 80 | 81 | 82 | if __name__ == "__main__": 83 | env = AntRandDirec2DEnv() 84 | while True: 85 | task = env.sample_tasks(1)[0] 86 | env.set_task(task) 87 | env.reset() 88 | for _ in range(100): 89 | env.render() 90 | _, reward, _, _ = env.step(env.action_space.sample()) # take a random action -------------------------------------------------------------------------------- /meta_policy_search/envs/mujoco_envs/ant_rand_goal.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from meta_policy_search.envs.base import MetaEnv 3 | from meta_policy_search.utils import logger 4 | import gym 5 | from gym.envs.mujoco.mujoco_env import MujocoEnv 6 | 7 | class AntRandGoalEnv(MetaEnv, gym.utils.EzPickle, MujocoEnv): 8 | def __init__(self): 9 | self.set_task(self.sample_tasks(1)[0]) 10 | MujocoEnv.__init__(self, 'ant.xml', 5) 11 | gym.utils.EzPickle.__init__(self) 12 | 13 | def sample_tasks(self, n_tasks): 14 | a = np.random.random(n_tasks) * 2 * np.pi 15 | r = 3 * np.random.random(n_tasks) ** 0.5 16 | return np.stack((r * np.cos(a), r * np.sin(a)), axis=-1) 17 | 18 | def set_task(self, task): 19 | """ 20 | Args: 21 | task: task of the meta-learning environment 22 | """ 23 | self.goal_pos = task 24 | 25 | def get_task(self): 26 | """ 27 | Returns: 28 | task: task of the meta-learning environment 29 | """ 30 | return self.goal_pos 31 | 32 | def step(self, a): 33 | self.do_simulation(a, self.frame_skip) 34 | xposafter = self.get_body_com("torso") 35 | goal_reward = -np.sum(np.abs(xposafter[:2] - self.goal_pos)) # make it happy, not suicidal 36 | ctrl_cost = .1 * np.square(a).sum() 37 | contact_cost = 0.5 * 1e-3 * np.sum(np.square(np.clip(self.sim.data.cfrc_ext, -1, 1))) 38 | # survive_reward = 1.0 39 | survive_reward = 0.0 40 | reward = goal_reward - ctrl_cost - contact_cost + survive_reward 41 | state = self.state_vector() 42 | # notdone = np.isfinite(state).all() and 1.0 >= state[2] >= 0. 43 | # done = not notdone 44 | done = False 45 | ob = self._get_obs() 46 | return ob, reward, done, dict( 47 | reward_forward=goal_reward, 48 | reward_ctrl=-ctrl_cost, 49 | reward_contact=-contact_cost, 50 | reward_survive=survive_reward) 51 | 52 | def _get_obs(self): 53 | return np.concatenate([ 54 | self.sim.data.qpos.flat, 55 | self.sim.data.qvel.flat, 56 | np.clip(self.sim.data.cfrc_ext, -1, 1).flat, 57 | ]) 58 | 59 | def reset_model(self): 60 | qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1) 61 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 62 | self.set_state(qpos, qvel) 63 | return self._get_obs() 64 | 65 | def viewer_setup(self): 66 | self.viewer.cam.distance = self.model.stat.extent * 0.5 67 | 68 | def log_diagnostics(self, paths, prefix=''): 69 | progs = [np.mean(path["env_infos"]["reward_forward"]) for path in paths] 70 | ctrl_cost = [-np.mean(path["env_infos"]["reward_ctrl"]) for path in paths] 71 | 72 | logger.logkv(prefix + 'AverageForwardReturn', np.mean(progs)) 73 | logger.logkv(prefix + 'MaxForwardReturn', np.max(progs)) 74 | logger.logkv(prefix + 'MinForwardReturn', np.min(progs)) 75 | logger.logkv(prefix + 'StdForwardReturn', np.std(progs)) 76 | 77 | logger.logkv(prefix + 'AverageCtrlCost', np.mean(ctrl_cost)) 78 | 79 | 80 | if __name__ == "__main__": 81 | env = AntRandGoalEnv() 82 | while True: 83 | env.reset() 84 | for _ in range(100): 85 | env.render() 86 | _, reward, _, _ = env.step(env.action_space.sample()) # take a random action -------------------------------------------------------------------------------- /meta_policy_search/envs/mujoco_envs/half_cheetah_rand_direc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from meta_policy_search.envs.base import MetaEnv 3 | from meta_policy_search.utils import logger 4 | import gym 5 | from gym.envs.mujoco.mujoco_env import MujocoEnv 6 | 7 | 8 | class HalfCheetahRandDirecEnv(MetaEnv, MujocoEnv, gym.utils.EzPickle): 9 | def __init__(self, goal_direction=None): 10 | self.goal_direction = goal_direction if goal_direction else 1.0 11 | MujocoEnv.__init__(self, 'half_cheetah.xml', 5) 12 | gym.utils.EzPickle.__init__(self, goal_direction) 13 | 14 | def sample_tasks(self, n_tasks): 15 | # for fwd/bwd env, goal direc is backwards if - 1.0, forwards if + 1.0 16 | return np.random.choice((-1.0, 1.0), (n_tasks, )) 17 | 18 | def set_task(self, task): 19 | """ 20 | Args: 21 | task: task of the meta-learning environment 22 | """ 23 | self.goal_direction = task 24 | 25 | def get_task(self): 26 | """ 27 | Returns: 28 | task: task of the meta-learning environment 29 | """ 30 | return self.goal_direction 31 | 32 | def step(self, action): 33 | xposbefore = self.sim.data.qpos[0] 34 | self.do_simulation(action, self.frame_skip) 35 | xposafter = self.sim.data.qpos[0] 36 | ob = self._get_obs() 37 | reward_ctrl = - 0.5 * 0.1 * np.square(action).sum() 38 | reward_run = self.goal_direction * (xposafter - xposbefore) / self.dt 39 | reward = reward_ctrl + reward_run 40 | done = False 41 | return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl) 42 | 43 | def _get_obs(self): 44 | return np.concatenate([ 45 | self.sim.data.qpos.flat[1:], 46 | self.sim.data.qvel.flat, 47 | ]) 48 | 49 | def reset_model(self): 50 | qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 51 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 52 | self.set_state(qpos, qvel) 53 | return self._get_obs() 54 | 55 | def viewer_setup(self): 56 | self.viewer.cam.distance = self.model.stat.extent * 0.5 57 | 58 | def log_diagnostics(self, paths, prefix=''): 59 | fwrd_vel = [path["env_infos"]['reward_run'] for path in paths] 60 | final_fwrd_vel = [path["env_infos"]['reward_run'][-1] for path in paths] 61 | ctrl_cost = [-path["env_infos"]['reward_ctrl'] for path in paths] 62 | 63 | logger.logkv(prefix + 'AvgForwardVel', np.mean(fwrd_vel)) 64 | logger.logkv(prefix + 'AvgFinalForwardVel', np.mean(final_fwrd_vel)) 65 | logger.logkv(prefix + 'AvgCtrlCost', np.std(ctrl_cost)) 66 | 67 | def __str__(self): 68 | return 'HalfCheetahRandDirecEnv' -------------------------------------------------------------------------------- /meta_policy_search/envs/mujoco_envs/half_cheetah_rand_vel.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from meta_policy_search.envs.base import MetaEnv 3 | from meta_policy_search.utils import logger 4 | import gym 5 | from gym.envs.mujoco.mujoco_env import MujocoEnv 6 | 7 | class HalfCheetahRandVelEnv(MetaEnv, MujocoEnv, gym.utils.EzPickle): 8 | def __init__(self): 9 | self.set_task(self.sample_tasks(1)[0]) 10 | MujocoEnv.__init__(self, 'half_cheetah.xml', 5) 11 | gym.utils.EzPickle.__init__(self) 12 | 13 | def sample_tasks(self, n_tasks): 14 | return np.random.uniform(0.0, 3.0, (n_tasks, )) 15 | 16 | def set_task(self, task): 17 | """ 18 | Args: 19 | task: task of the meta-learning environment 20 | """ 21 | self.goal_velocity = task 22 | 23 | def get_task(self): 24 | """ 25 | Returns: 26 | task: task of the meta-learning environment 27 | """ 28 | return self.goal_velocity 29 | 30 | def step(self, action): 31 | xposbefore = self.sim.data.qpos[0] 32 | self.do_simulation(action, self.frame_skip) 33 | xposafter = self.sim.data.qpos[0] 34 | ob = self._get_obs() 35 | reward_ctrl = - 0.5 * 0.1 * np.square(action).sum() 36 | forward_vel = (xposafter - xposbefore) / self.dt 37 | reward_run = - np.abs(forward_vel - self.goal_velocity) 38 | reward = reward_ctrl + reward_run 39 | done = False 40 | return ob, reward, done, dict(forward_vel=forward_vel, reward_run=reward_run, reward_ctrl=reward_ctrl) 41 | 42 | def _get_obs(self): 43 | return np.concatenate([ 44 | self.sim.data.qpos.flat[1:], 45 | self.sim.data.qvel.flat, 46 | ]) 47 | 48 | def reset_model(self): 49 | qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 50 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 51 | self.set_state(qpos, qvel) 52 | return self._get_obs() 53 | 54 | def viewer_setup(self): 55 | self.viewer.cam.distance = self.model.stat.extent * 0.5 56 | 57 | def log_diagnostics(self, paths, prefix=''): 58 | fwrd_vel = [path["env_infos"]['forward_vel'] for path in paths] 59 | final_fwrd_vel = [path["env_infos"]['forward_vel'][-1] for path in paths] 60 | ctrl_cost = [-path["env_infos"]['reward_ctrl'] for path in paths] 61 | 62 | logger.logkv(prefix + 'AvgForwardVel', np.mean(fwrd_vel)) 63 | logger.logkv(prefix + 'AvgFinalForwardVel', np.mean(final_fwrd_vel)) 64 | logger.logkv(prefix + 'AvgCtrlCost', np.std(ctrl_cost)) 65 | -------------------------------------------------------------------------------- /meta_policy_search/envs/mujoco_envs/humanoid_rand_direc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from meta_policy_search.envs.base import MetaEnv 3 | import gym 4 | from gym.envs.mujoco.mujoco_env import MujocoEnv 5 | 6 | def mass_center(model, sim): 7 | mass = np.expand_dims(model.body_mass, 1) 8 | xpos = sim.data.xipos 9 | return (np.sum(mass * xpos, 0) / np.sum(mass)) 10 | 11 | class HumanoidRandDirecEnv(MetaEnv, gym.utils.EzPickle, MujocoEnv): 12 | def __init__(self): 13 | self.set_task(self.sample_tasks(1)[0]) 14 | MujocoEnv.__init__(self, 'humanoid.xml', 5) 15 | gym.utils.EzPickle.__init__(self) 16 | 17 | def sample_tasks(self, n_tasks): 18 | return np.random.choice((-1.0, 1.0), (n_tasks, )) 19 | 20 | def set_task(self, task): 21 | """ 22 | Args: 23 | task: task of the meta-learning environment 24 | """ 25 | self.goal_direction = task 26 | 27 | def get_task(self): 28 | """ 29 | Returns: 30 | task: task of the meta-learning environment 31 | """ 32 | return self.goal_direction 33 | 34 | def _get_obs(self): 35 | data = self.sim.data 36 | return np.concatenate([data.qpos.flat[2:], 37 | data.qvel.flat, 38 | data.cinert.flat, 39 | data.cvel.flat, 40 | data.qfrc_actuator.flat, 41 | data.cfrc_ext.flat]) 42 | 43 | def step(self, a): 44 | pos_before = mass_center(self.model, self.sim)[0] 45 | self.do_simulation(a, self.frame_skip) 46 | pos_after = mass_center(self.model, self.sim)[0] 47 | alive_bonus = 5.0 48 | data = self.sim.data 49 | lin_vel_cost = 0.25 * self.goal_direction * (pos_after - pos_before) / self.model.opt.timestep 50 | quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum() 51 | quad_impact_cost = .5e-6 * np.square(data.cfrc_ext).sum() 52 | quad_impact_cost = min(quad_impact_cost, 10) 53 | reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus 54 | qpos = self.sim.data.qpos 55 | done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0)) 56 | return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost, reward_alive=alive_bonus, reward_impact=-quad_impact_cost) 57 | 58 | def reset_model(self): 59 | c = 0.01 60 | self.set_state( 61 | self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq), 62 | self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,) 63 | ) 64 | return self._get_obs() 65 | 66 | def viewer_setup(self): 67 | self.viewer.cam.trackbodyid = 1 68 | self.viewer.cam.distance = self.model.stat.extent * 1.0 69 | self.viewer.cam.elevation = -20 70 | 71 | if __name__ == "__main__": 72 | env = HumanoidRandDirecEnv() 73 | while True: 74 | env.reset() 75 | for _ in range(200): 76 | env.render() 77 | _, reward, _, _ = env.step(env.action_space.sample()) # take a random action -------------------------------------------------------------------------------- /meta_policy_search/envs/mujoco_envs/humanoid_rand_direc_2d.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from meta_policy_search.envs.base import MetaEnv 3 | import gym 4 | from gym.envs.mujoco.mujoco_env import MujocoEnv 5 | 6 | def mass_center(model, sim): 7 | mass = np.expand_dims(model.body_mass, 1) 8 | xpos = sim.data.xipos 9 | return (np.sum(mass * xpos, 0) / np.sum(mass)) 10 | 11 | class HumanoidRandDirec2DEnv(MetaEnv, gym.utils.EzPickle, MujocoEnv): 12 | def __init__(self): 13 | self.set_task(self.sample_tasks(1)[0]) 14 | MujocoEnv.__init__(self, 'humanoid.xml', 5) 15 | gym.utils.EzPickle.__init__(self) 16 | 17 | def sample_tasks(self, n_tasks): 18 | directions = np.random.normal(size=(n_tasks, 2)) 19 | directions /= np.linalg.norm(directions, axis=1)[..., np.newaxis] 20 | return directions 21 | 22 | def set_task(self, task): 23 | """ 24 | Args: 25 | task: task of the meta-learning environment 26 | """ 27 | self.goal_direction = task 28 | 29 | def get_task(self): 30 | """ 31 | Returns: 32 | task: task of the meta-learning environment 33 | """ 34 | return self.goal_direction 35 | 36 | def _get_obs(self): 37 | data = self.sim.data 38 | return np.concatenate([data.qpos.flat[2:], 39 | data.qvel.flat, 40 | data.cinert.flat, 41 | data.cvel.flat, 42 | data.qfrc_actuator.flat, 43 | data.cfrc_ext.flat]) 44 | 45 | def step(self, a): 46 | pos_before = np.copy(mass_center(self.model, self.sim)[:2]) 47 | self.do_simulation(a, self.frame_skip) 48 | pos_after = mass_center(self.model, self.sim)[:2] 49 | alive_bonus = 5.0 50 | data = self.sim.data 51 | lin_vel_cost = 0.25 * np.sum(self.goal_direction * (pos_after - pos_before)) / self.model.opt.timestep 52 | quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum() 53 | quad_impact_cost = .5e-6 * np.square(data.cfrc_ext).sum() 54 | quad_impact_cost = min(quad_impact_cost, 10) 55 | reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus 56 | qpos = self.sim.data.qpos 57 | done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0)) 58 | return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost, reward_alive=alive_bonus, reward_impact=-quad_impact_cost) 59 | 60 | def reset_model(self): 61 | c = 0.01 62 | self.set_state( 63 | self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq), 64 | self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,) 65 | ) 66 | return self._get_obs() 67 | 68 | def viewer_setup(self): 69 | self.viewer.cam.trackbodyid = 1 70 | self.viewer.cam.distance = self.model.stat.extent * 1.0 71 | self.viewer.cam.elevation = -20 72 | 73 | if __name__ == "__main__": 74 | env = HumanoidRandDirec2DEnv() 75 | while True: 76 | env.reset() 77 | for _ in range(200): 78 | env.render() 79 | _, reward, _, _ = env.step(env.action_space.sample()) # take a random action 80 | print(reward) -------------------------------------------------------------------------------- /meta_policy_search/envs/mujoco_envs/swimmer_rand_vel.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from meta_policy_search.utils import logger 4 | from meta_policy_search.envs.base import MetaEnv 5 | from gym.envs.mujoco.mujoco_env import MujocoEnv 6 | 7 | class SwimmerRandVelEnv(MetaEnv, MujocoEnv, gym.utils.EzPickle): 8 | def __init__(self): 9 | self.set_task(self.sample_tasks(1)[0]) 10 | MujocoEnv.__init__(self, 'swimmer.xml', 4) 11 | gym.utils.EzPickle.__init__(self) 12 | 13 | def sample_tasks(self, n_tasks): 14 | # for fwd/bwd env, goal direc is backwards if - 1.0, forwards if + 1.0 15 | return np.random.uniform(0.1, 0.2, (n_tasks, )) 16 | 17 | def set_task(self, task): 18 | """ 19 | Args: 20 | task: task of the meta-learning environment 21 | """ 22 | self.goal_vel = task 23 | 24 | def get_task(self): 25 | """ 26 | Returns: 27 | task: task of the meta-learning environment 28 | """ 29 | return self.goal_vel 30 | 31 | def step(self, a): 32 | ctrl_cost_coeff = 0.0001 33 | xposbefore = self.sim.data.qpos[0] 34 | self.do_simulation(a, self.frame_skip) 35 | xposafter = self.sim.data.qpos[0] 36 | reward_fwd = np.abs((xposafter - xposbefore) / self.dt - self.goal_vel) 37 | reward_ctrl = - ctrl_cost_coeff * np.square(a).sum() 38 | reward = reward_fwd + reward_ctrl 39 | ob = self._get_obs() 40 | return ob, reward, False, dict(reward_fwd=reward_fwd, reward_ctrl=reward_ctrl) 41 | 42 | def _get_obs(self): 43 | qpos = self.sim.data.qpos 44 | qvel = self.sim.data.qvel 45 | return np.concatenate([qpos.flat[2:], qvel.flat]) 46 | 47 | def reset_model(self): 48 | self.set_state( 49 | self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq), 50 | self.init_qvel + self.np_random.uniform(low=-.1, high=.1, size=self.model.nv) 51 | ) 52 | return self._get_obs() 53 | 54 | def log_diagnostics(self, paths, prefix=''): 55 | progs = [ 56 | path["observations"][-1][-3] - path["observations"][0][-3] 57 | for path in paths 58 | ] 59 | logger.record_tabular(prefix + 'AverageForwardProgress', np.mean(progs)) 60 | logger.record_tabular(prefix + 'MaxForwardProgress', np.max(progs)) 61 | logger.record_tabular(prefix + 'MinForwardProgress', np.min(progs)) 62 | logger.record_tabular(prefix + 'StdForwardProgress', np.std(progs)) 63 | 64 | -------------------------------------------------------------------------------- /meta_policy_search/envs/mujoco_envs/walker2d_rand_direc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from meta_policy_search.envs.base import MetaEnv 3 | import gym 4 | from gym.envs.mujoco.mujoco_env import MujocoEnv 5 | 6 | class Walker2DRandDirecEnv(MetaEnv, gym.utils.EzPickle, MujocoEnv): 7 | def __init__(self): 8 | self.set_task(self.sample_tasks(1)[0]) 9 | MujocoEnv.__init__(self, 'walker2d.xml', 8) 10 | gym.utils.EzPickle.__init__(self) 11 | 12 | def sample_tasks(self, n_tasks): 13 | return np.random.choice((-1.0, 1.0), (n_tasks, )) 14 | 15 | def set_task(self, task): 16 | """ 17 | Args: 18 | task: task of the meta-learning environment 19 | """ 20 | self.goal_direction = task 21 | 22 | def get_task(self): 23 | """ 24 | Returns: 25 | task: task of the meta-learning environment 26 | """ 27 | return self.goal_direction 28 | 29 | def step(self, a): 30 | posbefore = self.sim.data.qpos[0] 31 | self.do_simulation(a, self.frame_skip) 32 | posafter, height, ang = self.sim.data.qpos[0:3] 33 | alive_bonus = 1.0 34 | reward = (self.goal_direction * (posafter - posbefore) / self.dt) 35 | reward += alive_bonus 36 | reward -= 1e-3 * np.square(a).sum() 37 | done = not (height > 0.8 and height < 2.0 and 38 | ang > -1.0 and ang < 1.0) 39 | ob = self._get_obs() 40 | return ob, reward, done, {} 41 | 42 | def _get_obs(self): 43 | qpos = self.sim.data.qpos 44 | qvel = self.sim.data.qvel 45 | return np.concatenate([qpos[1:], np.clip(qvel, -10, 10)]).ravel() 46 | 47 | def reset_model(self): 48 | self.set_state( 49 | self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq), 50 | self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv) 51 | ) 52 | return self._get_obs() 53 | 54 | def viewer_setup(self): 55 | self.viewer.cam.trackbodyid = 2 56 | self.viewer.cam.distance = self.model.stat.extent * 0.5 57 | 58 | if __name__ == "__main__": 59 | env = Walker2DRandDirecEnv() 60 | while True: 61 | env.reset() 62 | for _ in range(200): 63 | env.render() 64 | _, reward, _, _ = env.step(env.action_space.sample()) # take a random action -------------------------------------------------------------------------------- /meta_policy_search/envs/mujoco_envs/walker2d_rand_vel.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from meta_policy_search.envs.base import MetaEnv 3 | import gym 4 | from gym.envs.mujoco.mujoco_env import MujocoEnv 5 | 6 | class Walker2DRandVelEnv(MetaEnv, gym.utils.EzPickle, MujocoEnv): 7 | def __init__(self): 8 | self.set_task(self.sample_tasks(1)[0]) 9 | MujocoEnv.__init__(self, 'walker2d.xml', 8) 10 | gym.utils.EzPickle.__init__(self) 11 | 12 | def sample_tasks(self, n_tasks): 13 | return np.random.uniform(0.0, 10.0, (n_tasks, )) 14 | 15 | def set_task(self, task): 16 | """ 17 | Args: 18 | task: task of the meta-learning environment 19 | """ 20 | self.goal_velocity = task 21 | 22 | def get_task(self): 23 | """ 24 | Returns: 25 | task: task of the meta-learning environment 26 | """ 27 | return self.goal_velocity 28 | 29 | def step(self, a): 30 | posbefore = self.sim.data.qpos[0] 31 | self.do_simulation(a, self.frame_skip) 32 | posafter, height, ang = self.sim.data.qpos[0:3] 33 | alive_bonus = 15.0 34 | forward_vel = (posafter - posbefore) / self.dt 35 | reward = - np.abs(forward_vel - self.goal_velocity) 36 | reward += alive_bonus 37 | reward -= 1e-3 * np.square(a).sum() 38 | done = not (height > 0.8 and height < 2.0 and 39 | ang > -1.0 and ang < 1.0) 40 | ob = self._get_obs() 41 | return ob, reward, done, {} 42 | 43 | def _get_obs(self): 44 | qpos = self.sim.data.qpos 45 | qvel = self.sim.data.qvel 46 | return np.concatenate([qpos[1:], np.clip(qvel, -10, 10)]).ravel() 47 | 48 | def reset_model(self): 49 | self.set_state( 50 | self.init_qpos + self.np_random.uniform(low=-.005, high=.005, size=self.model.nq), 51 | self.init_qvel + self.np_random.uniform(low=-.005, high=.005, size=self.model.nv) 52 | ) 53 | return self._get_obs() 54 | 55 | def viewer_setup(self): 56 | self.viewer.cam.trackbodyid = 2 57 | self.viewer.cam.distance = self.model.stat.extent * 0.5 58 | 59 | if __name__ == "__main__": 60 | env = Walker2DRandVelEnv() 61 | while True: 62 | env.reset() 63 | for _ in range(200): 64 | env.render() 65 | _, reward, _, _ = env.step(env.action_space.sample()) # take a random action -------------------------------------------------------------------------------- /meta_policy_search/envs/normalized_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from meta_policy_search.utils.serializable import Serializable 3 | from gym.spaces import Box 4 | from rand_param_envs.gym.spaces import Box as OldBox 5 | 6 | class NormalizedEnv(Serializable): 7 | """ 8 | Normalizes the environment class. 9 | 10 | Args: 11 | Env (gym.Env): class of the unnormalized gym environment 12 | scale_reward (float): scale of the reward 13 | normalize_obs (bool): whether normalize the observations or not 14 | normalize_reward (bool): whether normalize the reward or not 15 | obs_alpha (float): step size of the running mean and variance for the observations 16 | reward_alpha (float): step size of the running mean and variance for the observations 17 | normalization_scale (float): rescaled action magnitude 18 | 19 | """ 20 | def __init__(self, 21 | env, 22 | scale_reward=1., 23 | normalize_obs=False, 24 | normalize_reward=False, 25 | obs_alpha=0.001, 26 | reward_alpha=0.001, 27 | normalization_scale=10., 28 | ): 29 | Serializable.quick_init(self, locals()) 30 | 31 | self._scale_reward = 1 32 | self._wrapped_env = env 33 | 34 | self._normalize_obs = normalize_obs 35 | self._normalize_reward = normalize_reward 36 | self._obs_alpha = obs_alpha 37 | self._obs_mean = np.zeros(self.observation_space.shape) 38 | self._obs_var = np.ones(self.observation_space.shape) 39 | self._reward_alpha = reward_alpha 40 | self._reward_mean = 0. 41 | self._reward_var = 1. 42 | self._normalization_scale = normalization_scale 43 | 44 | 45 | @property 46 | def action_space(self): 47 | if isinstance(self._wrapped_env.action_space, Box): 48 | ub = np.ones(self._wrapped_env.action_space.shape) * self._normalization_scale 49 | return Box(-1 * ub, ub, dtype=np.float32) 50 | return self._wrapped_env.action_space 51 | 52 | def __getattr__(self, attr): 53 | """ 54 | If normalized env does not have the attribute then call the attribute in the wrapped_env 55 | Args: 56 | attr: attribute to get 57 | 58 | Returns: 59 | attribute of the wrapped_env 60 | 61 | """ 62 | orig_attr = self._wrapped_env.__getattribute__(attr) 63 | 64 | if callable(orig_attr): 65 | def hooked(*args, **kwargs): 66 | result = orig_attr(*args, **kwargs) 67 | return result 68 | 69 | return hooked 70 | else: 71 | return orig_attr 72 | 73 | def _update_obs_estimate(self, obs): 74 | o_a = self._obs_alpha 75 | self._obs_mean = (1 - o_a) * self._obs_mean + o_a * obs 76 | self._obs_var = (1 - o_a) * self._obs_var + o_a * np.square(obs - self._obs_mean) 77 | 78 | def _update_reward_estimate(self, reward): 79 | r_a = self._reward_alpha 80 | self._reward_mean = (1 - r_a) * self._reward_mean + r_a * reward 81 | self._reward_var = (1 - r_a) * self._reward_var + r_a * np.square(reward - self._reward_mean) 82 | 83 | def _apply_normalize_obs(self, obs): 84 | self._update_obs_estimate(obs) 85 | return (obs - self._obs_mean) / (np.sqrt(self._obs_var) + 1e-8) 86 | 87 | def _apply_normalize_reward(self, reward): 88 | self._update_reward_estimate(reward) 89 | return reward / (np.sqrt(self._reward_var) + 1e-8) 90 | 91 | def reset(self): 92 | obs = self._wrapped_env.reset() 93 | if self._normalize_obs: 94 | return self._apply_normalize_obs(obs) 95 | else: 96 | return obs 97 | 98 | def __getstate__(self): 99 | d = Serializable.__getstate__(self) 100 | d["_obs_mean"] = self._obs_mean 101 | d["_obs_var"] = self._obs_var 102 | return d 103 | 104 | def __setstate__(self, d): 105 | Serializable.__setstate__(self, d) 106 | self._obs_mean = d["_obs_mean"] 107 | self._obs_var = d["_obs_var"] 108 | 109 | def step(self, action): 110 | if isinstance(self._wrapped_env.action_space, Box) or isinstance(self._wrapped_env.action_space, OldBox): 111 | # rescale the action 112 | lb, ub = self._wrapped_env.action_space.low, self._wrapped_env.action_space.high 113 | scaled_action = lb + (action + self._normalization_scale) * (ub - lb) / (2 * self._normalization_scale) 114 | scaled_action = np.clip(scaled_action, lb, ub) 115 | else: 116 | scaled_action = action 117 | wrapped_step = self._wrapped_env.step(scaled_action) 118 | next_obs, reward, done, info = wrapped_step 119 | if getattr(self, "_normalize_obs", False): 120 | next_obs = self._apply_normalize_obs(next_obs) 121 | if getattr(self, "_normalize_reward", False): 122 | reward = self._apply_normalize_reward(reward) 123 | return next_obs, reward * self._scale_reward, done, info 124 | 125 | 126 | normalize = NormalizedEnv -------------------------------------------------------------------------------- /meta_policy_search/envs/point_envs/corner_goals_point_env_2d.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.envs.base import MetaEnv 2 | 3 | import numpy as np 4 | from gym.spaces import Box 5 | 6 | 7 | class MetaPointEnv(MetaEnv): 8 | 9 | def step(self, action): 10 | """ 11 | Run one timestep of the environment's dynamics. When end of episode 12 | is reached, reset() should be called to reset the environment's internal state. 13 | 14 | Args: 15 | action : an action provided by the environment 16 | Returns: 17 | (observation, reward, done, info) 18 | observation : agent's observation of the current environment 19 | reward [Float] : amount of reward due to the previous action 20 | done : a boolean, indicating whether the episode has ended 21 | info : a dictionary containing other diagnostic information from the previous action 22 | """ 23 | prev_state = self._state 24 | self._state = prev_state + np.clip(action, -0.1, 0.1) 25 | reward = self.reward(prev_state, action, self._state) 26 | done = self.done(self._state) 27 | next_observation = np.copy(self._state) 28 | return next_observation, reward, done, {} 29 | 30 | def reset(self): 31 | """ 32 | Resets the state of the environment, returning an initial observation. 33 | Outputs 34 | ------- 35 | observation : the initial observation of the space. (Initial reward is assumed to be 0.) 36 | """ 37 | self._state = np.random.uniform(-2, 2, size=(2,)) 38 | observation = np.copy(self._state) 39 | return observation 40 | 41 | @property 42 | def observation_space(self): 43 | return Box(low=-np.inf, high=np.inf, shape=(2,)) 44 | 45 | @property 46 | def action_space(self): 47 | return Box(low=-0.1, high=0.1, shape=(2,)) 48 | 49 | def done(self, obs): 50 | if obs.ndim == 1: 51 | return abs(obs[0]) < 0.01 and abs(obs[1]) < 0.01 52 | elif obs.ndim == 2: 53 | return np.logical_and(np.abs(obs[:, 0]) < 0.01, np.abs(obs[:, 1]) < 0.01) 54 | 55 | def reward(self, obs, act, obs_next): 56 | if obs_next.ndim == 1: 57 | return - np.sqrt(obs_next[0]**2 + obs_next[1]**2) 58 | elif obs_next.ndim == 2: 59 | return - np.sqrt(obs_next[:, 0] ** 2 + obs_next[:, 1] ** 2) 60 | 61 | def log_diagnostics(self, paths): 62 | pass 63 | 64 | def sample_tasks(self, n_tasks): 65 | return [{}] * n_tasks 66 | 67 | def set_task(self, task): 68 | pass 69 | 70 | def get_task(self): 71 | return {} -------------------------------------------------------------------------------- /meta_policy_search/envs/point_envs/point_env_2d.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.envs.base import MetaEnv 2 | 3 | import numpy as np 4 | from gym.spaces import Box 5 | 6 | 7 | class MetaPointEnv(MetaEnv): 8 | 9 | def step(self, action): 10 | """ 11 | Run one timestep of the environment's dynamics. When end of episode 12 | is reached, reset() should be called to reset the environment's internal state. 13 | 14 | Args: 15 | action : an action provided by the environment 16 | Returns: 17 | (observation, reward, done, info) 18 | observation : agent's observation of the current environment 19 | reward [Float] : amount of reward due to the previous action 20 | done : a boolean, indicating whether the episode has ended 21 | info : a dictionary containing other diagnostic information from the previous action 22 | """ 23 | prev_state = self._state 24 | self._state = prev_state + np.clip(action, -0.1, 0.1) 25 | reward = self.reward(prev_state, action, self._state) 26 | done = self.done(self._state) 27 | next_observation = np.copy(self._state) 28 | return next_observation, reward, done, {} 29 | 30 | def reset(self): 31 | """ 32 | Resets the state of the environment, returning an initial observation. 33 | Outputs 34 | ------- 35 | observation : the initial observation of the space. (Initial reward is assumed to be 0.) 36 | """ 37 | self._state = np.random.uniform(-2, 2, size=(2,)) 38 | observation = np.copy(self._state) 39 | return observation 40 | 41 | @property 42 | def observation_space(self): 43 | return Box(low=-np.inf, high=np.inf, shape=(2,)) 44 | 45 | @property 46 | def action_space(self): 47 | return Box(low=-0.1, high=0.1, shape=(2,)) 48 | 49 | def done(self, obs): 50 | if obs.ndim == 1: 51 | return abs(obs[0]) < 0.01 and abs(obs[1]) < 0.01 52 | elif obs.ndim == 2: 53 | return np.logical_and(np.abs(obs[:, 0]) < 0.01, np.abs(obs[:, 1]) < 0.01) 54 | 55 | def reward(self, obs, act, obs_next): 56 | if obs_next.ndim == 1: 57 | return - np.sqrt(obs_next[0]**2 + obs_next[1]**2) 58 | elif obs_next.ndim == 2: 59 | return - np.sqrt(obs_next[:, 0] ** 2 + obs_next[:, 1] ** 2) 60 | 61 | def log_diagnostics(self, paths): 62 | pass 63 | 64 | def sample_tasks(self, n_tasks): 65 | return [{}] * n_tasks 66 | 67 | def set_task(self, task): 68 | pass 69 | 70 | def get_task(self): 71 | return {} -------------------------------------------------------------------------------- /meta_policy_search/envs/point_envs/point_env_2d_corner.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.envs.base import MetaEnv 2 | 3 | import numpy as np 4 | from gym.spaces import Box 5 | 6 | 7 | class MetaPointEnvCorner(MetaEnv): 8 | """ 9 | Simple 2D point meta environment. Each meta-task corresponds to a different goal / corner 10 | (one of the 4 points (-2,-2), (-2, 2), (2, -2), (2,2)) which are sampled with equal probability 11 | """ 12 | 13 | def __init__(self, reward_type='sparse', sparse_reward_radius=0.5): 14 | assert reward_type in ['dense', 'dense_squared', 'sparse'] 15 | self.reward_type = reward_type 16 | print("Point Env reward type is", reward_type) 17 | self.sparse_reward_radius = sparse_reward_radius 18 | self.corners = [np.array([-2,-2]), np.array([2,-2]), np.array([-2,2]), np.array([2, 2])] 19 | self.observation_space = Box(low=-np.inf, high=np.inf, shape=(2,)) 20 | self.action_space = Box(low=-0.2, high=0.2, shape=(2,)) 21 | 22 | def step(self, action): 23 | """ 24 | Run one timestep of the environment's dynamics. When end of episode 25 | is reached, reset() should be called to reset the environment's internal state. 26 | 27 | Args: 28 | action : an action provided by the environment 29 | Returns: 30 | (observation, reward, done, info) 31 | observation : agent's observation of the current environment 32 | reward [Float] : amount of reward due to the previous action 33 | done : a boolean, indicating whether the episode has ended 34 | info : a dictionary containing other diagnostic information from the previous action 35 | """ 36 | prev_state = self._state 37 | self._state = prev_state + np.clip(action, -0.2, 0.2) 38 | reward = self.reward(prev_state, action, self._state) 39 | done = False # self.done(self._state) 40 | next_observation = np.copy(self._state) 41 | return next_observation, reward, done, {} 42 | 43 | def reset(self): 44 | """ 45 | Resets the state of the environment, returning an initial observation. 46 | Outputs 47 | ------- 48 | observation : the initial observation of the space. (Initial reward is assumed to be 0.) 49 | """ 50 | self._state = np.random.uniform(-0.2, 0.2, size=(2,)) 51 | observation = np.copy(self._state) 52 | return observation 53 | 54 | def done(self, obs): 55 | if obs.ndim == 1: 56 | return self.done(np.array([obs])) 57 | elif obs.ndim == 2: 58 | goal_distance = np.linalg.norm(obs - self.goal[None,:], axis=1) 59 | return np.max(self._state) > 3 60 | 61 | def reward(self, obs, act, obs_next): 62 | if obs_next.ndim == 2: 63 | goal_distance = np.linalg.norm(obs_next - self.goal[None,:], axis=1)[0] 64 | if self.reward_type == 'dense': 65 | return - goal_distance 66 | elif self.reward_type == 'dense_squared': 67 | return - goal_distance**2 68 | elif self.reward_type == 'sparse': 69 | dist_from_start = np.linalg.norm(obs_next, ord=1, axis=1)[0] 70 | if dist_from_start < self.sparse_reward_radius: 71 | return 0 72 | dists = [np.linalg.norm(obs_next - corner[None, :], axis=1) for corner in self.corners] 73 | if np.min(goal_distance) == min(dists): 74 | return np.linalg.norm(obs - self.goal[None,:], axis=1)[0] - goal_distance 75 | return 0 76 | # return np.maximum(self.sparse_reward_radius - goal_distance, 0) 77 | 78 | elif obs_next.ndim == 1: 79 | return self.reward(np.array([obs]), np.array([act]), np.array([obs_next])) 80 | else: 81 | raise NotImplementedError 82 | 83 | def log_diagnostics(self, *args): 84 | pass 85 | 86 | def sample_tasks(self, n_tasks): 87 | return [self.corners[idx] for idx in np.random.choice(range(len(self.corners)), size=n_tasks)] 88 | 89 | def set_task(self, task): 90 | self.goal = task 91 | 92 | def get_task(self): 93 | return self.goal 94 | 95 | if __name__ == "__main__": 96 | env = MetaPointEnvCorner() 97 | task = env.sample_tasks(10) 98 | print(task[0]) 99 | while True: 100 | env.set_task(task[0]) 101 | env.reset() 102 | done = False 103 | i = 0 104 | t_r = 0 105 | while not done: 106 | obs, reward, done, _ = env.step(env.action_space.sample()) # take a random action 107 | t_r += reward 108 | i += 1 109 | if reward > 0: 110 | print(obs) 111 | break 112 | if i > 200: 113 | print(obs) 114 | break 115 | print(i, t_r) -------------------------------------------------------------------------------- /meta_policy_search/envs/point_envs/point_env_2d_momentum.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.envs.base import MetaEnv 2 | 3 | import numpy as np 4 | from gym.spaces import Box 5 | 6 | 7 | class MetaPointEnvMomentum(MetaEnv): 8 | """ 9 | Simple 2D point meta environment. Each meta-task corresponds to a different goal / corner 10 | (one of the 4 points (-2,-2), (-2, 2), (2, -2), (2,2)) which are sampled with equal probability 11 | """ 12 | 13 | def __init__(self, reward_type='sparse', sparse_reward_radius=2): 14 | assert reward_type in ['dense', 'dense_squared', 'sparse'] 15 | self.reward_type = reward_type 16 | print("Point Env reward type is", reward_type) 17 | self.sparse_reward_radius = sparse_reward_radius 18 | self.corners = [np.array([-2,-2]), np.array([2,-2]), np.array([-2,2]), np.array([2, 2])] 19 | self.observation_space = Box(low=-np.inf, high=np.inf, shape=(4,)) 20 | self.action_space = Box(low=-0.1, high=0.1, shape=(2,)) 21 | 22 | def step(self, action): 23 | """ 24 | Run one timestep of the environment's dynamics. When end of episode 25 | is reached, reset() should be called to reset the environment's internal state. 26 | 27 | Args: 28 | action : an action provided by the environment 29 | Returns: 30 | (observation, reward, done, info) 31 | observation : agent's observation of the current environment 32 | reward [Float] : amount of reward due to the previous action 33 | done : a boolean, indicating whether the episode has ended 34 | info : a dictionary containing other diagnostic information from the previous action 35 | """ 36 | prev_state = self._state 37 | self._velocity += np.clip(action, -0.1, 0.1) 38 | self._state = prev_state + self._velocity 39 | reward = self.reward(prev_state, action, self._state) 40 | done = False # self.done(self._state) 41 | next_observation = np.hstack((self._state, self._velocity)) 42 | return next_observation, reward, done, {} 43 | 44 | def reset(self): 45 | """ 46 | Resets the state of the environment, returning an initial observation. 47 | Outputs 48 | ------- 49 | observation : the initial observation of the space. (Initial reward is assumed to be 0.) 50 | """ 51 | self._state = np.random.uniform(-0.2, 0.2, size=(2,)) 52 | self._velocity = np.random.uniform(-0.1, 0.1, size=(2,)) 53 | observation = np.hstack((self._state, self._velocity)) 54 | return observation 55 | 56 | def done(self, obs): 57 | if obs.ndim == 1: 58 | return self.done(np.array([obs])) 59 | elif obs.ndim == 2: 60 | goal_distance = np.linalg.norm(obs[:2] - self.goal[None,:], axis=1) 61 | return np.max(self._state) > 3 62 | 63 | def reward(self, obs, act, obs_next): 64 | if obs_next.ndim == 2: 65 | goal_distance = np.linalg.norm(obs_next[:2] - self.goal[None,:], axis=1)[0] 66 | if self.reward_type == 'dense': 67 | return - goal_distance 68 | elif self.reward_type == 'dense_squared': 69 | return - goal_distance**2 70 | elif self.reward_type == 'sparse': 71 | return np.maximum(self.sparse_reward_radius - goal_distance, 0) 72 | 73 | elif obs_next.ndim == 1: 74 | return self.reward(np.array([obs]), np.array([act]), np.array([obs_next])) 75 | else: 76 | raise NotImplementedError 77 | 78 | def log_diagnostics(self, *args): 79 | pass 80 | 81 | def sample_tasks(self, n_tasks): 82 | return [self.corners[idx] for idx in np.random.choice(range(len(self.corners)), size=n_tasks)] 83 | 84 | def set_task(self, task): 85 | self.goal = task 86 | 87 | def get_task(self): 88 | return self.goal 89 | 90 | if __name__ == "__main__": 91 | env = MetaPointEnvMomentum() 92 | while True: 93 | task = env.sample_tasks(10) 94 | env.set_task(task[0]) 95 | env.reset() 96 | done = False 97 | i = 0 98 | t_r = 0 99 | while not done: 100 | obs, reward, done, _ = env.step(env.action_space.sample()) # take a random action 101 | t_r += reward 102 | i += 1 103 | if reward > 0: 104 | break 105 | if np.max(obs) > 300: 106 | break 107 | if i > 200: 108 | break 109 | print(i, t_r) -------------------------------------------------------------------------------- /meta_policy_search/envs/point_envs/point_env_2d_v2.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.envs.base import MetaEnv 2 | 3 | import numpy as np 4 | from gym.spaces import Box 5 | 6 | 7 | class MetaPointEnv(MetaEnv): 8 | def __init__(self): 9 | self.goal = np.random.uniform(-2, 2, size=(2,)) 10 | 11 | def step(self, action): 12 | """ 13 | Run one timestep of the environment's dynamics. When end of episode 14 | is reached, reset() should be called to reset the environment's internal state. 15 | 16 | Args: 17 | action : an action provided by the environment 18 | Returns: 19 | (observation, reward, done, info) 20 | observation : agent's observation of the current environment 21 | reward [Float] : amount of reward due to the previous action 22 | done : a boolean, indicating whether the episode has ended 23 | info : a dictionary containing other diagnostic information from the previous action 24 | """ 25 | prev_state = self._state 26 | self._state = prev_state + np.clip(action, -0.1, 0.1) 27 | reward = self.reward(prev_state, action, self._state) 28 | done = self.done(self._state) 29 | next_observation = np.copy(self._state) 30 | return next_observation, reward, done, {} 31 | 32 | 33 | def reset(self): 34 | """ 35 | Resets the state of the environment, returning an initial observation. 36 | Outputs 37 | ------- 38 | observation : the initial observation of the space. (Initial reward is assumed to be 0.) 39 | """ 40 | self._state = np.zeros(2) 41 | observation = np.copy(self._state) 42 | return observation 43 | 44 | @property 45 | def observation_space(self): 46 | return Box(low=-np.inf, high=np.inf, shape=(2,)) 47 | 48 | @property 49 | def action_space(self): 50 | return Box(low=-0.1, high=0.1, shape=(2,)) 51 | 52 | def done(self, obs): 53 | if obs.ndim == 1: 54 | return abs(obs[0]) < 0.01 and abs(obs[1]) < 0.01 55 | elif obs.ndim == 2: 56 | return np.logical_and(np.abs(obs[:, 0]) < 0.01, np.abs(obs[:, 1]) < 0.01) 57 | 58 | def reward(self, obs, act, obs_next): 59 | return - np.sqrt(np.sum((self.goal - obs_next) ** 2)) 60 | # if obs_next.ndim == 1: 61 | # return - np.sqrt(obs_next[0]**2 + obs_next[1]**2) 62 | # elif obs_next.ndim == 2: 63 | # return - np.sqrt(obs_next[:, 0] ** 2 + obs_next[:, 1] ** 2) 64 | 65 | def log_diagnostics(self, paths): 66 | pass 67 | 68 | def sample_tasks(self, n_tasks): 69 | return np.random.uniform(-2, 2, size=(n_tasks, 2)) 70 | 71 | def set_task(self, task): 72 | self.goal = task 73 | 74 | def get_task(self): 75 | return self.task -------------------------------------------------------------------------------- /meta_policy_search/envs/point_envs/point_env_2d_walls.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.envs.base import MetaEnv 2 | 3 | import numpy as np 4 | from gym.spaces import Box 5 | 6 | 7 | class MetaPointEnvWalls(MetaEnv): 8 | """ 9 | Simple 2D point meta environment. Each meta-task corresponds to a different goal / corner 10 | (one of the 4 points (-2,-2), (-2, 2), (2, -2), (2,2)) which are sampled with equal probability 11 | """ 12 | 13 | def __init__(self, reward_type='dense', sparse_reward_radius=2): 14 | assert reward_type in ['dense', 'dense_squared', 'sparse'] 15 | self.reward_type = reward_type 16 | print("Point Env reward type is", reward_type) 17 | self.sparse_reward_radius = sparse_reward_radius 18 | self.corners = [np.array([-2,-2]), np.array([2,-2]), np.array([-2,2]), np.array([2, 2])] 19 | self.observation_space = Box(low=-np.inf, high=np.inf, shape=(2,)) 20 | self.action_space = Box(low=-0.2, high=0.2, shape=(2,)) 21 | 22 | def step(self, action): 23 | """ 24 | Run one timestep of the environment's dynamics. When end of episode 25 | is reached, reset() should be called to reset the environment's internal state. 26 | 27 | Args: 28 | action : an action provided by the environment 29 | Returns: 30 | (observation, reward, done, info) 31 | observation : agent's observation of the current environment 32 | reward [Float] : amount of reward due to the previous action 33 | done : a boolean, indicating whether the episode has ended 34 | info : a dictionary containing other diagnostic information from the previous action 35 | """ 36 | prev_state = self._state 37 | self._state = prev_state + np.clip(action, -0.2, 0.2) 38 | reward = self.reward(prev_state, action, self._state) 39 | done = False # self.done(self._state) 40 | if np.linalg.norm(prev_state) < 1 and np.linalg.norm(self._state) > 1: 41 | gap_1_dist = np.linalg.norm(self._state - self.gap_1[None,:], axis=1)[0] 42 | if gap_1_dist > 1: 43 | self._state = self._state / (np.linalg.norm(self._state) + 1e-6) 44 | assert gap_1_dist < 1 or np.linalg.norm(self._state) < 1 45 | elif np.linalg.norm(prev_state) < 2 and np.linalg.norm(self._state) > 2: 46 | gap_2_dist = np.linalg.norm(self._state - self.gap_2[None,:], axis=1)[0] 47 | if gap_2_dist > 1: 48 | self._state = self._state / (np.linalg.norm(self._state) * 0.5 + 1e-6) 49 | assert gap_2_dist < 1 or np.linalg.norm(self._state) < 2 50 | next_observation = np.copy(self._state) 51 | return next_observation, reward, done, {} 52 | 53 | def reset(self): 54 | """ 55 | Resets the state of the environment, returning an initial observation. 56 | Outputs 57 | ------- 58 | observation : the initial observation of the space. (Initial reward is assumed to be 0.) 59 | """ 60 | self._state = np.random.uniform(-0.2, 0.2, size=(2,)) 61 | observation = np.copy(self._state) 62 | return observation 63 | 64 | def done(self, obs): 65 | if obs.ndim == 1: 66 | return self.done(np.array([obs])) 67 | elif obs.ndim == 2: 68 | goal_distance = np.linalg.norm(obs - self.goal[None,:], axis=1) 69 | return np.max(self._state) > 3 70 | 71 | def reward(self, obs, act, obs_next): 72 | if obs_next.ndim == 2: 73 | goal_distance = np.linalg.norm(obs_next - self.goal[None,:], axis=1)[0] 74 | if self.reward_type == 'dense': 75 | return - goal_distance 76 | elif self.reward_type == 'dense_squared': 77 | return - goal_distance**2 78 | elif self.reward_type == 'sparse': 79 | if goal_distance < self.sparse_reward_radius: 80 | return np.linalg.norm(obs - self.goal[None,:], axis=1)[0] - goal_distance 81 | else: 82 | return 83 | # return np.maximum(self.sparse_reward_radius - goal_distance, 0) 84 | 85 | elif obs_next.ndim == 1: 86 | return self.reward(np.array([obs]), np.array([act]), np.array([obs_next])) 87 | else: 88 | raise NotImplementedError 89 | 90 | def log_diagnostics(self, *args): 91 | pass 92 | 93 | def sample_tasks(self, n_tasks): 94 | goals = [self.corners[idx] for idx in np.random.choice(range(len(self.corners)), size=n_tasks)] 95 | gaps_1 = np.random.normal(size=(n_tasks, 2)) 96 | gaps_1 /= np.linalg.norm(gaps_1, axis=1)[..., np.newaxis] 97 | gaps_2 = np.random.normal(size=(n_tasks, 2)) 98 | gaps_2 /= (np.linalg.norm(gaps_2, axis=1) / 2)[..., np.newaxis] 99 | return [dict(goal=goal, gap_1=gap_1, gap_2=gap_2) for goal, gap_1, gap_2 in zip(goals, gaps_1, gaps_2)] 100 | 101 | def set_task(self, task): 102 | self.goal = task['goal'] 103 | self.gap_1 = task['gap_1'] 104 | self.gap_2 = task['gap_2'] 105 | 106 | def get_task(self): 107 | return dict(goal=self.goal, gap_1=self.gap_1, gap_2=self.gap_2) 108 | 109 | if __name__ == "__main__": 110 | env = MetaPointEnvWalls() 111 | while True: 112 | task = env.sample_tasks(10) 113 | env.set_task(task[0]) 114 | env.reset() 115 | done = False 116 | i = 0 117 | t_r = 0 118 | while not done: 119 | obs, reward, done, _ = env.step(env.action_space.sample()) # take a random action 120 | t_r += reward 121 | i += 1 122 | if reward > 0: 123 | break 124 | if np.max(obs) > 300: 125 | break 126 | if i > 200: 127 | break 128 | print(i, t_r) -------------------------------------------------------------------------------- /meta_policy_search/envs/sawyer_envs/sawyer_door.py: -------------------------------------------------------------------------------- 1 | from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv as SawyerEnv 2 | from multiworld.core.flat_goal_env import FlatGoalEnv 3 | import numpy as np 4 | from meta_policy_search.envs.base import MetaEnv 5 | from meta_policy_search.utils import logger 6 | 7 | 8 | class SawyerPushEnv(FlatGoalEnv, MetaEnv): 9 | """ 10 | Wrapper for SawyerPushEnv from multiworld envs, using our method headers 11 | """ 12 | def __init__(self, *args, **kwargs): 13 | self.quick_init(locals()) 14 | sawyer_env = SawyerEnv(*args, **kwargs) 15 | FlatGoalEnv.__init__(self, sawyer_env, obs_keys=['state_observation'], goal_keys=['state_desired_goal']) 16 | 17 | def sample_tasks(self, n_tasks): 18 | return self.sample_goals(n_tasks) 19 | 20 | def set_task(self, task): 21 | """ 22 | Args: 23 | task: task of the meta-learning environment 24 | """ 25 | return self.set_goal(task) 26 | 27 | def get_task(self): 28 | """ 29 | Returns: 30 | task: task of the meta-learning environment 31 | """ 32 | return self.get_goal() 33 | 34 | def log_diagnostics(self, paths, prefix=''): 35 | self.get_diagnostics(paths) 36 | 37 | @property 38 | def action_space(self): 39 | return FlatGoalEnv.action_space(self) 40 | 41 | def render(self): 42 | SawyerEnv.render(self) 43 | 44 | def log_diagnostics(self, paths, prefix=''): 45 | reach_dist = [path["env_infos"]['reachDist'] for path in paths] 46 | placing_dist = [path["env_infos"]['placeDist'] for path in paths] 47 | 48 | logger.logkv(prefix + 'AverageReachDistance', np.mean(reach_dist)) 49 | logger.logkv(prefix + 'AveragePlaceDistance', np.mean(placing_dist)) 50 | 51 | if __name__ == "__main__": 52 | env = SawyerPushEnv() 53 | while True: 54 | task = env.sample_tasks(1)[0] 55 | env.set_task(task) 56 | env.reset() 57 | for _ in range(500): 58 | env.render() 59 | _, reward, _, _ = env.step(env.action_space.sample()) # take a random action -------------------------------------------------------------------------------- /meta_policy_search/envs/sawyer_envs/sawyer_pick_and_place.py: -------------------------------------------------------------------------------- 1 | from multiworld.envs.mujoco.sawyer_xyz.pickPlace.sawyer_pick_and_place import SawyerPickPlaceEnv as SawyerEnv 2 | from multiworld.core.flat_goal_env import FlatGoalEnv 3 | import numpy as np 4 | from meta_policy_search.envs.base import MetaEnv 5 | from meta_policy_search.utils import logger 6 | 7 | 8 | class SawyerPickAndPlaceEnv(FlatGoalEnv, MetaEnv): 9 | """ 10 | Wrapper for SawyerPickAndPlaceEnv from multiworld envs, using our method headers 11 | """ 12 | def __init__(self, *args, **kwargs): 13 | self.quick_init(locals()) 14 | sawyer_env = SawyerEnv(*args, **kwargs) 15 | FlatGoalEnv.__init__(self, sawyer_env, obs_keys=['state_observation'], goal_keys=['state_desired_goal']) 16 | 17 | def sample_tasks(self, n_tasks): 18 | return self.sample_goals(n_tasks) 19 | 20 | def set_task(self, task): 21 | """ 22 | Args: 23 | task: task of the meta-learning environment 24 | """ 25 | return self.set_goal(task) 26 | 27 | def get_task(self): 28 | """ 29 | Returns: 30 | task: task of the meta-learning environment 31 | """ 32 | return self.get_goal() 33 | 34 | def log_diagnostics(self, paths, prefix=''): 35 | self.get_diagnostics(paths) 36 | 37 | @property 38 | def action_space(self): 39 | return FlatGoalEnv.action_space(self) 40 | 41 | def render(self): 42 | SawyerEnv.render(self) 43 | 44 | def log_diagnostics(self, paths, prefix=''): 45 | reach_rew = [path["env_infos"]['reachRew'] for path in paths] 46 | pick_rew = [path["env_infos"]['pickRew'][-1] for path in paths] 47 | place_rew = [path["env_infos"]['placeRew'] for path in paths] 48 | reach_dist = [path["env_infos"]['reachDist'] for path in paths] 49 | placing_dist = [path["env_infos"]['placingDist'] for path in paths] 50 | 51 | logger.logkv(prefix + 'AverageReachReward', np.mean(reach_rew)) 52 | logger.logkv(prefix + 'AveragePickReward', np.mean(pick_rew)) 53 | logger.logkv(prefix + 'AveragePlaceReward', np.mean(place_rew)) 54 | logger.logkv(prefix + 'AverageReachDistance', np.mean(reach_dist)) 55 | logger.logkv(prefix + 'AveragePlaceDistance', np.mean(placing_dist)) 56 | 57 | if __name__ == "__main__": 58 | env = SawyerPickAndPlaceEnv() 59 | while True: 60 | task = env.sample_tasks(1)[0] 61 | env.set_task(task) 62 | env.reset() 63 | for _ in range(500): 64 | SawyerEnv.render(env) 65 | _, reward, _, _ = env.step(env.action_space.sample()) # take a random action -------------------------------------------------------------------------------- /meta_policy_search/envs/sawyer_envs/sawyer_push.py: -------------------------------------------------------------------------------- 1 | from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv as SawyerEnv 2 | from multiworld.core.flat_goal_env import FlatGoalEnv 3 | import numpy as np 4 | from meta_policy_search.envs.base import MetaEnv 5 | from meta_policy_search.utils import logger 6 | 7 | 8 | class SawyerPushEnv(FlatGoalEnv, MetaEnv): 9 | """ 10 | Wrapper for SawyerPushEnv from multiworld envs, using our method headers 11 | """ 12 | def __init__(self, *args, **kwargs): 13 | self.quick_init(locals()) 14 | sawyer_env = SawyerEnv(*args, **kwargs) 15 | FlatGoalEnv.__init__(self, sawyer_env, obs_keys=['state_observation'], goal_keys=['state_desired_goal']) 16 | 17 | def sample_tasks(self, n_tasks): 18 | return self.sample_goals(n_tasks) 19 | 20 | def set_task(self, task): 21 | """ 22 | Args: 23 | task: task of the meta-learning environment 24 | """ 25 | return self.set_goal(task) 26 | 27 | def get_task(self): 28 | """ 29 | Returns: 30 | task: task of the meta-learning environment 31 | """ 32 | return self.get_goal() 33 | 34 | def log_diagnostics(self, paths, prefix=''): 35 | self.get_diagnostics(paths) 36 | 37 | @property 38 | def action_space(self): 39 | return FlatGoalEnv.action_space(self) 40 | 41 | def render(self): 42 | SawyerEnv.render(self) 43 | 44 | def log_diagnostics(self, paths, prefix=''): 45 | reach_dist = [path["env_infos"]['reachDist'] for path in paths] 46 | placing_dist = [path["env_infos"]['placeDist'] for path in paths] 47 | 48 | logger.logkv(prefix + 'AverageReachDistance', np.mean(reach_dist)) 49 | logger.logkv(prefix + 'AveragePlaceDistance', np.mean(placing_dist)) 50 | 51 | if __name__ == "__main__": 52 | env = SawyerPushEnv() 53 | while True: 54 | task = env.sample_tasks(1)[0] 55 | env.set_task(task) 56 | env.reset() 57 | for _ in range(500): 58 | env.render() 59 | _, reward, _, _ = env.step(env.action_space.sample()) # take a random action -------------------------------------------------------------------------------- /meta_policy_search/envs/sawyer_envs/sawyer_push_simple.py: -------------------------------------------------------------------------------- 1 | from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push_simple import SawyerPushSimpleEnv as SawyerEnv 2 | from multiworld.core.flat_goal_env import FlatGoalEnv 3 | import numpy as np 4 | from meta_policy_search.envs.base import MetaEnv 5 | from meta_policy_search.utils import logger 6 | 7 | 8 | class SawyerPushSimpleEnv(FlatGoalEnv, MetaEnv): 9 | """ 10 | Wrapper for SawyerPushSimpleEnv from multiworld envs, using our method headers 11 | """ 12 | def __init__(self, *args, **kwargs): 13 | self.quick_init(locals()) 14 | sawyer_env = SawyerEnv( 15 | obj_low=(-0.0, 0.5, 0.02), 16 | obj_high=(0.0, 0.5, 0.02), 17 | goal_low=(-0.2, 0.6, 0.02), 18 | goal_high=(0.2, 0.8, 0.02), 19 | rew_mode='posPlace', 20 | *args, **kwargs) 21 | FlatGoalEnv.__init__(self, sawyer_env, obs_keys=['state_observation'], goal_keys=['state_desired_goal']) 22 | 23 | def sample_tasks(self, n_tasks): 24 | return self.sample_goals(n_tasks) 25 | 26 | def set_task(self, task): 27 | """ 28 | Args: 29 | task: task of the meta-learning environment 30 | """ 31 | return self.set_goal(task) 32 | 33 | def get_task(self): 34 | """ 35 | Returns: 36 | task: task of the meta-learning environment 37 | """ 38 | return self.get_goal() 39 | 40 | def log_diagnostics(self, paths, prefix=''): 41 | self.get_diagnostics(paths) 42 | 43 | @property 44 | def action_space(self): 45 | return FlatGoalEnv.action_space(self) 46 | 47 | def render(self): 48 | SawyerEnv.render(self) 49 | 50 | def log_diagnostics(self, paths, prefix=''): 51 | reach_dist = [path["env_infos"]['reachDist'] for path in paths] 52 | placing_dist = [path["env_infos"]['placeDist'] for path in paths] 53 | cos_dist = [path["env_infos"]['cosDist'] for path in paths] 54 | 55 | logger.logkv(prefix + 'AverageReachDistance', np.mean(reach_dist)) 56 | logger.logkv(prefix + 'AveragePlaceDistance', np.mean(placing_dist)) 57 | logger.logkv(prefix + 'AverageCosDistance', np.mean(cos_dist)) -------------------------------------------------------------------------------- /meta_policy_search/meta_algos/__init__.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.meta_algos.base import MAMLAlgo 2 | from meta_policy_search.meta_algos.dice_maml import DICEMAML 3 | from meta_policy_search.meta_algos.pro_mp import ProMP 4 | from meta_policy_search.meta_algos.trpo_maml import TRPOMAML 5 | from meta_policy_search.meta_algos.vpg_maml import VPGMAML 6 | from meta_policy_search.meta_algos.vpg_dice_maml import VPG_DICEMAML -------------------------------------------------------------------------------- /meta_policy_search/meta_algos/vpg_dice_maml.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.meta_algos.dice_maml import DICEMAML 2 | 3 | import tensorflow as tf 4 | from collections import OrderedDict 5 | 6 | 7 | class VPG_DICEMAML(DICEMAML): 8 | """ 9 | Algorithm for DICE VPG MAML 10 | 11 | Args: 12 | max_path_length (int): maximum path length 13 | policy (Policy) : policy object 14 | name (str): tf variable scope 15 | learning_rate (float): learning rate for the meta-objective 16 | inner_lr (float) : gradient step size used for inner step 17 | meta_batch_size (int): number of meta-learning tasks 18 | num_inner_grad_steps (int) : number of gradient updates taken per maml iteration 19 | trainable_inner_step_size (boolean): whether make the inner step size a trainable variable 20 | """ 21 | def __init__( 22 | self, 23 | max_path_length, 24 | *args, 25 | name="vpg_dice_maml", 26 | **kwargs 27 | ): 28 | super(VPG_DICEMAML, self).__init__(max_path_length, *args, **kwargs) 29 | 30 | self._optimization_keys = ['observations', 'actions', 'advantages', 'adjusted_rewards', 'mask', 'agent_infos'] 31 | self.name = name 32 | 33 | self.build_graph() 34 | 35 | def build_graph(self): 36 | """ 37 | Creates the computation graph for DICE MAML 38 | """ 39 | 40 | """ Build graph for sampling """ 41 | with tf.variable_scope(self.name + '_sampling'): 42 | self.step_sizes = self._create_step_size_vars() 43 | 44 | """ --- Build inner update graph for adapting the policy and sampling trajectories --- """ 45 | # this graph is only used for adapting the policy and not computing the meta-updates 46 | self.adapted_policies_params, self.adapt_input_ph_dict = self._build_inner_adaption() 47 | 48 | 49 | """ Build graph for meta-update """ 50 | meta_update_scope = tf.variable_scope(self.name + '_meta_update') 51 | 52 | with meta_update_scope: 53 | obs_phs, action_phs, adj_reward_phs, mask_phs, dist_info_old_phs, all_phs_dict = self._make_dice_input_placeholders('step0') 54 | self.meta_op_phs_dict = OrderedDict(all_phs_dict) 55 | 56 | distribution_info_vars, current_policy_params, all_surr_objs = [], [], [] 57 | 58 | for i in range(self.meta_batch_size): 59 | obs_stacked = self._reshape_obs_phs(obs_phs[i]) 60 | dist_info_sym = self.policy.distribution_info_sym(obs_stacked, params=None) 61 | distribution_info_vars.append(dist_info_sym) # step 0 62 | current_policy_params.append(self.policy.policy_params) # set to real policy_params (tf.Variable) 63 | 64 | with meta_update_scope: 65 | """ Inner updates""" 66 | for step_id in range(1, self.num_inner_grad_steps+1): 67 | with tf.variable_scope("inner_update_%i"%step_id): 68 | surr_objs, adapted_policy_params = [], [] 69 | 70 | # inner adaptation step for each task 71 | for i in range(self.meta_batch_size): 72 | action_stacked = self._reshape_action_phs(action_phs[i]) 73 | surr_loss = self._adapt_objective_sym(action_stacked, adj_reward_phs[i], mask_phs[i], distribution_info_vars[i]) 74 | 75 | adapted_params_var = self._adapt_sym(surr_loss, current_policy_params[i]) 76 | 77 | adapted_policy_params.append(adapted_params_var) 78 | surr_objs.append(surr_loss) 79 | 80 | all_surr_objs.append(surr_objs) 81 | # Create new placeholders for the next step 82 | obs_phs, action_phs, adj_reward_phs, mask_phs, dist_info_old_phs, all_phs_dict = self._make_dice_input_placeholders('step%i' % step_id) 83 | self.meta_op_phs_dict.update(all_phs_dict) 84 | 85 | # dist_info_vars_for_next_step 86 | distribution_info_vars = [] 87 | for i in range(self.meta_batch_size): 88 | obs_stacked = self._reshape_obs_phs(obs_phs[i]) 89 | distribution_info_vars.append(self.policy.distribution_info_sym(obs_stacked, params=adapted_policy_params[i])) 90 | 91 | current_policy_params = adapted_policy_params 92 | 93 | """ Outer (meta-)objective """ 94 | with tf.variable_scope("outer_update"): 95 | adv_phs, phs_dict = self._make_advantage_phs('step%i' % self.num_inner_grad_steps) 96 | self.meta_op_phs_dict.update(phs_dict) 97 | 98 | surr_objs = [] 99 | 100 | # meta-objective 101 | for i in range(self.meta_batch_size): 102 | action_stacked = self._reshape_action_phs(action_phs[i]) 103 | log_likelihood = self.policy.distribution.log_likelihood_sym(action_stacked, distribution_info_vars[i]) 104 | log_likelihood = tf.reshape(log_likelihood, tf.shape(mask_phs[i])) 105 | surr_obj = - tf.reduce_mean(log_likelihood * adv_phs[i] * mask_phs[i]) 106 | surr_objs.append(surr_obj) 107 | 108 | """ Mean over meta tasks """ 109 | meta_objective = tf.reduce_mean(tf.stack(surr_objs, 0)) 110 | 111 | self.optimizer.build_graph( 112 | loss=meta_objective, 113 | target=self.policy, 114 | input_ph_dict=self.meta_op_phs_dict, 115 | ) 116 | 117 | def _make_advantage_phs(self, prefix=''): 118 | adv_phs = [] 119 | all_phs_dict = OrderedDict() 120 | 121 | for task_id in range(self.meta_batch_size): 122 | # advantage ph 123 | ph = tf.placeholder(dtype=tf.float32, shape=[None, self.max_path_length], name='advantage' + '_' + prefix + '_' + str(task_id)) 124 | all_phs_dict['%s_task%i_%s' % (prefix, task_id, 'advantages')] = ph 125 | adv_phs.append(ph) 126 | 127 | return adv_phs, all_phs_dict 128 | -------------------------------------------------------------------------------- /meta_policy_search/meta_trainer.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import time 4 | from meta_policy_search.utils import logger 5 | 6 | 7 | class Trainer(object): 8 | """ 9 | Performs steps of meta-policy search. 10 | 11 | Pseudocode:: 12 | 13 | for iter in n_iter: 14 | sample tasks 15 | for task in tasks: 16 | for adapt_step in num_inner_grad_steps 17 | sample trajectories with policy 18 | perform update/adaptation step 19 | sample trajectories with post-update policy 20 | perform meta-policy gradient step(s) 21 | 22 | Args: 23 | algo (Algo) : 24 | env (Env) : 25 | sampler (Sampler) : 26 | sample_processor (SampleProcessor) : 27 | baseline (Baseline) : 28 | policy (Policy) : 29 | n_itr (int) : Number of iterations to train for 30 | start_itr (int) : Number of iterations policy has already trained for, if reloading 31 | num_inner_grad_steps (int) : Number of inner steps per maml iteration 32 | sess (tf.Session) : current tf session (if we loaded policy, for example) 33 | """ 34 | def __init__( 35 | self, 36 | algo, 37 | env, 38 | sampler, 39 | sample_processor, 40 | policy, 41 | n_itr, 42 | start_itr=0, 43 | num_inner_grad_steps=1, 44 | sess=None, 45 | ): 46 | self.algo = algo 47 | self.env = env 48 | self.sampler = sampler 49 | self.sample_processor = sample_processor 50 | self.baseline = sample_processor.baseline 51 | self.policy = policy 52 | self.n_itr = n_itr 53 | self.start_itr = start_itr 54 | self.num_inner_grad_steps = num_inner_grad_steps 55 | if sess is None: 56 | sess = tf.Session() 57 | self.sess = sess 58 | 59 | def train(self): 60 | """ 61 | Trains policy on env using algo 62 | 63 | Pseudocode:: 64 | 65 | for itr in n_itr: 66 | for step in num_inner_grad_steps: 67 | sampler.sample() 68 | algo.compute_updated_dists() 69 | algo.optimize_policy() 70 | sampler.update_goals() 71 | """ 72 | with self.sess.as_default() as sess: 73 | 74 | # initialize uninitialized vars (only initialize vars that were not loaded) 75 | uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))] 76 | sess.run(tf.variables_initializer(uninit_vars)) 77 | 78 | start_time = time.time() 79 | for itr in range(self.start_itr, self.n_itr): 80 | itr_start_time = time.time() 81 | logger.log("\n ---------------- Iteration %d ----------------" % itr) 82 | logger.log("Sampling set of tasks/goals for this meta-batch...") 83 | 84 | self.sampler.update_tasks() 85 | self.policy.switch_to_pre_update() # Switch to pre-update policy 86 | 87 | all_samples_data, all_paths = [], [] 88 | list_sampling_time, list_inner_step_time, list_outer_step_time, list_proc_samples_time = [], [], [], [] 89 | start_total_inner_time = time.time() 90 | for step in range(self.num_inner_grad_steps+1): 91 | logger.log('** Step ' + str(step) + ' **') 92 | 93 | """ -------------------- Sampling --------------------------""" 94 | 95 | logger.log("Obtaining samples...") 96 | time_env_sampling_start = time.time() 97 | paths = self.sampler.obtain_samples(log=True, log_prefix='Step_%d-' % step) 98 | list_sampling_time.append(time.time() - time_env_sampling_start) 99 | all_paths.append(paths) 100 | 101 | """ ----------------- Processing Samples ---------------------""" 102 | 103 | logger.log("Processing samples...") 104 | time_proc_samples_start = time.time() 105 | samples_data = self.sample_processor.process_samples(paths, log='all', log_prefix='Step_%d-' % step) 106 | all_samples_data.append(samples_data) 107 | list_proc_samples_time.append(time.time() - time_proc_samples_start) 108 | 109 | self.log_diagnostics(sum(list(paths.values()), []), prefix='Step_%d-' % step) 110 | 111 | """ ------------------- Inner Policy Update --------------------""" 112 | 113 | time_inner_step_start = time.time() 114 | if step < self.num_inner_grad_steps: 115 | logger.log("Computing inner policy updates...") 116 | self.algo._adapt(samples_data) 117 | # train_writer = tf.summary.FileWriter('/home/ignasi/Desktop/meta_policy_search_graph', 118 | # sess.graph) 119 | list_inner_step_time.append(time.time() - time_inner_step_start) 120 | total_inner_time = time.time() - start_total_inner_time 121 | 122 | time_maml_opt_start = time.time() 123 | """ ------------------ Outer Policy Update ---------------------""" 124 | 125 | logger.log("Optimizing policy...") 126 | # This needs to take all samples_data so that it can construct graph for meta-optimization. 127 | time_outer_step_start = time.time() 128 | self.algo.optimize_policy(all_samples_data) 129 | 130 | """ ------------------- Logging Stuff --------------------------""" 131 | logger.logkv('Itr', itr) 132 | logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled) 133 | 134 | logger.logkv('Time-OuterStep', time.time() - time_outer_step_start) 135 | logger.logkv('Time-TotalInner', total_inner_time) 136 | logger.logkv('Time-InnerStep', np.sum(list_inner_step_time)) 137 | logger.logkv('Time-SampleProc', np.sum(list_proc_samples_time)) 138 | logger.logkv('Time-Sampling', np.sum(list_sampling_time)) 139 | 140 | logger.logkv('Time', time.time() - start_time) 141 | logger.logkv('ItrTime', time.time() - itr_start_time) 142 | logger.logkv('Time-MAMLSteps', time.time() - time_maml_opt_start) 143 | 144 | logger.log("Saving snapshot...") 145 | params = self.get_itr_snapshot(itr) 146 | logger.save_itr_params(itr, params) 147 | logger.log("Saved") 148 | 149 | logger.dumpkvs() 150 | 151 | logger.log("Training finished") 152 | self.sess.close() 153 | 154 | def get_itr_snapshot(self, itr): 155 | """ 156 | Gets the current policy and env for storage 157 | """ 158 | return dict(itr=itr, policy=self.policy, env=self.env, baseline=self.baseline) 159 | 160 | def log_diagnostics(self, paths, prefix): 161 | # TODO: we aren't using it so far 162 | self.env.log_diagnostics(paths, prefix) 163 | self.policy.log_diagnostics(paths, prefix) 164 | self.baseline.log_diagnostics(paths, prefix) 165 | -------------------------------------------------------------------------------- /meta_policy_search/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.optimizers.base import Optimizer 2 | from meta_policy_search.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer 3 | from meta_policy_search.optimizers.maml_first_order_optimizer import MAMLFirstOrderOptimizer -------------------------------------------------------------------------------- /meta_policy_search/optimizers/base.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search import utils 2 | 3 | class Optimizer(object): 4 | def __init__(self): 5 | self._input_ph_dict = None 6 | 7 | def build_graph(self, loss, target, input_ph_dict): 8 | """ 9 | Sets the objective function and target weights for the optimize function 10 | 11 | Args: 12 | loss (tf_op) : minimization objective 13 | target (Policy) : Policy whose values we are optimizing over 14 | input_ph_dict (dict) : dict containing the placeholders of the computation graph corresponding to loss 15 | """ 16 | raise NotImplementedError 17 | 18 | def optimize(self, input_val_dict): 19 | """ 20 | Carries out the optimization step 21 | 22 | Args: 23 | input_val_dict (dict): dict containing the values to be fed into the computation graph 24 | 25 | """ 26 | raise NotImplementedError 27 | 28 | def loss(self, input_val_dict): 29 | """ 30 | Computes the value of the loss for given inputs 31 | 32 | Args: 33 | input_val_dict (dict): dict containing the values to be fed into the computation graph 34 | 35 | Returns: 36 | (float): value of the loss 37 | 38 | """ 39 | raise NotImplementedError 40 | 41 | def create_feed_dict(self, input_val_dict): 42 | return utils.create_feed_dict(placeholder_dict=self._input_ph_dict, value_dict=input_val_dict) 43 | -------------------------------------------------------------------------------- /meta_policy_search/optimizers/maml_first_order_optimizer.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.utils import logger 2 | from meta_policy_search.optimizers.base import Optimizer 3 | import tensorflow as tf 4 | 5 | class MAMLFirstOrderOptimizer(Optimizer): 6 | """ 7 | Optimizer for first order methods (SGD, Adam) 8 | 9 | Args: 10 | tf_optimizer_cls (tf.train.optimizer): desired tensorflow optimzier for training 11 | tf_optimizer_args (dict or None): arguments for the optimizer 12 | learning_rate (float): learning rate 13 | max_epochs: number of maximum epochs for training 14 | tolerance (float): tolerance for early stopping. If the loss fucntion decreases less than the specified tolerance 15 | after an epoch, then the training stops. 16 | num_minibatches (int): number of mini-batches for performing the gradient step. The mini-batch size is 17 | batch size//num_minibatches. 18 | verbose (bool): Whether to log or not the optimization process 19 | 20 | """ 21 | 22 | def __init__( 23 | self, 24 | tf_optimizer_cls=tf.train.AdamOptimizer, 25 | tf_optimizer_args=None, 26 | learning_rate=1e-3, 27 | max_epochs=1, 28 | tolerance=1e-6, 29 | num_minibatches=1, 30 | verbose=False 31 | ): 32 | 33 | self._target = None 34 | if tf_optimizer_args is None: 35 | tf_optimizer_args = dict() 36 | tf_optimizer_args['learning_rate'] = learning_rate 37 | 38 | self._tf_optimizer = tf_optimizer_cls(**tf_optimizer_args) 39 | self._max_epochs = max_epochs 40 | self._tolerance = tolerance 41 | self._num_minibatches = num_minibatches # Unused 42 | self._verbose = verbose 43 | self._all_inputs = None 44 | self._train_op = None 45 | self._loss = None 46 | self._input_ph_dict = None 47 | 48 | def build_graph(self, loss, target, input_ph_dict): 49 | """ 50 | Sets the objective function and target weights for the optimize function 51 | 52 | Args: 53 | loss (tf_op) : minimization objective 54 | target (Policy) : Policy whose values we are optimizing over 55 | input_ph_dict (dict) : dict containing the placeholders of the computation graph corresponding to loss 56 | """ 57 | assert isinstance(loss, tf.Tensor) 58 | assert hasattr(target, 'get_params') 59 | assert isinstance(input_ph_dict, dict) 60 | 61 | self._target = target 62 | self._input_ph_dict = input_ph_dict 63 | self._loss = loss 64 | self._train_op = self._tf_optimizer.minimize(loss, var_list=target.get_params()) 65 | 66 | def loss(self, input_val_dict): 67 | """ 68 | Computes the value of the loss for given inputs 69 | 70 | Args: 71 | input_val_dict (dict): dict containing the values to be fed into the computation graph 72 | 73 | Returns: 74 | (float): value of the loss 75 | 76 | """ 77 | sess = tf.get_default_session() 78 | feed_dict = self.create_feed_dict(input_val_dict) 79 | loss = sess.run(self._loss, feed_dict=feed_dict) 80 | return loss 81 | 82 | def optimize(self, input_val_dict): 83 | """ 84 | Carries out the optimization step 85 | 86 | Args: 87 | input_val_dict (dict): dict containing the values to be fed into the computation graph 88 | 89 | Returns: 90 | (float) loss before optimization 91 | 92 | """ 93 | 94 | sess = tf.get_default_session() 95 | feed_dict = self.create_feed_dict(input_val_dict) 96 | 97 | # Overload self._batch size 98 | # dataset = MAMLBatchDataset(inputs, num_batches=self._batch_size, extra_inputs=extra_inputs, meta_batch_size=self.meta_batch_size, num_grad_updates=self.num_grad_updates) 99 | # Todo: reimplement minibatches 100 | 101 | loss_before_opt = None 102 | for epoch in range(self._max_epochs): 103 | if self._verbose: 104 | logger.log("Epoch %d" % epoch) 105 | 106 | loss, _ = sess.run([self._loss, self._train_op], feed_dict) 107 | if not loss_before_opt: loss_before_opt = loss 108 | 109 | # if self._verbose: 110 | # logger.log("Epoch: %d | Loss: %f" % (epoch, new_loss)) 111 | # 112 | # if abs(last_loss - new_loss) < self._tolerance: 113 | # break 114 | # last_loss = new_loss 115 | return loss_before_opt 116 | 117 | 118 | class MAMLPPOOptimizer(MAMLFirstOrderOptimizer): 119 | """ 120 | Adds inner and outer kl terms to first order optimizer #TODO: (Do we really need this?) 121 | 122 | """ 123 | def __init__(self, *args, **kwargs): 124 | # Todo: reimplement minibatches 125 | super(MAMLPPOOptimizer, self).__init__(*args, **kwargs) 126 | self._inner_kl = None 127 | self._outer_kl = None 128 | 129 | def build_graph(self, loss, target, input_ph_dict, inner_kl=None, outer_kl=None): 130 | """ 131 | Sets the objective function and target weights for the optimize function 132 | 133 | Args: 134 | loss (tf.Tensor) : minimization objective 135 | target (Policy) : Policy whose values we are optimizing over 136 | input_ph_dict (dict) : dict containing the placeholders of the computation graph corresponding to loss 137 | inner_kl (list): list with the inner kl loss for each task 138 | outer_kl (list): list with the outer kl loss for each task 139 | """ 140 | super(MAMLPPOOptimizer, self).build_graph(loss, target, input_ph_dict) 141 | assert inner_kl is not None 142 | 143 | self._inner_kl = inner_kl 144 | self._outer_kl = outer_kl 145 | 146 | def compute_stats(self, input_val_dict): 147 | """ 148 | Computes the value the loss, the outer KL and the inner KL-divergence between the current policy and the 149 | provided dist_info_data 150 | 151 | Args: 152 | inputs (list): inputs needed to compute the inner KL 153 | extra_inputs (list): additional inputs needed to compute the inner KL 154 | 155 | Returns: 156 | (float): value of the loss 157 | (ndarray): inner kls - numpy array of shape (num_inner_grad_steps,) 158 | (float): outer_kl 159 | """ 160 | sess = tf.get_default_session() 161 | feed_dict = self.create_feed_dict(input_val_dict) 162 | loss, inner_kl, outer_kl = sess.run([self._loss, self._inner_kl, self._outer_kl], feed_dict=feed_dict) 163 | return loss, inner_kl, outer_kl 164 | 165 | 166 | 167 | -------------------------------------------------------------------------------- /meta_policy_search/policies/__init__.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.policies.base import MetaPolicy 2 | from meta_policy_search.policies.base import Policy 3 | from meta_policy_search.policies.gaussian_mlp_policy import GaussianMLPPolicy 4 | from meta_policy_search.policies.meta_gaussian_mlp_policy import MetaGaussianMLPPolicy -------------------------------------------------------------------------------- /meta_policy_search/policies/distributions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonasrothfuss/ProMP/93ae339e23dfc6e1133f9538f2c7cc0ccee89d19/meta_policy_search/policies/distributions/__init__.py -------------------------------------------------------------------------------- /meta_policy_search/policies/distributions/base.py: -------------------------------------------------------------------------------- 1 | class Distribution(object): 2 | """ 3 | General methods for a generic distribution 4 | """ 5 | @property 6 | def dim(self): 7 | raise NotImplementedError 8 | 9 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars): 10 | """ 11 | Symbolic KL divergence of two distributions 12 | 13 | Args: 14 | old_dist_info_vars (dict) : dict of old distribution parameters as tf.Tensor 15 | new_dist_info_vars (dict) : dict of new distribution parameters as tf.Tensor 16 | 17 | Returns: 18 | (tf.Tensor) : Symbolic representation of kl divergence (tensorflow op) 19 | """ 20 | raise NotImplementedError 21 | 22 | def kl(self, old_dist_info, new_dist_info): 23 | """ 24 | Compute the KL divergence of two distributions 25 | 26 | Args: 27 | old_dist_info (dict): dict of old distribution parameters as numpy array 28 | new_dist_info (dict): dict of new distribution parameters as numpy array 29 | 30 | Returns: 31 | (numpy array): kl divergence of distributions 32 | """ 33 | raise NotImplementedError 34 | 35 | def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars): 36 | """ 37 | Symbolic likelihood ratio p_new(x)/p_old(x) of two distributions 38 | 39 | Args: 40 | x_var (tf.Tensor): variable where to evaluate the likelihood ratio p_new(x)/p_old(x) 41 | old_dist_info_vars (dict) : dict of old distribution parameters as tf.Tensor 42 | new_dist_info_vars (dict) : dict of new distribution parameters as tf.Tensor 43 | 44 | Returns: 45 | (tf.Tensor): likelihood ratio 46 | """ 47 | raise NotImplementedError 48 | 49 | def likelihood_ratio(self, x_var, old_dist_info, new_dist_info): 50 | """ 51 | Compute the likelihood ratio p_new(x)/p_old(x) of two distributions 52 | 53 | Args: 54 | x_var (numpy array): variable where to evaluate the likelihood ratio p_new(x)/p_old(x) 55 | old_dist_info_vars (dict) : dict of old distribution parameters as numpy array 56 | new_dist_info_vars (dict) : dict of new distribution parameters as numpy array 57 | 58 | Returns: 59 | (numpy array): likelihood ratio 60 | """ 61 | raise NotImplementedError 62 | 63 | def entropy_sym(self, dist_info_vars): 64 | """ 65 | Symbolic entropy of the distribution 66 | 67 | Args: 68 | dist_info (dict) : dict of distribution parameters as tf.Tensor 69 | 70 | Returns: 71 | (tf.Tensor): entropy 72 | """ 73 | raise NotImplementedError 74 | 75 | def entropy(self, dist_info): 76 | """ 77 | Compute the entropy of the distribution 78 | 79 | Args: 80 | dist_info (dict) : dict of distribution parameters as numpy array 81 | 82 | Returns: 83 | (numpy array): entropy 84 | """ 85 | raise NotImplementedError 86 | 87 | def log_likelihood_sym(self, x_var, dist_info_vars): 88 | """ 89 | Symbolic log likelihood log p(x) of the distribution 90 | 91 | Args: 92 | x_var (tf.Tensor): variable where to evaluate the log likelihood 93 | dist_info_vars (dict) : dict of distribution parameters as tf.Tensor 94 | 95 | Returns: 96 | (numpy array): log likelihood 97 | """ 98 | raise NotImplementedError 99 | 100 | def log_likelihood(self, xs, dist_info): 101 | """ 102 | Compute the log likelihood log p(x) of the distribution 103 | 104 | Args: 105 | x_var (numpy array): variable where to evaluate the log likelihood 106 | dist_info_vars (dict) : dict of distribution parameters as numpy array 107 | 108 | Returns: 109 | (numpy array): log likelihood 110 | """ 111 | raise NotImplementedError 112 | 113 | def sample(self, dist_info): 114 | """ 115 | Draws a sample from the distribution 116 | 117 | Args: 118 | dist_info (dict) : dict of distribution parameter instantiations as numpy array 119 | 120 | Returns: 121 | (obj): sample drawn from the corresponding instantiation 122 | """ 123 | raise NotImplementedError 124 | 125 | @property 126 | def dist_info_specs(self): 127 | raise NotImplementedError 128 | 129 | @property 130 | def dist_info_keys(self): 131 | return [k for k, _ in self.dist_info_specs] 132 | -------------------------------------------------------------------------------- /meta_policy_search/policies/distributions/diagonal_gaussian.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from meta_policy_search.policies.distributions.base import Distribution 4 | 5 | class DiagonalGaussian(Distribution): 6 | """ 7 | General methods for a diagonal gaussian distribution of this size 8 | """ 9 | def __init__(self, dim): 10 | self._dim = dim 11 | 12 | @property 13 | def dim(self): 14 | return self._dim 15 | 16 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars): 17 | """ 18 | Computes the symbolic representation of the KL divergence of two multivariate 19 | Gaussian distribution with diagonal covariance matrices 20 | 21 | Args: 22 | old_dist_info_vars (dict) : dict of old distribution parameters as tf.Tensor 23 | new_dist_info_vars (dict) : dict of new distribution parameters as tf.Tensor 24 | 25 | Returns: 26 | (tf.Tensor) : Symbolic representation of kl divergence (tensorflow op) 27 | """ 28 | old_means = old_dist_info_vars["mean"] 29 | old_log_stds = old_dist_info_vars["log_std"] 30 | new_means = new_dist_info_vars["mean"] 31 | new_log_stds = new_dist_info_vars["log_std"] 32 | 33 | # assert ranks 34 | tf.assert_rank(old_means, 2), tf.assert_rank(old_log_stds, 2) 35 | tf.assert_rank(new_means, 2), tf.assert_rank(new_log_stds, 2) 36 | 37 | old_std = tf.exp(old_log_stds) 38 | new_std = tf.exp(new_log_stds) 39 | 40 | numerator = tf.square(old_means - new_means) + \ 41 | tf.square(old_std) - tf.square(new_std) 42 | denominator = 2 * tf.square(new_std) + 1e-8 43 | return tf.reduce_sum( 44 | numerator / denominator + new_log_stds - old_log_stds, reduction_indices=-1) 45 | 46 | def kl(self, old_dist_info, new_dist_info): 47 | """ 48 | Compute the KL divergence of two multivariate Gaussian distribution with 49 | diagonal covariance matrices 50 | 51 | Args: 52 | old_dist_info (dict): dict of old distribution parameters as numpy array 53 | new_dist_info (dict): dict of new distribution parameters as numpy array 54 | 55 | Returns: 56 | (numpy array): kl divergence of distributions 57 | """ 58 | old_means = old_dist_info["mean"] 59 | old_log_stds = old_dist_info["log_std"] 60 | new_means = new_dist_info["mean"] 61 | new_log_stds = new_dist_info["log_std"] 62 | 63 | old_std = np.exp(old_log_stds) 64 | new_std = np.exp(new_log_stds) 65 | numerator = np.square(old_means - new_means) + \ 66 | np.square(old_std) - np.square(new_std) 67 | denominator = 2 * np.square(new_std) + 1e-8 68 | return np.sum( 69 | numerator / denominator + new_log_stds - old_log_stds, axis=-1) 70 | 71 | def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars): 72 | """ 73 | Symbolic likelihood ratio p_new(x)/p_old(x) of two distributions 74 | 75 | Args: 76 | x_var (tf.Tensor): variable where to evaluate the likelihood ratio p_new(x)/p_old(x) 77 | old_dist_info_vars (dict) : dict of old distribution parameters as tf.Tensor 78 | new_dist_info_vars (dict) : dict of new distribution parameters as tf.Tensor 79 | 80 | Returns: 81 | (tf.Tensor): likelihood ratio 82 | """ 83 | with tf.variable_scope("log_li_new"): 84 | logli_new = self.log_likelihood_sym(x_var, new_dist_info_vars) 85 | with tf.variable_scope("log_li_old"): 86 | logli_old = self.log_likelihood_sym(x_var, old_dist_info_vars) 87 | return tf.exp(logli_new - logli_old) 88 | 89 | def log_likelihood_sym(self, x_var, dist_info_vars): 90 | """ 91 | Symbolic log likelihood log p(x) of the distribution 92 | 93 | Args: 94 | x_var (tf.Tensor): variable where to evaluate the log likelihood 95 | dist_info_vars (dict) : dict of distribution parameters as tf.Tensor 96 | 97 | Returns: 98 | (numpy array): log likelihood 99 | """ 100 | means = dist_info_vars["mean"] 101 | log_stds = dist_info_vars["log_std"] 102 | 103 | # assert ranks 104 | tf.assert_rank(x_var, 2), tf.assert_rank(means, 2), tf.assert_rank(log_stds, 2) 105 | 106 | zs = (x_var - means) / tf.exp(log_stds) 107 | return - tf.reduce_sum(log_stds, reduction_indices=-1) - \ 108 | 0.5 * tf.reduce_sum(tf.square(zs), reduction_indices=-1) - \ 109 | 0.5 * self.dim * np.log(2 * np.pi) 110 | 111 | def log_likelihood(self, xs, dist_info): 112 | """ 113 | Compute the log likelihood log p(x) of the distribution 114 | 115 | Args: 116 | x_var (numpy array): variable where to evaluate the log likelihood 117 | dist_info_vars (dict) : dict of distribution parameters as numpy array 118 | 119 | Returns: 120 | (numpy array): log likelihood 121 | """ 122 | means = dist_info["mean"] 123 | log_stds = dist_info["log_std"] 124 | zs = (xs - means) / np.exp(log_stds) 125 | return - np.sum(log_stds, axis=-1) - \ 126 | 0.5 * np.sum(np.square(zs), axis=-1) - \ 127 | 0.5 * self.dim * np.log(2 * np.pi) 128 | 129 | def entropy_sym(self, dist_info_vars): 130 | """ 131 | Symbolic entropy of the distribution 132 | 133 | Args: 134 | dist_info (dict) : dict of distribution parameters as tf.Tensor 135 | 136 | Returns: 137 | (tf.Tensor): entropy 138 | """ 139 | log_stds = dist_info_vars["log_std"] 140 | return tf.reduce_sum(log_stds + np.log(np.sqrt(2 * np.pi * np.e)), reduction_indices=-1) 141 | 142 | def entropy(self, dist_info): 143 | """ 144 | Compute the entropy of the distribution 145 | 146 | Args: 147 | dist_info (dict) : dict of distribution parameters as numpy array 148 | 149 | Returns: 150 | (numpy array): entropy 151 | """ 152 | log_stds = dist_info["log_std"] 153 | return np.sum(log_stds + np.log(np.sqrt(2 * np.pi * np.e)), axis=-1) 154 | 155 | def sample(self, dist_info): 156 | """ 157 | Draws a sample from the distribution 158 | 159 | Args: 160 | dist_info (dict) : dict of distribution parameter instantiations as numpy array 161 | 162 | Returns: 163 | (obj): sample drawn from the corresponding instantiation 164 | """ 165 | means = dist_info["mean"] 166 | log_stds = dist_info["log_std"] 167 | rnd = np.random.normal(size=means.shape) 168 | return rnd * np.exp(log_stds) + means 169 | 170 | @property 171 | def dist_info_specs(self): 172 | return [("mean", (self.dim,)), ("log_std", (self.dim,))] 173 | -------------------------------------------------------------------------------- /meta_policy_search/policies/networks/__init__.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | 4 | class MyTestCase(unittest.TestCase): 5 | def test_something(self): 6 | self.assertEqual(True, False) 7 | 8 | 9 | if __name__ == '__main__': 10 | unittest.main() -------------------------------------------------------------------------------- /meta_policy_search/policies/networks/mlp.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from meta_policy_search.utils.utils import get_original_tf_name, get_last_scope 3 | 4 | 5 | def create_mlp(name, 6 | output_dim, 7 | hidden_sizes, 8 | hidden_nonlinearity, 9 | output_nonlinearity, 10 | input_dim=None, 11 | input_var=None, 12 | w_init=tf.contrib.layers.xavier_initializer(), 13 | b_init=tf.zeros_initializer(), 14 | reuse=False 15 | ): 16 | """ 17 | Creates a MLP network 18 | Args: 19 | name (str): scope of the neural network 20 | output_dim (int): dimension of the output 21 | hidden_sizes (tuple): tuple with the hidden sizes of the fully connected network 22 | hidden_nonlinearity (tf): non-linearity for the activations in the hidden layers 23 | output_nonlinearity (tf or None): output non-linearity. None results in no non-linearity being applied 24 | input_dim (tuple): dimensions of the input variable e.g. (None, action_dim) 25 | input_var (tf.placeholder or tf.Variable or None): Input of the network as a symbolic variable 26 | w_init (tf.initializer): initializer for the weights 27 | b_init (tf.initializer): initializer for the biases 28 | reuse (bool): reuse or not the network 29 | 30 | Returns: 31 | input_var (tf.placeholder or tf.Variable): Input of the network as a symbolic variable 32 | output_var (tf.Tensor): Output of the network as a symbolic variable 33 | 34 | """ 35 | 36 | assert input_var is not None or input_dim is not None 37 | 38 | if input_var is None: 39 | input_var = tf.placeholder(dtype=tf.float32, shape=input_dim, name='input') 40 | with tf.variable_scope(name): 41 | x = input_var 42 | 43 | for idx, hidden_size in enumerate(hidden_sizes): 44 | x = tf.layers.dense(x, 45 | hidden_size, 46 | name='hidden_%d' % idx, 47 | activation=hidden_nonlinearity, 48 | kernel_initializer=w_init, 49 | bias_initializer=b_init, 50 | reuse=reuse, 51 | ) 52 | 53 | output_var = tf.layers.dense(x, 54 | output_dim, 55 | name='output', 56 | activation=output_nonlinearity, 57 | kernel_initializer=w_init, 58 | bias_initializer=b_init, 59 | reuse=reuse, 60 | ) 61 | 62 | return input_var, output_var 63 | 64 | 65 | def forward_mlp(output_dim, 66 | hidden_sizes, 67 | hidden_nonlinearity, 68 | output_nonlinearity, 69 | input_var, 70 | mlp_params, 71 | ): 72 | """ 73 | Creates the forward pass of an mlp given the input vars and the mlp params. Assumes that the params are passed in 74 | order i.e. [hidden_0/kernel, hidden_0/bias, hidden_1/kernel, hidden_1/bias, ..., output/kernel, output/bias] 75 | Args: 76 | output_dim (int): dimension of the output 77 | hidden_sizes (tuple): tuple with the hidden sizes of the fully connected network 78 | hidden_nonlinearity (tf): non-linearity for the activations in the hidden layers 79 | output_nonlinearity (tf or None): output non-linearity. None results in no non-linearity being applied 80 | input_var (tf.placeholder or tf.Variable): Input of the network as a symbolic variable 81 | mlp_params (OrderedDict): OrderedDict of the params of the neural network. 82 | 83 | Returns: 84 | input_var (tf.placeholder or tf.Variable): Input of the network as a symbolic variable 85 | output_var (tf.Tensor): Output of the network as a symbolic variable 86 | 87 | """ 88 | x = input_var 89 | idx = 0 90 | bias_added = False 91 | sizes = tuple(hidden_sizes) + (output_dim,) 92 | 93 | if output_nonlinearity is None: 94 | output_nonlinearity = tf.identity 95 | 96 | for name, param in mlp_params.items(): 97 | assert str(idx) in name or (idx == len(hidden_sizes) and "output" in name) 98 | 99 | if "kernel" in name: 100 | assert param.shape == (x.shape[-1], sizes[idx]) 101 | x = tf.matmul(x, param) 102 | elif "bias" in name: 103 | assert param.shape == (sizes[idx],) 104 | x = tf.add(x, param) 105 | bias_added = True 106 | else: 107 | raise NameError 108 | 109 | if bias_added: 110 | if "hidden" in name: 111 | x = hidden_nonlinearity(x) 112 | elif "output" in name: 113 | x = output_nonlinearity(x) 114 | else: 115 | raise NameError 116 | idx += 1 117 | bias_added = False 118 | output_var = x 119 | return input_var, output_var # Todo why return input_var? 120 | 121 | -------------------------------------------------------------------------------- /meta_policy_search/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.samplers.base import Sampler 2 | from meta_policy_search.samplers.base import SampleProcessor 3 | from meta_policy_search.samplers.meta_sample_processor import MetaSampleProcessor 4 | from meta_policy_search.samplers.meta_sampler import MetaSampler 5 | from meta_policy_search.samplers.dice_sample_processor import DiceSampleProcessor 6 | from meta_policy_search.samplers.meta_sample_processor import DiceMetaSampleProcessor 7 | -------------------------------------------------------------------------------- /meta_policy_search/samplers/meta_sample_processor.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.samplers.base import SampleProcessor 2 | from meta_policy_search.samplers.dice_sample_processor import DiceSampleProcessor 3 | from meta_policy_search.utils import utils 4 | import numpy as np 5 | 6 | class MetaSampleProcessor(SampleProcessor): 7 | 8 | def process_samples(self, paths_meta_batch, log=False, log_prefix=''): 9 | """ 10 | Processes sampled paths. This involves: 11 | - computing discounted rewards (returns) 12 | - fitting baseline estimator using the path returns and predicting the return baselines 13 | - estimating the advantages using GAE (+ advantage normalization id desired) 14 | - stacking the path data 15 | - logging statistics of the paths 16 | 17 | Args: 18 | paths_meta_batch (dict): A list of dict of lists, size: [meta_batch_size] x (batch_size) x [5] x (max_path_length) 19 | log (boolean): indicates whether to log 20 | log_prefix (str): prefix for the logging keys 21 | 22 | Returns: 23 | (list of dicts) : Processed sample data among the meta-batch; size: [meta_batch_size] x [7] x (batch_size x max_path_length) 24 | """ 25 | assert isinstance(paths_meta_batch, dict), 'paths must be a dict' 26 | assert self.baseline, 'baseline must be specified' 27 | 28 | samples_data_meta_batch = [] 29 | all_paths = [] 30 | 31 | for meta_task, paths in paths_meta_batch.items(): 32 | 33 | # fits baseline, compute advantages and stack path data 34 | samples_data, paths = self._compute_samples_data(paths) 35 | 36 | samples_data_meta_batch.append(samples_data) 37 | all_paths.extend(paths) 38 | 39 | # 7) compute normalized trajectory-batch rewards (for E-MAML) 40 | overall_avg_reward = np.mean(np.concatenate([samples_data['rewards'] for samples_data in samples_data_meta_batch])) 41 | overall_avg_reward_std = np.std(np.concatenate([samples_data['rewards'] for samples_data in samples_data_meta_batch])) 42 | 43 | for samples_data in samples_data_meta_batch: 44 | samples_data['adj_avg_rewards'] = (samples_data['rewards'] - overall_avg_reward) / (overall_avg_reward_std + 1e-8) 45 | 46 | # 8) log statistics if desired 47 | self._log_path_stats(all_paths, log=log, log_prefix=log_prefix) 48 | 49 | return samples_data_meta_batch 50 | 51 | class DiceMetaSampleProcessor(DiceSampleProcessor): 52 | process_samples = MetaSampleProcessor.process_samples -------------------------------------------------------------------------------- /meta_policy_search/samplers/meta_sampler.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.samplers.base import Sampler 2 | from meta_policy_search.samplers.vectorized_env_executor import MetaParallelEnvExecutor, MetaIterativeEnvExecutor 3 | from meta_policy_search.utils import utils, logger 4 | from collections import OrderedDict 5 | 6 | from pyprind import ProgBar 7 | import numpy as np 8 | import time 9 | import itertools 10 | 11 | 12 | class MetaSampler(Sampler): 13 | """ 14 | Sampler for Meta-RL 15 | 16 | Args: 17 | env (meta_policy_search.envs.base.MetaEnv) : environment object 18 | policy (meta_policy_search.policies.base.Policy) : policy object 19 | batch_size (int) : number of trajectories per task 20 | meta_batch_size (int) : number of meta tasks 21 | max_path_length (int) : max number of steps per trajectory 22 | envs_per_task (int) : number of envs to run vectorized for each task (influences the memory usage) 23 | """ 24 | 25 | def __init__( 26 | self, 27 | env, 28 | policy, 29 | rollouts_per_meta_task, 30 | meta_batch_size, 31 | max_path_length, 32 | envs_per_task=None, 33 | parallel=False 34 | ): 35 | super(MetaSampler, self).__init__(env, policy, rollouts_per_meta_task, max_path_length) 36 | assert hasattr(env, 'set_task') 37 | 38 | self.envs_per_task = rollouts_per_meta_task if envs_per_task is None else envs_per_task 39 | self.meta_batch_size = meta_batch_size 40 | self.total_samples = meta_batch_size * rollouts_per_meta_task * max_path_length 41 | self.parallel = parallel 42 | self.total_timesteps_sampled = 0 43 | 44 | # setup vectorized environment 45 | 46 | if self.parallel: 47 | self.vec_env = MetaParallelEnvExecutor(env, self.meta_batch_size, self.envs_per_task, self.max_path_length) 48 | else: 49 | self.vec_env = MetaIterativeEnvExecutor(env, self.meta_batch_size, self.envs_per_task, self.max_path_length) 50 | 51 | def update_tasks(self): 52 | """ 53 | Samples a new goal for each meta task 54 | """ 55 | tasks = self.env.sample_tasks(self.meta_batch_size) 56 | assert len(tasks) == self.meta_batch_size 57 | self.vec_env.set_tasks(tasks) 58 | 59 | def obtain_samples(self, log=False, log_prefix=''): 60 | """ 61 | Collect batch_size trajectories from each task 62 | 63 | Args: 64 | log (boolean): whether to log sampling times 65 | log_prefix (str) : prefix for logger 66 | 67 | Returns: 68 | (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length) 69 | """ 70 | 71 | # initial setup / preparation 72 | paths = OrderedDict() 73 | for i in range(self.meta_batch_size): 74 | paths[i] = [] 75 | 76 | n_samples = 0 77 | running_paths = [_get_empty_running_paths_dict() for _ in range(self.vec_env.num_envs)] 78 | 79 | pbar = ProgBar(self.total_samples) 80 | policy_time, env_time = 0, 0 81 | 82 | policy = self.policy 83 | 84 | # initial reset of envs 85 | obses = self.vec_env.reset() 86 | 87 | while n_samples < self.total_samples: 88 | 89 | # execute policy 90 | t = time.time() 91 | obs_per_task = np.split(np.asarray(obses), self.meta_batch_size) 92 | actions, agent_infos = policy.get_actions(obs_per_task) 93 | policy_time += time.time() - t 94 | 95 | # step environments 96 | t = time.time() 97 | actions = np.concatenate(actions) # stack meta batch 98 | next_obses, rewards, dones, env_infos = self.vec_env.step(actions) 99 | env_time += time.time() - t 100 | 101 | # stack agent_infos and if no infos were provided (--> None) create empty dicts 102 | agent_infos, env_infos = self._handle_info_dicts(agent_infos, env_infos) 103 | 104 | new_samples = 0 105 | for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, 106 | rewards, env_infos, agent_infos, 107 | dones): 108 | # append new samples to running paths 109 | running_paths[idx]["observations"].append(observation) 110 | running_paths[idx]["actions"].append(action) 111 | running_paths[idx]["rewards"].append(reward) 112 | running_paths[idx]["env_infos"].append(env_info) 113 | running_paths[idx]["agent_infos"].append(agent_info) 114 | 115 | # if running path is done, add it to paths and empty the running path 116 | if done: 117 | paths[idx // self.envs_per_task].append(dict( 118 | observations=np.asarray(running_paths[idx]["observations"]), 119 | actions=np.asarray(running_paths[idx]["actions"]), 120 | rewards=np.asarray(running_paths[idx]["rewards"]), 121 | env_infos=utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), 122 | agent_infos=utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), 123 | )) 124 | new_samples += len(running_paths[idx]["rewards"]) 125 | running_paths[idx] = _get_empty_running_paths_dict() 126 | 127 | pbar.update(new_samples) 128 | n_samples += new_samples 129 | obses = next_obses 130 | pbar.stop() 131 | 132 | self.total_timesteps_sampled += self.total_samples 133 | if log: 134 | logger.logkv(log_prefix + "PolicyExecTime", policy_time) 135 | logger.logkv(log_prefix + "EnvExecTime", env_time) 136 | 137 | return paths 138 | 139 | def _handle_info_dicts(self, agent_infos, env_infos): 140 | if not env_infos: 141 | env_infos = [dict() for _ in range(self.vec_env.num_envs)] 142 | if not agent_infos: 143 | agent_infos = [dict() for _ in range(self.vec_env.num_envs)] 144 | else: 145 | assert len(agent_infos) == self.meta_batch_size 146 | assert len(agent_infos[0]) == self.envs_per_task 147 | agent_infos = sum(agent_infos, []) # stack agent_infos 148 | 149 | assert len(agent_infos) == self.meta_batch_size * self.envs_per_task == len(env_infos) 150 | return agent_infos, env_infos 151 | 152 | 153 | def _get_empty_running_paths_dict(): 154 | return dict(observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[]) 155 | -------------------------------------------------------------------------------- /meta_policy_search/samplers/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | 4 | 5 | def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, save_video=True, video_filename='sim_out.mp4', ignore_done=False): 6 | observations = [] 7 | actions = [] 8 | rewards = [] 9 | agent_infos = [] 10 | env_infos = [] 11 | images = [] 12 | 13 | ''' get wrapped env ''' 14 | wrapped_env = env 15 | while hasattr(wrapped_env, '_wrapped_env'): 16 | wrapped_env = wrapped_env._wrapped_env 17 | 18 | frame_skip = wrapped_env.frame_skip if hasattr(wrapped_env, 'frame_skip') else 1 19 | assert hasattr(wrapped_env, 'dt'), 'environment must have dt attribute that specifies the timestep' 20 | timestep = wrapped_env.dt 21 | 22 | o = env.reset() 23 | agent.reset() 24 | path_length = 0 25 | if animated: 26 | env.render() 27 | 28 | while path_length < max_path_length: 29 | a, agent_info = agent.get_action([o]) 30 | next_o, r, d, env_info = env.step(a) 31 | observations.append(env.observation_space.flatten(o)) 32 | rewards.append(r) 33 | actions.append(env.action_space.flatten(a)) 34 | agent_infos.append(agent_info) 35 | env_infos.append(env_info) 36 | path_length += 1 37 | if d and not ignore_done: # and not animated: 38 | break 39 | o = next_o 40 | if animated: 41 | env.render() 42 | time.sleep(timestep*frame_skip / speedup) 43 | if save_video: 44 | from PIL import Image 45 | image = env.wrapped_env.wrapped_env.get_viewer().get_image() 46 | pil_image = Image.frombytes('RGB', (image[1], image[2]), image[0]) 47 | images.append(np.flipud(np.array(pil_image))) 48 | 49 | if animated: 50 | if save_video: 51 | import moviepy.editor as mpy 52 | fps = int(speedup/timestep * frame_skip) 53 | clip = mpy.ImageSequenceClip(images, fps=fps) 54 | if video_filename[-3:] == 'gif': 55 | clip.write_gif(video_filename, fps=fps) 56 | else: 57 | clip.write_videofile(video_filename, fps=fps) 58 | #return 59 | 60 | return dict( 61 | observations=observations, 62 | actons=actions, 63 | rewards=rewards, 64 | agent_infos=agent_infos, 65 | env_infos=env_infos 66 | ) -------------------------------------------------------------------------------- /meta_policy_search/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.utils.serializable import Serializable 2 | from meta_policy_search.utils.utils import * -------------------------------------------------------------------------------- /meta_policy_search/utils/serializable.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2016 rllab contributors 3 | https://github.com/rll/rllab 4 | """ 5 | 6 | import inspect 7 | import sys 8 | 9 | 10 | class Serializable(object): 11 | 12 | def __init__(self, *args, **kwargs): 13 | self.__args = args 14 | self.__kwargs = kwargs 15 | 16 | def quick_init(self, locals_): 17 | try: 18 | if object.__getattribute__(self, "_serializable_initialized"): 19 | return 20 | except AttributeError: 21 | pass 22 | if sys.version_info >= (3, 0): 23 | spec = inspect.getfullargspec(self.__init__) 24 | # Exclude the first "self" parameter 25 | if spec.varkw: 26 | kwargs = locals_[spec.varkw] 27 | else: 28 | kwargs = dict() 29 | else: 30 | spec = inspect.getargspec(self.__init__) 31 | if spec.keywords: 32 | kwargs = locals_[spec.keywords] 33 | else: 34 | kwargs = dict() 35 | if spec.varargs: 36 | varargs = locals_[spec.varargs] 37 | else: 38 | varargs = tuple() 39 | in_order_args = [locals_[arg] for arg in spec.args][1:] 40 | self.__args = tuple(in_order_args) + varargs 41 | self.__kwargs = kwargs 42 | setattr(self, "_serializable_initialized", True) 43 | 44 | def __getstate__(self): 45 | return {"__args": self.__args, "__kwargs": self.__kwargs} 46 | 47 | def __setstate__(self, d): 48 | out = type(self)(*d["__args"], **d["__kwargs"]) 49 | self.__dict__.update(out.__dict__) 50 | 51 | @classmethod 52 | def clone(cls, obj, **kwargs): 53 | assert isinstance(obj, Serializable) 54 | d = obj.__getstate__() 55 | 56 | # Split the entries in kwargs between positional and keyword arguments 57 | # and update d['__args'] and d['__kwargs'], respectively. 58 | if sys.version_info >= (3, 0): 59 | spec = inspect.getfullargspec(obj.__init__) 60 | else: 61 | spec = inspect.getargspec(obj.__init__) 62 | in_order_args = spec.args[1:] 63 | 64 | d["__args"] = list(d["__args"]) 65 | for kw, val in kwargs.items(): 66 | if kw in in_order_args: 67 | d["__args"][in_order_args.index(kw)] = val 68 | else: 69 | d["__kwargs"][kw] = val 70 | 71 | out = type(obj).__new__(type(obj)) 72 | out.__setstate__(d) 73 | return out 74 | -------------------------------------------------------------------------------- /meta_policy_search/utils/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy 3 | import scipy.signal 4 | import json 5 | 6 | def get_original_tf_name(name): 7 | """ 8 | Args: 9 | name (str): full name of the tf variable with all the scopes 10 | 11 | Returns: 12 | (str): name given to the variable when creating it (i.e. name of the variable w/o the scope and the colons) 13 | """ 14 | return name.split("/")[-1].split(":")[0] 15 | 16 | 17 | def remove_scope_from_name(name, scope): 18 | """ 19 | Args: 20 | name (str): full name of the tf variable with all the scopes 21 | 22 | Returns: 23 | (str): full name of the variable with the scope removed 24 | """ 25 | result = name.split(scope)[1] 26 | result = result[1:] if result[0] == '/' else result 27 | return result.split(":")[0] 28 | 29 | def remove_first_scope_from_name(name): 30 | return name.replace(name + '/', "").split(":")[0] 31 | 32 | def get_last_scope(name): 33 | """ 34 | Args: 35 | name (str): full name of the tf variable with all the scopes 36 | 37 | Returns: 38 | (str): name of the last scope 39 | """ 40 | return name.split("/")[-2] 41 | 42 | 43 | def extract(x, *keys): 44 | """ 45 | Args: 46 | x (dict or list): dict or list of dicts 47 | 48 | Returns: 49 | (tuple): tuple with the elements of the dict or the dicts of the list 50 | """ 51 | if isinstance(x, dict): 52 | return tuple(x[k] for k in keys) 53 | elif isinstance(x, list): 54 | return tuple([xi[k] for xi in x] for k in keys) 55 | else: 56 | raise NotImplementedError 57 | 58 | 59 | def normalize_advantages(advantages): 60 | """ 61 | Args: 62 | advantages (np.ndarray): np array with the advantages 63 | 64 | Returns: 65 | (np.ndarray): np array with the advantages normalized 66 | """ 67 | return (advantages - np.mean(advantages)) / (advantages.std() + 1e-8) 68 | 69 | 70 | def shift_advantages_to_positive(advantages): 71 | return (advantages - np.min(advantages)) + 1e-8 72 | 73 | 74 | def discount_cumsum(x, discount): 75 | """ 76 | See https://docs.scipy.org/doc/scipy/reference/tutorial/signal.html#difference-equation-filtering 77 | 78 | Returns: 79 | (float) : y[t] - discount*y[t+1] = x[t] or rev(y)[t] - discount*rev(y)[t-1] = rev(x)[t] 80 | """ 81 | return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1] 82 | 83 | 84 | def explained_variance_1d(ypred, y): 85 | """ 86 | Args: 87 | ypred (np.ndarray): predicted values of the variable of interest 88 | y (np.ndarray): real values of the variable 89 | 90 | Returns: 91 | (float): variance explained by your estimator 92 | 93 | """ 94 | assert y.ndim == 1 and ypred.ndim == 1 95 | vary = np.var(y) 96 | if np.isclose(vary, 0): 97 | if np.var(ypred) > 0: 98 | return 0 99 | else: 100 | return 1 101 | return 1 - np.var(y - ypred) / (vary + 1e-8) 102 | 103 | 104 | def concat_tensor_dict_list(tensor_dict_list): 105 | """ 106 | Args: 107 | tensor_dict_list (list) : list of dicts of lists of tensors 108 | 109 | Returns: 110 | (dict) : dict of lists of tensors 111 | """ 112 | keys = list(tensor_dict_list[0].keys()) 113 | ret = dict() 114 | for k in keys: 115 | example = tensor_dict_list[0][k] 116 | if isinstance(example, dict): 117 | v = concat_tensor_dict_list([x[k] for x in tensor_dict_list]) 118 | else: 119 | v = np.concatenate([x[k] for x in tensor_dict_list]) 120 | ret[k] = v 121 | return ret 122 | 123 | 124 | def stack_tensor_dict_list(tensor_dict_list): 125 | """ 126 | Args: 127 | tensor_dict_list (list) : list of dicts of tensors 128 | 129 | Returns: 130 | (dict) : dict of lists of tensors 131 | """ 132 | keys = list(tensor_dict_list[0].keys()) 133 | ret = dict() 134 | for k in keys: 135 | example = tensor_dict_list[0][k] 136 | if isinstance(example, dict): 137 | v = stack_tensor_dict_list([x[k] for x in tensor_dict_list]) 138 | else: 139 | v = np.asarray([x[k] for x in tensor_dict_list]) 140 | ret[k] = v 141 | return ret 142 | 143 | 144 | def create_feed_dict(placeholder_dict, value_dict): 145 | """ 146 | matches the placeholders with their values given a placeholder and value_dict. 147 | The keys in both dicts must match 148 | 149 | Args: 150 | placeholder_dict (dict): dict of placeholders 151 | value_dict (dict): dict of values to be fed to the placeholders 152 | 153 | Returns: feed dict 154 | 155 | """ 156 | assert set(placeholder_dict.keys()) <= set(value_dict.keys()), \ 157 | "value dict must provide the necessary data to serve all placeholders in placeholder_dict" 158 | # match the placeholders with their values 159 | return dict([(placeholder_dict[key], value_dict[key]) for key in placeholder_dict.keys()]) 160 | 161 | def set_seed(seed): 162 | """ 163 | Set the random seed for all random number generators 164 | 165 | Args: 166 | seed (int) : seed to use 167 | 168 | Returns: 169 | None 170 | """ 171 | import random 172 | import tensorflow as tf 173 | seed %= 4294967294 174 | random.seed(seed) 175 | np.random.seed(seed) 176 | tf.set_random_seed(seed) 177 | print('using seed %s' % (str(seed))) 178 | 179 | class ClassEncoder(json.JSONEncoder): 180 | def default(self, o): 181 | if isinstance(o, type): 182 | return {'$class': o.__module__ + "." + o.__name__} 183 | if callable(o): 184 | return {'function': o.__name__} 185 | return json.JSONEncoder.default(self, o) 186 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | joblib 3 | gym==0.10.5 4 | scipy 5 | PyPrind 6 | Pillow 7 | moviepy 8 | mpi4py 9 | click 10 | tensorflow>=1.4.0 11 | cloudpickle 12 | matplotlib 13 | git+https://github.com/dennisl88/rand_param_envs.git 14 | git+https://github.com/dennisl88/multiworld.git@russell 15 | -------------------------------------------------------------------------------- /run_scripts/e-maml_run_mujoco.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.baselines.linear_baseline import LinearFeatureBaseline 2 | from meta_policy_search.envs.mujoco_envs.half_cheetah_rand_direc import HalfCheetahRandDirecEnv 3 | from meta_policy_search.envs.normalized_env import normalize 4 | from meta_policy_search.meta_algos.trpo_maml import TRPOMAML 5 | from meta_policy_search.meta_trainer import Trainer 6 | from meta_policy_search.samplers.meta_sampler import MetaSampler 7 | from meta_policy_search.samplers.meta_sample_processor import MetaSampleProcessor 8 | from meta_policy_search.policies.meta_gaussian_mlp_policy import MetaGaussianMLPPolicy 9 | from meta_policy_search.utils import logger 10 | from meta_policy_search.utils.utils import set_seed, ClassEncoder 11 | 12 | import numpy as np 13 | import os 14 | import json 15 | import argparse 16 | import time 17 | 18 | meta_policy_search_path = '/'.join(os.path.realpath(os.path.dirname(__file__)).split('/')[:-1]) 19 | 20 | def main(config): 21 | set_seed(config['seed']) 22 | 23 | 24 | baseline = globals()[config['baseline']]() #instantiate baseline 25 | 26 | env = globals()[config['env']]() # instantiate env 27 | env = normalize(env) # apply normalize wrapper to env 28 | 29 | policy = MetaGaussianMLPPolicy( 30 | name="meta-policy", 31 | obs_dim=np.prod(env.observation_space.shape), 32 | action_dim=np.prod(env.action_space.shape), 33 | meta_batch_size=config['meta_batch_size'], 34 | hidden_sizes=config['hidden_sizes'], 35 | ) 36 | 37 | sampler = MetaSampler( 38 | env=env, 39 | policy=policy, 40 | rollouts_per_meta_task=config['rollouts_per_meta_task'], # This batch_size is confusing 41 | meta_batch_size=config['meta_batch_size'], 42 | max_path_length=config['max_path_length'], 43 | parallel=config['parallel'], 44 | ) 45 | 46 | sample_processor = MetaSampleProcessor( 47 | baseline=baseline, 48 | discount=config['discount'], 49 | gae_lambda=config['gae_lambda'], 50 | normalize_adv=config['normalize_adv'], 51 | ) 52 | 53 | algo = TRPOMAML( 54 | policy=policy, 55 | step_size=config['step_size'], 56 | inner_type=config['inner_type'], 57 | inner_lr=config['inner_lr'], 58 | meta_batch_size=config['meta_batch_size'], 59 | num_inner_grad_steps=config['num_inner_grad_steps'], 60 | exploration=True, 61 | ) 62 | 63 | trainer = Trainer( 64 | algo=algo, 65 | policy=policy, 66 | env=env, 67 | sampler=sampler, 68 | sample_processor=sample_processor, 69 | n_itr=config['n_itr'], 70 | num_inner_grad_steps=config['num_inner_grad_steps'], 71 | ) 72 | 73 | trainer.train() 74 | 75 | if __name__=="__main__": 76 | idx = int(time.time()) 77 | 78 | parser = argparse.ArgumentParser(description='ProMP: Proximal Meta-Policy Search') 79 | parser.add_argument('--config_file', type=str, default='', help='json file with run specifications') 80 | parser.add_argument('--dump_path', type=str, default=meta_policy_search_path + '/data/pro-mp/run_%d' % idx) 81 | 82 | args = parser.parse_args() 83 | 84 | 85 | if args.config_file: # load configuration from json file 86 | with open(args.config_file, 'r') as f: 87 | config = json.load(f) 88 | 89 | else: # use default config 90 | 91 | config = { 92 | 'seed': 1, 93 | 94 | 'baseline': 'LinearFeatureBaseline', 95 | 96 | 'env': 'HalfCheetahRandDirecEnv', 97 | 98 | # sampler config 99 | 'rollouts_per_meta_task': 20, 100 | 'max_path_length': 100, 101 | 'parallel': True, 102 | 103 | # sample processor config 104 | 'discount': 0.99, 105 | 'gae_lambda': 1, 106 | 'normalize_adv': True, 107 | 108 | # policy config 109 | 'hidden_sizes': (64, 64), 110 | 'learn_std': True, # whether to learn the standard deviation of the gaussian policy 111 | 112 | # E-MAML config 113 | 'inner_lr': 0.1, # adaptation step size 114 | 'learning_rate': 1e-3, # meta-policy gradient step size 115 | 'step_size': 0.01, # size of the TRPO trust-region 116 | 'n_itr': 1001, # number of overall training iterations 117 | 'meta_batch_size': 40, # number of sampled meta-tasks per iterations 118 | 'num_inner_grad_steps': 1, # number of inner / adaptation gradient steps 119 | 'inner_type' : 'log_likelihood', # type of inner loss function used 120 | 121 | } 122 | 123 | # configure logger 124 | logger.configure(dir=args.dump_path, format_strs=['stdout', 'log', 'csv'], 125 | snapshot_mode='last_gap') 126 | 127 | # dump run configuration before starting training 128 | json.dump(config, open(args.dump_path + '/params.json', 'w'), cls=ClassEncoder) 129 | 130 | # start the actual algorithm 131 | main(config) -------------------------------------------------------------------------------- /run_scripts/maml_run_mujoco.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.baselines.linear_baseline import LinearFeatureBaseline 2 | from meta_policy_search.envs.mujoco_envs.half_cheetah_rand_direc import HalfCheetahRandDirecEnv 3 | from meta_policy_search.envs.normalized_env import normalize 4 | from meta_policy_search.meta_algos.trpo_maml import TRPOMAML 5 | from meta_policy_search.meta_trainer import Trainer 6 | from meta_policy_search.samplers.meta_sampler import MetaSampler 7 | from meta_policy_search.samplers.meta_sample_processor import MetaSampleProcessor 8 | from meta_policy_search.policies.meta_gaussian_mlp_policy import MetaGaussianMLPPolicy 9 | from meta_policy_search.utils import logger 10 | from meta_policy_search.utils.utils import set_seed, ClassEncoder 11 | 12 | import numpy as np 13 | import os 14 | import json 15 | import argparse 16 | import time 17 | 18 | meta_policy_search_path = '/'.join(os.path.realpath(os.path.dirname(__file__)).split('/')[:-1]) 19 | 20 | def main(config): 21 | set_seed(config['seed']) 22 | 23 | 24 | baseline = globals()[config['baseline']]() #instantiate baseline 25 | 26 | env = globals()[config['env']]() # instantiate env 27 | env = normalize(env) # apply normalize wrapper to env 28 | 29 | policy = MetaGaussianMLPPolicy( 30 | name="meta-policy", 31 | obs_dim=np.prod(env.observation_space.shape), 32 | action_dim=np.prod(env.action_space.shape), 33 | meta_batch_size=config['meta_batch_size'], 34 | hidden_sizes=config['hidden_sizes'], 35 | ) 36 | 37 | sampler = MetaSampler( 38 | env=env, 39 | policy=policy, 40 | rollouts_per_meta_task=config['rollouts_per_meta_task'], # This batch_size is confusing 41 | meta_batch_size=config['meta_batch_size'], 42 | max_path_length=config['max_path_length'], 43 | parallel=config['parallel'], 44 | ) 45 | 46 | sample_processor = MetaSampleProcessor( 47 | baseline=baseline, 48 | discount=config['discount'], 49 | gae_lambda=config['gae_lambda'], 50 | normalize_adv=config['normalize_adv'], 51 | ) 52 | 53 | algo = TRPOMAML( 54 | policy=policy, 55 | step_size=config['step_size'], 56 | inner_type=config['inner_type'], 57 | inner_lr=config['inner_lr'], 58 | meta_batch_size=config['meta_batch_size'], 59 | num_inner_grad_steps=config['num_inner_grad_steps'], 60 | exploration=False, 61 | ) 62 | 63 | trainer = Trainer( 64 | algo=algo, 65 | policy=policy, 66 | env=env, 67 | sampler=sampler, 68 | sample_processor=sample_processor, 69 | n_itr=config['n_itr'], 70 | num_inner_grad_steps=config['num_inner_grad_steps'], 71 | ) 72 | 73 | trainer.train() 74 | 75 | if __name__=="__main__": 76 | idx = int(time.time()) 77 | 78 | parser = argparse.ArgumentParser(description='ProMP: Proximal Meta-Policy Search') 79 | parser.add_argument('--config_file', type=str, default='', help='json file with run specifications') 80 | parser.add_argument('--dump_path', type=str, default=meta_policy_search_path + '/data/pro-mp/run_%d' % idx) 81 | 82 | args = parser.parse_args() 83 | 84 | 85 | if args.config_file: # load configuration from json file 86 | with open(args.config_file, 'r') as f: 87 | config = json.load(f) 88 | 89 | else: # use default config 90 | 91 | config = { 92 | 'seed': 1, 93 | 94 | 'baseline': 'LinearFeatureBaseline', 95 | 96 | 'env': 'HalfCheetahRandDirecEnv', 97 | 98 | # sampler config 99 | 'rollouts_per_meta_task': 20, 100 | 'max_path_length': 100, 101 | 'parallel': True, 102 | 103 | # sample processor config 104 | 'discount': 0.99, 105 | 'gae_lambda': 1, 106 | 'normalize_adv': True, 107 | 108 | # policy config 109 | 'hidden_sizes': (64, 64), 110 | 'learn_std': True, # whether to learn the standard deviation of the gaussian policy 111 | 112 | # E-MAML config 113 | 'inner_lr': 0.1, # adaptation step size 114 | 'learning_rate': 1e-3, # meta-policy gradient step size 115 | 'step_size': 0.01, # size of the TRPO trust-region 116 | 'n_itr': 1001, # number of overall training iterations 117 | 'meta_batch_size': 40, # number of sampled meta-tasks per iterations 118 | 'num_inner_grad_steps': 1, # number of inner / adaptation gradient steps 119 | 'inner_type' : 'log_likelihood', # type of inner loss function used 120 | 121 | } 122 | 123 | # configure logger 124 | logger.configure(dir=args.dump_path, format_strs=['stdout', 'log', 'csv'], 125 | snapshot_mode='last_gap') 126 | 127 | # dump run configuration before starting training 128 | json.dump(config, open(args.dump_path + '/params.json', 'w'), cls=ClassEncoder) 129 | 130 | # start the actual algorithm 131 | main(config) -------------------------------------------------------------------------------- /run_scripts/pro-mp_run_mujoco.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.baselines.linear_baseline import LinearFeatureBaseline 2 | from meta_policy_search.envs.mujoco_envs.half_cheetah_rand_direc import HalfCheetahRandDirecEnv 3 | from meta_policy_search.envs.normalized_env import normalize 4 | from meta_policy_search.meta_algos.pro_mp import ProMP 5 | from meta_policy_search.meta_trainer import Trainer 6 | from meta_policy_search.samplers.meta_sampler import MetaSampler 7 | from meta_policy_search.samplers.meta_sample_processor import MetaSampleProcessor 8 | from meta_policy_search.policies.meta_gaussian_mlp_policy import MetaGaussianMLPPolicy 9 | from meta_policy_search.utils import logger 10 | from meta_policy_search.utils.utils import set_seed, ClassEncoder 11 | 12 | import numpy as np 13 | import tensorflow as tf 14 | import os 15 | import json 16 | import argparse 17 | import time 18 | 19 | meta_policy_search_path = '/'.join(os.path.realpath(os.path.dirname(__file__)).split('/')[:-1]) 20 | 21 | def main(config): 22 | set_seed(config['seed']) 23 | 24 | 25 | baseline = globals()[config['baseline']]() #instantiate baseline 26 | 27 | env = globals()[config['env']]() # instantiate env 28 | env = normalize(env) # apply normalize wrapper to env 29 | 30 | policy = MetaGaussianMLPPolicy( 31 | name="meta-policy", 32 | obs_dim=np.prod(env.observation_space.shape), 33 | action_dim=np.prod(env.action_space.shape), 34 | meta_batch_size=config['meta_batch_size'], 35 | hidden_sizes=config['hidden_sizes'], 36 | ) 37 | 38 | sampler = MetaSampler( 39 | env=env, 40 | policy=policy, 41 | rollouts_per_meta_task=config['rollouts_per_meta_task'], # This batch_size is confusing 42 | meta_batch_size=config['meta_batch_size'], 43 | max_path_length=config['max_path_length'], 44 | parallel=config['parallel'], 45 | ) 46 | 47 | sample_processor = MetaSampleProcessor( 48 | baseline=baseline, 49 | discount=config['discount'], 50 | gae_lambda=config['gae_lambda'], 51 | normalize_adv=config['normalize_adv'], 52 | ) 53 | 54 | algo = ProMP( 55 | policy=policy, 56 | inner_lr=config['inner_lr'], 57 | meta_batch_size=config['meta_batch_size'], 58 | num_inner_grad_steps=config['num_inner_grad_steps'], 59 | learning_rate=config['learning_rate'], 60 | num_ppo_steps=config['num_promp_steps'], 61 | clip_eps=config['clip_eps'], 62 | target_inner_step=config['target_inner_step'], 63 | init_inner_kl_penalty=config['init_inner_kl_penalty'], 64 | adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], 65 | ) 66 | 67 | trainer = Trainer( 68 | algo=algo, 69 | policy=policy, 70 | env=env, 71 | sampler=sampler, 72 | sample_processor=sample_processor, 73 | n_itr=config['n_itr'], 74 | num_inner_grad_steps=config['num_inner_grad_steps'], 75 | ) 76 | 77 | trainer.train() 78 | 79 | if __name__=="__main__": 80 | idx = int(time.time()) 81 | 82 | parser = argparse.ArgumentParser(description='ProMP: Proximal Meta-Policy Search') 83 | parser.add_argument('--config_file', type=str, default='', help='json file with run specifications') 84 | parser.add_argument('--dump_path', type=str, default=meta_policy_search_path + '/data/pro-mp/run_%d' % idx) 85 | 86 | args = parser.parse_args() 87 | 88 | 89 | if args.config_file: # load configuration from json file 90 | with open(args.config_file, 'r') as f: 91 | config = json.load(f) 92 | 93 | else: # use default config 94 | 95 | config = { 96 | 'seed': 1, 97 | 98 | 'baseline': 'LinearFeatureBaseline', 99 | 100 | 'env': 'HalfCheetahRandDirecEnv', 101 | 102 | # sampler config 103 | 'rollouts_per_meta_task': 20, 104 | 'max_path_length': 100, 105 | 'parallel': True, 106 | 107 | # sample processor config 108 | 'discount': 0.99, 109 | 'gae_lambda': 1, 110 | 'normalize_adv': True, 111 | 112 | # policy config 113 | 'hidden_sizes': (64, 64), 114 | 'learn_std': True, # whether to learn the standard deviation of the gaussian policy 115 | 116 | # ProMP config 117 | 'inner_lr': 0.1, # adaptation step size 118 | 'learning_rate': 1e-3, # meta-policy gradient step size 119 | 'num_promp_steps': 5, # number of ProMp steps without re-sampling 120 | 'clip_eps': 0.3, # clipping range 121 | 'target_inner_step': 0.01, 122 | 'init_inner_kl_penalty': 5e-4, 123 | 'adaptive_inner_kl_penalty': False, # whether to use an adaptive or fixed KL-penalty coefficient 124 | 'n_itr': 1001, # number of overall training iterations 125 | 'meta_batch_size': 40, # number of sampled meta-tasks per iterations 126 | 'num_inner_grad_steps': 1, # number of inner / adaptation gradient steps 127 | 128 | } 129 | 130 | # configure logger 131 | logger.configure(dir=args.dump_path, format_strs=['stdout', 'log', 'csv'], 132 | snapshot_mode='last_gap') 133 | 134 | # dump run configuration before starting training 135 | json.dump(config, open(args.dump_path + '/params.json', 'w'), cls=ClassEncoder) 136 | 137 | # start the actual algorithm 138 | main(config) -------------------------------------------------------------------------------- /run_scripts/pro-mp_run_point_mass.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.baselines.linear_baseline import LinearFeatureBaseline 2 | from meta_policy_search.envs.point_envs.point_env_2d_corner import MetaPointEnvCorner 3 | from meta_policy_search.envs.normalized_env import normalize 4 | from meta_policy_search.meta_algos.pro_mp import ProMP 5 | from meta_policy_search.meta_trainer import Trainer 6 | from meta_policy_search.samplers.meta_sampler import MetaSampler 7 | from meta_policy_search.samplers.meta_sample_processor import MetaSampleProcessor 8 | from meta_policy_search.policies.meta_gaussian_mlp_policy import MetaGaussianMLPPolicy 9 | from meta_policy_search.utils import logger 10 | from meta_policy_search.utils.utils import set_seed, ClassEncoder 11 | 12 | import numpy as np 13 | import tensorflow as tf 14 | import os 15 | import json 16 | import argparse 17 | import time 18 | 19 | meta_policy_search_path = '/'.join(os.path.realpath(os.path.dirname(__file__)).split('/')[:-1]) 20 | 21 | def main(config): 22 | set_seed(config['seed']) 23 | 24 | 25 | baseline = globals()[config['baseline']]() #instantiate baseline 26 | 27 | env = globals()[config['env']]() # instantiate env 28 | env = normalize(env) # apply normalize wrapper to env 29 | 30 | policy = MetaGaussianMLPPolicy( 31 | name="meta-policy", 32 | obs_dim=np.prod(env.observation_space.shape), 33 | action_dim=np.prod(env.action_space.shape), 34 | meta_batch_size=config['meta_batch_size'], 35 | hidden_sizes=config['hidden_sizes'], 36 | ) 37 | 38 | sampler = MetaSampler( 39 | env=env, 40 | policy=policy, 41 | rollouts_per_meta_task=config['rollouts_per_meta_task'], # This batch_size is confusing 42 | meta_batch_size=config['meta_batch_size'], 43 | max_path_length=config['max_path_length'], 44 | parallel=config['parallel'], 45 | ) 46 | 47 | sample_processor = MetaSampleProcessor( 48 | baseline=baseline, 49 | discount=config['discount'], 50 | gae_lambda=config['gae_lambda'], 51 | normalize_adv=config['normalize_adv'], 52 | ) 53 | 54 | algo = ProMP( 55 | policy=policy, 56 | inner_lr=config['inner_lr'], 57 | meta_batch_size=config['meta_batch_size'], 58 | num_inner_grad_steps=config['num_inner_grad_steps'], 59 | learning_rate=config['learning_rate'], 60 | num_ppo_steps=config['num_promp_steps'], 61 | clip_eps=config['clip_eps'], 62 | target_inner_step=config['target_inner_step'], 63 | init_inner_kl_penalty=config['init_inner_kl_penalty'], 64 | adaptive_inner_kl_penalty=config['adaptive_inner_kl_penalty'], 65 | ) 66 | 67 | trainer = Trainer( 68 | algo=algo, 69 | policy=policy, 70 | env=env, 71 | sampler=sampler, 72 | sample_processor=sample_processor, 73 | n_itr=config['n_itr'], 74 | num_inner_grad_steps=config['num_inner_grad_steps'], 75 | ) 76 | 77 | trainer.train() 78 | 79 | if __name__=="__main__": 80 | idx = int(time.time()) 81 | 82 | parser = argparse.ArgumentParser(description='ProMP: Proximal Meta-Policy Search') 83 | parser.add_argument('--config_file', type=str, default='', help='json file with run specifications') 84 | parser.add_argument('--dump_path', type=str, default=meta_policy_search_path + '/data/pro-mp/run_%d' % idx) 85 | 86 | args = parser.parse_args() 87 | 88 | 89 | if args.config_file: # load configuration from json file 90 | with open(args.config_file, 'r') as f: 91 | config = json.load(f) 92 | 93 | else: # use default config 94 | 95 | config = { 96 | 'seed': 1, 97 | 98 | 'baseline': 'LinearFeatureBaseline', 99 | 100 | 'env': 'MetaPointEnvCorner', 101 | 102 | # sampler config 103 | 'rollouts_per_meta_task': 20, 104 | 'max_path_length': 100, 105 | 'parallel': True, 106 | 107 | # sample processor config 108 | 'discount': 0.99, 109 | 'gae_lambda': 1, 110 | 'normalize_adv': True, 111 | 112 | # policy config 113 | 'hidden_sizes': (64, 64), 114 | 'learn_std': True, # whether to learn the standard deviation of the gaussian policy 115 | 116 | # ProMP config 117 | 'inner_lr': 0.1, # adaptation step size 118 | 'learning_rate': 1e-3, # meta-policy gradient step size 119 | 'num_promp_steps': 5, # number of ProMp steps without re-sampling 120 | 'clip_eps': 0.3, # clipping range 121 | 'target_inner_step': 0.01, 122 | 'init_inner_kl_penalty': 5e-4, 123 | 'adaptive_inner_kl_penalty': False, # whether to use an adaptive or fixed KL-penalty coefficient 124 | 'n_itr': 1001, # number of overall training iterations 125 | 'meta_batch_size': 40, # number of sampled meta-tasks per iterations 126 | 'num_inner_grad_steps': 1, # number of inner / adaptation gradient steps 127 | 128 | } 129 | 130 | # configure logger 131 | logger.configure(dir=args.dump_path, format_strs=['stdout', 'log', 'csv'], 132 | snapshot_mode='last_gap') 133 | 134 | # dump run configuration before starting training 135 | json.dump(config, open(args.dump_path + '/params.json', 'w'), cls=ClassEncoder) 136 | 137 | # start the actual algorithm 138 | main(config) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name="meta_policy_search", 4 | version='0.1', 5 | description='Framework that provides multiple gradient-based Meta-RL algorithms', 6 | url='https://github.com/jonasrothfuss/maml-zoo', 7 | author='Dennis Lee, Ignasi Clavera, Jonas Rothfuss', 8 | author_email='jonas.rothfuss@berkeley.edu', 9 | license='MIT', 10 | packages=['meta_policy_search'], 11 | test_suite='nose.collector', 12 | tests_require=['nose'], 13 | install_requires=[ 14 | 'joblib==0.12.2', 15 | 'PyPrind', 16 | 'numpy', 17 | 'scipy', 18 | 'gym==0.10.5', 19 | 'python_dateutil', 20 | 'tensorflow' 21 | ], 22 | zip_safe=False) -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonasrothfuss/ProMP/93ae339e23dfc6e1133f9538f2c7cc0ccee89d19/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_baselines.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import pickle 4 | from meta_policy_search.utils import utils 5 | from meta_policy_search.policies.base import Policy 6 | from meta_policy_search.baselines.linear_baseline import LinearFeatureBaseline, LinearTimeBaseline 7 | from meta_policy_search.samplers.meta_sampler import MetaSampler 8 | from gym import Env 9 | 10 | 11 | class RandomEnv(Env): 12 | def __init__(self): 13 | self.state = np.zeros(1) 14 | self.goal = 0 15 | 16 | def sample_tasks(self, n_tasks): 17 | """ 18 | Args: 19 | n_tasks (int) : number of different meta-tasks needed 20 | Returns: 21 | tasks (list) : an (n_tasks) length list of reset args 22 | """ 23 | return np.random.choice(100, n_tasks, replace=False) # Ensure every env has a different goal 24 | 25 | def set_task(self, task): 26 | """ 27 | Args: 28 | task: task of the meta-learning environment 29 | """ 30 | self.goal = task 31 | 32 | def get_task(self): 33 | """ 34 | Returns: 35 | task: task of the meta-learning environment 36 | """ 37 | return self.goal 38 | 39 | def step(self, action): 40 | self.state += (self.goal - action) * np.random.random() 41 | return self.state * 100 + self.goal, (self.goal - action)[0], 0, {} 42 | 43 | def reset(self): 44 | self.state = np.zeros(1) 45 | return self.state 46 | 47 | def env_spec(self): 48 | return None 49 | 50 | 51 | class RandomPolicy(Policy): 52 | def get_actions(self, observations): 53 | return [[np.random.random() + obs / 100 for obs in task] for task in observations], None 54 | 55 | 56 | class TestLinearFeatureBaseline(unittest.TestCase): 57 | def setUp(self): 58 | self.random_env = RandomEnv() 59 | self.random_policy = RandomPolicy(1, 1) 60 | self.meta_batch_size = 2 61 | self.batch_size = 10 62 | self.path_length = 100 63 | self.linear = LinearFeatureBaseline() 64 | self.sampler = MetaSampler(self.random_env, self.random_policy, self.batch_size, 65 | self.meta_batch_size, self.path_length, parallel=True) 66 | 67 | def testFit(self): 68 | paths = self.sampler.obtain_samples() 69 | for task in paths.values(): 70 | unfit_error = 0 71 | for path in task: 72 | path["returns"] = utils.discount_cumsum(path["rewards"], 0.99) 73 | unfit_pred = self.linear.predict(path) 74 | unfit_error += sum([np.square(pred - actual) for pred, actual in zip(unfit_pred, path['returns'])]) 75 | self.linear.fit(task) 76 | fit_error = 0 77 | for path in task: 78 | fit_pred = self.linear.predict(path) 79 | fit_error += sum([np.square(pred - actual) for pred, actual in zip(fit_pred, path['returns'])]) 80 | self.assertTrue(fit_error < unfit_error) 81 | 82 | def testSerialize(self): 83 | paths = self.sampler.obtain_samples() 84 | for task in paths.values(): 85 | for path in task: 86 | path["returns"] = utils.discount_cumsum(path["rewards"], 0.99) 87 | self.linear.fit(task) 88 | fit_error_pre = 0 89 | for path in task: 90 | fit_pred = self.linear.predict(path) 91 | fit_error_pre += sum([np.square(pred - actual) for pred, actual in zip(fit_pred, path['returns'])]) 92 | pkl = pickle.dumps(self.linear) 93 | self.linear = pickle.loads(pkl) 94 | fit_error_post = 0 95 | for path in task: 96 | fit_pred = self.linear.predict(path) 97 | fit_error_post += sum([np.square(pred - actual) for pred, actual in zip(fit_pred, path['returns'])]) 98 | self.assertEqual(fit_error_pre, fit_error_post) 99 | 100 | 101 | class TestLinearFeatureBaseline(unittest.TestCase): 102 | def setUp(self): 103 | self.random_env = RandomEnv() 104 | self.random_policy = RandomPolicy(1, 1) 105 | self.meta_batch_size = 2 106 | self.batch_size = 10 107 | self.path_length = 100 108 | self.linear = LinearTimeBaseline() 109 | self.sampler = MetaSampler(self.random_env, self.random_policy, self.batch_size, 110 | self.meta_batch_size, self.path_length, parallel=True) 111 | 112 | def testFit(self): 113 | base_path = np.arange(-4.0, 22.0, step=.6) 114 | task1 = [{'discounted_rewards': base_path + np.random.normal(scale=2, size=base_path.shape), 115 | 'observations': base_path} for i in range(10)] 116 | task2 = [{'discounted_rewards': base_path**3 + np.random.normal(scale=2, size=base_path.shape), 117 | 'observations': base_path} for i in range(10)] 118 | 119 | 120 | for task in [task1, task2]: 121 | unfit_error = np.sum([np.sum(path['discounted_rewards']**2) for path in task]) 122 | print('unfit_error', unfit_error) 123 | self.linear.fit(task, target_key='discounted_rewards') 124 | fit_error = 0 125 | for path in task: 126 | fit_pred = self.linear.predict(path) 127 | fit_error += sum([np.square(pred - actual) for pred, actual in zip(fit_pred, path['discounted_rewards'])]) 128 | print('fit_error', fit_error) 129 | self.assertTrue(2*fit_error < unfit_error) 130 | 131 | def testSerialize(self): 132 | base_path = np.arange(-4.0, 22.0, step=.6) 133 | task1 = [{'discounted_rewards': base_path + np.random.normal(scale=2, size=base_path.shape), 134 | 'observations': base_path} for i in range(10)] 135 | task2 = [{'discounted_rewards': base_path**3 + np.random.normal(scale=2, size=base_path.shape), 136 | 'observations': base_path} for i in range(10)] 137 | 138 | for task in [task1, task2]: 139 | self.linear.fit(task, target_key='discounted_rewards') 140 | fit_error_pre = 0 141 | for path in task: 142 | fit_pred = self.linear.predict(path) 143 | fit_error_pre += sum([np.square(pred - actual) for pred, actual in zip(fit_pred, path['discounted_rewards'])]) 144 | pkl = pickle.dumps(self.linear) 145 | self.linear = pickle.loads(pkl) 146 | fit_error_post = 0 147 | for path in task: 148 | fit_pred = self.linear.predict(path) 149 | fit_error_post += sum([np.square(pred - actual) for pred, actual in zip(fit_pred, path['discounted_rewards'])]) 150 | self.assertEqual(fit_error_pre, fit_error_post) 151 | 152 | if __name__ == '__main__': 153 | unittest.main() 154 | -------------------------------------------------------------------------------- /tests/test_integration.py: -------------------------------------------------------------------------------- 1 | from meta_policy_search.baselines.linear_baseline import LinearFeatureBaseline 2 | from meta_policy_search.meta_algos.pro_mp import ProMP 3 | from meta_policy_search.samplers.meta_sampler import MetaSampler 4 | from meta_policy_search.samplers.meta_sample_processor import MetaSampleProcessor 5 | from meta_policy_search.policies.meta_gaussian_mlp_policy import MetaGaussianMLPPolicy 6 | 7 | import tensorflow as tf 8 | import numpy as np 9 | import unittest 10 | 11 | from gym.spaces import Box 12 | 13 | 14 | class MetaPointEnv(): 15 | 16 | def step(self, action): 17 | """ 18 | Run one timestep of the environment's dynamics. When end of episode 19 | is reached, reset() should be called to reset the environment's internal state. 20 | 21 | Args: 22 | action : an action provided by the environment 23 | Returns: 24 | (observation, reward, done, info) 25 | observation : agent's observation of the current environment 26 | reward [Float] : amount of reward due to the previous action 27 | done : a boolean, indicating whether the episode has ended 28 | info : a dictionary containing other diagnostic information from the previous action 29 | """ 30 | prev_state = self._state 31 | self._state = prev_state + np.clip(action, -0.1, 0.1) 32 | reward = self.reward(prev_state, action, self._state) 33 | done = self.done(self._state) 34 | next_observation = np.copy(self._state) 35 | return next_observation, reward, done, {} 36 | 37 | def reset(self): 38 | """ 39 | Resets the state of the environment, returning an initial observation. 40 | Outputs 41 | ------- 42 | observation : the initial observation of the space. (Initial reward is assumed to be 0.) 43 | """ 44 | self._state = np.random.uniform(-2, 2, size=(2,)) 45 | observation = np.copy(self._state) 46 | return observation 47 | 48 | @property 49 | def observation_space(self): 50 | return Box(low=-np.inf, high=np.inf, shape=(2,)) 51 | 52 | @property 53 | def action_space(self): 54 | return Box(low=-0.1, high=0.1, shape=(2,)) 55 | 56 | def done(self, obs): 57 | if obs.ndim == 1: 58 | return abs(obs[0]) < 0.01 and abs(obs[1]) < 0.01 59 | elif obs.ndim == 2: 60 | return np.logical_and(np.abs(obs[:, 0]) < 0.01, np.abs(obs[:, 1]) < 0.01) 61 | 62 | def reward(self, obs, act, obs_next): 63 | if obs_next.ndim == 1: 64 | return - np.sqrt(obs_next[0]**2 + obs_next[1]**2) 65 | elif obs_next.ndim == 2: 66 | return - np.sqrt(obs_next[:, 0] ** 2 + obs_next[:, 1] ** 2) 67 | 68 | def log_diagnostics(self, paths): 69 | pass 70 | 71 | def sample_tasks(self, n_tasks): 72 | return [{}] * n_tasks 73 | 74 | def set_task(self, task): 75 | pass 76 | 77 | class TestLikelihoodRation(unittest.TestCase): 78 | """ 79 | Assure that likelihhood ratio at first gradient step is approx. one since pi_old = pi_new 80 | """ 81 | 82 | def setUp(self): 83 | self.env = env = MetaPointEnv() 84 | 85 | self.baseline = baseline = LinearFeatureBaseline() 86 | 87 | self.policy = policy = MetaGaussianMLPPolicy( 88 | name="meta-policy", 89 | obs_dim=np.prod(env.observation_space.shape), 90 | action_dim=np.prod(env.action_space.shape), 91 | meta_batch_size=10, 92 | hidden_sizes=(16, 16), 93 | learn_std=True, 94 | hidden_nonlinearity=tf.tanh, 95 | output_nonlinearity=None, 96 | ) 97 | 98 | self.sampler = MetaSampler( 99 | env=env, 100 | policy=policy, 101 | rollouts_per_meta_task=2, 102 | meta_batch_size=10, 103 | max_path_length=50, 104 | parallel=False, 105 | ) 106 | 107 | self.sample_processor = MetaSampleProcessor( 108 | baseline=baseline, 109 | discount=0.99, 110 | gae_lambda=1.0, 111 | normalize_adv=True, 112 | positive_adv=False, 113 | ) 114 | 115 | self.algo = ProMP( 116 | policy=policy, 117 | inner_lr=0.1, 118 | meta_batch_size=10, 119 | num_inner_grad_steps=2, 120 | learning_rate=1e-3, 121 | num_ppo_steps=5, 122 | num_minibatches=1, 123 | clip_eps=0.5, 124 | target_inner_step=2e-2, 125 | init_inner_kl_penalty=1e-3, 126 | ) 127 | 128 | def test_likelihood_ratio(self): 129 | with tf.Session() as sess: 130 | 131 | # initialize uninitialized vars (only initialize vars that were not loaded) 132 | uninit_vars = [var for var in tf.global_variables() if not sess.run(tf.is_variable_initialized(var))] 133 | sess.run(tf.variables_initializer(uninit_vars)) 134 | 135 | self.sampler.update_tasks() 136 | self.policy.switch_to_pre_update() # Switch to pre-update policy 137 | 138 | all_samples_data, all_paths = [], [] 139 | for step in range(1): 140 | 141 | """ -------------------- Sampling --------------------------""" 142 | paths = self.sampler.obtain_samples(log_prefix=str(step)) 143 | all_paths.append(paths) 144 | 145 | """ ----------------- Processing Samples ---------------------""" 146 | samples_data = self.sample_processor.process_samples(paths, log=False) 147 | all_samples_data.append(samples_data) 148 | 149 | """ ------------------- Inner Policy Update --------------------""" 150 | obs_phs, action_phs, adv_phs, dist_info_phs, all_phs = self.algo._make_input_placeholders('') 151 | 152 | for i in range(self.algo.meta_batch_size): 153 | obs = samples_data[i]['observations'] 154 | actions = samples_data[i]['actions'] 155 | agent_infos = samples_data[i]['agent_infos'] 156 | param_vals = self.policy.get_param_values() 157 | 158 | likelihood_ratio_sym = self.policy.likelihood_ratio_sym(obs_phs[i], action_phs[i], 159 | dist_info_phs[i], 160 | self.policy.policies_params_phs[i]) 161 | 162 | feed_dict_params = dict(zip(self.policy.policies_params_phs[i].values(), param_vals.values())) 163 | 164 | feed_dict_dist_infos = dict(zip(dist_info_phs[i].values(), agent_infos.values())) 165 | 166 | feed_dict = {obs_phs[i]: obs, 167 | action_phs[i]: actions 168 | } 169 | 170 | feed_dict.update(feed_dict_params) 171 | feed_dict.update(feed_dict_dist_infos) 172 | 173 | lr = sess.run(likelihood_ratio_sym, feed_dict=feed_dict) 174 | 175 | self.assertTrue(np.allclose(lr, 1)) 176 | -------------------------------------------------------------------------------- /tests/test_optimizers.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from meta_policy_search.optimizers.maml_first_order_optimizer import MAMLFirstOrderOptimizer 4 | from collections import OrderedDict 5 | import tensorflow as tf 6 | 7 | 8 | def fc(x, scope, nh, *, init_scale=1.0, init_bias=0.0): 9 | with tf.variable_scope(scope): 10 | nin = x.get_shape()[1].value 11 | w = tf.get_variable("w", [nin, nh], initializer=tf.orthogonal_initializer(init_scale)) 12 | b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(init_bias)) 13 | return tf.matmul(x, w)+b 14 | 15 | 16 | class Mlp(object): 17 | def __init__(self, inputs, output_size, hidden_size=(32, 32), name='mlp'): 18 | activ = tf.tanh 19 | curr_output = inputs 20 | self.name = name 21 | with tf.variable_scope(self.name): 22 | for i, size in enumerate(hidden_size): 23 | curr_output = activ(fc(curr_output, str(i), nh=size, init_scale=np.sqrt(2))) 24 | self.output = fc(curr_output, 'y_pred', nh=output_size, init_scale=np.sqrt(2)) 25 | self.params = tf.trainable_variables(scope=self.name) 26 | 27 | def get_params(self): 28 | return self.params 29 | 30 | 31 | class CombinedMlp(object): 32 | def __init__(self, mlps): 33 | self.params = sum([mlp.params for mlp in mlps], []) 34 | self.output = [mlp.output for mlp in mlps] 35 | 36 | def get_params(self): 37 | return self.params 38 | 39 | 40 | class TestOptimizer(unittest.TestCase): #TODO add test for ConjugateGradientOptimizer 41 | 42 | def testSine(self): 43 | np.random.seed(65) 44 | for optimizer in [MAMLFirstOrderOptimizer()]: 45 | tf.reset_default_graph() 46 | with tf.Session(): 47 | input_phs = tf.placeholder(dtype=tf.float32, shape=[None, 1]) 48 | target_phs = tf.placeholder(dtype=tf.float32, shape=[None, 1]) 49 | network = Mlp(input_phs, 1, hidden_size=(32,32), name='sin') 50 | loss = tf.reduce_mean(tf.square(network.output - target_phs)) 51 | input_ph_dict = OrderedDict({'x': input_phs, 'y': target_phs}) 52 | optimizer.build_graph(loss, network, input_ph_dict) 53 | sess = tf.get_default_session() 54 | sess.run(tf.global_variables_initializer()) 55 | 56 | for i in range(5000): 57 | xs = np.random.normal(0, 3, (1000, 1)) 58 | ys = np.sin(xs) 59 | inputs = {'x': xs, 'y': ys} 60 | optimizer.optimize(inputs) 61 | if i % 100 == 0: 62 | print(optimizer.loss(inputs)) 63 | 64 | xs = np.random.normal(0, 3, (100, 1)) 65 | ys = np.sin(xs) 66 | y_pred = sess.run(network.output, feed_dict=dict(list(zip(input_ph_dict.values(), (xs, ys))))) 67 | self.assertLessEqual(np.mean((ys-y_pred)**2), 0.02) 68 | 69 | def testGauss(self): 70 | np.random.seed(65) 71 | for optimizer in [MAMLFirstOrderOptimizer()]: 72 | tf.reset_default_graph() 73 | with tf.Session(): 74 | input_phs = tf.placeholder(dtype=tf.float32, shape=[None, 100]) 75 | target_mean_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1]) 76 | target_std_ph = tf.placeholder(dtype=tf.float32, shape=[None, 1]) 77 | 78 | mean_network = Mlp(input_phs, 1, hidden_size=(8,8), name='mean') 79 | std_network = Mlp(input_phs, 1, hidden_size=(8,8), name='std') 80 | 81 | target_std = tf.exp(target_std_ph) 82 | pred_std = tf.exp(std_network.output) 83 | 84 | numerator = tf.square(target_mean_ph - mean_network.output) + tf.square(target_std) - tf.square(pred_std) 85 | denominator = 2 * tf.square(pred_std) + 1e-8 86 | loss = tf.reduce_mean(tf.reduce_sum(numerator / denominator + std_network.output - target_std_ph, axis=-1)) 87 | 88 | joined_network = CombinedMlp([mean_network, std_network]) 89 | input_ph_dict = OrderedDict({'x': input_phs, 'y_mean': target_mean_ph, 'y_std': target_std_ph}) 90 | 91 | optimizer.build_graph(loss, joined_network, input_ph_dict) 92 | 93 | sess = tf.get_default_session() 94 | sess.run(tf.global_variables_initializer()) 95 | 96 | for i in range(2000): 97 | means = np.random.random(size=(1000)) 98 | stds = np.random.random(size=(1000)) 99 | inputs = np.vstack([np.random.normal(mean, np.exp(std), 100) for mean, std in zip(means, stds)]) 100 | all_inputs = {'x': inputs, 'y_mean': means.reshape(-1, 1), 'y_std': stds.reshape(-1, 1)} 101 | optimizer.optimize(all_inputs) 102 | if i % 100 == 0: 103 | print(optimizer.loss(all_inputs)) 104 | 105 | means = np.random.random(size=(20)) 106 | stds = np.random.random(size=(20)) 107 | 108 | inputs = np.stack([np.random.normal(mean, np.exp(std), 100) for mean, std in zip(means, stds)], axis=0) 109 | values_dict = OrderedDict({'x': inputs, 'y_mean': means.reshape(-1, 1), 'y_std': stds.reshape(-1, 1)}) 110 | 111 | mean_pred, std_pred = sess.run(joined_network.output, feed_dict=dict(list(zip(input_ph_dict.values(), 112 | values_dict.values())))) 113 | 114 | self.assertTrue(np.mean(np.square(mean_pred - means)) < 0.2) 115 | self.assertTrue(np.mean(np.square(std_pred - stds)) < 0.2) 116 | 117 | 118 | if __name__ == '__main__': 119 | unittest.main() -------------------------------------------------------------------------------- /tests/test_policies.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from meta_policy_search.policies.gaussian_mlp_policy import GaussianMLPPolicy 3 | import numpy as np 4 | import tensorflow as tf 5 | import pickle 6 | import gym 7 | 8 | class DummySpace(object): 9 | def __init__(self, dim): 10 | self._dim = dim 11 | 12 | @property 13 | def shape(self): 14 | return self._dim 15 | 16 | class DummyEnv(object): 17 | def __init__(self, obs_dim, act_dim): 18 | self._observation_space = gym.spaces.Box(low=-np.ones(obs_dim), high=np.ones(obs_dim), dtype=np.float32) 19 | self._action_space = gym.spaces.Box(low=-np.ones(act_dim), high=np.ones(act_dim), dtype=np.float32) 20 | 21 | @property 22 | def observation_space(self): 23 | return self._observation_space 24 | 25 | @property 26 | def action_space(self): 27 | return self._action_space 28 | 29 | def get_obs(self, n=None): 30 | if n is None: 31 | return np.random.uniform(0, 1, size=self.observation_space.shape) 32 | else: 33 | return np.random.uniform(0, 1, size=(n,) + self.observation_space.shape) 34 | 35 | 36 | class TestPolicy(unittest.TestCase): 37 | 38 | def setUp(self): 39 | sess = tf.get_default_session() 40 | if sess is None: 41 | tf.InteractiveSession() 42 | 43 | def test_output_sym(self): 44 | with tf.Session() as sess: 45 | obs_dim = 23 46 | action_dim = 7 47 | self.env = DummyEnv(obs_dim, action_dim) 48 | self.policy = GaussianMLPPolicy(obs_dim, 49 | action_dim, 50 | name='test_policy_output_sym', 51 | hidden_sizes=(64, 64)) 52 | 53 | obs_ph_1 = tf.placeholder(dtype=tf.float32, name="obs_ph_1", 54 | shape=(None,) + self.env.observation_space.shape) 55 | output_sym_1 = self.policy.distribution_info_sym(obs_ph_1) 56 | 57 | sess.run(tf.global_variables_initializer()) 58 | 59 | n_obs = self.env.get_obs(n=100) 60 | action, agent_infos = self.policy.get_actions(n_obs) 61 | agent_infos_output_sym = sess.run(output_sym_1, feed_dict={obs_ph_1: n_obs}) 62 | 63 | for k in agent_infos.keys(): 64 | self.assertTrue(np.allclose(agent_infos[k], agent_infos_output_sym[k], rtol=1e-5, atol=1e-5)) 65 | 66 | def test_get_action(self): 67 | 68 | with tf.Session() as sess: 69 | obs_dim = 23 70 | action_dim = 7 71 | self.env = DummyEnv(obs_dim, action_dim) 72 | self.policy = GaussianMLPPolicy(obs_dim, 73 | action_dim, 74 | name='test_policy_get_action', 75 | hidden_sizes=(64, 64)) 76 | 77 | sess.run(tf.global_variables_initializer()) 78 | 79 | obs = self.env.get_obs() 80 | action, agent_infos = self.policy.get_action(obs) 81 | actions, agents_infos = self.policy.get_actions(np.expand_dims(obs, 0)) 82 | for k in agent_infos.keys(): 83 | self.assertTrue(np.allclose(agent_infos[k], agents_infos[k], rtol=1e-5, atol=1e-5)) 84 | 85 | def testSerialize1(self): 86 | obs_dim = 23 87 | action_dim = 7 88 | self.env = DummyEnv(obs_dim, action_dim) 89 | self.policy = GaussianMLPPolicy(obs_dim, 90 | action_dim, 91 | name='test_policy_serialize', 92 | hidden_sizes=(64, 64)) 93 | 94 | sess = tf.get_default_session() 95 | sess.run(tf.global_variables_initializer()) 96 | all_param_values = self.policy.get_param_values() 97 | 98 | self.policy.set_params(all_param_values) 99 | 100 | def testSerialize2(self): 101 | obs_dim = 2 102 | action_dim = 7 103 | env = DummyEnv(obs_dim, action_dim) 104 | policy = GaussianMLPPolicy(obs_dim, 105 | action_dim, 106 | name='test_policy_serialize2', 107 | hidden_sizes=(54, 23)) 108 | 109 | sess = tf.get_default_session() 110 | sess.run(tf.global_variables_initializer()) 111 | 112 | obs = env.get_obs() 113 | _, pre_agent_infos = policy.get_action(obs) 114 | pkl_str = pickle.dumps(policy) 115 | tf.reset_default_graph() 116 | with tf.Session() as sess: 117 | policy_unpickled = pickle.loads(pkl_str) 118 | _, post_agent_infos = policy_unpickled.get_action(obs) 119 | for key in pre_agent_infos.keys(): 120 | self.assertTrue(np.allclose(pre_agent_infos[key], post_agent_infos[key])) 121 | 122 | 123 | if __name__ == '__main__': 124 | unittest.main() 125 | --------------------------------------------------------------------------------