├── .dockerignore
├── src
    └── imitation
    │   ├── py.typed
    │   ├── scripts
    │       ├── config
    │       │   ├── __init__.py
    │       │   ├── tuned_hps
    │       │   │   ├── fast_dagger_seals_cartpole.json
    │       │   │   ├── bc_seals_ant_best_hp_eval.json
    │       │   │   ├── bc_seals_hopper_best_hp_eval.json
    │       │   │   ├── bc_seals_swimmer_best_hp_eval.json
    │       │   │   ├── bc_seals_walker_best_hp_eval.json
    │       │   │   ├── bc_seals_half_cheetah_best_hp_eval.json
    │       │   │   ├── dagger_seals_ant_best_hp_eval.json
    │       │   │   ├── dagger_seals_swimmer_best_hp_eval.json
    │       │   │   ├── dagger_seals_hopper_best_hp_eval.json
    │       │   │   ├── dagger_seals_walker_best_hp_eval.json
    │       │   │   ├── dagger_seals_half_cheetah_best_hp_eval.json
    │       │   │   ├── gail_seals_ant_best_hp_eval.json
    │       │   │   ├── airl_seals_ant_best_hp_eval.json
    │       │   │   ├── gail_seals_half_cheetah_best_hp_eval.json
    │       │   │   ├── airl_seals_half_cheetah_best_hp_eval.json
    │       │   │   ├── gail_seals_hopper_best_hp_eval.json
    │       │   │   ├── airl_seals_hopper_best_hp_eval.json
    │       │   │   ├── gail_seals_swimmer_best_hp_eval.json
    │       │   │   ├── gail_seals_walker_best_hp_eval.json
    │       │   │   ├── airl_seals_swimmer_best_hp_eval.json
    │       │   │   └── airl_seals_walker_best_hp_eval.json
    │       │   ├── analyze.py
    │       │   ├── parallel.py
    │       │   ├── train_imitation.py
    │       │   └── eval_policy.py
    │       ├── ingredients
    │       │   ├── __init__.py
    │       │   ├── sqil.py
    │       │   ├── wb.py
    │       │   ├── expert.py
    │       │   ├── policy.py
    │       │   ├── bc.py
    │       │   └── policy_evaluation.py
    │       ├── __init__.py
    │       └── convert_trajs.py
    │   ├── util
    │       ├── __init__.py
    │       ├── sacred_file_parsing.py
    │       ├── video_wrapper.py
    │       └── registry.py
    │   ├── algorithms
    │       ├── __init__.py
    │       └── adversarial
    │       │   └── __init__.py
    │   ├── regularization
    │       └── __init__.py
    │   ├── rewards
    │       ├── __init__.py
    │       └── reward_function.py
    │   ├── policies
    │       ├── __init__.py
    │       ├── exploration_wrapper.py
    │       └── replay_buffer_wrapper.py
    │   ├── testing
    │       ├── __init__.py
    │       ├── reward_nets.py
    │       └── reward_improvement.py
    │   ├── data
    │       ├── __init__.py
    │       └── serialize.py
    │   └── __init__.py
├── docs
    ├── .gitignore
    ├── main-concepts
    │   └── benchmarks.md
    ├── _static
    │   └── css
    │   │   └── custom.css
    ├── _templates
    │   └── autosummary
    │   │   ├── base.rst
    │   │   ├── class.rst
    │   │   └── module.rst
    ├── development
    │   ├── license.rst
    │   ├── release-notes.rst
    │   └── contributing
    │   │   ├── index.rst
    │   │   └── code-of-conduct.rst
    ├── Makefile
    ├── make.bat
    ├── getting-started
    │   ├── installation.rst
    │   ├── first_steps.rst
    │   └── what_is_imitation.rst
    ├── algorithms
    │   ├── sqil.rst
    │   ├── bc.rst
    │   ├── mce_irl.rst
    │   ├── dagger.rst
    │   ├── density.rst
    │   ├── gail.rst
    │   └── airl.rst
    └── index.rst
├── experiments
    ├── .gitignore
    ├── imit_table_cheetahs.csv
    ├── imit_table_mvp_seals_config.csv
    ├── rollouts_from_policies_config.csv
    ├── common.sh
    ├── imit_benchmark_config.csv
    ├── convert_traj.py
    ├── README.md
    ├── rollouts_from_policies.sh
    ├── bc_benchmark.sh
    ├── benchmark_and_table.sh
    ├── transfer_learn_benchmark.sh
    └── dagger_benchmark.sh
├── tests
    ├── testdata
    │   ├── rollouts_from_policies_config.csv
    │   ├── imit_benchmark_config.csv
    │   ├── expert_models
    │   │   ├── cartpole_0
    │   │   │   ├── rollouts
    │   │   │   │   └── final.npz
    │   │   │   └── policies
    │   │   │   │   └── final
    │   │   │   │       └── model.zip
    │   │   └── pendulum_0
    │   │   │   └── rollouts
    │   │   │       └── final.npz
    │   ├── npz_format_rollout.npz
    │   └── pickle_format_rollout.pkl
    ├── algorithms
    │   ├── __init__.py
    │   └── conftest.py
    ├── generate_test_data.sh
    ├── util
    │   ├── test_registry.py
    │   └── test_sacred_file_parsing.py
    ├── conftest.py
    ├── rewards
    │   ├── test_reward_fn.py
    │   └── test_reward_wrapper.py
    ├── test_benchmarking.py
    ├── scripts
    │   └── ingredients
    │   │   └── test_rewards.py
    └── test_examples.py
├── .coveragerc
├── .github
    ├── pull_request_template.md
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
    └── workflows
    │   └── publish-to-pypi.yml
├── .gitattributes
├── mypy.ini
├── ci
    ├── build_and_activate_venv.ps1
    ├── build_and_activate_venv.sh
    ├── Xdummy-entrypoint.py
    └── check_typeignore.py
├── .readthedocs.yml
├── CITATION.bib
├── pyproject.toml
├── examples
    ├── quickstart.sh
    ├── train_dagger_atari_interactive_policy.py
    └── quickstart.py
├── .codecov.yml
├── benchmarking
    ├── run_benchmark_on_slurm.sh
    ├── sacred_output_to_csv.py
    └── run_all_benchmarks_on_slurm.sh
├── LICENSE
├── runners
    ├── launch_docker-dev.sh
    └── build_push_image.sh
├── setup.cfg
├── Dockerfile
├── .gitignore
└── .pre-commit-config.yaml


/.dockerignore:
--------------------------------------------------------------------------------
1 | .gitignore


--------------------------------------------------------------------------------
/src/imitation/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _api
2 | 


--------------------------------------------------------------------------------
/experiments/.gitignore:
--------------------------------------------------------------------------------
1 | figures/
2 | 


--------------------------------------------------------------------------------
/docs/main-concepts/benchmarks.md:
--------------------------------------------------------------------------------
1 | ../../benchmarking/README.md


--------------------------------------------------------------------------------
/src/imitation/scripts/config/__init__.py:
--------------------------------------------------------------------------------
1 | """Configuration settings for scripts."""
2 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/ingredients/__init__.py:
--------------------------------------------------------------------------------
1 | """Ingredients for Sacred experiments."""
2 | 


--------------------------------------------------------------------------------
/experiments/imit_table_cheetahs.csv:
--------------------------------------------------------------------------------
1 | env_config_name,n_expert_demos
2 | seals_half_cheetah,40
3 | 


--------------------------------------------------------------------------------
/src/imitation/util/__init__.py:
--------------------------------------------------------------------------------
1 | """General utility functions: e.g. logging, configuration, etc."""
2 | 


--------------------------------------------------------------------------------
/src/imitation/algorithms/__init__.py:
--------------------------------------------------------------------------------
1 | """Implementations of imitation and reward learning algorithms."""
2 | 


--------------------------------------------------------------------------------
/tests/testdata/rollouts_from_policies_config.csv:
--------------------------------------------------------------------------------
1 | env_config_name,n_demonstrations
2 | seals_cartpole,1
3 | 


--------------------------------------------------------------------------------
/tests/algorithms/__init__.py:
--------------------------------------------------------------------------------
1 | """This is just here to make mypy stop complaining about duplicate conftests."""
2 | 


--------------------------------------------------------------------------------
/src/imitation/algorithms/adversarial/__init__.py:
--------------------------------------------------------------------------------
1 | """Adversarial imitation learning algorithms, AIRL and GAIL."""
2 | 


--------------------------------------------------------------------------------
/src/imitation/regularization/__init__.py:
--------------------------------------------------------------------------------
1 | """Implements a variety of regularization techniques for NN weights."""
2 | 


--------------------------------------------------------------------------------
/src/imitation/rewards/__init__.py:
--------------------------------------------------------------------------------
1 | """Reward models: neural network modules, serialization, preprocessing, etc."""
2 | 


--------------------------------------------------------------------------------
/docs/_static/css/custom.css:
--------------------------------------------------------------------------------
1 | .sidebar-container, .toc-drawer{
2 |     box-sizing:border-box;
3 |     width:20em;
4 | }
5 | 


--------------------------------------------------------------------------------
/src/imitation/policies/__init__.py:
--------------------------------------------------------------------------------
1 | """Classes defining policies and methods to manipulate them (e.g. serialization)."""
2 | 


--------------------------------------------------------------------------------
/experiments/imit_table_mvp_seals_config.csv:
--------------------------------------------------------------------------------
1 | env_config_name,n_expert_demos
2 | seals_cartpole,40
3 | seals_mountain_car,40
4 | 


--------------------------------------------------------------------------------
/src/imitation/testing/__init__.py:
--------------------------------------------------------------------------------
1 | """Helper methods for unit tests.
2 | 
3 | May also be useful for users of imitation.
4 | """
5 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | exclude_lines =
3 |     pragma: no cover
4 |     @overload
5 |     @typing.overload
6 |     raise NotImplementedError
7 | 


--------------------------------------------------------------------------------
/tests/testdata/imit_benchmark_config.csv:
--------------------------------------------------------------------------------
1 | env_config_name,gen_batch_size,n_epochs,n_expert_demos
2 | seals_cartpole,10,3,1
3 | pendulum,10,3,1
4 | 


--------------------------------------------------------------------------------
/docs/_templates/autosummary/base.rst:
--------------------------------------------------------------------------------
1 | {{ fullname | escape | underline}}
2 | 
3 | .. currentmodule:: {{ module }}
4 | 
5 | .. auto{{ objtype }}:: {{ objname }}
6 | 


--------------------------------------------------------------------------------
/tests/testdata/expert_models/cartpole_0/rollouts/final.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCompatibleAI/imitation/HEAD/tests/testdata/expert_models/cartpole_0/rollouts/final.npz


--------------------------------------------------------------------------------
/tests/testdata/expert_models/pendulum_0/rollouts/final.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCompatibleAI/imitation/HEAD/tests/testdata/expert_models/pendulum_0/rollouts/final.npz


--------------------------------------------------------------------------------
/tests/testdata/npz_format_rollout.npz:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:87ba63200068de7128849e9dc29e50be2daaec805bff51aad06229a747a2e5d5
3 | size 247960
4 | 


--------------------------------------------------------------------------------
/tests/testdata/pickle_format_rollout.pkl:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:89a8370f84db04078e736115be0774d20785334f81e9afc8252d1985e7de9f62
3 | size 15058
4 | 


--------------------------------------------------------------------------------
/tests/testdata/expert_models/cartpole_0/policies/final/model.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HumanCompatibleAI/imitation/HEAD/tests/testdata/expert_models/cartpole_0/policies/final/model.zip


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | ## Description
2 | 
3 | Description of changes and what issue this PR is solving.
4 | 
5 | ## Testing
6 | 
7 | Description of how you've tested your changes.
8 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-vendored
2 | tests/testdata/pickle_format_rollout.pkl filter=lfs diff=lfs merge=lfs -text
3 | tests/testdata/npz_format_rollout.npz filter=lfs diff=lfs merge=lfs -text
4 | 


--------------------------------------------------------------------------------
/src/imitation/data/__init__.py:
--------------------------------------------------------------------------------
1 | """Modules handling environment data.
2 | 
3 | For example: types for transitions/trajectories; methods to compute rollouts;
4 | buffers to store transitions; helpers for these modules.
5 | """
6 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | ignore_missing_imports = true
3 | exclude = output
4 | 
5 | # torch had some type errors, we ignore them because they're not our fault
6 | [mypy-torch._dynamo.*]
7 | follow_imports = skip
8 | follow_imports_for_stubs = True
9 | 


--------------------------------------------------------------------------------
/docs/development/license.rst:
--------------------------------------------------------------------------------
1 | .. _License:
2 | 
3 | License
4 | =======
5 | 
6 | This license is also available on the `project repository <https://github.com/HumanCompatibleAI/imitation/blob/master/LICENSE>`_.
7 | 
8 | .. include:: ../../LICENSE
9 | 


--------------------------------------------------------------------------------
/experiments/rollouts_from_policies_config.csv:
--------------------------------------------------------------------------------
 1 | env_config_name,n_demonstrations
 2 | seals_cartpole,40
 3 | seals_mountain_car,40
 4 | seals_half_cheetah,40
 5 | seals_hopper,40
 6 | seals_walker,40
 7 | seals_swimmer,40
 8 | seals_ant,40
 9 | seals_humanoid,240
10 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/__init__.py:
--------------------------------------------------------------------------------
 1 | """Command-line scripts."""
 2 | 
 3 | # Add our custom environments to Gym registry.
 4 | 
 5 | try:
 6 |     # pytype: disable=import-error
 7 |     import seals  # noqa: F401
 8 | 
 9 |     # pytype: enable=import-error
10 | except ImportError:
11 |     pass
12 | 


--------------------------------------------------------------------------------
/ci/build_and_activate_venv.ps1:
--------------------------------------------------------------------------------
 1 | Param(
 2 |    $venv
 3 | )
 4 | $ErrorActionPreference = "Stop"  # exit immediately on any error
 5 | 
 6 | If ($venv -eq $null) {
 7 |    $venv = "venv"
 8 | }
 9 | 
10 | virtualenv -p python3.9 $venv
11 | & $venv\Scripts\activate
12 | pip install ".[docs,parallel,test]"
13 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sphinx:
 4 |   configuration: docs/conf.py
 5 | 
 6 | formats: all
 7 | 
 8 | build:
 9 |   os: ubuntu-22.04
10 |   tools:
11 |     python: "3.8"
12 | 
13 | python:
14 |   install:
15 |     - method: pip
16 |       path: .
17 |       extra_requirements:
18 |         - docs
19 | 


--------------------------------------------------------------------------------
/docs/development/release-notes.rst:
--------------------------------------------------------------------------------
1 | Release Notes
2 | =============
3 | 
4 | .. changelog::
5 |     :changelog-url: https://imitation.readthedocs.io/en/latest/development/release-notes.html
6 |     :github: https://github.com/HumanCompatibleAI/imitation/releases
7 |     :pypi: https://pypi.org/project/imitation/
8 | 


--------------------------------------------------------------------------------
/src/imitation/__init__.py:
--------------------------------------------------------------------------------
 1 | """imitation: implementations of imitation and reward learning algorithms."""
 2 | 
 3 | from importlib import metadata
 4 | 
 5 | try:
 6 |     __version__ = metadata.version("imitation")
 7 | except metadata.PackageNotFoundError:  # pragma: no cover
 8 |     # package is not installed
 9 |     pass
10 | 


--------------------------------------------------------------------------------
/tests/generate_test_data.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # This script regenerates tests/testdata.
 3 | set -e
 4 | 
 5 | # Regenerate tests/testdata/expert_models (for various tests).
 6 | experiments/train_experts.sh -r
 7 | 
 8 | mkdir -p tests/testdata/expert_models/cartpole_0/policies/final_without_vecnorm
 9 | ln -sf ../final/model.zip tests/testdata/expert_models/cartpole_0/policies/final_without_vecnorm/model.zip
10 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Bug description
11 | Description of what the bug is.
12 | 
13 | ## Steps to reproduce
14 | Code or a description of how to reproduce the bug.
15 | 
16 | ## Environment
17 | - Operating system and version:
18 | - Python version:
19 | - Output of `pip freeze --all`:
20 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/fast_dagger_seals_cartpole.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "bc": {"train_kwargs": {"n_batches": 50}},
 3 |     "dagger": {"total_timesteps": 50},
 4 |     "demonstrations": {"n_expert_demos": 10},
 5 |     "policy_evaluation": {"n_episodes_eval": 50},
 6 |     "environment": {
 7 |         "gym_id": "seals/CartPole-v0",
 8 |         "num_vec": 2,
 9 |         "parallel": false,
10 |         "max_episode_steps": 5
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Problem
11 | Description of what problem this feature request aims to solve.
12 | 
13 | ## Solution
14 | Description of proposed solution or change.
15 | 
16 | ## Possible alternative solutions
17 | Description of any alternative solutions or features you've considered.
18 | 


--------------------------------------------------------------------------------
/CITATION.bib:
--------------------------------------------------------------------------------
 1 | @misc{gleave2022imitation,
 2 |   author = {Gleave, Adam and Taufeeque, Mohammad and Rocamonde, Juan and Jenner, Erik and Wang, Steven H. and Toyer, Sam and Ernestus, Maximilian and Belrose, Nora and Emmons, Scott and Russell, Stuart},
 3 |   title = {imitation: Clean Imitation Learning Implementations},
 4 |   year = {2022},
 5 |   howPublished = {arXiv:2211.11972v1 [cs.LG]},
 6 |   archivePrefix = {arXiv},
 7 |   eprint = {2211.11972},
 8 |   primaryClass = {cs.LG},
 9 |   url = {https://arxiv.org/abs/2211.11972},
10 | }
11 | 


--------------------------------------------------------------------------------
/docs/_templates/autosummary/class.rst:
--------------------------------------------------------------------------------
 1 | {{ fullname | escape | underline}}
 2 | 
 3 | .. currentmodule:: {{ module }}
 4 | 
 5 | .. autoclass:: {{ objname }}
 6 | 
 7 |    {% block methods %}
 8 |    .. automethod:: __init__
 9 | 
10 |    {% if methods %}
11 |    .. rubric:: {{ _('Methods') }}
12 | 
13 |    .. autosummary::
14 |    {% for item in methods %}
15 |       ~{{ name }}.{{ item }}
16 |    {%- endfor %}
17 |    {% endif %}
18 |    {% endblock %}
19 | 
20 |    {% block attributes %}
21 |    {% if attributes %}
22 |    .. rubric:: {{ _('Attributes') }}
23 | 
24 |    .. autosummary::
25 |    {% for item in attributes %}
26 |       ~{{ name }}.{{ item }}
27 |    {%- endfor %}
28 |    {% endif %}
29 |    {% endblock %}
30 | 


--------------------------------------------------------------------------------
/experiments/common.sh:
--------------------------------------------------------------------------------
 1 | # shellcheck shell=bash
 2 | 
 3 | # Common variables for experiment scripts
 4 | 
 5 | export GNU_DATE=date
 6 | export GNU_GETOPT=getopt
 7 | if [[ "$OSTYPE" == "darwin"* ]]; then
 8 |   export GNU_DATE=gdate
 9 |   if [[ $(uname -m) == 'arm64' ]]; then
10 |     export GNU_GETOPT=/opt/homebrew/opt/gnu-getopt/bin/getopt
11 |   else
12 |     export GNU_GETOPT=/usr/local/opt/gnu-getopt/bin/getopt
13 |   fi
14 | fi
15 | 
16 | TIMESTAMP=$($GNU_DATE --iso-8601=seconds)
17 | export TIMESTAMP
18 | 
19 | # Set OMP_NUM_THREADS=2 if not yet exported.
20 | # This is important because parallel runs of PyTorch often throttle due to
21 | # CPU contention unless this is set to a low number.
22 | export OMP_NUM_THREADS=${OMP_NUM_THREADS:-2}
23 | 


--------------------------------------------------------------------------------
/docs/development/contributing/index.rst:
--------------------------------------------------------------------------------
 1 | .. _Contributing:
 2 | 
 3 | Contributing
 4 | ============
 5 | 
 6 | .. toctree::
 7 |    :maxdepth: 2
 8 |    :caption: Contributing
 9 |    :hidden:
10 | 
11 |    code-of-conduct
12 |    ways-to-contribute
13 | 
14 | Thank you for your interest in imitation!
15 | 
16 | As an open-source project, we welcome contributions from all users, and are always open to any feedback or suggestions. This section of the documentation is intended to help you understand the process of contributing to the project.
17 | 
18 | 
19 | To keep the community open and inclusive, we have developed a :ref:`Code of Conduct <Code of Conduct>`. If you are not
20 | familiar with our Code of Conduct, take a minute to read it before starting your first contribution.
21 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?= -j auto -W
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 
22 | livehtml:
23 | 	sphinx-autobuild "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) --open-browser
24 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | # Note: setuptools 66.1.1 is the last version that supports installing gym==0.21.0
 3 | requires = ["setuptools==66.1.1", "setuptools_scm[toml]>=6.2"]
 4 | build-backend = "setuptools.build_meta"
 5 | 
 6 | # TODO(juan): we've commented this out because currently
 7 | # there's no way (that we could find in the docs) to point
 8 | # the local_scheme and version_scheme to a function from the
 9 | # pyproject.toml file, so setup.py has to be used.
10 | # [tool.setuptools_scm]
11 | # # Disable local scheme to allow uploads to Test PyPI.
12 | # # See https://github.com/pypa/setuptools_scm/issues/342
13 | # local_scheme = "imitation.version:get_version"
14 | 
15 | [tool.black]
16 | target-version = ["py38"]
17 | 
18 | [tool.pytype]
19 | inputs = [
20 |   "src/",
21 |   "tests/",
22 |   "experiments/",
23 |   "setup.py"
24 | ]
25 | python_version = "3.8"
26 | 


--------------------------------------------------------------------------------
/examples/quickstart.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Train PPO agent on pendulum and collect expert demonstrations. Tensorboard logs saved in quickstart/rl/
 4 | python -m imitation.scripts.train_rl with pendulum environment.fast policy_evaluation.fast rl.fast fast logging.log_dir=quickstart/rl/
 5 | 
 6 | # Train GAIL from demonstrations. Tensorboard logs saved in output/ (default log directory).
 7 | python -m imitation.scripts.train_adversarial gail with pendulum environment.fast demonstrations.fast policy_evaluation.fast rl.fast fast demonstrations.path=quickstart/rl/rollouts/final.npz demonstrations.source=local
 8 | 
 9 | # Train AIRL from demonstrations. Tensorboard logs saved in output/ (default log directory).
10 | python -m imitation.scripts.train_adversarial airl with pendulum environment.fast demonstrations.fast policy_evaluation.fast rl.fast fast demonstrations.path=quickstart/rl/rollouts/final.npz demonstrations.source=local
11 | 


--------------------------------------------------------------------------------
/experiments/imit_benchmark_config.csv:
--------------------------------------------------------------------------------
 1 | env_config_name,gen_batch_size,n_expert_demos
 2 | seals_cartpole,5000,1
 3 | seals_cartpole,5000,4
 4 | seals_cartpole,5000,7
 5 | seals_cartpole,5000,10
 6 | seals_mountain_car,5000,1
 7 | seals_mountain_car,5000,4
 8 | seals_mountain_car,5000,7
 9 | seals_mountain_car,5000,10
10 | seals_half_cheetah,50000,4
11 | seals_half_cheetah,50000,11
12 | seals_half_cheetah,50000,18
13 | seals_half_cheetah,50000,25
14 | seals_hopper,50000,4
15 | seals_hopper,50000,11
16 | seals_hopper,50000,18
17 | seals_hopper,50000,25
18 | seals_walker,50000,4
19 | seals_walker,50000,11
20 | seals_walker,50000,18
21 | seals_walker,50000,25
22 | seals_swimmer,50000,4
23 | seals_swimmer,50000,11
24 | seals_swimmer,50000,18
25 | seals_swimmer,50000,25
26 | seals_ant,50000,4
27 | seals_ant,50000,11
28 | seals_ant,50000,18
29 | seals_ant,50000,25
30 | seals_humanoid,50000,80
31 | seals_humanoid,50000,160
32 | seals_humanoid,50000,240
33 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/.codecov.yml:
--------------------------------------------------------------------------------
 1 | coverage:
 2 |   status:
 3 |     project:
 4 |       default: false
 5 |       main:
 6 |         paths:
 7 |           - "src/imitation/"
 8 |           - "!src/imitation/envs/examples/"
 9 |           - "!src/imitation/scripts/"
10 |       auxiliary:
11 |         target: 0%
12 |         paths:
13 |           - "src/imitation/envs/examples/"
14 |           - "src/imitation/scripts/"
15 |       tests:
16 |         # Should not have dead code in our tests
17 |         target: 100%
18 |         paths:
19 |           - "tests/"
20 |     patch:
21 |       default: false
22 |       main:
23 |         paths:
24 |           - "src/imitation/"
25 |           - "!src/imitation/envs/examples/"
26 |           - "!src/imitation/scripts/"
27 |       auxiliary:
28 |         paths:
29 |           - "examples/"
30 |           - "src/imitation/envs/examples/"
31 |           - "src/imitation/scripts/"
32 |           - "!src/imitation/scripts/config"
33 |       tests:
34 |         target: 100%
35 |         paths:
36 |           - "tests/"
37 | 


--------------------------------------------------------------------------------
/benchmarking/run_benchmark_on_slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --array=1-10
 3 | # Avoid cluttering the root directory with log files:
 4 | #SBATCH --output=slurm/%A_%a.out
 5 | #SBATCH --cpus-per-task=8
 6 | #SBATCH --gpus=0
 7 | #SBATCH --mem=8gb
 8 | #SBATCH --time=70:00:00
 9 | #SBATCH --qos=scavenger
10 | 
11 | # This script will run an imitation algorithm on an environment for 10 seeds.
12 | 
13 | # This script assumes that you set up imitation in your NAS home directory and
14 | # installed it in a venv located in the imitation directory.
15 | 
16 | # Call this script with <script> <algo> <env>. Where
17 | #  <scripts> is either 'train_imitation' (then algo must be 'bc' or 'dagger') or
18 | #  'train_adversarial' (then algo must be 'gail' or 'airl').
19 | #  The env can be any of 'seals_ant', 'seals_half_cheetah', 'seals_hopper',
20 | #  'seals_swimmer',  'seals_walker'
21 | 
22 | cd "/nas/ucb/$(whoami)/imitation" || exit
23 | source venv/bin/activate
24 | srun python -m "imitation.scripts.$1" "$2" with "$2_$3" "seed=$SLURM_ARRAY_TASK_ID"
25 | 


--------------------------------------------------------------------------------
/src/imitation/rewards/reward_function.py:
--------------------------------------------------------------------------------
 1 | """Type alias shared by reward-related code."""
 2 | 
 3 | import abc
 4 | from typing import Protocol
 5 | 
 6 | import numpy as np
 7 | 
 8 | 
 9 | class RewardFn(Protocol):
10 |     """Abstract class for reward function.
11 | 
12 |     Requires implementation of __call__() to compute the reward given a batch of
13 |     states, actions, next states and dones.
14 |     """
15 | 
16 |     @abc.abstractmethod
17 |     def __call__(
18 |         self,
19 |         state: np.ndarray,
20 |         action: np.ndarray,
21 |         next_state: np.ndarray,
22 |         done: np.ndarray,
23 |     ) -> np.ndarray:
24 |         """Compute rewards for a batch of transitions.
25 | 
26 |         Args:
27 |             state: Current states of shape `(batch_size,) + state_shape`.
28 |             action: Actions of shape `(batch_size,) + action_shape`.
29 |             next_state: Successor states of shape `(batch_size,) + state_shape`.
30 |             done: End-of-episode (terminal state) indicator of shape `(batch_size,)`.
31 | 
32 |         Returns:
33 |             Computed rewards of shape `(batch_size,`).
34 |         """  # noqa: DAR202
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019-2022 Center for Human-Compatible AI and Google LLC
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ci/build_and_activate_venv.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Usage: ./build_and_activate_venv.sh [venv_path] [python_version]
 4 | #   venv_path: Path at which the virtualenv directory should be created. Defaults to
 5 | #     'venv'.
 6 | #   python_version: Version of python to be used in the virtualenv. Defaults to
 7 | #     'python3.8'.
 8 | 
 9 | set -e  # exit immediately on any error
10 | 
11 | venv=$1
12 | if [[ ${venv} == "" ]]; then
13 |   venv="venv"
14 | fi
15 | python_version=$2
16 | if [[ ${python_version} == "" ]]; then
17 |   python_version="python3.8"
18 | fi
19 | 
20 | virtualenv -p ${python_version} ${venv}
21 | # shellcheck disable=SC1090,SC1091
22 | source ${venv}/bin/activate
23 | 
24 | # Update pip to the latest version.
25 | pip install --upgrade pip
26 | 
27 | # If platform is linux, install pytorch CPU version.
28 | # This will prevent installing the CUDA version in the pip install ".[docs,parallel,test]" command.
29 | # The CUDA version is a couple of gigabytes larger than the CPU version.
30 | # Since we don't need the CUDA version for testing, we can save some time by not installing it.
31 | if [[ "$OSTYPE" == "linux-gnu"* ]]; then
32 |   pip install torch --index-url https://download.pytorch.org/whl/cpu
33 | fi
34 | pip install ".[docs,parallel,test]"
35 | 


--------------------------------------------------------------------------------
/tests/util/test_registry.py:
--------------------------------------------------------------------------------
 1 | """Tests for `imitation.util.registry`."""
 2 | 
 3 | import pytest
 4 | 
 5 | from imitation.util import registry
 6 | 
 7 | 
 8 | def test_lazy():
 9 |     """Test indirect/lazy loading of registered values."""
10 |     reg = registry.Registry()
11 | 
12 |     reg.register("nomodule", indirect="this.module.does.not.exist:foobar")
13 |     with pytest.raises(ImportError):
14 |         reg.get("nomodule")
15 | 
16 |     reg.register("noattribute", indirect="imitation:attr_does_not_exist")
17 |     with pytest.raises(AttributeError):
18 |         reg.get("noattribute")
19 | 
20 |     with pytest.raises(ValueError, match="exactly one of"):
21 |         reg.register(key="wrongargs", value=3.14, indirect="math:pi")
22 | 
23 |     reg.register("exists", indirect="math:pi")
24 |     val = reg.get("exists")
25 |     import math
26 | 
27 |     assert val == math.pi
28 | 
29 | 
30 | def test_keys():
31 |     reg = registry.Registry()
32 | 
33 |     with pytest.raises(KeyError, match="not registered"):
34 |         reg.get("foobar")
35 | 
36 |     reg.register(key="foobar", value="fizzbuzz")
37 |     assert reg.get("foobar") == "fizzbuzz"
38 | 
39 |     with pytest.raises(KeyError, match="Duplicate registration"):
40 |         reg.register(key="foobar", value="fizzbuzz")
41 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/analyze.py:
--------------------------------------------------------------------------------
 1 | """Configuration settings for analyze, inspecting results from completed experiments."""
 2 | 
 3 | import os.path as osp
 4 | 
 5 | import sacred
 6 | 
 7 | analysis_ex = sacred.Experiment("analyze")
 8 | 
 9 | 
10 | @analysis_ex.config
11 | def config():
12 |     # Recursively search in this directory for sacred logs
13 |     source_dir_str = "output/sacred/train_adversarial"
14 |     skip_failed_runs = True  # Skip analysis for logs that have FAILED status
15 |     run_name = None  # Restrict analysis to sacred logs with a certain run name
16 |     env_name = None  # Restrict analysis to sacred logs with a certain env name
17 |     csv_output_path = None  # Write CSV output to this path
18 |     tex_output_path = None  # Write LaTex output to this path
19 |     print_table = True  # Set to True to print analysis to stdout
20 |     split_str = ","  # str used to split source_dir_str into multiple source dirs
21 |     table_verbosity = 1  # Choose from 0, 1, 2 or 3
22 |     source_dirs = None
23 | 
24 | 
25 | @analysis_ex.config
26 | def convert_source_dirs(source_dir_str, split_str, source_dirs):
27 |     if source_dirs is None:
28 |         source_dirs = source_dir_str.split(split_str)
29 | 
30 |     source_dirs = [osp.expanduser(p) for p in source_dirs]
31 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/bc_seals_ant_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bc": {
 3 |     "batch_size": 16,
 4 |     "l2_weight": 2.350251568550711e-5,
 5 |     "optimizer_cls": {
 6 |       "py/type": "torch.optim.adam.Adam"
 7 |     },
 8 |     "optimizer_kwargs": {
 9 |       "lr": 0.0017601048183920826
10 |     },
11 |     "train_kwargs": {
12 |       "log_interval": 500,
13 |       "n_batches": null,
14 |       "n_epochs": 5
15 |     }
16 |   },
17 |   "dagger": {
18 |     "rollout_round_min_episodes": null,
19 |     "total_timesteps": 100000,
20 |     "use_offline_rollouts": false
21 |   },
22 |   "demonstrations": {
23 |     "source": "huggingface",
24 |     "algo_name": "ppo",
25 |     "n_expert_demos": null
26 |   },
27 |   "policy": {
28 |     "policy_cls": {
29 |       "py/type": "imitation.policies.base.FeedForward32Policy"
30 |     },
31 |     "policy_kwargs": {
32 |       "features_extractor_class": {
33 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
34 |       },
35 |       "features_extractor_kwargs": {
36 |         "normalize_class": {
37 |           "py/type": "imitation.util.networks.RunningNorm"
38 |         }
39 |       }
40 |     }
41 |   },
42 |   "policy_evaluation": {
43 |     "n_episodes_eval": 50
44 |   },
45 |   "environment": {
46 |     "gym_id": "seals/Ant-v1"
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/bc_seals_hopper_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bc": {
 3 |     "batch_size": 64,
 4 |     "l2_weight": 1.3610189916104634e-6,
 5 |     "optimizer_cls": {
 6 |       "py/type": "torch.optim.adam.Adam"
 7 |     },
 8 |     "optimizer_kwargs": {
 9 |       "lr": 0.0007172435323620212
10 |     },
11 |     "train_kwargs": {
12 |       "log_interval": 500,
13 |       "n_batches": null,
14 |       "n_epochs": 20
15 |    }
16 |   },
17 |   "dagger": {
18 |     "rollout_round_min_episodes": null,
19 |     "total_timesteps": 100000,
20 |     "use_offline_rollouts": false
21 |   },
22 |   "demonstrations": {
23 |     "source": "huggingface",
24 |     "algo_name": "ppo",
25 |     "n_expert_demos": null
26 |   },
27 |   "policy": {
28 |     "policy_cls": {
29 |       "py/type": "imitation.policies.base.FeedForward32Policy"
30 |     },
31 |     "policy_kwargs": {
32 |       "features_extractor_class": {
33 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
34 |       },
35 |       "features_extractor_kwargs": {
36 |         "normalize_class": {
37 |           "py/type": "imitation.util.networks.RunningNorm"
38 |         }
39 |       }
40 |     }
41 |   },
42 |   "policy_evaluation": {
43 |     "n_episodes_eval": 50
44 |   },
45 |   "environment": {
46 |     "gym_id": "seals/Hopper-v1"
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/bc_seals_swimmer_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bc": {
 3 |     "batch_size": 16,
 4 |     "l2_weight": 4.37857842825771e-5,
 5 |     "optimizer_cls": {
 6 |       "py/type": "torch.optim.adam.Adam"
 7 |     },
 8 |     "optimizer_kwargs": {
 9 |       "lr": 0.0016370547173923296
10 |     },
11 |     "train_kwargs": {
12 |       "log_interval": 500,
13 |       "n_batches": null,
14 |       "n_epochs": 10
15 |     }
16 |   },
17 |   "dagger": {
18 |     "rollout_round_min_episodes": null,
19 |     "total_timesteps": 100000,
20 |     "use_offline_rollouts": false
21 |   },
22 |   "demonstrations": {
23 |     "source": "huggingface",
24 |     "algo_name": "ppo",
25 |     "n_expert_demos": null
26 |   },
27 |   "policy": {
28 |     "policy_cls": {
29 |       "py/type": "imitation.policies.base.FeedForward32Policy"
30 |     },
31 |     "policy_kwargs": {
32 |       "features_extractor_class": {
33 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
34 |       },
35 |       "features_extractor_kwargs": {
36 |         "normalize_class": {
37 |           "py/type": "imitation.util.networks.RunningNorm"
38 |         }
39 |       }
40 |     }
41 |   },
42 |   "policy_evaluation": {
43 |     "n_episodes_eval": 50
44 |   },
45 |   "environment": {
46 |     "gym_id": "seals/Swimmer-v1"
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/bc_seals_walker_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bc": {
 3 |     "batch_size": 32,
 4 |     "l2_weight": 0.0014680228143404998,
 5 |     "optimizer_cls": {
 6 |       "py/type": "torch.optim.adam.Adam"
 7 |     },
 8 |     "optimizer_kwargs": {
 9 |       "lr": 0.0003034620018780926
10 |     },
11 |     "train_kwargs": {
12 |       "log_interval": 500,
13 |       "n_batches": null,
14 |       "n_epochs": 20
15 |     }
16 |   },
17 |   "dagger": {
18 |     "rollout_round_min_episodes": null,
19 |     "total_timesteps": 100000,
20 |     "use_offline_rollouts": false
21 |   },
22 |   "demonstrations": {
23 |     "source": "huggingface",
24 |     "algo_name": "ppo",
25 |     "n_expert_demos": null
26 |   },
27 |   "policy": {
28 |     "policy_cls": {
29 |       "py/type": "imitation.policies.base.FeedForward32Policy"
30 |     },
31 |     "policy_kwargs": {
32 |       "features_extractor_class": {
33 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
34 |       },
35 |       "features_extractor_kwargs": {
36 |         "normalize_class": {
37 |           "py/type": "imitation.util.networks.RunningNorm"
38 |         }
39 |       }
40 |     }
41 |   },
42 |   "policy_evaluation": {
43 |     "n_episodes_eval": 50
44 |   },
45 |   "environment": {
46 |     "gym_id": "seals/Walker2d-v1"
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/bc_seals_half_cheetah_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bc": {
 3 |     "batch_size": 64,
 4 |     "l2_weight": 0.005728455628518169,
 5 |     "optimizer_cls": {
 6 |       "py/type": "torch.optim.adam.Adam"
 7 |     },
 8 |     "optimizer_kwargs": {
 9 |       "lr": 0.008056922426724927
10 |     },
11 |     "train_kwargs": {
12 |       "log_interval": 500,
13 |       "n_batches": null,
14 |       "n_epochs": 20
15 |     }
16 |   },
17 |   "dagger": {
18 |     "rollout_round_min_episodes": null,
19 |     "total_timesteps": 60000,
20 |     "use_offline_rollouts": false
21 |   },
22 |   "demonstrations": {
23 |     "source": "huggingface",
24 |     "algo_name": "ppo",
25 |     "n_expert_demos": null
26 |   },
27 |   "policy": {
28 |     "policy_cls": {
29 |       "py/type": "imitation.policies.base.FeedForward32Policy"
30 |     },
31 |     "policy_kwargs": {
32 |       "features_extractor_class": {
33 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
34 |       },
35 |       "features_extractor_kwargs": {
36 |         "normalize_class": {
37 |           "py/type": "imitation.util.networks.RunningNorm"
38 |         }
39 |       }
40 |     }
41 |   },
42 |   "policy_evaluation": {
43 |     "n_episodes_eval": 50
44 |   },
45 |   "environment": {
46 |     "gym_id": "seals/HalfCheetah-v1"
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/benchmarking/sacred_output_to_csv.py:
--------------------------------------------------------------------------------
 1 | """Converts a directory of Sacred output to a CSV file."""
 2 | import pathlib
 3 | import sys
 4 | 
 5 | from imitation.util.sacred_file_parsing import find_sacred_runs
 6 | 
 7 | 
 8 | def main(path: pathlib.Path, only_completed_runs: bool = True):
 9 |     if not path.exists():
10 |         raise NotADirectoryError(f"Path {path} does not exist.")
11 | 
12 |     # Print header
13 |     if only_completed_runs:
14 |         print("algo, env, score, expert_score")
15 |     else:
16 |         print("algo, env, score, expert_score, status")
17 | 
18 |     # Print data
19 |     for config, run in find_sacred_runs(path, only_completed_runs):
20 |         algo = run["command"]
21 |         env = config["environment"]["gym_id"]
22 |         score = run["result"]["imit_stats"]["monitor_return_mean"]
23 |         expert_score = run["result"]["expert_stats"]["monitor_return_mean"]
24 | 
25 |         if only_completed_runs:
26 |             print(f"{algo}, {env}, {score}, {expert_score}")
27 |         else:
28 |             status = run["status"]
29 |             print(f"{algo}, {env}, {score}, {expert_score}, {status}")
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     if len(sys.argv) != 2:
34 |         print(f"Usage: {sys.argv[0]} <path to sacred output directory>")
35 |         sys.exit(1)
36 | 
37 |     main(pathlib.Path(sys.argv[1]))
38 | 


--------------------------------------------------------------------------------
/examples/train_dagger_atari_interactive_policy.py:
--------------------------------------------------------------------------------
 1 | """Training DAgger with an interactive policy that queries the user for actions.
 2 | 
 3 | Note that this is a toy example that does not lead to training a reasonable policy.
 4 | """
 5 | 
 6 | import tempfile
 7 | 
 8 | import gymnasium as gym
 9 | import numpy as np
10 | from stable_baselines3.common import vec_env
11 | 
12 | from imitation.algorithms import bc, dagger
13 | from imitation.policies import interactive
14 | 
15 | if __name__ == "__main__":
16 |     rng = np.random.default_rng(0)
17 | 
18 |     env = vec_env.DummyVecEnv([lambda: gym.wrappers.TimeLimit(gym.make("Pong-v4"), 10)])
19 |     env.seed(0)
20 | 
21 |     expert = interactive.AtariInteractivePolicy(env)
22 | 
23 |     bc_trainer = bc.BC(
24 |         observation_space=env.observation_space,
25 |         action_space=env.action_space,
26 |         rng=rng,
27 |     )
28 | 
29 |     with tempfile.TemporaryDirectory(prefix="dagger_example_") as tmpdir:
30 |         dagger_trainer = dagger.SimpleDAggerTrainer(
31 |             venv=env,
32 |             scratch_dir=tmpdir,
33 |             expert_policy=expert,
34 |             bc_trainer=bc_trainer,
35 |             rng=rng,
36 |         )
37 |         dagger_trainer.train(
38 |             total_timesteps=20,
39 |             rollout_round_min_episodes=1,
40 |             rollout_round_min_timesteps=10,
41 |         )
42 | 


--------------------------------------------------------------------------------
/ci/Xdummy-entrypoint.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | """This script starts an X server and sets DISPLAY, then runs wrapped command."""
 4 | 
 5 | # Usage: ./Xdummy-entrypoint.py [command]
 6 | #
 7 | # Adapted from https://github.com/openai/mujoco-py/blob/master/vendor/Xdummy-entrypoint
 8 | # Copyright OpenAI; MIT License
 9 | 
10 | import argparse
11 | import os
12 | import subprocess
13 | import sys
14 | 
15 | if __name__ == "__main__":
16 |     parser = argparse.ArgumentParser()
17 |     args, extra_args = parser.parse_known_args()
18 | 
19 |     subprocess.Popen(
20 |         [
21 |             "nohup",
22 |             "Xorg",
23 |             "-noreset",
24 |             "+extension",
25 |             "GLX",
26 |             "+extension",
27 |             "RANDR",
28 |             "+extension",
29 |             "RENDER",
30 |             "-logfile",
31 |             "/tmp/xdummy.log",
32 |             "-config",
33 |             "/etc/dummy_xorg.conf",
34 |             ":0",
35 |         ],
36 |     )
37 |     os.environ["DISPLAY"] = ":0"
38 | 
39 |     if not extra_args:
40 |         argv = ["/bin/bash"]
41 |     else:
42 |         argv = extra_args
43 | 
44 |     # Explicitly flush right before the exec since otherwise things might get
45 |     # lost in Python's buffers around stdout/stderr (!).
46 |     sys.stdout.flush()
47 |     sys.stderr.flush()
48 | 
49 |     os.execvpe(argv[0], argv, os.environ)
50 | 


--------------------------------------------------------------------------------
/experiments/convert_traj.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Convert trajectories from `imitation` format to openai/baselines GAIL format."""
 3 | 
 4 | import argparse
 5 | import os
 6 | from pathlib import Path
 7 | from typing import Sequence
 8 | 
 9 | import numpy as np
10 | 
11 | from imitation.data import rollout, serialize, types
12 | 
13 | 
14 | def convert_trajs_to_sb(trajs: Sequence[types.TrajectoryWithRew]) -> dict:
15 |     """Converts Trajectories into the dict format used by Stable Baselines GAIL."""
16 |     trans = rollout.flatten_trajectories_with_rew(trajs)
17 |     return dict(
18 |         acs=trans.acts,
19 |         rews=trans.rews,
20 |         obs=trans.obs,
21 |         ep_rets=np.array([np.sum(t.rews) for t in trajs]),
22 |     )
23 | 
24 | 
25 | def main():
26 |     parser = argparse.ArgumentParser()
27 |     parser.add_argument("src_path", type=str)
28 |     parser.add_argument("dst_path", type=str)
29 |     args = parser.parse_args()
30 | 
31 |     src_path = Path(args.src_path)
32 |     dst_path = Path(args.dst_path)
33 | 
34 |     assert src_path.is_file()
35 |     src_trajs = serialize.load_with_rewards(src_path)
36 |     dst_trajs = convert_trajs_to_sb(src_trajs)
37 |     os.makedirs(dst_path.parent, exist_ok=True)
38 |     with open(dst_path, "wb") as f:
39 |         np.savez_compressed(f, **dst_trajs)
40 | 
41 |     print(f"Dumped rollouts to {dst_path}")
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     main()
46 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/dagger_seals_ant_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bc": {
 3 |     "batch_size": 16,
 4 |     "l2_weight": 0.0001,
 5 |     "optimizer_cls": {
 6 |       "py/type": "torch.optim.adam.Adam"
 7 |     },
 8 |     "optimizer_kwargs": {
 9 |       "lr": 0.001
10 |     },
11 |     "train_kwargs": {
12 |       "log_interval": 500,
13 |       "n_batches": null,
14 |       "n_epochs": 10
15 |     }
16 |   },
17 |   "dagger": {
18 |     "beta_schedule": {
19 |       "py/object": "imitation.algorithms.dagger.LinearBetaSchedule",
20 |       "rampdown_rounds": 15
21 |     },
22 |     "rollout_round_min_episodes": 5,
23 |     "total_timesteps": 100000,
24 |     "use_offline_rollouts": false
25 |   },
26 |   "demonstrations": {
27 |     "source": "huggingface",
28 |     "algo_name": "ppo",
29 |     "n_expert_demos": null
30 |   },
31 |   "policy": {
32 |     "policy_cls": {
33 |       "py/type": "imitation.policies.base.FeedForward32Policy"
34 |     },
35 |     "policy_kwargs": {
36 |       "features_extractor_class": {
37 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
38 |       },
39 |       "features_extractor_kwargs": {
40 |         "normalize_class": {
41 |           "py/type": "imitation.util.networks.RunningNorm"
42 |         }
43 |       }
44 |     }
45 |   },
46 |   "policy_evaluation": {
47 |     "n_episodes_eval": 50
48 |   },
49 |   "environment": {
50 |     "gym_id": "seals/Ant-v1"
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/dagger_seals_swimmer_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bc": {
 3 |     "batch_size": 16,
 4 |     "l2_weight": 0.0001,
 5 |     "optimizer_cls": {
 6 |       "py/type": "torch.optim.adam.Adam"
 7 |     },
 8 |     "optimizer_kwargs": {
 9 |       "lr": 0.001
10 |     },
11 |     "train_kwargs": {
12 |       "log_interval": 500,
13 |       "n_batches": null,
14 |       "n_epochs": 1
15 |     }
16 |   },
17 |   "dagger": {
18 |     "beta_schedule": {
19 |       "py/object": "imitation.algorithms.dagger.LinearBetaSchedule",
20 |       "rampdown_rounds": 15
21 |     },
22 |     "rollout_round_min_episodes": 3,
23 |     "total_timesteps": 100000,
24 |     "use_offline_rollouts": false
25 |   },
26 |   "demonstrations": {
27 |     "source": "huggingface",
28 |     "algo_name": "ppo",
29 |     "n_expert_demos": null
30 |   },
31 |   "policy": {
32 |     "policy_cls": {
33 |       "py/type": "imitation.policies.base.FeedForward32Policy"
34 |     },
35 |     "policy_kwargs": {
36 |       "features_extractor_class": {
37 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
38 |       },
39 |       "features_extractor_kwargs": {
40 |         "normalize_class": {
41 |           "py/type": "imitation.util.networks.RunningNorm"
42 |         }
43 |       }
44 |     }
45 |   },
46 |   "policy_evaluation": {
47 |     "n_episodes_eval": 50
48 |   },
49 |   "environment": {
50 |     "gym_id": "seals/Swimmer-v1"
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/dagger_seals_hopper_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bc": {
 3 |     "batch_size": 16,
 4 |     "l2_weight": 0.0001,
 5 |     "optimizer_cls": {
 6 |       "py/type": "torch.optim.adam.Adam"
 7 |     },
 8 |     "optimizer_kwargs": {
 9 |       "lr": 0.001
10 |     },
11 |     "train_kwargs": {
12 |       "log_interval": 500,
13 |       "n_batches": null,
14 |       "n_epochs": 1
15 |     }
16 |   },
17 |   "dagger": {
18 |     "beta_schedule": {
19 |       "decay_probability": 0.7,
20 |       "py/object": "imitation.algorithms.dagger.ExponentialBetaSchedule"
21 |     },
22 |     "rollout_round_min_episodes": 10,
23 |     "total_timesteps": 100000,
24 |     "use_offline_rollouts": false
25 |   },
26 |   "demonstrations": {
27 |     "source": "huggingface",
28 |     "algo_name": "ppo",
29 |     "n_expert_demos": null
30 |   },
31 |   "policy": {
32 |     "policy_cls": {
33 |       "py/type": "imitation.policies.base.FeedForward32Policy"
34 |     },
35 |     "policy_kwargs": {
36 |       "features_extractor_class": {
37 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
38 |       },
39 |       "features_extractor_kwargs": {
40 |         "normalize_class": {
41 |           "py/type": "imitation.util.networks.RunningNorm"
42 |         }
43 |       }
44 |     }
45 |   },
46 |   "policy_evaluation": {
47 |     "n_episodes_eval": 50
48 |   },
49 |   "environment": {
50 |     "gym_id": "seals/Hopper-v1"
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/docs/getting-started/installation.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | Installation
 3 | ============
 4 | 
 5 | Prerequisites
 6 | -------------
 7 | 
 8 | - Python 3.8+
 9 | - pip (it helps to make sure this is up-to-date: ``pip install -U pip``)
10 | - (on ARM64 Macs) you need to set environment variables due to \
11 |   `a bug in grpcio <https://stackoverflow.com/questions/66640705/how-can-i-install-grpcio-on-an-apple-m1-silicon-laptop>`_:
12 | 
13 | .. code-block:: bash
14 | 
15 |     export GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
16 |     export GRPC_PYTHON_BUILD_SYSTEM_ZLIB=1
17 | 
18 | - (Optional) OpenGL (to render gym environments)
19 | - (Optional) FFmpeg (to encode videos of renders)
20 | 
21 | Installation from PyPI
22 | ----------------------
23 | 
24 | To install the latest PyPI release, simply run:
25 | 
26 | .. code-block:: bash
27 | 
28 |     pip install imitation
29 | 
30 | 
31 | Installation from source
32 | ------------------------
33 | 
34 | Installation from source is useful if you wish to contribute to the development of ``imitation``, or if you need features that have not yet been made available in a stable release:
35 | 
36 | .. code-block:: bash
37 | 
38 |     git clone http://github.com/HumanCompatibleAI/imitation
39 |     cd imitation
40 |     pip install -e .
41 | 
42 | There are also a number of dependencies used for running tests and building the documentation, which can be installed with:
43 | 
44 | .. code-block:: bash
45 | 
46 |     pip install -e ".[dev]"
47 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/dagger_seals_walker_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bc": {
 3 |     "batch_size": 16,
 4 |     "l2_weight": 0.0001,
 5 |     "optimizer_cls": {
 6 |       "py/type": "torch.optim.adam.Adam"
 7 |     },
 8 |     "optimizer_kwargs": {
 9 |       "lr": 0.001
10 |     },
11 |     "train_kwargs": {
12 |       "log_interval": 500,
13 |       "n_batches": null,
14 |       "n_epochs": 5
15 |     }
16 |   },
17 |   "dagger": {
18 |     "beta_schedule": {
19 |       "decay_probability": 0.7,
20 |       "py/object": "imitation.algorithms.dagger.ExponentialBetaSchedule"
21 |     },
22 |     "rollout_round_min_episodes": 5,
23 |     "total_timesteps": 100000,
24 |     "use_offline_rollouts": false
25 |   },
26 |   "demonstrations": {
27 |     "source": "huggingface",
28 |     "algo_name": "ppo",
29 |     "n_expert_demos": null
30 |   },
31 |   "policy": {
32 |     "policy_cls": {
33 |       "py/type": "imitation.policies.base.FeedForward32Policy"
34 |     },
35 |     "policy_kwargs": {
36 |       "features_extractor_class": {
37 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
38 |       },
39 |       "features_extractor_kwargs": {
40 |         "normalize_class": {
41 |           "py/type": "imitation.util.networks.RunningNorm"
42 |         }
43 |       }
44 |     }
45 |   },
46 |   "policy_evaluation": {
47 |     "n_episodes_eval": 50
48 |   },
49 |   "environment": {
50 |     "gym_id": "seals/Walker2d-v1"
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/dagger_seals_half_cheetah_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bc": {
 3 |     "batch_size": 16,
 4 |     "l2_weight": 0.0001,
 5 |     "optimizer_cls": {
 6 |       "py/type": "torch.optim.adam.Adam"
 7 |     },
 8 |     "optimizer_kwargs": {
 9 |       "lr": 0.001
10 |     },
11 |     "train_kwargs": {
12 |       "log_interval": 500,
13 |       "n_batches": null,
14 |       "n_epochs": 5
15 |     }
16 |   },
17 |   "dagger": {
18 |     "beta_schedule": {
19 |       "decay_probability": 0.7,
20 |       "py/object": "imitation.algorithms.dagger.ExponentialBetaSchedule"
21 |     },
22 |     "rollout_round_min_episodes": 5,
23 |     "total_timesteps": 60000,
24 |     "use_offline_rollouts": false
25 |   },
26 |   "demonstrations": {
27 |     "source": "huggingface",
28 |     "algo_name": "ppo",
29 |     "n_expert_demos": null
30 |   },
31 |   "policy": {
32 |     "policy_cls": {
33 |       "py/type": "imitation.policies.base.FeedForward32Policy"
34 |     },
35 |     "policy_kwargs": {
36 |       "features_extractor_class": {
37 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
38 |       },
39 |       "features_extractor_kwargs": {
40 |         "normalize_class": {
41 |           "py/type": "imitation.util.networks.RunningNorm"
42 |         }
43 |       }
44 |     }
45 |   },
46 |   "policy_evaluation": {
47 |     "n_episodes_eval": 50
48 |   },
49 |   "environment": {
50 |     "gym_id": "seals/HalfCheetah-v1"
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/ingredients/sqil.py:
--------------------------------------------------------------------------------
 1 | """This ingredient provides a SQIL algorithm instance."""
 2 | import sacred
 3 | from stable_baselines3 import dqn as dqn_algorithm
 4 | 
 5 | from imitation.policies import base
 6 | from imitation.scripts.ingredients import policy, rl
 7 | 
 8 | sqil_ingredient = sacred.Ingredient(
 9 |     "sqil",
10 |     ingredients=[rl.rl_ingredient, policy.policy_ingredient],
11 | )
12 | 
13 | 
14 | @sqil_ingredient.config
15 | def config():
16 |     total_timesteps = 3e5
17 |     train_kwargs = dict(
18 |         log_interval=4,  # Number of updates between Tensorboard/stdout logs
19 |         progress_bar=True,
20 |     )
21 | 
22 |     locals()  # quieten flake8 unused variable warning
23 | 
24 | 
25 | @rl.rl_ingredient.config_hook
26 | def override_rl_cls(config, command_name, logger):
27 |     # want to remove arguments added by the rl ingredient but keep
28 |     # the ones that are added by others
29 |     del logger
30 | 
31 |     res = {}
32 |     if command_name == "sqil" and config["rl"]["rl_cls"] is None:
33 |         res["rl_cls"] = dqn_algorithm.DQN
34 | 
35 |     return res
36 | 
37 | 
38 | @policy.policy_ingredient.config_hook
39 | def override_policy_cls(config, command_name, logger):  # noqa
40 |     del logger
41 | 
42 |     res = {}
43 |     if (
44 |         command_name == "sqil"
45 |         and config["policy"]["policy_cls"] == base.FeedForward32Policy
46 |     ):
47 |         res["policy_cls"] = "MlpPolicy"
48 | 
49 |     return res
50 | 


--------------------------------------------------------------------------------
/docs/_templates/autosummary/module.rst:
--------------------------------------------------------------------------------
 1 | {{ fullname | escape | underline}}
 2 | 
 3 | .. automodule:: {{ fullname }}
 4 | 
 5 |    {% block attributes %}
 6 |    {% if attributes %}
 7 |    .. rubric:: {{ _('Module Attributes') }}
 8 | 
 9 |    .. autosummary::
10 |    {% for item in attributes %}
11 |       {{ item }}
12 |    {%- endfor %}
13 |    {% endif %}
14 |    {% endblock %}
15 | 
16 |    {% block functions %}
17 |    {% if functions %}
18 |    .. rubric:: {{ _('Functions') }}
19 |    .. testsetup::
20 |       :skipif: skip_doctests
21 | 
22 |       # import all functions from module since examples don't import them
23 |       from {{ fullname }} import *
24 | 
25 |    .. autosummary::
26 |    {% for item in functions %}
27 |       {{ item }}
28 |    {%- endfor %}
29 |    {% endif %}
30 |    {% endblock %}
31 | 
32 |    {% block classes %}
33 |    {% if classes %}
34 |    .. rubric:: {{ _('Classes') }}
35 | 
36 |    .. autosummary::
37 |    {% for item in classes %}
38 |       {{ item }}
39 |    {%- endfor %}
40 |    {% endif %}
41 |    {% endblock %}
42 | 
43 |    {% block exceptions %}
44 |    {% if exceptions %}
45 |    .. rubric:: {{ _('Exceptions') }}
46 | 
47 |    .. autosummary::
48 |    {% for item in exceptions %}
49 |       {{ item }}
50 |    {%- endfor %}
51 |    {% endif %}
52 |    {% endblock %}
53 | 
54 | {% block modules %}
55 | {% if modules %}
56 | .. rubric:: Modules
57 | 
58 | .. autosummary::
59 |    :toctree:
60 |    :recursive:
61 | {% for item in modules %}
62 |    {{ item }}
63 | {%- endfor %}
64 | {% endif %}
65 | {% endblock %}
66 | 


--------------------------------------------------------------------------------
/runners/launch_docker-dev.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e # Exit immediately if a command exits with a non-zero status.
 4 | set -x # echo on
 5 | 
 6 | __usage="launch_docker-dev.sh - Launching humancompatibleai/imitation:python-req
 7 | 
 8 | Usage: launch_docker-dev.sh [options]
 9 | 
10 | options:
11 |   -p, --pull                pull the image to DockerHub
12 | 
13 | Note: You can specify IMIT_LOCAL_MNT environment variables to mount local
14 |   repository and MuJoCo license key respectively.
15 | "
16 | 
17 | PULL=0
18 | 
19 | while test $# -gt 0; do
20 |   case "$1" in
21 |   -p | --pull)
22 |     PULL=1 # Pull the image from Docker Hub
23 |     shift
24 |     ;;
25 |   -h | --help)
26 |     echo "${__usage}"
27 |     exit 0
28 |     ;;
29 |   *)
30 |     echo "Unrecognized flag $1" >&2
31 |     exit 1
32 |     ;;
33 |   esac
34 | done
35 | 
36 | DOCKER_IMAGE="humancompatibleai/imitation:python-req"
37 | # Specify IMIT_LOCAL_MNT if you want to mount a local directory to the docker container
38 | if [[ ${IMIT_LOCAL_MNT} == "" ]]; then
39 |   IMIT_LOCAL_MNT="${HOME}/imitation"
40 | fi
41 | 
42 | # install imitation in developer mode
43 | CMD="pip install -e .[docs,parallel,test] gym[mujoco]" # copied from ci/build_and_activate_venv.sh
44 | 
45 | # Pull image from DockerHub if prompted
46 | if [[ $PULL == 1 ]]; then
47 |   echo "Pulling ${DOCKER_IMAGE} from DockerHub"
48 |   docker pull ${DOCKER_IMAGE}
49 | fi
50 | 
51 | 
52 | docker run -it --rm --init \
53 |   -v "${IMIT_LOCAL_MNT}:/imitation" \
54 |   ${DOCKER_IMAGE} \
55 |   /bin/bash -c "${CMD} && exec bash"
56 | 


--------------------------------------------------------------------------------
/runners/build_push_image.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e # Exit immediately if a command exits with a non-zero status.
 4 | 
 5 | __usage="build_push_image.sh - Building and pushing Docker image
 6 | 
 7 | Usage: build_push_image.sh [options] [tags]
 8 | 
 9 | options:
10 |   -h, --help                show brief help
11 |   -p, --push                push the image to DockerHub
12 | tags:
13 |   base                      base stage image
14 |   python-req                python-req stage image
15 | "
16 | 
17 | KEYS=""
18 | PUSH=0
19 | 
20 | while test $# -gt 0; do
21 |   case "$1" in
22 |   -p | --push)
23 |     PUSH=1 # Push the image to Docker Hub
24 |     shift
25 |     ;;
26 |   base)
27 |     KEYS+="base "
28 |     shift
29 |     ;;
30 |   python-req)
31 |     KEYS+="python-req "
32 |     shift
33 |     ;;
34 |   -h | --help)
35 |     echo "${__usage}"
36 |     exit 0
37 |     ;;
38 |   *)
39 |     echo "Unrecognized flag $1" >&2
40 |     exit 1
41 |     ;;
42 |   esac
43 | done
44 | 
45 | if [[ -z $KEYS ]]; then
46 |   KEYS="base"
47 |   echo "No tag found in the arguments! Building default image humancompatibleai/imitation:${KEYS}"
48 | fi
49 | 
50 | for key in $KEYS; do
51 |   echo "----- Building humancompatibleai/imitation:${key} ..."
52 |   BUILD_CMD="docker build --target ${key} -t humancompatibleai/imitation:${key} ."
53 |   PUSH_CMD="docker push humancompatibleai/imitation:${key}"
54 | 
55 |   # Build image
56 |   ${BUILD_CMD}
57 | 
58 |   # Push image if prompted
59 |   if [[ $PUSH == 1 ]]; then
60 |     echo "----- Pushing humancompatibleai/imitation:${key} ..."
61 |     ${PUSH_CMD}
62 |   fi
63 | done
64 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | """Fixtures common across tests."""
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | import seals  # noqa: F401
 6 | import torch
 7 | from stable_baselines3.common.vec_env import VecEnv
 8 | 
 9 | from imitation.data.wrappers import RolloutInfoWrapper
10 | from imitation.util import logger, util
11 | 
12 | CARTPOLE_ENV_NAME = "seals/CartPole-v0"
13 | 
14 | 
15 | @pytest.fixture(params=[1, 4], ids=lambda n: f"vecenv({n})")
16 | def cartpole_venv(request, rng) -> VecEnv:
17 |     num_envs = request.param
18 |     return util.make_vec_env(
19 |         CARTPOLE_ENV_NAME,
20 |         n_envs=num_envs,
21 |         post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],
22 |         rng=rng,
23 |     )
24 | 
25 | 
26 | @pytest.fixture(scope="session", autouse=True)
27 | def torch_single_threaded():
28 |     """Make PyTorch execute code single-threaded.
29 | 
30 |     This allows us to run the test suite with greater across-test parallelism.
31 |     This is faster, since:
32 |         - There are diminishing returns to more threads within a test.
33 |         - Many tests cannot be multi-threaded (e.g. most not using PyTorch training),
34 |           and we have to set between-test parallelism based on peak resource
35 |           consumption of tests to avoid spurious failures.
36 |     """
37 |     torch.set_num_threads(1)
38 |     torch.set_num_interop_threads(1)
39 | 
40 | 
41 | @pytest.fixture()
42 | def custom_logger(tmpdir: str) -> logger.HierarchicalLogger:
43 |     return logger.configure(tmpdir)
44 | 
45 | 
46 | @pytest.fixture()
47 | def rng() -> np.random.Generator:
48 |     return np.random.default_rng(seed=0)
49 | 


--------------------------------------------------------------------------------
/docs/getting-started/first_steps.rst:
--------------------------------------------------------------------------------
 1 | .. _First Steps:
 2 | 
 3 | ===========
 4 | First Steps
 5 | ===========
 6 | 
 7 | Imitation can be used in two main ways: through its command-line interface (CLI) or Python API.
 8 | The CLI allows you to quickly train and test algorithms and policies directly from the command line.
 9 | The Python API provides greater flexibility and extensibility, and allows you to inter-operate with your existing Python environment.
10 | 
11 | CLI Quickstart
12 | ==============
13 | 
14 | We provide several CLI scripts as front-ends to the algorithms implemented in ``imitation``.
15 | These use `Sacred <https://github.com/idsia/sacred>`_ for configuration and replicability.
16 | 
17 | For information on how to configure Sacred CLI options, see the `Sacred docs <https://sacred.readthedocs.io/en/stable/>`_.
18 | 
19 | .. literalinclude :: ../../examples/quickstart.sh
20 |    :language: bash
21 | 
22 | .. note::
23 |   Remove the ``fast`` options from the commands above to allow training run to completion.
24 | 
25 | .. tip::
26 |   ``python -m imitation.scripts.train_rl print_config`` will list Sacred script options.
27 |   These configuration options are also documented in each script's docstrings.
28 | 
29 | 
30 | Python Interface Quickstart
31 | ===========================
32 | 
33 | Here's an `example script`_ that loads CartPole demonstrations and trains BC, GAIL, and
34 | AIRL models on that data. You will need to ``pip install seals`` or ``pip install imitation[test]``
35 | to run this.
36 | 
37 | .. _example script: https://github.com/HumanCompatibleAI/imitation/blob/master/examples/quickstart.py
38 | 
39 | .. literalinclude :: ../../examples/quickstart.py
40 |    :language: python
41 | 


--------------------------------------------------------------------------------
/src/imitation/testing/reward_nets.py:
--------------------------------------------------------------------------------
 1 | """Utility functions for testing reward nets."""
 2 | 
 3 | import gymnasium as gym
 4 | import torch as th
 5 | 
 6 | from imitation.rewards import reward_nets
 7 | 
 8 | 
 9 | def make_ensemble(
10 |     obs_space: gym.Space,
11 |     action_space: gym.Space,
12 |     num_members: int = 2,
13 |     **kwargs,
14 | ):
15 |     """Create a simple reward ensemble."""
16 |     return reward_nets.RewardEnsemble(
17 |         obs_space,
18 |         action_space,
19 |         members=[
20 |             reward_nets.BasicRewardNet(obs_space, action_space, **kwargs)
21 |             for _ in range(num_members)
22 |         ],
23 |     )
24 | 
25 | 
26 | class MockRewardNet(reward_nets.RewardNet):
27 |     """A mock reward net for testing."""
28 | 
29 |     def __init__(
30 |         self,
31 |         observation_space: gym.Space,
32 |         action_space: gym.Space,
33 |         value: float = 0.0,
34 |     ):
35 |         """Create mock reward.
36 | 
37 |         Args:
38 |             observation_space: observation space of the env
39 |             action_space: action space of the env
40 |             value: The reward to always return. Defaults to 0.0.
41 |         """
42 |         super().__init__(observation_space, action_space)
43 |         self.value = value
44 | 
45 |     def forward(
46 |         self,
47 |         state: th.Tensor,
48 |         action: th.Tensor,
49 |         next_state: th.Tensor,
50 |         done: th.Tensor,
51 |     ) -> th.Tensor:
52 |         batch_size = state.shape[0]
53 |         return th.full(
54 |             (batch_size,),
55 |             fill_value=self.value,
56 |             dtype=th.float32,
57 |             device=state.device,
58 |         )
59 | 


--------------------------------------------------------------------------------
/tests/rewards/test_reward_fn.py:
--------------------------------------------------------------------------------
 1 | """Tests `imitation.rewards.reward_function` and `imitation.rewards.serialize`."""
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | 
 6 | from imitation.rewards import reward_function, serialize
 7 | 
 8 | OBS = np.random.randint(0, 10, (64, 100))
 9 | ACTS = NEXT_OBS = OBS
10 | DONES = np.zeros(64, dtype=np.bool_)
11 | 
12 | 
13 | def _funky_reward_fn(obs, act, next_obs, done):
14 |     """Returns consecutive reward from 1 to batch size `len(obs)`."""
15 |     # give each environment number from 1 to num_envs
16 |     return (np.arange(len(obs))).astype("float32")
17 | 
18 | 
19 | def _invalid_reward_fn(obs, act, next_obs, done):
20 |     """Returns rewards for lesser number of observations."""
21 |     return (np.arange(len(obs) - 1)).astype("float32")
22 | 
23 | 
24 | def test_reward_fn_override():
25 |     # test inheriting class from RewardFn works
26 |     class InheritedFunkyReward(reward_function.RewardFn):
27 |         """A reward inherited from RewardFn."""
28 | 
29 |         def __init__(self):
30 |             super().__init__()
31 | 
32 |         def __call__(self, obs, act, next_obs, steps=None):
33 |             """Returns consecutive reward from 0 to batch size -1 (`len(obs)` - 1)."""
34 |             return (np.arange(len(obs))).astype("float32")
35 | 
36 |     inherited_funky_reward_fn = InheritedFunkyReward()
37 |     inherited_funky_reward_fn(OBS, ACTS, NEXT_OBS)
38 | 
39 | 
40 | def test_validate_rewardfn_class():
41 |     validated_reward_fn = serialize.ValidateRewardFn(_funky_reward_fn)
42 |     validated_reward_fn(OBS, ACTS, NEXT_OBS, DONES)
43 | 
44 |     with pytest.raises(AssertionError):
45 |         invalidated_reward_fn = serialize.ValidateRewardFn(_invalid_reward_fn)
46 |         invalidated_reward_fn(OBS, ACTS, NEXT_OBS, DONES)
47 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | # TODO(adam): eventually add more docstrings and remove the D10{2,3,5} ignore
 3 | extend-ignore=E203,D102,D103,D105
 4 | docstring-convention=google
 5 | max-line-length=88
 6 | per-file-ignores =
 7 | # F841 local variable unused [for Sacred config scopes]
 8 |   src/imitation/scripts/config/*.py:F841
 9 |   ../src/imitation/scripts/config/*.py:F841
10 |   src/imitation/envs/examples/airl_envs/*.py:D
11 | 
12 | [darglint]
13 | strictness=short
14 | 
15 | [isort]
16 | known_first_party=imitation
17 | # isort 5.0 is confused by imitation.utils.sacred
18 | # into thinking that `sacred` is first-party.
19 | known_third_party=sacred, wandb
20 | default_section=THIRDPARTY
21 | skip=.pytype
22 | # Below are needed for black compatibility
23 | multi_line_output=3
24 | include_trailing_comma=True
25 | force_grid_wrap=0
26 | use_parentheses=True
27 | line_length=88
28 | ensure_newline_before_comments=True
29 | 
30 | [tool:pytest]
31 | filterwarnings =
32 |     ignore:Using or importing the ABCs from 'collections':DeprecationWarning:(google|pkg_resources)
33 |     ignore:Parameters to load are deprecated:Warning:gym
34 |     ignore:The binary mode of fromstring is deprecated:DeprecationWarning:gym
35 | 
36 | markers =
37 |     expensive: mark a test as expensive (deselect with '-m "not expensive"')
38 | 
39 | # Terminate the test just before CircleCI's 10-minute timeout so we see the test failure
40 | # instead of a timeout.
41 | timeout = 590
42 | 
43 | [coverage:run]
44 | source = imitation
45 | include=
46 |     src/*
47 |     tests/*
48 | omit =
49 |     src/imitation/scripts/config/*
50 | 
51 | [coverage:report]
52 | exclude_lines =
53 |     if self.debug:
54 |     pragma: no cover
55 |     raise NotImplementedError
56 |     if __name__ == .__main__.:
57 | omit =
58 |   setup.py
59 | 
60 | [coverage:paths]
61 | source =
62 |     src/imitation
63 |     *venv/lib/python*/site-packages/imitation
64 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/gail_seals_ant_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "algorithm_kwargs": {
 3 |     "demo_batch_size": 32,
 4 |     "gen_replay_buffer_capacity": 16384,
 5 |     "n_disc_updates_per_round": 8
 6 |   },
 7 |   "checkpoint_interval": 0,
 8 |   "demonstrations": {
 9 |     "source": "huggingface",
10 |     "algo_name": "ppo",
11 |     "n_expert_demos": null
12 |   },
13 |   "reward": {
14 |     "add_std_alpha": null,
15 |     "ensemble_size": null,
16 |     "net_cls": {
17 |       "py/type": "imitation.rewards.reward_nets.BasicRewardNet"
18 |     },
19 |     "net_kwargs": {
20 |       "normalize_input_layer": {
21 |         "py/type": "imitation.util.networks.RunningNorm"
22 |       }
23 |     },
24 |     "normalize_output_layer": {
25 |       "py/type": "imitation.util.networks.RunningNorm"
26 |     }
27 |   },
28 |   "rl": {
29 |     "batch_size": 16384,
30 |     "rl_cls": {
31 |       "py/type": "stable_baselines3.ppo.ppo.PPO"
32 |     },
33 |     "rl_kwargs": {
34 |       "batch_size": 16,
35 |       "clip_range": 0.3,
36 |       "ent_coef": 0.008871887607426377,
37 |       "gae_lambda": 0.8,
38 |       "gamma": 0.995,
39 |       "learning_rate": 2.428297806883194e-5,
40 |       "max_grad_norm": 0.9,
41 |       "n_epochs": 10,
42 |       "vf_coef": 0.4351450387648799
43 |     }
44 |   },
45 |   "total_timesteps": 10000000,
46 |   "policy": {
47 |     "policy_cls": {
48 |       "py/type": "imitation.policies.base.FeedForward32Policy"
49 |     },
50 |     "policy_kwargs": {
51 |       "features_extractor_class": {
52 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
53 |       },
54 |       "features_extractor_kwargs": {
55 |         "normalize_class": {
56 |           "py/type": "imitation.util.networks.RunningNorm"
57 |         }
58 |       }
59 |     }
60 |   },
61 |   "policy_evaluation": {
62 |     "n_episodes_eval": 50
63 |   },
64 |   "environment": {
65 |     "gym_id": "seals/Ant-v1"
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/airl_seals_ant_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "algorithm_kwargs": {
 3 |     "demo_batch_size": 8192,
 4 |     "gen_replay_buffer_capacity": 8192,
 5 |     "n_disc_updates_per_round": 16
 6 |   },
 7 |   "checkpoint_interval": 0,
 8 |   "demonstrations": {
 9 |     "source": "huggingface",
10 |     "algo_name": "ppo",
11 |     "n_expert_demos": null
12 |   },
13 |   "reward": {
14 |     "add_std_alpha": null,
15 |     "ensemble_size": null,
16 |     "net_cls": {
17 |       "py/type": "imitation.rewards.reward_nets.BasicShapedRewardNet"
18 |     },
19 |     "net_kwargs": {
20 |       "normalize_input_layer": {
21 |         "py/type": "imitation.util.networks.RunningNorm"
22 |       }
23 |     },
24 |     "normalize_output_layer": {
25 |       "py/type": "imitation.util.networks.RunningNorm"
26 |     }
27 |   },
28 |   "rl": {
29 |     "batch_size": 8192,
30 |     "rl_cls": {
31 |       "py/type": "stable_baselines3.ppo.ppo.PPO"
32 |     },
33 |     "rl_kwargs": {
34 |       "batch_size": 16,
35 |       "clip_range": 0.3,
36 |       "ent_coef": 3.27750078482474e-6,
37 |       "gae_lambda": 0.8,
38 |       "gamma": 0.995,
39 |       "learning_rate": 3.249429831179079e-5,
40 |       "max_grad_norm": 0.9,
41 |       "n_epochs": 10,
42 |       "vf_coef": 0.4351450387648799
43 |     }
44 |   },
45 |   "total_timesteps": 10000000,
46 |   "policy": {
47 |     "policy_cls": {
48 |       "py/type": "imitation.policies.base.FeedForward32Policy"
49 |     },
50 |     "policy_kwargs": {
51 |       "features_extractor_class": {
52 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
53 |       },
54 |       "features_extractor_kwargs": {
55 |         "normalize_class": {
56 |           "py/type": "imitation.util.networks.RunningNorm"
57 |         }
58 |       }
59 |     }
60 |   },
61 |   "policy_evaluation": {
62 |     "n_episodes_eval": 50
63 |   },
64 |   "environment": {
65 |     "gym_id": "seals/Ant-v1"
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/gail_seals_half_cheetah_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "algorithm_kwargs": {
 3 |     "demo_batch_size": 8192,
 4 |     "gen_replay_buffer_capacity": 512,
 5 |     "n_disc_updates_per_round": 8
 6 |   },
 7 |   "checkpoint_interval": 0,
 8 |   "demonstrations": {
 9 |     "source": "huggingface",
10 |     "algo_name": "ppo",
11 |     "n_expert_demos": null
12 |   },
13 |   "reward": {
14 |     "add_std_alpha": null,
15 |     "ensemble_size": null,
16 |     "net_cls": {
17 |       "py/type": "imitation.rewards.reward_nets.BasicRewardNet"
18 |     },
19 |     "net_kwargs": {
20 |       "normalize_input_layer": {
21 |         "py/type": "imitation.util.networks.RunningNorm"
22 |       }
23 |     },
24 |     "normalize_output_layer": {
25 |       "py/type": "imitation.util.networks.RunningNorm"
26 |     }
27 |   },
28 |   "rl": {
29 |     "batch_size": 4096,
30 |     "rl_cls": {
31 |       "py/type": "stable_baselines3.ppo.ppo.PPO"
32 |     },
33 |     "rl_kwargs": {
34 |       "batch_size": 64,
35 |       "clip_range": 0.1,
36 |       "ent_coef": 3.992371122209408e-6,
37 |       "gae_lambda": 0.95,
38 |       "gamma": 0.95,
39 |       "learning_rate": 0.00026250519057717037,
40 |       "max_grad_norm": 0.8,
41 |       "n_epochs": 5,
42 |       "vf_coef": 0.11483689492120866
43 |     }
44 |   },
45 |   "total_timesteps": 10000000,
46 |   "policy": {
47 |     "policy_cls": {
48 |       "py/type": "imitation.policies.base.FeedForward32Policy"
49 |     },
50 |     "policy_kwargs": {
51 |       "features_extractor_class": {
52 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
53 |       },
54 |       "features_extractor_kwargs": {
55 |         "normalize_class": {
56 |           "py/type": "imitation.util.networks.RunningNorm"
57 |         }
58 |       }
59 |     }
60 |   },
61 |   "policy_evaluation": {
62 |     "n_episodes_eval": 50
63 |   },
64 |   "environment": {
65 |     "gym_id": "seals/HalfCheetah-v1"
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/airl_seals_half_cheetah_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "algorithm_kwargs": {
 3 |     "demo_batch_size": 2048,
 4 |     "gen_replay_buffer_capacity": 512,
 5 |     "n_disc_updates_per_round": 16
 6 |   },
 7 |   "checkpoint_interval": 0,
 8 |   "demonstrations": {
 9 |     "source": "huggingface",
10 |     "algo_name": "ppo",
11 |     "n_expert_demos": null
12 |   },
13 |   "reward": {
14 |     "add_std_alpha": null,
15 |     "ensemble_size": null,
16 |     "net_cls": {
17 |       "py/type": "imitation.rewards.reward_nets.BasicShapedRewardNet"
18 |     },
19 |     "net_kwargs": {
20 |       "normalize_input_layer": {
21 |         "py/type": "imitation.util.networks.RunningNorm"
22 |       }
23 |     },
24 |     "normalize_output_layer": {
25 |       "py/type": "imitation.util.networks.RunningNorm"
26 |     }
27 |   },
28 |   "rl": {
29 |     "batch_size": 8192,
30 |     "rl_cls": {
31 |       "py/type": "stable_baselines3.ppo.ppo.PPO"
32 |     },
33 |     "rl_kwargs": {
34 |       "batch_size": 64,
35 |       "clip_range": 0.1,
36 |       "ent_coef": 0.0005544771755195421,
37 |       "gae_lambda": 0.95,
38 |       "gamma": 0.95,
39 |       "learning_rate": 0.00047248619386801587,
40 |       "max_grad_norm": 0.8,
41 |       "n_epochs": 5,
42 |       "vf_coef": 0.11483689492120866
43 |     }
44 |   },
45 |   "total_timesteps": 10000000,
46 |   "policy": {
47 |     "policy_cls": {
48 |       "py/type": "imitation.policies.base.FeedForward32Policy"
49 |     },
50 |     "policy_kwargs": {
51 |       "features_extractor_class": {
52 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
53 |       },
54 |       "features_extractor_kwargs": {
55 |         "normalize_class": {
56 |           "py/type": "imitation.util.networks.RunningNorm"
57 |         }
58 |       }
59 |     }
60 |   },
61 |   "policy_evaluation": {
62 |     "n_episodes_eval": 50
63 |   },
64 |   "environment": {
65 |     "gym_id": "seals/HalfCheetah-v1"
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/tests/test_benchmarking.py:
--------------------------------------------------------------------------------
 1 | """Tests for config files in imitation/scripts/config/tuned_hps/ folder."""
 2 | 
 3 | import pytest
 4 | 
 5 | from imitation.scripts import train_adversarial, train_imitation, tuning
 6 | 
 7 | ALGORITHMS = ["bc", "dagger", "airl", "gail"]
 8 | ENVIRONMENTS = [
 9 |     "seals_walker",
10 |     "seals_ant",
11 |     "seals_half_cheetah",
12 |     "seals_hopper",
13 |     "seals_swimmer",
14 | ]
15 | 
16 | 
17 | @pytest.mark.parametrize("environment", ENVIRONMENTS)
18 | @pytest.mark.parametrize("algorithm", ALGORITHMS)
19 | def test_benchmarks_print_config_succeeds(algorithm: str, environment: str):
20 |     # We test the configs using the print_config command,
21 |     # because running the configs requires MuJoCo.
22 |     # Requiring MuJoCo to run the tests adds too much complexity.
23 | 
24 |     if algorithm in ("bc", "dagger"):
25 |         experiment = train_imitation.train_imitation_ex
26 |     elif algorithm in ("airl", "gail"):
27 |         experiment = train_adversarial.train_adversarial_ex
28 |     else:
29 |         raise ValueError(f"Unknown algorithm: {algorithm}")  # pragma: no cover
30 | 
31 |     config_name = f"{algorithm}_{environment}"
32 |     run = experiment.run(command_name="print_config", named_configs=[config_name])
33 |     assert run.status == "COMPLETED"
34 | 
35 | 
36 | @pytest.mark.parametrize("environment", ENVIRONMENTS)
37 | @pytest.mark.parametrize("algorithm", ALGORITHMS)
38 | def test_tuning_print_config_succeeds(algorithm: str, environment: str):
39 |     # We test the configs using the print_config command,
40 |     # because running the configs requires MuJoCo.
41 |     # Requiring MuJoCo to run the tests adds too much complexity.
42 |     experiment = tuning.tuning_ex
43 |     run = experiment.run(
44 |         command_name="print_config",
45 |         named_configs=[algorithm],
46 |         config_updates=dict(
47 |             parallel_run_config=dict(
48 |                 base_named_configs=[f"{algorithm}_{environment}"],
49 |             ),
50 |         ),
51 |     )
52 |     assert run.status == "COMPLETED"
53 | 


--------------------------------------------------------------------------------
/benchmarking/run_all_benchmarks_on_slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | sbatch --job-name=bc_seals_ant run_benchmark_on_slurm.sh train_imitation bc seals_ant
 3 | sbatch --job-name=dagger_seals_ant run_benchmark_on_slurm.sh train_imitation dagger seals_ant
 4 | sbatch --job-name=airl_seals_ant run_benchmark_on_slurm.sh train_adversarial airl seals_ant
 5 | sbatch --job-name=gail_seals_ant run_benchmark_on_slurm.sh train_adversarial gail seals_ant
 6 | sbatch --job-name=bc_seals_half_cheetah run_benchmark_on_slurm.sh train_imitation bc seals_half_cheetah
 7 | sbatch --job-name=dagger_seals_half_cheetah run_benchmark_on_slurm.sh train_imitation dagger seals_half_cheetah
 8 | sbatch --job-name=airl_seals_half_cheetah run_benchmark_on_slurm.sh train_adversarial airl seals_half_cheetah
 9 | sbatch --job-name=gail_seals_half_cheetah run_benchmark_on_slurm.sh train_adversarial gail seals_half_cheetah
10 | sbatch --job-name=bc_seals_hopper run_benchmark_on_slurm.sh train_imitation bc seals_hopper
11 | sbatch --job-name=dagger_seals_hopper run_benchmark_on_slurm.sh train_imitation dagger seals_hopper
12 | sbatch --job-name=airl_seals_hopper run_benchmark_on_slurm.sh train_adversarial airl seals_hopper
13 | sbatch --job-name=gail_seals_hopper run_benchmark_on_slurm.sh train_adversarial gail seals_hopper
14 | sbatch --job-name=bc_seals_swimmer run_benchmark_on_slurm.sh train_imitation bc seals_swimmer
15 | sbatch --job-name=dagger_seals_swimmer run_benchmark_on_slurm.sh train_imitation dagger seals_swimmer
16 | sbatch --job-name=airl_seals_swimmer run_benchmark_on_slurm.sh train_adversarial airl seals_swimmer
17 | sbatch --job-name=gail_seals_swimmer run_benchmark_on_slurm.sh train_adversarial gail seals_swimmer
18 | sbatch --job-name=bc_seals_walker run_benchmark_on_slurm.sh train_imitation bc seals_walker
19 | sbatch --job-name=dagger_seals_walker run_benchmark_on_slurm.sh train_imitation dagger seals_walker
20 | sbatch --job-name=airl_seals_walker run_benchmark_on_slurm.sh train_adversarial airl seals_walker
21 | sbatch --job-name=gail_seals_walker run_benchmark_on_slurm.sh train_adversarial gail seals_walker
22 | 


--------------------------------------------------------------------------------
/experiments/README.md:
--------------------------------------------------------------------------------
 1 | Experiment scripts are compatible with Linux and macOS.
 2 | 
 3 | ## (macOS only) macOS compatibility setup
 4 | 
 5 | macOS to install some GNU-compatible binaries before all experiments scripts will work.
 6 | 
 7 | ```
 8 | brew install coreutils gnu-getopt parallel
 9 | ```
10 | 
11 | ## Scripts
12 | 
13 | ### Phase 1: Generate expert demonstrations from models.
14 | 
15 | Run `experiments/rollouts_from_policies.sh`. (Rollouts saved in `output/train_experts/`).
16 | Demonstrations are used in Phase 2 for imitation learning.
17 | 
18 | ### Phase 2: Train imitation learning.
19 | 
20 | Run `experiments/imit_benchmark.sh --run_name RUN_NAME`. To choose AIRL or GAIL, add the `--airl` and `--gail` flags (default is GAIL).
21 | 
22 | To analyze these results, run `python -m imitation.scripts.analyze with run_name=RUN_NAME`. Analysis can be run even while training is midway (will only show completed imitation learner's results). [Example output.](https://gist.github.com/shwang/4049cd4fb5cab72f2eeb7f3d15a7ab47)
23 | 
24 | ### Phase 3: Transfer learning.
25 | 
26 | Run `experiments/transfer_learn_benchmark.sh`. To choose AIRL or GAIL, add the `--airl` and `--gail` flags (default is GAIL). Transfer rewards are loaded from `data/reward_models`.
27 | 
28 | ## Hyperparameter tuning
29 | 
30 | Add a named config containing the hyperparameter search space and other settings to `src/imitation/scripts/config/parallel.py`. (`def example_cartpole_rl():` is an example).
31 | 
32 | Run your hyperparameter tuning experiment using `python -m imitation.scripts.parallel with YOUR_NAMED_CONFIG inner_run_name=RUN_NAME`.
33 | 
34 | Analyze imitation learning experiments using `python -m imitation.scripts.analyze with run_name=RUN_NAME source_dir=~/ray_results`.
35 | 
36 | View Stable Baselines training stats on TensorBoard (available for regular RL, imitation learning, and transfer learning) using `tensorboard --log_dir ~/ray_results`. To view only a subset of TensorBoard training progress use `imitation.scripts.analyze gather_tb_directories with source_dir=~/ray_results run_name=RUN_NAME`.
37 | 


--------------------------------------------------------------------------------
/experiments/rollouts_from_policies.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # The values of {env_config_name} are defined in the config file
 4 | # `experiments/rollouts_from_policies.csv`.
 5 | #
 6 | # TODO(shwang): Nice to have -- first evaluate the mean return of each policy, then use
 7 | # this to choose the best seed rather than hardcoding seed 0. This will probably require
 8 | # a Python implementation. We (Steven) currently pick out the best policies seeds
 9 | # manually and rename that directory to `expert_models/polieices/${env_name}_0/` to ensure
10 | # that downstream scripts get good expert rollouts and policies. If you are in the
11 | # process of picking good policies, then this "check rollout quality" script could
12 | # be useful: https://gist.github.com/1bea85e658a41b32c2693832fc216b8a.
13 | 
14 | set -e  # Exit on error.
15 | 
16 | source experiments/common.sh
17 | 
18 | CONFIG_CSV=${CONFIG_CSV:-experiments/rollouts_from_policies_config.csv}
19 | # To prevent race conditions, we use a different output dir for each process id.
20 | OUTPUT_DIR="output/train_experts/${TIMESTAMP}-${BASHPID}"
21 | 
22 | if ! TEMP=$($GNU_GETOPT -o f -l fast -- "$@"); then
23 |   exit 1
24 | fi
25 | eval set -- "$TEMP"
26 | 
27 | while true; do
28 |   case "$1" in
29 |     -f | --fast)
30 |       # Fast mode (debug)
31 |       CONFIG_CSV="tests/testdata/rollouts_from_policies_config.csv"
32 |       shift
33 |       ;;
34 |     --)
35 |       shift
36 |       break
37 |       ;;
38 |     *)
39 |       echo "Unrecognized flag $1" >&2
40 |       exit 1
41 |       ;;
42 |   esac
43 | done
44 | 
45 | echo "Writing logs in ${OUTPUT_DIR}, and saving rollouts in ${OUTPUT_DIR}/expert_models/*/rollouts/"
46 | 
47 | parallel -j 25% --header : --results "${OUTPUT_DIR}/parallel/" --colsep , \
48 |   python -m imitation.scripts.eval_policy \
49 |   --capture=sys \
50 |   with \
51 |   '{env_config_name}' \
52 |   logging.log_root="${OUTPUT_DIR}" \
53 |   rollout_save_path="${OUTPUT_DIR}/expert_models/{env_config_name}_0/rollouts/final.npz" \
54 |   eval_n_episodes='{n_demonstrations}' \
55 |   eval_n_timesteps=None \
56 |   :::: ${CONFIG_CSV}
57 | 


--------------------------------------------------------------------------------
/tests/rewards/test_reward_wrapper.py:
--------------------------------------------------------------------------------
 1 | """Tests `imitation.util.reward_wrapper`."""
 2 | 
 3 | import numpy as np
 4 | 
 5 | from imitation.data import rollout
 6 | from imitation.policies.base import RandomPolicy
 7 | from imitation.rewards import reward_wrapper
 8 | from imitation.util import util
 9 | 
10 | 
11 | class FunkyReward:
12 |     """A reward that ignores observation and depends only on batch index."""
13 | 
14 |     def __call__(self, obs, act, next_obs, steps=None):
15 |         """Returns consecutive reward from 1 to batch size `len(obs)`."""
16 |         # give each environment number from 1 to num_envs
17 |         return (np.arange(len(obs)) + 1).astype("float32")
18 | 
19 | 
20 | def test_reward_overwrite(rng):
21 |     """Test that reward wrapper actually overwrites base rewards."""
22 |     env_name = "Pendulum-v1"
23 |     num_envs = 3
24 |     env = util.make_vec_env(env_name, rng=rng, n_envs=num_envs)
25 |     reward_fn = FunkyReward()
26 |     wrapped_env = reward_wrapper.RewardVecEnvWrapper(env, reward_fn)
27 |     policy = RandomPolicy(env.observation_space, env.action_space)
28 |     sample_until = rollout.make_min_episodes(10)
29 |     default_stats = rollout.rollout_stats(
30 |         rollout.generate_trajectories(policy, env, sample_until, rng),
31 |     )
32 |     wrapped_stats = rollout.rollout_stats(
33 |         rollout.generate_trajectories(policy, wrapped_env, sample_until, rng),
34 |     )
35 |     # Pendulum-v1 always has negative rewards
36 |     assert default_stats["return_max"] < 0
37 |     # ours gives between 1 * traj_len and num_envs * traj_len reward
38 |     # (trajectories are all constant length of 200 in Pendulum)
39 |     steps = wrapped_stats["len_mean"]
40 |     assert wrapped_stats["return_min"] == 1 * steps
41 |     assert wrapped_stats["return_max"] == num_envs * steps
42 | 
43 |     # check that wrapped reward is negative (all pendulum rewards is negative)
44 |     # and other rewards are non-negative
45 |     rand_act, _ = policy.predict(wrapped_env.reset())
46 |     _, rew, _, infos = wrapped_env.step(rand_act)
47 |     assert np.all(rew >= 0)
48 |     assert np.all([info_dict["original_env_rew"] < 0 for info_dict in infos])
49 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/gail_seals_hopper_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "algorithm_kwargs": {
 3 |     "demo_batch_size": 128,
 4 |     "gen_replay_buffer_capacity": 4096,
 5 |     "n_disc_updates_per_round": 8
 6 |   },
 7 |   "checkpoint_interval": 0,
 8 |   "demonstrations": {
 9 |     "source": "huggingface",
10 |     "algo_name": "ppo",
11 |     "n_expert_demos": null
12 |   },
13 |   "reward": {
14 |     "add_std_alpha": null,
15 |     "ensemble_size": null,
16 |     "net_cls": {
17 |       "py/type": "imitation.rewards.reward_nets.BasicRewardNet"
18 |     },
19 |     "net_kwargs": {
20 |       "normalize_input_layer": {
21 |         "py/type": "imitation.util.networks.RunningNorm"
22 |       }
23 |     },
24 |     "normalize_output_layer": {
25 |       "py/type": "imitation.util.networks.RunningNorm"
26 |     }
27 |   },
28 |   "rl": {
29 |     "batch_size": 4096,
30 |     "rl_cls": {
31 |       "py/type": "stable_baselines3.ppo.ppo.PPO"
32 |     },
33 |     "rl_kwargs": {
34 |       "batch_size": 512,
35 |       "clip_range": 0.1,
36 |       "ent_coef": 0.001255299425412744,
37 |       "gae_lambda": 0.98,
38 |       "gamma": 0.995,
39 |       "learning_rate": 4.3984856156897565e-5,
40 |       "max_grad_norm": 0.9,
41 |       "n_epochs": 20,
42 |       "vf_coef": 0.20315938606555833
43 |     }
44 |   },
45 |   "total_timesteps": 10000000,
46 |   "policy": {
47 |     "policy_cls": "MlpPolicy",
48 |     "policy_kwargs": {
49 |       "activation_fn": {
50 |         "py/type": "torch.nn.modules.activation.ReLU"
51 |       },
52 |       "features_extractor_class": {
53 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
54 |       },
55 |       "features_extractor_kwargs": {
56 |         "normalize_class": {
57 |           "py/type": "imitation.util.networks.RunningNorm"
58 |         }
59 |       },
60 |       "net_arch": [
61 |         {
62 |           "pi": [
63 |             64,
64 |             64
65 |           ],
66 |           "vf": [
67 |             64,
68 |             64
69 |           ]
70 |         }
71 |       ]
72 |     }
73 |   },
74 |   "policy_evaluation": {
75 |     "n_episodes_eval": 50
76 |   },
77 |   "environment": {
78 |     "gym_id": "seals/Hopper-v1"
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/airl_seals_hopper_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "algorithm_kwargs": {
 3 |     "demo_batch_size": 2048,
 4 |     "gen_replay_buffer_capacity": 8192,
 5 |     "n_disc_updates_per_round": 16
 6 |   },
 7 |   "checkpoint_interval": 0,
 8 |   "demonstrations": {
 9 |     "source": "huggingface",
10 |     "algo_name": "ppo",
11 |     "n_expert_demos": null
12 |   },
13 |   "reward": {
14 |     "add_std_alpha": null,
15 |     "ensemble_size": null,
16 |     "net_cls": {
17 |       "py/type": "imitation.rewards.reward_nets.BasicShapedRewardNet"
18 |     },
19 |     "net_kwargs": {
20 |       "normalize_input_layer": {
21 |         "py/type": "imitation.util.networks.RunningNorm"
22 |       }
23 |     },
24 |     "normalize_output_layer": {
25 |       "py/type": "imitation.util.networks.RunningNorm"
26 |     }
27 |   },
28 |   "rl": {
29 |     "batch_size": 8192,
30 |     "rl_cls": {
31 |       "py/type": "stable_baselines3.ppo.ppo.PPO"
32 |     },
33 |     "rl_kwargs": {
34 |       "batch_size": 512,
35 |       "clip_range": 0.1,
36 |       "ent_coef": 0.009709494745755033,
37 |       "gae_lambda": 0.98,
38 |       "gamma": 0.995,
39 |       "learning_rate": 0.0005807211840258373,
40 |       "max_grad_norm": 0.9,
41 |       "n_epochs": 20,
42 |       "vf_coef": 0.20315938606555833
43 |     }
44 |   },
45 |   "total_timesteps": 10000000,
46 |   "policy": {
47 |     "policy_cls": "MlpPolicy",
48 |     "policy_kwargs": {
49 |       "activation_fn": {
50 |         "py/type": "torch.nn.modules.activation.ReLU"
51 |       },
52 |       "features_extractor_class": {
53 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
54 |       },
55 |       "features_extractor_kwargs": {
56 |         "normalize_class": {
57 |           "py/type": "imitation.util.networks.RunningNorm"
58 |         }
59 |       },
60 |       "net_arch": [
61 |         {
62 |           "pi": [
63 |             64,
64 |             64
65 |           ],
66 |           "vf": [
67 |             64,
68 |             64
69 |           ]
70 |         }
71 |       ]
72 |     }
73 |   },
74 |   "policy_evaluation": {
75 |     "n_episodes_eval": 50
76 |   },
77 |   "environment": {
78 |     "gym_id": "seals/Hopper-v1"
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/experiments/bc_benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -e
 3 | 
 4 | # This script trains BC experts.
 5 | #
 6 | # Use the --paper flag to produce paper benchmark results.
 7 | #
 8 | # When training is finished, it reports the mean episode reward of each
 9 | # expert.
10 | source experiments/common.sh
11 | 
12 | ENVS=(seals_cartpole)
13 | SEEDS=(0 1 2)
14 | # To prevent race conditions, we use a different output dir for each process id.
15 | OUTPUT_DIR="output/bc_benchmark/${TIMESTAMP}-${BASHPID}"
16 | extra_configs=()
17 | extra_options=()
18 | extra_parallel_options=()
19 | 
20 | if ! TEMP=$($GNU_GETOPT -o fTw -l fast,paper,tmux,run_name:,wandb -- "$@"); then
21 |   exit 1
22 | fi
23 | eval set -- "$TEMP"
24 | 
25 | while true; do
26 |   case "$1" in
27 |     -f | --fast)
28 |       # Fast mode (debug)
29 |       SEEDS=(0)
30 |       extra_configs=("${extra_configs[@]}" environment.fast demonstrations.fast policy_evaluation.fast fast)
31 |       shift
32 |       ;;
33 |     --paper)  # Table benchmark settings
34 |       ENVS=(seals_cartpole seals_mountain_car seals_half_cheetah)
35 |       shift
36 |       ;;
37 |     -w | --wandb)
38 |       # activate wandb logging by adding 'wandb' format string to logging.log_format_strs
39 |       extra_configs=("${extra_configs[@]}" "logging.wandb_logging")
40 |       shift
41 |       ;;
42 |     --run_name)
43 |       extra_options=("${extra_options[@]}" --name "$2")
44 |       shift 2
45 |       ;;
46 |     -T | --tmux)
47 |       extra_parallel_options=("${extra_parallel_options[@]}" --tmux)
48 |       shift
49 |     ;;
50 |     --)
51 |       shift
52 |       break
53 |       ;;
54 |     *)
55 |       echo "Unrecognized flag $1" >&2
56 |       exit 1
57 |       ;;
58 |   esac
59 | done
60 | 
61 | echo "Writing logs in ${OUTPUT_DIR}"
62 | 
63 | parallel -j 25% --header : --results "${OUTPUT_DIR}/parallel/" --colsep , --progress \
64 |   "${extra_parallel_options[@]}" \
65 |   python -m imitation.scripts.train_imitation \
66 |   --capture=sys \
67 |   "${extra_options[@]}" \
68 |   bc \
69 |   with \
70 |   '{env_config_name}' \
71 |   "${extra_configs[@]}" \
72 |   'seed={seed}' \
73 |   logging.log_root="${OUTPUT_DIR}" \
74 |   ::: env_config_name "${ENVS[@]}" \
75 |   ::: seed "${SEEDS[@]}"
76 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-to-pypi.yml:
--------------------------------------------------------------------------------
 1 | # Adapted from https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
 2 | 
 3 | name: Publish imitation distributions 📦 to PyPI and TestPyPI
 4 | 
 5 | on:
 6 |   # This requires that some file is changed, to avoid running
 7 |   # on new branch creation, when it would fail.
 8 |   # Both 'branches' and 'paths' need to be specified here, per:
 9 |   # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#onpushbranchestagsbranches-ignoretags-ignore
10 |   push:
11 |     branches:
12 |       - '**'
13 |     tags:
14 |       - 'v**'
15 |     paths:
16 |       - '**'
17 | 
18 | jobs:
19 |   build-n-publish:
20 |     name: Build and publish imitation distributions 📦 to PyPI and TestPyPI
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v3
25 |       with:
26 |         # Fetch tags needed by setuptools_scm to infer version number
27 |         # See https://github.com/pypa/setuptools_scm/issues/414
28 |         fetch-depth: 0
29 |     - name: Set up Python 3.10
30 |       uses: actions/setup-python@v3
31 |       with:
32 |         python-version: "3.10"
33 | 
34 |     - name: Install pypa/build
35 |       run: >-
36 |         python -m
37 |         pip install
38 |         build
39 |         --user
40 |     - name: Build a binary wheel and a source tarball
41 |       run: >-
42 |         python -m
43 |         build
44 |         --sdist
45 |         --wheel
46 |         --outdir dist/
47 |         .
48 | 
49 |     # Publish new distribution to Test PyPi on every push.
50 |     # This ensures the workflow stays healthy, and will also serve
51 |     # as a source of alpha builds.
52 |     - name: Publish distribution 📦 to Test PyPI
53 |       uses: pypa/gh-action-pypi-publish@release/v1
54 |       with:
55 |         password: ${{ secrets.TEST_PYPI_API_TOKEN }}
56 |         repository_url: https://test.pypi.org/legacy/
57 | 
58 |     # Publish new distribution to production PyPi on releases.
59 |     - name: Publish distribution 📦 to PyPI
60 |       if: startsWith(github.ref, 'refs/tags/v')
61 |       uses: pypa/gh-action-pypi-publish@release/v1
62 |       with:
63 |         password: ${{ secrets.PYPI_API_TOKEN }}
64 | 


--------------------------------------------------------------------------------
/docs/algorithms/sqil.rst:
--------------------------------------------------------------------------------
 1 | .. _soft q imitation learning docs:
 2 | 
 3 | ================================
 4 | Soft Q Imitation Learning (SQIL)
 5 | ================================
 6 | 
 7 | Soft Q Imitation learning learns to imitate a policy from demonstrations by
 8 | using the DQN algorithm with modified rewards. During each policy update, half
 9 | of the batch is sampled from the demonstrations and half is sampled from the
10 | environment. Expert demonstrations are assigned a reward of 1, and the
11 | environment is assigned a reward of 0. This encourages the policy to imitate
12 | the demonstrations, and to simultaneously avoid states not seen in the
13 | demonstrations.
14 | 
15 | .. note::
16 | 
17 |     This implementation is based on the DQN implementation in Stable Baselines 3,
18 |     which does not implement the soft Q-learning and therefore does not support
19 |     continuous actions. Therefore, this implementation only supports discrete actions
20 |     and the name "soft" Q-learning could be misleading.
21 | 
22 | Example
23 | =======
24 | 
25 | Detailed example notebook: :doc:`../tutorials/8_train_sqil`
26 | 
27 | .. testcode::
28 |     :skipif: skip_doctests
29 | 
30 |     import datasets
31 |     import gymnasium as gym
32 |     from stable_baselines3.common.evaluation import evaluate_policy
33 |     from stable_baselines3.common.vec_env import DummyVecEnv
34 | 
35 |     from imitation.algorithms import sqil
36 |     from imitation.data import huggingface_utils
37 | 
38 |     # Download some expert trajectories from the HuggingFace Datasets Hub.
39 |     dataset = datasets.load_dataset("HumanCompatibleAI/ppo-CartPole-v1")
40 |     rollouts = huggingface_utils.TrajectoryDatasetSequence(dataset["train"])
41 | 
42 |     sqil_trainer = sqil.SQIL(
43 |         venv=DummyVecEnv([lambda: gym.make("CartPole-v1")]),
44 |         demonstrations=rollouts,
45 |         policy="MlpPolicy",
46 |     )
47 |     # Hint: set to 1_000_000 to match the expert performance.
48 |     sqil_trainer.train(total_timesteps=1_000)
49 |     reward, _ = evaluate_policy(sqil_trainer.policy, sqil_trainer.venv, 10)
50 |     print("Reward:", reward)
51 | 
52 | .. testoutput::
53 |     :hide:
54 | 
55 |     ...
56 | 
57 | API
58 | ===
59 | .. autoclass:: imitation.algorithms.sqil.SQIL
60 |     :members:
61 |     :noindex:
62 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/gail_seals_swimmer_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "algorithm_kwargs": {
 3 |     "demo_batch_size": 32,
 4 |     "gen_replay_buffer_capacity": 4096,
 5 |     "n_disc_updates_per_round": 16
 6 |   },
 7 |   "checkpoint_interval": 0,
 8 |   "demonstrations": {
 9 |     "source": "huggingface",
10 |     "algo_name": "ppo",
11 |     "n_expert_demos": null
12 |   },
13 |   "expert": {
14 |     "loader_kwargs": {
15 |       "gym_id": "seals/Swimmer-v1",
16 |       "organization": "HumanCompatibleAI"
17 |     }
18 |   },
19 |   "reward": {
20 |     "add_std_alpha": null,
21 |     "ensemble_size": null,
22 |     "net_cls": {
23 |       "py/type": "imitation.rewards.reward_nets.BasicRewardNet"
24 |     },
25 |     "net_kwargs": {
26 |       "normalize_input_layer": {
27 |         "py/type": "imitation.util.networks.RunningNorm"
28 |       }
29 |     },
30 |     "normalize_output_layer": {
31 |       "py/type": "imitation.util.networks.RunningNorm"
32 |     }
33 |   },
34 |   "rl": {
35 |     "batch_size": 4096,
36 |     "rl_cls": {
37 |       "py/type": "stable_baselines3.ppo.ppo.PPO"
38 |     },
39 |     "rl_kwargs": {
40 |       "batch_size": 64,
41 |       "clip_range": 0.1,
42 |       "ent_coef": 2.257758693006348e-6,
43 |       "gae_lambda": 0.95,
44 |       "gamma": 0.999,
45 |       "learning_rate": 2.0190030388504567e-5,
46 |       "max_grad_norm": 2,
47 |       "n_epochs": 5,
48 |       "vf_coef": 0.6162112311062333
49 |     }
50 |   },
51 |   "total_timesteps": 10000000,
52 |   "policy": {
53 |     "policy_cls": "MlpPolicy",
54 |     "policy_kwargs": {
55 |       "activation_fn": {
56 |         "py/type": "torch.nn.modules.activation.ReLU"
57 |       },
58 |       "features_extractor_class": {
59 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
60 |       },
61 |       "features_extractor_kwargs": {
62 |         "normalize_class": {
63 |           "py/type": "imitation.util.networks.RunningNorm"
64 |         }
65 |       },
66 |       "net_arch": [
67 |         {
68 |           "pi": [
69 |             64,
70 |             64
71 |           ],
72 |           "vf": [
73 |             64,
74 |             64
75 |           ]
76 |         }
77 |       ]
78 |     }
79 |   },
80 |   "policy_evaluation": {
81 |     "n_episodes_eval": 50
82 |   },
83 |   "environment": {
84 |     "gym_id": "seals/Swimmer-v1"
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/gail_seals_walker_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "algorithm_kwargs": {
 3 |     "demo_batch_size": 512,
 4 |     "gen_replay_buffer_capacity": 16384,
 5 |     "n_disc_updates_per_round": 16
 6 |   },
 7 |   "checkpoint_interval": 0,
 8 |   "demonstrations": {
 9 |     "source": "huggingface",
10 |     "algo_name": "ppo",
11 |     "n_expert_demos": null
12 |   },
13 |   "expert": {
14 |     "loader_kwargs": {
15 |       "gym_id": "seals/Walker2d-v1",
16 |       "organization": "HumanCompatibleAI"
17 |     }
18 |   },
19 |   "reward": {
20 |     "add_std_alpha": null,
21 |     "ensemble_size": null,
22 |     "net_cls": {
23 |       "py/type": "imitation.rewards.reward_nets.BasicRewardNet"
24 |     },
25 |     "net_kwargs": {
26 |       "normalize_input_layer": {
27 |         "py/type": "imitation.util.networks.RunningNorm"
28 |       }
29 |     },
30 |     "normalize_output_layer": {
31 |       "py/type": "imitation.util.networks.RunningNorm"
32 |     }
33 |   },
34 |   "rl": {
35 |     "batch_size": 16384,
36 |     "rl_cls": {
37 |       "py/type": "stable_baselines3.ppo.ppo.PPO"
38 |     },
39 |     "rl_kwargs": {
40 |       "batch_size": 128,
41 |       "clip_range": 0.4,
42 |       "ent_coef": 0.0007566389899529574,
43 |       "gae_lambda": 0.92,
44 |       "gamma": 0.98,
45 |       "learning_rate": 1.943992487657563e-5,
46 |       "max_grad_norm": 0.6,
47 |       "n_epochs": 20,
48 |       "vf_coef": 0.6167177795726859
49 |     }
50 |   },
51 |   "total_timesteps": 10000000,
52 |   "policy": {
53 |     "policy_cls": "MlpPolicy",
54 |     "policy_kwargs": {
55 |       "activation_fn": {
56 |         "py/type": "torch.nn.modules.activation.ReLU"
57 |       },
58 |       "features_extractor_class": {
59 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
60 |       },
61 |       "features_extractor_kwargs": {
62 |         "normalize_class": {
63 |           "py/type": "imitation.util.networks.RunningNorm"
64 |         }
65 |       },
66 |       "net_arch": [
67 |         {
68 |           "pi": [
69 |             64,
70 |             64
71 |           ],
72 |           "vf": [
73 |             64,
74 |             64
75 |           ]
76 |         }
77 |       ]
78 |     }
79 |   },
80 |   "policy_evaluation": {
81 |     "n_episodes_eval": 50
82 |   },
83 |   "environment": {
84 |     "gym_id": "seals/Walker2d-v1"
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/airl_seals_swimmer_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "algorithm_kwargs": {
 3 |     "demo_batch_size": 128,
 4 |     "gen_replay_buffer_capacity": 16384,
 5 |     "n_disc_updates_per_round": 16
 6 |   },
 7 |   "checkpoint_interval": 0,
 8 |   "demonstrations": {
 9 |     "source": "huggingface",
10 |     "algo_name": "ppo",
11 |     "n_expert_demos": null
12 |   },
13 |   "expert": {
14 |     "loader_kwargs": {
15 |       "gym_id": "seals/Swimmer-v1",
16 |       "organization": "HumanCompatibleAI"
17 |     }
18 |   },
19 |   "reward": {
20 |     "add_std_alpha": null,
21 |     "ensemble_size": null,
22 |     "net_cls": {
23 |       "py/type": "imitation.rewards.reward_nets.BasicShapedRewardNet"
24 |     },
25 |     "net_kwargs": {
26 |       "normalize_input_layer": {
27 |         "py/type": "imitation.util.networks.RunningNorm"
28 |       }
29 |     },
30 |     "normalize_output_layer": {
31 |       "py/type": "imitation.util.networks.RunningNorm"
32 |     }
33 |   },
34 |   "rl": {
35 |     "batch_size": 16384,
36 |     "rl_cls": {
37 |       "py/type": "stable_baselines3.ppo.ppo.PPO"
38 |     },
39 |     "rl_kwargs": {
40 |       "batch_size": 64,
41 |       "clip_range": 0.1,
42 |       "ent_coef": 0.006137718463434523,
43 |       "gae_lambda": 0.95,
44 |       "gamma": 0.999,
45 |       "learning_rate": 0.0013390060486393868,
46 |       "max_grad_norm": 2,
47 |       "n_epochs": 5,
48 |       "vf_coef": 0.6162112311062333
49 |     }
50 |   },
51 |   "total_timesteps": 10000000,
52 |   "policy": {
53 |     "policy_cls": "MlpPolicy",
54 |     "policy_kwargs": {
55 |       "activation_fn": {
56 |         "py/type": "torch.nn.modules.activation.ReLU"
57 |       },
58 |       "features_extractor_class": {
59 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
60 |       },
61 |       "features_extractor_kwargs": {
62 |         "normalize_class": {
63 |           "py/type": "imitation.util.networks.RunningNorm"
64 |         }
65 |       },
66 |       "net_arch": [
67 |         {
68 |           "pi": [
69 |             64,
70 |             64
71 |           ],
72 |           "vf": [
73 |             64,
74 |             64
75 |           ]
76 |         }
77 |       ]
78 |     }
79 |   },
80 |   "policy_evaluation": {
81 |     "n_episodes_eval": 50
82 |   },
83 |   "environment": {
84 |     "gym_id": "seals/Swimmer-v1"
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/tuned_hps/airl_seals_walker_best_hp_eval.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "algorithm_kwargs": {
 3 |     "demo_batch_size": 512,
 4 |     "gen_replay_buffer_capacity": 16384,
 5 |     "n_disc_updates_per_round": 16
 6 |   },
 7 |   "checkpoint_interval": 0,
 8 |   "demonstrations": {
 9 |     "source": "huggingface",
10 |     "algo_name": "ppo",
11 |     "n_expert_demos": null
12 |   },
13 |   "expert": {
14 |     "loader_kwargs": {
15 |       "gym_id": "seals/Walker2d-v1",
16 |       "organization": "HumanCompatibleAI"
17 |     }
18 |   },
19 |   "reward": {
20 |     "add_std_alpha": null,
21 |     "ensemble_size": null,
22 |     "net_cls": {
23 |       "py/type": "imitation.rewards.reward_nets.BasicShapedRewardNet"
24 |     },
25 |     "net_kwargs": {
26 |       "normalize_input_layer": {
27 |         "py/type": "imitation.util.networks.RunningNorm"
28 |       }
29 |     },
30 |     "normalize_output_layer": {
31 |       "py/type": "imitation.util.networks.RunningNorm"
32 |     }
33 |   },
34 |   "rl": {
35 |     "batch_size": 16384,
36 |     "rl_cls": {
37 |       "py/type": "stable_baselines3.ppo.ppo.PPO"
38 |     },
39 |     "rl_kwargs": {
40 |       "batch_size": 128,
41 |       "clip_range": 0.4,
42 |       "ent_coef": 0.002003867232707145,
43 |       "gae_lambda": 0.92,
44 |       "gamma": 0.98,
45 |       "learning_rate": 3.052170958603811e-5,
46 |       "max_grad_norm": 0.6,
47 |       "n_epochs": 20,
48 |       "vf_coef": 0.6167177795726859
49 |     }
50 |   },
51 |   "total_timesteps": 10000000,
52 |   "policy": {
53 |     "policy_cls": "MlpPolicy",
54 |     "policy_kwargs": {
55 |       "activation_fn": {
56 |         "py/type": "torch.nn.modules.activation.ReLU"
57 |       },
58 |       "features_extractor_class": {
59 |         "py/type": "imitation.policies.base.NormalizeFeaturesExtractor"
60 |       },
61 |       "features_extractor_kwargs": {
62 |         "normalize_class": {
63 |           "py/type": "imitation.util.networks.RunningNorm"
64 |         }
65 |       },
66 |       "net_arch": [
67 |         {
68 |           "pi": [
69 |             64,
70 |             64
71 |           ],
72 |           "vf": [
73 |             64,
74 |             64
75 |           ]
76 |         }
77 |       ]
78 |     }
79 |   },
80 |   "policy_evaluation": {
81 |     "n_episodes_eval": 50
82 |   },
83 |   "environment": {
84 |     "gym_id": "seals/Walker2d-v1"
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/docs/algorithms/bc.rst:
--------------------------------------------------------------------------------
 1 | .. _behavioral cloning docs:
 2 | 
 3 | =======================
 4 | Behavioral Cloning (BC)
 5 | =======================
 6 | 
 7 | Behavioral cloning directly learns a policy by using supervised learning on
 8 | observation-action pairs from expert demonstrations. It is a simple approach to learning
 9 | a policy, but the policy often generalizes poorly and does not recover well from errors.
10 | 
11 | Alternatives to behavioral cloning include :ref:`DAgger <dagger docs>` (similar but gathers
12 | on-policy demonstrations) and :ref:`GAIL <gail docs>`/:ref:`AIRL <airl docs>` (more robust
13 | approaches to learning from demonstrations).
14 | 
15 | Example
16 | =======
17 | 
18 | Detailed example notebook: :doc:`../tutorials/1_train_bc`
19 | 
20 | .. testcode::
21 |     :skipif: skip_doctests
22 | 
23 |     import numpy as np
24 |     import gymnasium as gym
25 |     from stable_baselines3.common.evaluation import evaluate_policy
26 | 
27 |     from imitation.algorithms import bc
28 |     from imitation.data import rollout
29 |     from imitation.data.wrappers import RolloutInfoWrapper
30 |     from imitation.policies.serialize import load_policy
31 |     from imitation.util.util import make_vec_env
32 | 
33 |     rng = np.random.default_rng(0)
34 |     env = make_vec_env(
35 |         "seals:seals/CartPole-v0",
36 |         rng=rng,
37 |         n_envs=1,
38 |         post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],  # for computing rollouts
39 |     )
40 |     expert = load_policy(
41 |         "ppo-huggingface",
42 |         organization="HumanCompatibleAI",
43 |         env_name="seals-CartPole-v0",
44 |         venv=env,
45 |     )
46 |     rollouts = rollout.rollout(
47 |         expert,
48 |         env,
49 |         rollout.make_sample_until(min_timesteps=None, min_episodes=50),
50 |         rng=rng,
51 |     )
52 |     transitions = rollout.flatten_trajectories(rollouts)
53 | 
54 |     bc_trainer = bc.BC(
55 |         observation_space=env.observation_space,
56 |         action_space=env.action_space,
57 |         demonstrations=transitions,
58 |         rng=rng,
59 |     )
60 |     bc_trainer.train(n_epochs=1)
61 |     reward, _ = evaluate_policy(bc_trainer.policy, env, 10)
62 |     print("Reward:", reward)
63 | 
64 | .. testoutput::
65 |     :hide:
66 | 
67 |     ...
68 | 
69 | API
70 | ===
71 | .. autoclass:: imitation.algorithms.bc.BC
72 |     :members:
73 |     :noindex:
74 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/convert_trajs.py:
--------------------------------------------------------------------------------
 1 | """Converts old-style pickle or npz trajectories to new-style HuggingFace datasets.
 2 | 
 3 | See https://github.com/HumanCompatibleAI/imitation/pull/448 for a description
 4 | of the new trajectory format.
 5 | 
 6 | This script takes as command-line input multiple paths to saved trajectories,
 7 | in the old .pkl or .npz format. It then saves each sequence in the new HuggingFace
 8 | datasets format. The path is the same as the original but a directory without an
 9 | extension (i.e. "A.pkl" -> "A/", "A.npz" -> "A/", "A/" -> "A/", "A.foo" -> "A/").
10 | """
11 | 
12 | import pathlib
13 | import warnings
14 | 
15 | from imitation.data import huggingface_utils, serialize, types
16 | from imitation.util import util
17 | 
18 | 
19 | def update_traj_file_in_place(path_str: types.AnyPath, /) -> pathlib.Path:
20 |     """Converts pickle or npz file to the new HuggingFace format.
21 | 
22 |     The new data is saved as `Sequence[imitation.types.TrajectoryWithRew]`.
23 | 
24 |     Args:
25 |         path_str: Path to a pickle or npz file containing
26 |             `Sequence[imitation.types.Trajectory]`
27 |             or `Sequence[imitation.old_types.TrajectoryWithRew]`.
28 | 
29 |     Returns:
30 |         The path to the converted trajectory dataset.
31 |     """
32 |     path = util.parse_path(path_str)
33 |     with warnings.catch_warnings():
34 |         # Filter out DeprecationWarning because we expect to load old trajectories here.
35 |         warnings.filterwarnings(
36 |             "ignore",
37 |             message="Loading old .* version of Trajectories.*",
38 |             category=DeprecationWarning,
39 |         )
40 |         trajs = serialize.load(path)
41 | 
42 |     if isinstance(
43 |         trajs,
44 |         huggingface_utils.TrajectoryDatasetSequence,
45 |     ):
46 |         warnings.warn(f"File {path} is already in the new format. Skipping.")
47 |         return path
48 |     else:
49 |         converted_path = path.with_suffix("")
50 |         serialize.save(converted_path, trajs)
51 |         return converted_path
52 | 
53 | 
54 | def main():  # pragma: no cover
55 |     import sys
56 | 
57 |     if len(sys.argv) <= 1:
58 |         print("Supply at least one path to pickled trajectories.")
59 |     else:
60 |         for path in sys.argv[1:]:
61 |             update_traj_file_in_place(path)
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     main()
66 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/ingredients/wb.py:
--------------------------------------------------------------------------------
 1 | """This ingredient provides Weights & Biases logging."""
 2 | 
 3 | import logging
 4 | from typing import Any, Mapping, Optional
 5 | 
 6 | import sacred
 7 | 
 8 | wandb_ingredient = sacred.Ingredient("logging.wandb")
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | @wandb_ingredient.config
13 | def wandb_config():
14 |     # Other users can overwrite this function to customize their wandb.init() call.
15 |     wandb_tag = None  # User-specified tag for this run
16 |     wandb_name_prefix = ""  # User-specified prefix for the run name
17 |     wandb_kwargs = dict(
18 |         project="imitation",
19 |         monitor_gym=False,
20 |         save_code=False,
21 |     )  # Other kwargs to pass to wandb.init()
22 |     wandb_additional_info = dict()
23 | 
24 |     locals()
25 | 
26 | 
27 | @wandb_ingredient.capture
28 | def wandb_init(
29 |     _run,
30 |     wandb_name_prefix: str,
31 |     wandb_tag: Optional[str],
32 |     wandb_kwargs: Mapping[str, Any],
33 |     wandb_additional_info: Mapping[str, Any],
34 |     log_dir: str,
35 | ) -> None:
36 |     """Putting everything together to get the W&B kwargs for wandb.init().
37 | 
38 |     Args:
39 |         wandb_name_prefix: User-specified prefix for wandb run name.
40 |         wandb_tag: User-specified tag for this run.
41 |         wandb_kwargs: User-specified kwargs for wandb.init().
42 |         wandb_additional_info: User-specific additional info to add to wandb experiment
43 |             ``config``.
44 |         log_dir: W&B logs will be stored in directory `{log_dir}/wandb/`.
45 | 
46 |     Raises:
47 |         ModuleNotFoundError: wandb is not installed.
48 |     """
49 |     env_name = _run.config["environment"]["gym_id"]
50 |     root_seed = _run.config["seed"]
51 | 
52 |     updated_wandb_kwargs: Mapping[str, Any] = {
53 |         **wandb_kwargs,
54 |         "name": f"{wandb_name_prefix}-{env_name}-seed{root_seed}",
55 |         "tags": [env_name, f"seed{root_seed}"] + ([wandb_tag] if wandb_tag else []),
56 |         "dir": log_dir,
57 |     }
58 |     try:
59 |         import wandb
60 |     except ModuleNotFoundError as e:
61 |         raise ModuleNotFoundError(
62 |             "Trying to call `wandb.init()` but `wandb` not installed: "
63 |             "try `pip install wandb`.",
64 |         ) from e
65 |     wandb_config_dict = dict(**_run.config)
66 |     wandb_config_dict.update(wandb_additional_info)
67 |     wandb.init(config=wandb_config_dict, **updated_wandb_kwargs)
68 | 


--------------------------------------------------------------------------------
/docs/getting-started/what_is_imitation.rst:
--------------------------------------------------------------------------------
 1 | ======================
 2 | What is ``imitation``?
 3 | ======================
 4 | ``imitation`` is an open-source library providing high-quality, reliable and modular implementations of seven reward and imitation learning algorithms, built on modern backends like `PyTorch <https://pytorch.org/>`_ and `Stable Baselines3 <https://github.com/DLR-RM/stable-baselines3>`_. It includes implementations of :ref:`Behavioral Cloning (BC) <behavioral cloning docs>`, :ref:`DAgger <dagger docs>`, :ref:`Generative Adversarial Imitation Learning (GAIL) <gail docs>`, :ref:`Adversarial Inverse Reinforcement Learning (AIRL) <airl docs>`, :ref:`Reward Learning through Preference Comparisons <preference comparisons docs>`,  :ref:`Maximum Causal Entropy Inverse Reinforcement Learning (MCE IRL) <mce irl docs>`, and :ref:`Density-based reward modeling <density docs>`. The algorithms follow a consistent interface, making it simple to train and compare a range of algorithms.
 5 | 
 6 | A key use case of ``imitation`` is as an experimental baseline. Small implementation details in imitation learning algorithms can have significant impacts
 7 | on performance, which can lead to spurious positive results if a weak experimental baseline is used. To address this challenge, ``imitation``'s algorithms have been carefully benchmarked and compared to prior implementations. The codebase is statically type-checked and over 90% of it is covered by automated tests.
 8 | 
 9 | In addition to providing reliable baselines, ``imitation`` aims to simplify the process of developing novel reward and imitation learning algorithms. Its implementations are *modular*: users can freely change the reward or policy network architecture, RL algorithm and optimizer without touching the codebase itself. Algorithms can be extended by subclassing and overriding relevant methods. ``imitation`` also provides utility methods to handle common tasks to support the development of entirely novel algorithms.
10 | 
11 | Our goal for ``imitation`` is to make it easier to use, develop, and compare imitation and reward learning algorithms. The library is in active development, and we welcome contributions and feedback.
12 | 
13 | Check out our recommended
14 | :ref:`First Steps <First Steps>` for an overview of how to use the library. We also have tutorials, such as :doc:`../tutorials/1_train_bc`, that provide detailed examples of specific algorithms. If you are interested in helping develop ``imitation`` then we suggest you refer to the :ref:`Developer Guide <DevGuide>` as well as more specific guidelines for :ref:`Contributing <Contributing>`.
15 | 


--------------------------------------------------------------------------------
/src/imitation/testing/reward_improvement.py:
--------------------------------------------------------------------------------
 1 | """Utility functions used to check if rewards improved wrt to previous rewards."""
 2 | from typing import Iterable
 3 | 
 4 | import numpy as np
 5 | from scipy import stats
 6 | 
 7 | 
 8 | def is_significant_reward_improvement(
 9 |     old_rewards: Iterable[float],
10 |     new_rewards: Iterable[float],
11 |     p_value: float = 0.05,
12 | ) -> bool:
13 |     """Checks if the new rewards are really better than the old rewards.
14 | 
15 |     Ensures that this is not just due to lucky sampling by a permutation test.
16 | 
17 |     Args:
18 |         old_rewards: Iterable of "old" trajectory rewards (e.g. before training).
19 |         new_rewards: Iterable of "new" trajectory rewards (e.g. after training).
20 |         p_value: The maximum probability, that the old rewards are just as good as the
21 |             new rewards, that we tolerate.
22 | 
23 |     Returns:
24 |         True, if the new rewards are most probably better than the old rewards.
25 |         For this, the probability, that the old rewards are just as good as the new
26 |         rewards must be below `p_value`.
27 | 
28 |     >>> is_significant_reward_improvement((5, 6, 7, 4, 4), (7, 5, 9, 9, 12))
29 |     True
30 | 
31 |     >>> is_significant_reward_improvement((5, 6, 7, 4, 4), (7, 5, 9, 7, 4))
32 |     False
33 | 
34 |     >>> is_significant_reward_improvement((5, 6, 7, 4, 4), (7, 5, 9, 7, 4), p_value=0.3)
35 |     True
36 |     """
37 |     permutation_test_result = stats.permutation_test(
38 |         (old_rewards, new_rewards),
39 |         statistic=lambda x, y, axis: np.mean(x, axis=axis) - np.mean(y, axis=axis),
40 |         vectorized=True,
41 |         alternative="less",
42 |     )
43 | 
44 |     return permutation_test_result.pvalue < p_value
45 | 
46 | 
47 | def mean_reward_improved_by(
48 |     old_rews: Iterable[float],
49 |     new_rews: Iterable[float],
50 |     min_improvement: float,
51 | ):
52 |     """Checks if mean rewards improved wrt. to old rewards by a certain amount.
53 | 
54 |     Args:
55 |         old_rews: Iterable of "old" trajectory rewards (e.g. before training).
56 |         new_rews: Iterable of "new" trajectory rewards (e.g. after training).
57 |         min_improvement: The minimum amount of improvement that we expect.
58 | 
59 |     Returns:
60 |         True if the mean of the new rewards is larger than the mean of the old rewards
61 |         by min_improvement.
62 | 
63 |     >>> mean_reward_improved_by([5, 8, 7], [8, 9, 10], 2)
64 |     True
65 | 
66 |     >>> mean_reward_improved_by([5, 8, 7], [8, 9, 10], 5)
67 |     False
68 |     """
69 |     improvement = np.mean(new_rews) - np.mean(old_rews)  # type: ignore[call-overload]
70 |     return improvement >= min_improvement
71 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/ingredients/expert.py:
--------------------------------------------------------------------------------
 1 | """This ingredient provides an expert policy.
 2 | 
 3 | The expert policy is either loaded from disk or from the HuggingFace Model Hub or is
 4 | a test policy (e.g., random or zero).
 5 | The supported policy types are:
 6 | 
 7 | - :code:`ppo` and :code:`sac`: A policy trained with SB3.
 8 |     Needs a `path` in the `loader_kwargs`.
 9 | - :code:`<algo>-huggingface` (algo can be `ppo` or `sac`):
10 |     A policy trained with SB3 and uploaded to the HuggingFace Model Hub.
11 |     Will load the model from the repo :code:`<organization>/<algo>-<env_name>`.
12 |     You can set the organization with the `organization` key in :code:`loader_kwargs`.
13 |     The default is `HumanCompatibleAI`.
14 | - :code:`random`: A policy that takes random actions.
15 | - :code:`zero`: A policy that takes zero actions.
16 | """
17 | import sacred
18 | 
19 | from imitation.policies import serialize
20 | from imitation.scripts.ingredients import environment
21 | 
22 | expert_ingredient = sacred.Ingredient(
23 |     "expert",
24 |     ingredients=[environment.environment_ingredient],
25 | )
26 | 
27 | 
28 | @expert_ingredient.config
29 | def config():
30 |     # [ppo, sac, random, zero, ppo-huggingface, sac-huggingface] or your own.
31 |     policy_type = "ppo-huggingface"
32 |     # See imitation.policies.serialize.load_policy for options.
33 |     loader_kwargs = dict()
34 |     locals()  # quieten flake8
35 | 
36 | 
37 | @expert_ingredient.config_hook
38 | def config_hook(config, command_name, logger):
39 |     e_config = config["expert"]
40 |     if "huggingface" in e_config["policy_type"]:
41 |         # Set the default loader_kwargs for huggingface policies.
42 |         if "organization" not in e_config["loader_kwargs"]:
43 |             e_config["loader_kwargs"]["organization"] = "HumanCompatibleAI"
44 | 
45 |         # Note: unfortunately we need to pass the venv **and** its name to the
46 |         # huggingface policy loader since there is no way to get the name from the venv.
47 |         # The name is needed to deduce the repo id and load the correct huggingface
48 |         # model.
49 |         e_config["loader_kwargs"]["env_name"] = config["environment"]["gym_id"]
50 | 
51 |     # Note: this only serves the purpose to indicated that you need to specify the
52 |     #   path for local policies. It makes the config more explicit.
53 |     if (
54 |         e_config["policy_type"] in ("ppo", "sac")
55 |         and "path" not in e_config["loader_kwargs"]
56 |     ):  # pragma: no cover
57 |         e_config["loader_kwargs"]["path"] = None
58 |     return e_config
59 | 
60 | 
61 | @expert_ingredient.capture
62 | def get_expert_policy(venv, policy_type, loader_kwargs):
63 |     return serialize.load_policy(policy_type, venv, **loader_kwargs)
64 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/ingredients/policy.py:
--------------------------------------------------------------------------------
 1 | """This ingredient provides a newly constructed stable-baselines3 policy."""
 2 | 
 3 | import logging
 4 | from typing import Any, Mapping, Type
 5 | 
 6 | import sacred
 7 | from stable_baselines3.common import policies, utils, vec_env
 8 | 
 9 | import imitation.util.networks
10 | from imitation.policies import base
11 | from imitation.scripts.ingredients import logging as logging_ingredient
12 | 
13 | policy_ingredient = sacred.Ingredient(
14 |     "policy",
15 |     ingredients=[logging_ingredient.logging_ingredient],
16 | )
17 | logger = logging.getLogger(__name__)
18 | 
19 | 
20 | @policy_ingredient.config
21 | def config():
22 |     # Training
23 |     policy_cls = base.FeedForward32Policy
24 |     policy_kwargs = {}
25 | 
26 |     locals()  # quieten flake8
27 | 
28 | 
29 | @policy_ingredient.named_config
30 | def sac():
31 |     policy_cls = base.SAC1024Policy  # noqa: F841
32 | 
33 | 
34 | NORMALIZE_RUNNING_POLICY_KWARGS = {
35 |     "features_extractor_class": base.NormalizeFeaturesExtractor,
36 |     "features_extractor_kwargs": {
37 |         "normalize_class": imitation.util.networks.RunningNorm,
38 |     },
39 | }
40 | 
41 | 
42 | @policy_ingredient.named_config
43 | def normalize_running():
44 |     policy_kwargs = NORMALIZE_RUNNING_POLICY_KWARGS  # noqa: F841
45 | 
46 | 
47 | # Default config for CNN Policies
48 | @policy_ingredient.named_config
49 | def cnn_policy():
50 |     policy_cls = policies.ActorCriticCnnPolicy  # noqa: F841
51 | 
52 | 
53 | @policy_ingredient.capture
54 | def make_policy(
55 |     venv: vec_env.VecEnv,
56 |     policy_cls: Type[policies.BasePolicy],
57 |     policy_kwargs: Mapping[str, Any],
58 | ) -> policies.BasePolicy:
59 |     """Makes policy.
60 | 
61 |     Args:
62 |         venv: Vectorized environment we will be imitating demos from.
63 |         policy_cls: Type of a Stable Baselines3 policy architecture.
64 |             Specify only if policy_path is not specified.
65 |         policy_kwargs: Keyword arguments for policy constructor.
66 |             Specify only if policy_path is not specified.
67 | 
68 |     Returns:
69 |         A Stable Baselines3 policy.
70 |     """
71 |     policy_kwargs = dict(policy_kwargs)
72 |     if issubclass(policy_cls, policies.ActorCriticPolicy):
73 |         policy_kwargs.update(
74 |             {
75 |                 "observation_space": venv.observation_space,
76 |                 "action_space": venv.action_space,
77 |                 # parameter mandatory for ActorCriticPolicy, but not used by BC
78 |                 "lr_schedule": utils.get_schedule_fn(1),
79 |             },
80 |         )
81 |     policy: policies.BasePolicy
82 |     policy = policy_cls(**policy_kwargs)
83 |     logger.info(f"Policy network summary:\n {policy}")
84 |     return policy
85 | 


--------------------------------------------------------------------------------
/docs/algorithms/mce_irl.rst:
--------------------------------------------------------------------------------
 1 | .. _mce irl docs:
 2 | 
 3 | ===============================================================
 4 | Maximum Causal Entropy Inverse Reinforcement Learning (MCE IRL)
 5 | ===============================================================
 6 | 
 7 | Implements `Modeling Interaction via the Principle of Maximum Causal Entropy <https://www.cs.cmu.edu/~bziebart/publications/maximum-causal-entropy.pdf>`_.
 8 | 
 9 | Example
10 | =======
11 | 
12 | Detailed example notebook: :doc:`../tutorials/6_train_mce`
13 | 
14 | .. testcode::
15 |     :skipif: skip_doctests
16 | 
17 |     from functools import partial
18 | 
19 |     from seals import base_envs
20 |     from seals.diagnostics.cliff_world import CliffWorldEnv
21 |     import numpy as np
22 | 
23 |     from stable_baselines3.common.vec_env import DummyVecEnv
24 | 
25 |     from imitation.algorithms.mce_irl import (
26 |         MCEIRL,
27 |         mce_occupancy_measures,
28 |         mce_partition_fh,
29 |     )
30 |     from imitation.data import rollout
31 |     from imitation.rewards import reward_nets
32 | 
33 |     rng = np.random.default_rng(0)
34 | 
35 |     env_creator = partial(CliffWorldEnv, height=4, horizon=8, width=7, use_xy_obs=True)
36 |     env_single = env_creator()
37 | 
38 |     state_env_creator = lambda: base_envs.ExposePOMDPStateWrapper(env_creator())
39 | 
40 |     # This is just a vectorized environment because `generate_trajectories` expects one
41 |     state_venv = DummyVecEnv([state_env_creator] * 4)
42 | 
43 |     _, _, pi = mce_partition_fh(env_single)
44 | 
45 |     _, om = mce_occupancy_measures(env_single, pi=pi)
46 | 
47 |     reward_net = reward_nets.BasicRewardNet(
48 |         env_single.observation_space,
49 |         env_single.action_space,
50 |         hid_sizes=[256],
51 |         use_action=False,
52 |         use_done=False,
53 |         use_next_state=False,
54 |     )
55 | 
56 |     # training on analytically computed occupancy measures
57 |     mce_irl = MCEIRL(
58 |         om,
59 |         env_single,
60 |         reward_net,
61 |         log_interval=250,
62 |         optimizer_kwargs={"lr": 0.01},
63 |         rng=rng,
64 |     )
65 |     occ_measure = mce_irl.train()
66 | 
67 |     imitation_trajs = rollout.generate_trajectories(
68 |         policy=mce_irl.policy,
69 |         venv=state_venv,
70 |         sample_until=rollout.make_min_timesteps(5000),
71 |         rng=rng,
72 |     )
73 |     print("Imitation stats: ", rollout.rollout_stats(imitation_trajs))
74 | 
75 | .. testoutput::
76 |     :hide:
77 | 
78 |     ...
79 | 
80 | API
81 | ===
82 | .. autoclass:: imitation.algorithms.mce_irl.MCEIRL
83 |     :members:
84 |     :inherited-members:
85 |     :noindex:
86 | 
87 | .. autoclass:: imitation.algorithms.base.DemonstrationAlgorithm
88 |     :members:
89 |     :noindex:
90 | 


--------------------------------------------------------------------------------
/tests/scripts/ingredients/test_rewards.py:
--------------------------------------------------------------------------------
 1 | """Tests for imitation.scripts.ingredients.reward."""
 2 | from typing import Any, Mapping
 3 | 
 4 | import pytest
 5 | 
 6 | from imitation.rewards import reward_nets
 7 | from imitation.scripts.ingredients import reward
 8 | from imitation.util import networks
 9 | 
10 | 
11 | @pytest.fixture
12 | def member_config() -> Mapping[str, Any]:
13 |     return {
14 |         "net_cls": reward_nets.BasicRewardNet,
15 |         "net_kwargs": {},
16 |         "normalize_output_layer": None,
17 |     }
18 | 
19 | 
20 | def test_make_reward_ensemble(member_config, cartpole_venv):
21 |     reward_net = reward.make_reward_net(
22 |         venv=cartpole_venv,
23 |         net_cls=reward_nets.RewardEnsemble,
24 |         add_std_alpha=None,
25 |         net_kwargs={},
26 |         normalize_output_layer=None,
27 |         ensemble_size=3,
28 |         ensemble_member_config=member_config,
29 |     )
30 |     assert isinstance(reward_net, reward_nets.RewardEnsemble)
31 | 
32 |     reward_net = reward.make_reward_net(
33 |         venv=cartpole_venv,
34 |         net_cls=reward_nets.RewardEnsemble,
35 |         add_std_alpha=0,
36 |         net_kwargs={},
37 |         normalize_output_layer=None,
38 |         ensemble_size=3,
39 |         ensemble_member_config=member_config,
40 |     )
41 |     assert isinstance(reward_net, reward_nets.AddSTDRewardWrapper)
42 | 
43 | 
44 | def test_make_reward_errors(member_config, cartpole_venv):
45 |     with pytest.raises(ValueError, match=r"Must specify ensemble_size."):
46 |         reward.make_reward_net(
47 |             venv=cartpole_venv,
48 |             net_cls=reward_nets.RewardEnsemble,
49 |             add_std_alpha=None,
50 |             net_kwargs={},
51 |             normalize_output_layer=None,
52 |             ensemble_size=None,
53 |             ensemble_member_config=member_config,
54 |         )
55 | 
56 |     with pytest.raises(ValueError, match=r"Must specify ensemble_member_config."):
57 |         reward.make_reward_net(
58 |             venv=cartpole_venv,
59 |             net_cls=reward_nets.RewardEnsemble,
60 |             add_std_alpha=None,
61 |             net_kwargs={},
62 |             normalize_output_layer=None,
63 |             ensemble_size=5,
64 |             ensemble_member_config=None,
65 |         )
66 | 
67 |     with pytest.raises(
68 |         ValueError,
69 |         match=r"Output normalization not supported on RewardEnsembles.",
70 |     ):
71 |         reward.make_reward_net(
72 |             venv=cartpole_venv,
73 |             net_cls=reward_nets.RewardEnsemble,
74 |             add_std_alpha=None,
75 |             net_kwargs={},
76 |             normalize_output_layer=networks.RunningNorm,
77 |             ensemble_size=5,
78 |             ensemble_member_config=member_config,
79 |         )
80 | 


--------------------------------------------------------------------------------
/src/imitation/util/sacred_file_parsing.py:
--------------------------------------------------------------------------------
 1 | """Utilities to parse sacred run directories."""
 2 | import json
 3 | import pathlib
 4 | import warnings
 5 | from collections import defaultdict
 6 | from typing import Any, Dict, Generator, List, Tuple
 7 | 
 8 | SacredRun = Dict[str, Any]
 9 | SacredConfAndRun = Tuple[Dict[str, Any], SacredRun]
10 | GroupedRuns = Dict[str, Dict[str, List[SacredRun]]]
11 | 
12 | 
13 | def find_sacred_runs(
14 |     run_path: pathlib.Path,
15 |     only_completed_runs: bool = False,
16 | ) -> Generator[SacredConfAndRun, None, None]:
17 |     """Recursively iterates the sacred runs found below the given path.
18 | 
19 |     Assumes runs in the format of the sacred FileStorageObserver: each run consists
20 |     of a folder that contains a config.json and a run.json file.
21 | 
22 |     Note: will work with nested directories and can therefore be applied to the
23 |     `output/sacred` folder of the command line interface which creates sub-folders for
24 |     each script.
25 | 
26 |     Args:
27 |         run_path: The path to search for sacred run directories.
28 |         only_completed_runs: If True, only yields runs that have a run.json file with
29 |             status "COMPLETED".
30 | 
31 |     Yields:
32 |         Tuples of (config, run) dicts.
33 |     """
34 |     for config_path in run_path.rglob("config.json"):
35 |         run_path = config_path.parent / "run.json"
36 | 
37 |         if run_path.exists():
38 |             run = json.loads(run_path.read_text())
39 |             if only_completed_runs and run["status"] != "COMPLETED":
40 |                 continue
41 |             conf = json.loads(config_path.read_text())
42 |             yield conf, run
43 |         else:  # pragma: no cover
44 |             warnings.warn(f"Run {config_path.parent} has no run.json")
45 | 
46 | 
47 | def group_runs_by_algo_and_env(
48 |     path: pathlib.Path,
49 |     only_completed_runs: bool = False,
50 | ) -> GroupedRuns:
51 |     """Groups the runs found below the given path by algorithm and environment.
52 | 
53 |     Access all the runs of algorithm `algo` and environment `env` via
54 |     `runs_by_algo_and_env[algo][env]`.
55 | 
56 |     Args:
57 |         path: The path to search for sacred run directories.
58 |         only_completed_runs: If True, only yields runs that have a run.json file with
59 |             status "COMPLETED".
60 | 
61 |     Returns:
62 |         A dictionary mapping algorithms to environments to lists of runs.
63 |     """
64 |     runs_by_algo_and_env: GroupedRuns = defaultdict(lambda: defaultdict(list))
65 |     for conf, run in find_sacred_runs(path, only_completed_runs):
66 |         algo = run["command"]
67 |         env = conf["environment"]["gym_id"]
68 |         runs_by_algo_and_env[algo][env].append(run)
69 | 
70 |     return runs_by_algo_and_env
71 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/ingredients/bc.py:
--------------------------------------------------------------------------------
 1 | """This ingredient provides BC algorithm instance.
 2 | 
 3 | It is either loaded from disk or constructed from scratch.
 4 | """
 5 | import warnings
 6 | from typing import Optional, Sequence
 7 | 
 8 | import sacred
 9 | import torch as th
10 | from stable_baselines3.common import vec_env
11 | 
12 | from imitation.algorithms import bc
13 | from imitation.data import types
14 | from imitation.scripts.ingredients import policy
15 | 
16 | bc_ingredient = sacred.Ingredient("bc", ingredients=[policy.policy_ingredient])
17 | 
18 | 
19 | @bc_ingredient.config
20 | def config():
21 |     batch_size = 32
22 |     l2_weight = 3e-5  # L2 regularization weight
23 |     optimizer_cls = th.optim.Adam
24 |     optimizer_kwargs = dict(
25 |         lr=4e-4,
26 |     )
27 |     train_kwargs = dict(
28 |         n_epochs=None,  # Number of BC epochs per DAgger training round
29 |         n_batches=None,  # Number of BC batches per DAgger training round
30 |         log_interval=500,  # Number of updates between Tensorboard/stdout logs
31 |     )
32 |     agent_path = None  # Path to serialized policy. If None, a new policy is created.
33 | 
34 |     locals()  # quieten flake8 unused variable warning
35 | 
36 | 
37 | @bc_ingredient.capture
38 | def make_bc(
39 |     venv: vec_env.VecEnv,
40 |     expert_trajs: Sequence[types.Trajectory],
41 |     custom_logger,
42 |     batch_size: int,
43 |     l2_weight: float,
44 |     optimizer_cls,
45 |     optimizer_kwargs,
46 |     _rnd,
47 | ) -> bc.BC:
48 |     return bc.BC(
49 |         observation_space=venv.observation_space,
50 |         action_space=venv.action_space,
51 |         policy=make_or_load_policy(venv),
52 |         demonstrations=expert_trajs,
53 |         custom_logger=custom_logger,
54 |         rng=_rnd,
55 |         batch_size=batch_size,
56 |         l2_weight=l2_weight,
57 |         optimizer_cls=optimizer_cls,
58 |         optimizer_kwargs=optimizer_kwargs,
59 |     )
60 | 
61 | 
62 | @bc_ingredient.capture
63 | def make_or_load_policy(venv: vec_env.VecEnv, agent_path: Optional[str]):
64 |     """Makes a policy or loads a policy from a path if provided.
65 | 
66 |     Args:
67 |         venv: Vectorized environment we will be imitating demos from.
68 |         agent_path: Path to serialized policy. If provided, then load the
69 |             policy from this path. Otherwise, make a new policy.
70 |             Specify only if policy_cls and policy_kwargs are not specified.
71 | 
72 |     Returns:
73 |         A Stable Baselines3 policy.
74 |     """
75 |     if agent_path is None:
76 |         policy.make_policy(venv)
77 |     else:
78 |         warnings.warn(
79 |             "When agent_path is specified, policy.policy_cls and policy.policy_kwargs "
80 |             "are ignored.",
81 |             RuntimeWarning,
82 |         )
83 |         return bc.reconstruct_policy(agent_path)
84 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/ingredients/policy_evaluation.py:
--------------------------------------------------------------------------------
 1 | """This ingredient performs evaluation of learned policy.
 2 | 
 3 | It takes care of the right wrappers, does some rollouts
 4 | and computes statistics of the rollouts.
 5 | """
 6 | 
 7 | from typing import Mapping, Union
 8 | 
 9 | import numpy as np
10 | import sacred
11 | from stable_baselines3.common import base_class, policies, vec_env
12 | 
13 | from imitation.data import rollout
14 | 
15 | policy_evaluation_ingredient = sacred.Ingredient("policy_evaluation")
16 | 
17 | 
18 | @policy_evaluation_ingredient.config
19 | def config():
20 |     n_episodes_eval = 50  # Num of episodes for final mean ground truth return
21 |     locals()  # quieten flake8
22 | 
23 | 
24 | @policy_evaluation_ingredient.named_config
25 | def fast():
26 |     n_episodes_eval = 1  # noqa: F841
27 | 
28 | 
29 | @policy_evaluation_ingredient.capture
30 | def eval_policy(
31 |     rl_algo: Union[base_class.BaseAlgorithm, policies.BasePolicy],
32 |     venv: vec_env.VecEnv,
33 |     n_episodes_eval: int,
34 |     _rnd: np.random.Generator,
35 | ) -> Mapping[str, float]:
36 |     """Evaluation of imitation learned policy.
37 | 
38 |     Has the side effect of setting `rl_algo`'s environment to `venv`
39 |     if it is a `BaseAlgorithm`.
40 | 
41 |     Args:
42 |         rl_algo: Algorithm to evaluate.
43 |         venv: Environment to evaluate on.
44 |         n_episodes_eval: The number of episodes to average over when calculating
45 |             the average episode reward of the imitation policy for return.
46 |         _rnd: Random number generator provided by Sacred.
47 | 
48 |     Returns:
49 |         A dictionary with two keys. "imit_stats" gives the return value of
50 |         `rollout_stats()` on rollouts test-reward-wrapped environment, using the final
51 |         policy (remember that the ground-truth reward can be recovered from the
52 |         "monitor_return" key). "expert_stats" gives the return value of
53 |         `rollout_stats()` on the expert demonstrations loaded from `path`.
54 |     """
55 |     sample_until_eval = rollout.make_min_episodes(n_episodes_eval)
56 |     if isinstance(rl_algo, base_class.BaseAlgorithm):
57 |         # Set RL algorithm's env to venv, removing any cruft wrappers that the RL
58 |         # algorithm's environment may have accumulated.
59 |         rl_algo.set_env(venv)
60 |         # Generate trajectories with the RL algorithm's env - SB3 may apply wrappers
61 |         # under the hood to get it to work with the RL algorithm (e.g. transposing
62 |         # images, so they can be fed into CNNs).
63 |         train_env = rl_algo.get_env()
64 |         assert train_env is not None
65 |     else:
66 |         train_env = venv
67 |     trajs = rollout.generate_trajectories(
68 |         rl_algo,
69 |         train_env,
70 |         sample_until=sample_until_eval,
71 |         rng=_rnd,
72 |     )
73 |     return rollout.rollout_stats(trajs)
74 | 


--------------------------------------------------------------------------------
/docs/algorithms/dagger.rst:
--------------------------------------------------------------------------------
 1 | .. _dagger docs:
 2 | 
 3 | =======================
 4 | DAgger
 5 | =======================
 6 | 
 7 | `DAgger <https://arxiv.org/abs/1011.0686>`_ (Dataset Aggregation) iteratively trains a
 8 | policy using supervised learning on a dataset of observation-action pairs from expert demonstrations
 9 | (like :ref:`behavioral cloning <behavioral cloning docs>`), runs the policy to gather
10 | observations, queries the expert for good actions on those observations, and adds the
11 | newly labeled observations to the dataset. DAgger improves on behavioral cloning by
12 | training on a dataset that better resembles the observations the trained policy is
13 | likely to encounter, but it requires querying the expert online.
14 | 
15 | .. note::
16 |     DAgger paper: `A Reduction of Imitation Learning and Structured Prediction to No-Regret Online Learning <https://arxiv.org/abs/1011.0686>`_
17 | 
18 | Example
19 | =======
20 | 
21 | Detailed example notebook: :doc:`../tutorials/2_train_dagger`
22 | 
23 | .. testcode::
24 |     :skipif: skip_doctests
25 | 
26 |     import tempfile
27 | 
28 |     import numpy as np
29 |     import gymnasium as gym
30 |     from stable_baselines3.common.evaluation import evaluate_policy
31 | 
32 |     from imitation.algorithms import bc
33 |     from imitation.algorithms.dagger import SimpleDAggerTrainer
34 |     from imitation.policies.serialize import load_policy
35 |     from imitation.util.util import make_vec_env
36 | 
37 |     rng = np.random.default_rng(0)
38 |     env = make_vec_env(
39 |         "seals:seals/CartPole-v0",
40 |         rng=rng,
41 |     )
42 |     expert = load_policy(
43 |         "ppo-huggingface",
44 |         organization="HumanCompatibleAI",
45 |         env_name="seals-CartPole-v0",
46 |         venv=env,
47 |     )
48 | 
49 |     bc_trainer = bc.BC(
50 |         observation_space=env.observation_space,
51 |         action_space=env.action_space,
52 |         rng=rng,
53 |     )
54 |     with tempfile.TemporaryDirectory(prefix="dagger_example_") as tmpdir:
55 |         print(tmpdir)
56 |         dagger_trainer = SimpleDAggerTrainer(
57 |             venv=env,
58 |             scratch_dir=tmpdir,
59 |             expert_policy=expert,
60 |             bc_trainer=bc_trainer,
61 |             rng=rng,
62 |         )
63 |         dagger_trainer.train(8_000)
64 | 
65 |     reward, _ = evaluate_policy(dagger_trainer.policy, env, 10)
66 |     print("Reward:", reward)
67 | 
68 | .. testoutput::
69 |     :hide:
70 | 
71 |     ...
72 | 
73 | API
74 | ===
75 | .. autoclass:: imitation.algorithms.dagger.InteractiveTrajectoryCollector
76 |     :members:
77 |     :inherited-members:
78 |     :noindex:
79 | 
80 | .. autoclass:: imitation.algorithms.dagger.DAggerTrainer
81 |     :members:
82 |     :inherited-members:
83 |     :noindex:
84 | 
85 | .. autoclass:: imitation.algorithms.dagger.SimpleDAggerTrainer
86 |     :members:
87 |     :inherited-members:
88 |     :noindex:
89 | 


--------------------------------------------------------------------------------
/docs/development/contributing/code-of-conduct.rst:
--------------------------------------------------------------------------------
 1 | .. _Code of Conduct:
 2 | 
 3 | Code of Conduct
 4 | ===============
 5 | 
 6 | To ensure that the imitation community remains open and inclusive, we have a few ground rules that we ask contributors to adhere to. This isn't an exhaustive list of things that you can't do. Rather, take it in the spirit in which it's intended — a guide to make it easier to enrich all of us and the technical communities in which we participate.
 7 | 
 8 | 
 9 | * **Be friendly and patient**.
10 | * **Be welcoming**. We strive to be a community that welcomes and supports people of all backgrounds and identities. This includes, but is not limited to members of any race, ethnicity, culture, national origin, colour, immigration status, social and economic class, educational level, sex, sexual orientation, gender identity and expression, age, size, family status, political belief, religion, and mental and physical ability.
11 | * **Be considerate**. Your work will be used by other people, and you in turn will depend on the work of others. Any decision you take will affect users and colleagues, and you should take those consequences into account when making decisions. Remember that we're a world-wide community, so you might not be communicating in someone else's primary language.
12 | * **Be respectful**. Not all of us will agree all the time, but disagreement is no excuse for poor behavior and poor manners. We might all experience some frustration now and then, but we cannot allow that frustration to turn into a personal attack. Members of the imitation community should be respectful when dealing with other members as well as with people outside the imitation community.
13 | * **Be careful in the words that you choose**. We are a community of professionals, and we conduct ourselves professionally. Be kind to others. Do not insult or put down other participants. Harassment and other exclusionary behavior aren't acceptable. This includes, but is not limited to:
14 | 
15 |   * Violent threats or language directed against another person.
16 |   * Discriminatory jokes and language.
17 |   * Posting sexually explicit or violent material.
18 |   * Posting (or threatening to post) other people's personally identifying information without their consent ("doxing").
19 |   * Personal insults, especially those using racist or sexist terms.
20 |   * Unwelcome sexual attention.
21 |   * Advocating for, or encouraging, any of the above behavior.
22 |   * Repeated harassment of others. In general, if someone asks you to stop, then stop.
23 | 
24 | * **When we disagree, try to understand why**. It is important that we resolve disagreements and differing views constructively. Focus on helping to resolve issues and learning from mistakes.
25 | 
26 | Adapted from the original text courtesy of the `Django project <https://www.djangoproject.com/conduct/>`_, licensed under a `Creative Commons Attribution 3.0 License <https://creativecommons.org/licenses/by/3.0/>`_.
27 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Based on OpenAI's mujoco-py Dockerfile
 2 | 
 3 | # base stage contains just binary dependencies.
 4 | # This is used in the CI build.
 5 | FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04 AS base
 6 | ARG DEBIAN_FRONTEND=noninteractive
 7 | 
 8 | RUN apt-get update -q \
 9 |     && apt-get install -y --no-install-recommends \
10 |     build-essential \
11 |     curl \
12 |     wget \
13 |     ffmpeg \
14 |     git \
15 |     git-lfs \
16 |     ssh \
17 |     libgl1-mesa-dev \
18 |     libgl1-mesa-glx \
19 |     libglew-dev \
20 |     libosmesa6-dev \
21 |     net-tools \
22 |     parallel \
23 |     python3.8 \
24 |     python3.8-dev \
25 |     python3-pip \
26 |     rsync \
27 |     software-properties-common \
28 |     unzip \
29 |     vim \
30 |     virtualenv \
31 |     xpra \
32 |     xserver-xorg-dev \
33 |     patchelf  \
34 |     && apt-get clean \
35 |     && rm -rf /var/lib/apt/lists/*
36 | 
37 | RUN git lfs install
38 | 
39 | ENV LANG C.UTF-8
40 | 
41 | # Set the PATH to the venv before we create the venv, so it's visible in base.
42 | # This is since we may create the venv outside of Docker, e.g. in CI
43 | # or by binding it in for local development.
44 | ENV VIRTUAL_ENV=/venv
45 | ENV PATH="/venv/bin:$PATH"
46 | ENV LD_LIBRARY_PATH /usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
47 | 
48 | # Run Xdummy mock X server by default so that rendering will work.
49 | COPY ci/xorg.conf /etc/dummy_xorg.conf
50 | COPY ci/Xdummy-entrypoint.py /usr/bin/Xdummy-entrypoint.py
51 | ENTRYPOINT ["/usr/bin/Xdummy-entrypoint.py"]
52 | 
53 | # python-req stage contains Python venv, but not code.
54 | # It is useful for development purposes: you can mount
55 | # code from outside the Docker container.
56 | FROM base as python-req
57 | 
58 | WORKDIR /imitation
59 | # Copy over just setup.py and dependencies (__init__.py and README.md)
60 | # to avoid rebuilding venv when requirements have not changed.
61 | COPY ./setup.py ./setup.py
62 | COPY ./README.md ./README.md
63 | COPY ./src/imitation/__init__.py ./src/imitation/__init__.py
64 | COPY ci/build_and_activate_venv.sh ./ci/build_and_activate_venv.sh
65 | 
66 | # Pass mock value for version because .git is not present in the Docker container
67 | # at this stage, so setuptools-scm cannot determine version automatically.
68 | # setuptools-scm will compute it correctly when it comes to building and installing
69 | # imitation, as .git will then be present.
70 | RUN SETUPTOOLS_SCM_PRETEND_VERSION="dummy" ci/build_and_activate_venv.sh /venv \
71 |     && rm -rf $HOME/.cache/pip
72 | 
73 | # full stage contains everything.
74 | # Can be used for deployment and local testing.
75 | FROM python-req as full
76 | 
77 | # Delay copying (and installing) the code until the very end
78 | COPY . /imitation
79 | # Build a wheel then install to avoid copying whole directory (pip issue #2195)
80 | RUN python3 setup.py sdist bdist_wheel
81 | RUN pip install --upgrade dist/imitation-*.whl
82 | 
83 | # Default entrypoints
84 | CMD ["pytest", "-n", "auto", "-vv", "tests/"]
85 | 


--------------------------------------------------------------------------------
/experiments/benchmark_and_table.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # PREREQUISITE: Run experiments/download_models.sh to get rollouts.
 4 | #
 5 | # You can find expected results for GAIL and AIRL here:
 6 | #   https://github.com/HumanCompatibleAI/imitation/pull/317
 7 | #
 8 | # This end-to-end script runs all the scripts necessary to generate the paper
 9 | # results, and then generates TeX and CSV tables for the experiment output
10 | # in a variety of table verbosities.
11 | #
12 | # To test this script out, run with the --fast flag.
13 | #
14 | # You can reduce stdout verbosity of the imitation runs and view imitation
15 | # progress more easily by using the --tmux or -T flag. However, this flag
16 | # may hide errors from your terminal, so it is not recommended during debugging.
17 | #
18 | # All imitation runs are given the same timestamped `--run_name` so that
19 | # they can be gathered by the table-generating analysis script.
20 | 
21 | set -e  # Exit on error
22 | source experiments/common.sh
23 | 
24 | RUN_NAME="paper-${TIMESTAMP}"
25 | echo "Training with run_name=${RUN_NAME}"
26 | 
27 | script_dir=experiments
28 | fast_flag=()
29 | paper_flag=("--paper")
30 | tmux_flag=()
31 | 
32 | if ! TEMP=$($GNU_GETOPT -o fT -l fast,tmux -- "$@"); then
33 |   exit 1
34 | fi
35 | eval set -- "$TEMP"
36 | 
37 | while true; do
38 |   case "$1" in
39 |     -f | --fast)
40 |       # Use this flag to quickly test a shortened benchmark and table
41 |       fast_flag=("--fast")
42 |       paper_flag=()
43 |       # To prevent race conditions, we use a different run name for each process id.
44 |       RUN_NAME="test-${TIMESTAMP}-$BASHPID"
45 |       shift
46 |       ;;
47 |     -T | --tmux)
48 |       tmux_flag=("--tmux")
49 |       shift
50 |       ;;
51 |     --)
52 |       shift
53 |       break
54 |       ;;
55 |     *)
56 |       echo "Unrecognized flag $1" >&2
57 |       exit 1
58 |       ;;
59 |   esac
60 | done
61 | 
62 | 
63 | set -ex  # Start echoing commands
64 | 
65 | echo "BC BENCHMARK"
66 | ${script_dir}/bc_benchmark.sh "${fast_flag[@]}" "${paper_flag[@]}" "${tmux_flag[@]}" --run_name "$RUN_NAME"
67 | 
68 | IMIT_PLAIN="${script_dir}/imit_benchmark.sh ${fast_flag[*]} ${tmux_flag[*]} --run_name $RUN_NAME"
69 | 
70 | echo "AIRL seals BENCHMARK"
71 | $IMIT_PLAIN --mvp_seals --airl
72 | 
73 | echo "GAIL seals BENCHMARK"
74 | $IMIT_PLAIN --mvp_seals --gail
75 | 
76 | if [ ${#fast_flag[@]} -eq 0 ]; then
77 |   # Fast flag not specified.
78 |   echo "AIRL/GAIL HalfCheetah BENCHMARK"
79 |   $IMIT_PLAIN --cheetah
80 | fi
81 | 
82 | echo "DAGGER BENCHMARK"
83 | ${script_dir}/dagger_benchmark.sh "${fast_flag[@]}" "${paper_flag[@]}" "${tmux_flag[@]}" --run_name "$RUN_NAME"
84 | 
85 | result_dir=output/fast_table_result
86 | mkdir -p $result_dir
87 | for v in 0 1 2; do
88 |   base_out_path=$result_dir/fast_table_result_verbosity$v
89 | 
90 |   python -m imitation.scripts.analyze analyze_imitation with \
91 |     source_dir_str="output/sacred" table_verbosity=$v  \
92 |     csv_output_path=$base_out_path.csv \
93 |     tex_output_path=$base_out_path.tex \
94 |     run_name="$RUN_NAME"
95 | done
96 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Project-specific
  2 | .#*
  3 | *.html
  4 | output/
  5 | /data/
  6 | auto.pkl
  7 | nohup.out
  8 | 
  9 | # Byte-compiled / optimized / DLL files
 10 | __pycache__/
 11 | *.py[cod]
 12 | *$py.class
 13 | 
 14 | # C extensions
 15 | *.so
 16 | 
 17 | # Distribution / packaging
 18 | .Python
 19 | build/
 20 | develop-eggs/
 21 | dist/
 22 | downloads/
 23 | eggs/
 24 | .eggs/
 25 | lib/
 26 | lib64/
 27 | parts/
 28 | sdist/
 29 | var/
 30 | wheels/
 31 | pip-wheel-metadata/
 32 | share/python-wheels/
 33 | *.egg-info/
 34 | .installed.cfg
 35 | *.egg
 36 | MANIFEST
 37 | 
 38 | # PyInstaller
 39 | #  Usually these files are written by a python script from a template
 40 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 41 | *.manifest
 42 | *.spec
 43 | 
 44 | # Installer logs
 45 | pip-log.txt
 46 | pip-delete-this-directory.txt
 47 | 
 48 | # Unit test / coverage reports
 49 | htmlcov/
 50 | .tox/
 51 | .nox/
 52 | .coverage
 53 | .coverage.*
 54 | .cache
 55 | nosetests.xml
 56 | coverage.xml
 57 | *.cover
 58 | *.py,cover
 59 | .hypothesis/
 60 | .pytest_cache/
 61 | 
 62 | # Translations
 63 | *.mo
 64 | *.pot
 65 | 
 66 | # Django stuff:
 67 | *.log
 68 | local_settings.py
 69 | db.sqlite3
 70 | db.sqlite3-journal
 71 | 
 72 | # Flask stuff:
 73 | instance/
 74 | .webassets-cache
 75 | 
 76 | # Scrapy stuff:
 77 | .scrapy
 78 | 
 79 | # Sphinx documentation
 80 | docs/_build/
 81 | 
 82 | # PyBuilder
 83 | target/
 84 | 
 85 | # Jupyter Notebook
 86 | .ipynb_checkpoints
 87 | 
 88 | # IPython
 89 | profile_default/
 90 | ipython_config.py
 91 | 
 92 | # pyenv
 93 | .python-version
 94 | 
 95 | # pipenv
 96 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 97 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 98 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 99 | #   install all needed dependencies.
100 | #Pipfile.lock
101 | 
102 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
103 | __pypackages__/
104 | 
105 | # Celery stuff
106 | celerybeat-schedule
107 | celerybeat.pid
108 | 
109 | # SageMath parsed files
110 | *.sage.py
111 | 
112 | # Environments
113 | .env
114 | .venv
115 | env/
116 | venv/
117 | ENV/
118 | env.bak/
119 | venv.bak/
120 | 
121 | # Spyder project settings
122 | .spyderproject
123 | .spyproject
124 | 
125 | # Rope project settings
126 | .ropeproject
127 | 
128 | # PyCharm project settings
129 | .idea
130 | 
131 | # mkdocs documentation
132 | /site
133 | 
134 | # mypy
135 | .mypy_cache/
136 | .dmypy.json
137 | dmypy.json
138 | 
139 | # Pyre type checker
140 | .pyre/
141 | 
142 | # Pytype type checker
143 | .pytype/
144 | 
145 | # Mac OSX
146 | .DS_STORE
147 | 
148 | # Line profiler
149 | *.lprof
150 | 
151 | # yapf (code formatter)
152 | .style.yapf
153 | 
154 | 
155 | # Various testing / quickstart artifacts:
156 | 
157 | # Produced by transfer_learn_benchmark.sh
158 | /tests/testdata/reward_models
159 | # Produced by examples/quickstart.sh
160 | /quickstart/
161 | 
162 | .vscode/
163 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | repos:
 4 | # Linting
 5 | - repo: https://github.com/pre-commit/pre-commit-hooks
 6 |   rev: v4.4.0
 7 |   hooks:
 8 |   - id: check-ast
 9 |   - id: trailing-whitespace
10 |   - id: end-of-file-fixer
11 |     exclude_types: [jupyter]
12 |   - id: check-toml
13 |   - id: check-added-large-files
14 | - repo: https://github.com/psf/black
15 |   rev: 23.9.1
16 |   hooks:
17 |   - id: black
18 |   - id: black-jupyter
19 | - repo: https://github.com/PyCQA/isort
20 |   rev: 5.12.0
21 |   hooks:
22 |   - id: isort
23 | # Python static analysis
24 | - repo: https://github.com/pycqa/flake8
25 |   rev: '6.1.0'
26 |   hooks:
27 |   - id: flake8
28 |     additional_dependencies:
29 |       - darglint~=1.8.1
30 |       - flake8-blind-except==0.2.1
31 |       - flake8-builtins~=1.5.3
32 |       - flake8-commas~=2.1.0
33 |       - flake8-debugger~=4.1.2
34 |       - flake8-docstrings~=1.6.0
35 | # Shell static analysis
36 | - repo: https://github.com/koalaman/shellcheck-precommit
37 |   rev: v0.9.0
38 |   hooks:
39 |   - id: shellcheck
40 |   # precommit invokes shellcheck once per file. shellcheck complains if file
41 |   # includes another file not given on the command line. Ignore this, since
42 |   # they'll just get checked in a separate shellcheck invocation.
43 |     args: ["-e", "SC1091"]
44 | # Misc
45 | - repo: https://github.com/codespell-project/codespell
46 |   rev: v2.2.4
47 |   hooks:
48 |   - id: codespell
49 |     args: ["--skip=*.pyc,tests/testdata/*,*.ipynb,*.csv","--ignore-words-list=reacher,ith,iff"]
50 | - repo: https://github.com/syntaqx/git-hooks
51 |   rev: v0.0.18
52 |   hooks:
53 |   - id: circleci-config-validate
54 | # Hooks that run in local environment (not isolated venv) as they need
55 | # same dependencies as our package.
56 | - repo: local
57 |   hooks:
58 |   - id: check-notebooks
59 |     name: check-notebooks
60 |     entry: ./ci/clean_notebooks.py --check ./docs/tutorials/
61 |     language: script
62 |     types: [jupyter]
63 |     pass_filenames: false
64 |   # Run mypy directly from local repo rather than using mirror-mypy
65 |   # so that it uses installed dependencies. Adapted from:
66 |   # https://jaredkhan.com/blog/mypy-pre-commit
67 |   - id: mypy
68 |     name: mypy
69 |     language: system
70 |     types: [python]
71 |     entry: mypy --follow-imports=silent --show-error-codes
72 |     # use require_serial so that script
73 |     # is only called once per commit
74 |     require_serial: true
75 |     # Print the number of files as a sanity-check
76 |     verbose: true
77 |   - id: pytype
78 |     name: pytype
79 |     language: system
80 |     types: [python]
81 |     entry: "bash -c 'pytype --keep-going -j ${NUM_CPUS:-auto}'"
82 |     require_serial: true
83 |     verbose: true
84 |   - id: docs
85 |     name: docs
86 |     language: system
87 |     types_or: [python, rst]
88 |     entry: bash -c "cd docs/ && make clean && SKIP_DOCTEST=True NB_EXECUTION_MODE=off make html"
89 |     require_serial: true
90 |     verbose: true
91 |     pass_filenames: false
92 | 


--------------------------------------------------------------------------------
/ci/check_typeignore.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """Check for invalid "# type: ignore" comments.
 4 | 
 5 | This script checks that no files in our source code have a "#type: ignore" comment
 6 | without explicitly indicating the reason for the ignore. This is to ensure that we
 7 | don't accidentally ignore errors that we should be fixing.
 8 | """
 9 | import argparse
10 | import os
11 | import pathlib
12 | import re
13 | import sys
14 | from typing import List
15 | 
16 | # Regex to match a "# type: ignore" comment not followed by a reason.
17 | TYPE_IGNORE_COMMENT = re.compile(r"#\s*type:\s*ignore\s*(?![^\[]*\[)")
18 | 
19 | # Regex to match a "# type: ignore[<reason>]" comment.
20 | TYPE_IGNORE_REASON_COMMENT = re.compile(r"#\s*type:\s*ignore\[(?P<reason>.*)\]")
21 | 
22 | 
23 | class InvalidTypeIgnore(ValueError):
24 |     """Raised when a file has an invalid "# type: ignore" comment."""
25 | 
26 | 
27 | def check_file(file: pathlib.Path):
28 |     """Checks that the given file has no "# type: ignore" comments without a reason."""
29 |     with open(file, "r") as f:
30 |         for i, line in enumerate(f):
31 |             if TYPE_IGNORE_COMMENT.search(line):
32 |                 raise InvalidTypeIgnore(
33 |                     f"{file}:{i+1}: Found a '# type: ignore' comment without a reason.",
34 |                 )
35 | 
36 |             if search := TYPE_IGNORE_REASON_COMMENT.search(line):
37 |                 reason = search.group("reason")
38 |                 if reason == "":
39 |                     raise InvalidTypeIgnore(
40 |                         f"{file}:{i+1}: Found a '# type: ignore[]' "
41 |                         "comment without a reason.",
42 |                     )
43 | 
44 | 
45 | def check_files(files: List[pathlib.Path]):
46 |     """Checks that the given files have no type: ignore comments without a reason."""
47 |     for file in files:
48 |         if file == pathlib.Path(__file__):
49 |             continue
50 |         check_file(file)
51 | 
52 | 
53 | def get_files_to_check(root_dirs: List[pathlib.Path]) -> List[pathlib.Path]:
54 |     """Returns a list of files that should be checked for "# type: ignore" comments."""
55 |     # Get the list of files that should be checked.
56 |     files = []
57 |     for root_dir in root_dirs:
58 |         for root, _, filenames in os.walk(root_dir):
59 |             for filename in filenames:
60 |                 if filename.endswith(".py"):
61 |                     files.append(pathlib.Path(root) / filename)
62 | 
63 |     return files
64 | 
65 | 
66 | def parse_args():
67 |     """Parse command-line arguments."""
68 |     parser = argparse.ArgumentParser()
69 |     parser.add_argument(
70 |         "files",
71 |         nargs="+",
72 |         type=pathlib.Path,
73 |         help="List of files or paths to check for invalid '# type: ignore' comments.",
74 |     )
75 |     args = parser.parse_args()
76 |     return parser, args
77 | 
78 | 
79 | def main():
80 |     """Check for invalid "# type: ignore" comments."""
81 |     parser, args = parse_args()
82 |     file_list = get_files_to_check(args.files)
83 |     try:
84 |         check_files(file_list)
85 |     except InvalidTypeIgnore as e:
86 |         print(e)
87 |         sys.exit(1)
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     main()
92 | 


--------------------------------------------------------------------------------
/examples/quickstart.py:
--------------------------------------------------------------------------------
  1 | """This is a simple example demonstrating how to clone the behavior of an expert.
  2 | 
  3 | Refer to the jupyter notebooks for more detailed examples of how to use the algorithms.
  4 | """
  5 | import numpy as np
  6 | from stable_baselines3 import PPO
  7 | from stable_baselines3.common.evaluation import evaluate_policy
  8 | from stable_baselines3.ppo import MlpPolicy
  9 | 
 10 | from imitation.algorithms import bc
 11 | from imitation.data import rollout
 12 | from imitation.data.wrappers import RolloutInfoWrapper
 13 | from imitation.policies.serialize import load_policy
 14 | from imitation.util.util import make_vec_env
 15 | 
 16 | rng = np.random.default_rng(0)
 17 | env = make_vec_env(
 18 |     "seals:seals/CartPole-v0",
 19 |     rng=rng,
 20 |     post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],  # for computing rollouts
 21 | )
 22 | 
 23 | 
 24 | def train_expert():
 25 |     # note: use `download_expert` instead to download a pretrained, competent expert
 26 |     print("Training a expert.")
 27 |     expert = PPO(
 28 |         policy=MlpPolicy,
 29 |         env=env,
 30 |         seed=0,
 31 |         batch_size=64,
 32 |         ent_coef=0.0,
 33 |         learning_rate=0.0003,
 34 |         n_epochs=10,
 35 |         n_steps=64,
 36 |     )
 37 |     expert.learn(1_000)  # Note: change this to 100_000 to train a decent expert.
 38 |     return expert
 39 | 
 40 | 
 41 | def download_expert():
 42 |     print("Downloading a pretrained expert.")
 43 |     expert = load_policy(
 44 |         "ppo-huggingface",
 45 |         organization="HumanCompatibleAI",
 46 |         env_name="seals-CartPole-v0",
 47 |         venv=env,
 48 |     )
 49 |     return expert
 50 | 
 51 | 
 52 | def sample_expert_transitions():
 53 |     # expert = train_expert()  # uncomment to train your own expert
 54 |     expert = download_expert()
 55 | 
 56 |     print("Sampling expert transitions.")
 57 |     rollouts = rollout.rollout(
 58 |         expert,
 59 |         env,
 60 |         rollout.make_sample_until(min_timesteps=None, min_episodes=50),
 61 |         rng=rng,
 62 |     )
 63 |     return rollout.flatten_trajectories(rollouts)
 64 | 
 65 | 
 66 | transitions = sample_expert_transitions()
 67 | bc_trainer = bc.BC(
 68 |     observation_space=env.observation_space,
 69 |     action_space=env.action_space,
 70 |     demonstrations=transitions,
 71 |     rng=rng,
 72 | )
 73 | 
 74 | evaluation_env = make_vec_env(
 75 |     "seals:seals/CartPole-v0",
 76 |     rng=rng,
 77 |     env_make_kwargs={"render_mode": "human"},  # for rendering
 78 | )
 79 | 
 80 | print("Evaluating the untrained policy.")
 81 | reward, _ = evaluate_policy(
 82 |     bc_trainer.policy,  # type: ignore[arg-type]
 83 |     evaluation_env,
 84 |     n_eval_episodes=3,
 85 |     render=True,  # comment out to speed up
 86 | )
 87 | print(f"Reward before training: {reward}")
 88 | 
 89 | print("Training a policy using Behavior Cloning")
 90 | bc_trainer.train(n_epochs=1)
 91 | 
 92 | print("Evaluating the trained policy.")
 93 | reward, _ = evaluate_policy(
 94 |     bc_trainer.policy,  # type: ignore[arg-type]
 95 |     evaluation_env,
 96 |     n_eval_episodes=3,
 97 |     render=True,  # comment out to speed up
 98 | )
 99 | print(f"Reward after training: {reward}")
100 | 


--------------------------------------------------------------------------------
/docs/algorithms/density.rst:
--------------------------------------------------------------------------------
 1 | .. _density docs:
 2 | 
 3 | =============================
 4 | Density-Based Reward Modeling
 5 | =============================
 6 | 
 7 | Density-based reward modeling is an inverse reinforcement learning (IRL) technique that assigns higher rewards
 8 | to states or state-action pairs that occur more frequently in an expert's demonstrations.
 9 | This variant utilizes `kernel density estimation <https://en.wikipedia.org/wiki/Kernel_density_estimation>`_
10 | to model the underlying distribution of expert demonstrations.
11 | It assigns rewards to states or state-action pairs based on their estimated log-likelihood
12 | under the distribution of expert demonstrations.
13 | 
14 | The key intuition behind this method is to incentivize the agent to take actions
15 | that resemble the expert's actions in similar states.
16 | 
17 | While this approach is relatively simple, it does have several drawbacks:
18 | 
19 | - It assumes that the expert demonstrations are representative of the expert's behavior, which may not always be true.
20 | - It does not provide an interpretable reward function.
21 | - The kernel density estimation is not well-suited for high-dimensional state-action spaces.
22 | 
23 | Example
24 | =======
25 | 
26 | Detailed example notebook: :doc:`../tutorials/7_train_density`
27 | 
28 | .. testcode::
29 |     :skipif: skip_doctests
30 | 
31 |     import pprint
32 |     import numpy as np
33 | 
34 |     from stable_baselines3 import PPO
35 |     from stable_baselines3.common.policies import ActorCriticPolicy
36 | 
37 |     from imitation.algorithms import density as db
38 |     from imitation.data import serialize
39 |     from imitation.util import util
40 | 
41 |     rng = np.random.default_rng(0)
42 | 
43 |     env = util.make_vec_env("Pendulum-v1", rng=rng, n_envs=2)
44 |     rollouts = serialize.load("../tests/testdata/expert_models/pendulum_0/rollouts/final.npz")
45 | 
46 |     imitation_trainer = PPO(
47 |         ActorCriticPolicy, env, learning_rate=3e-4, gamma=0.95, ent_coef=1e-4, n_steps=2048
48 |     )
49 |     density_trainer = db.DensityAlgorithm(
50 |         venv=env,
51 |         rng=rng,
52 |         demonstrations=rollouts,
53 |         rl_algo=imitation_trainer,
54 |         density_type=db.DensityType.STATE_ACTION_DENSITY,
55 |         is_stationary=True,
56 |         kernel="gaussian",
57 |         kernel_bandwidth=0.4,
58 |         standardise_inputs=True,
59 |     )
60 |     density_trainer.train()
61 | 
62 |     def print_stats(density_trainer, n_trajectories):
63 |         stats = density_trainer.test_policy(n_trajectories=n_trajectories)
64 |         print("True reward function stats:")
65 |         pprint.pprint(stats)
66 |         stats_im = density_trainer.test_policy(true_reward=False, n_trajectories=n_trajectories)
67 |         print("Imitation reward function stats:")
68 |         pprint.pprint(stats_im)
69 | 
70 |     print("Stats before training:")
71 |     print_stats(density_trainer, 1)
72 | 
73 |     density_trainer.train_policy(100)  # Train for 1_000_000 steps to approach expert performance.
74 | 
75 |     print("Stats after training:")
76 |     print_stats(density_trainer, 1)
77 | 
78 | .. testoutput::
79 |     :hide:
80 | 
81 |     ...
82 | 
83 | API
84 | ===
85 | .. autoclass:: imitation.algorithms.density.DensityAlgorithm
86 |     :members:
87 |     :inherited-members:
88 |     :noindex:
89 | 


--------------------------------------------------------------------------------
/tests/util/test_sacred_file_parsing.py:
--------------------------------------------------------------------------------
 1 | """Tests for imitation.util.sacred_file_parsing."""
 2 | import json
 3 | import pathlib
 4 | 
 5 | import imitation.util.sacred_file_parsing as sfp
 6 | 
 7 | 
 8 | def _make_sacred_run_dir(
 9 |     path: pathlib.Path,
10 |     algo: str,
11 |     env: str,
12 |     status: str = "COMPLETED",
13 | ):
14 |     path.mkdir(parents=True, exist_ok=True)
15 |     cfg_file = path / "config.json"
16 |     cfg_file.write_text(json.dumps(dict(environment=dict(gym_id=env))))
17 | 
18 |     run_file = path / "run.json"
19 |     run_file.write_text(json.dumps(dict(status=status, command=algo)))
20 | 
21 | 
22 | def test_load_single_run(tmp_path):
23 |     # GIVEN
24 |     _make_sacred_run_dir(tmp_path / "run1", "ppo", "CartPole-v1")
25 | 
26 |     # WHEN
27 |     runs = list(sfp.find_sacred_runs(tmp_path))
28 | 
29 |     # THEN
30 |     assert len(runs) == 1
31 |     assert runs[0][0]["environment"]["gym_id"] == "CartPole-v1"
32 |     assert runs[0][1]["command"] == "ppo"
33 | 
34 | 
35 | def test_load_multiple_runs_in_sub_folders(tmp_path):
36 |     # GIVEN
37 |     _make_sacred_run_dir(tmp_path / "run1", "ppo", "CartPole-v1")
38 |     _make_sacred_run_dir(tmp_path / "subfolder1" / "run2", "ppo", "CartPole-v1")
39 |     _make_sacred_run_dir(tmp_path / "subfolder1" / "run3", "ppo", "CartPole-v1")
40 |     _make_sacred_run_dir(tmp_path / "subfolder2" / "run4", "ppo", "CartPole-v1")
41 | 
42 |     # WHEN
43 |     runs = list(sfp.find_sacred_runs(tmp_path))
44 | 
45 |     # THEN
46 |     assert len(runs) == 4
47 |     for conf, run in runs:
48 |         assert conf["environment"]["gym_id"] == "CartPole-v1"
49 |         assert run["command"] == "ppo"
50 | 
51 | 
52 | def test_loading_only_completed_runs(tmp_path):
53 |     # GIVEN
54 |     _make_sacred_run_dir(tmp_path / "run1", "ppo", "CartPole-v1")
55 |     _make_sacred_run_dir(tmp_path / "run2", "airl", "CartPole-v1", status="FAILED")
56 |     _make_sacred_run_dir(tmp_path / "run3", "ppo", "CartPole-v1", status="COMPLETED")
57 |     _make_sacred_run_dir(tmp_path / "run4", "gail", "CartPole-v1", status="RUNNING")
58 | 
59 |     # WHEN
60 |     runs = list(sfp.find_sacred_runs(tmp_path, only_completed_runs=True))
61 | 
62 |     # THEN
63 |     assert len(runs) == 2
64 |     for conf, run in runs:
65 |         assert conf["environment"]["gym_id"] == "CartPole-v1"
66 |         assert run["command"] == "ppo"
67 | 
68 | 
69 | def test_grouping_runs_by_algo_and_env(tmp_path):
70 |     # GIVEN
71 |     _make_sacred_run_dir(tmp_path / "run1", "ppo", "CartPole-v1")
72 |     _make_sacred_run_dir(tmp_path / "run2", "airl", "CartPole-v1")
73 |     _make_sacred_run_dir(tmp_path / "run3", "ppo", "CartPole-v1")
74 |     _make_sacred_run_dir(tmp_path / "run4", "gail", "CartPole-v1")
75 |     _make_sacred_run_dir(tmp_path / "run5", "ppo", "LunarLander-v2")
76 |     _make_sacred_run_dir(tmp_path / "run6", "airl", "LunarLander-v2")
77 |     _make_sacred_run_dir(tmp_path / "run7", "ppo", "LunarLander-v2")
78 | 
79 |     # WHEN
80 |     runs_by_algo_and_env = sfp.group_runs_by_algo_and_env(tmp_path)
81 | 
82 |     # THEN
83 |     assert set(runs_by_algo_and_env.keys()) == {"ppo", "airl", "gail"}
84 |     assert set(runs_by_algo_and_env["ppo"].keys()) == {"CartPole-v1", "LunarLander-v2"}
85 |     assert set(runs_by_algo_and_env["airl"].keys()) == {"CartPole-v1", "LunarLander-v2"}
86 |     assert set(runs_by_algo_and_env["gail"].keys()) == {"CartPole-v1"}
87 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/parallel.py:
--------------------------------------------------------------------------------
 1 | """Config files for parallel experiments.
 2 | 
 3 | Parallel experiments are intended to be defined in Python rather than
 4 | via CLI. For example, a user should add a new
 5 | `@parallel_ex.named_config` to define a new parallel experiment.
 6 | 
 7 | Adding custom named configs is necessary because the CLI interface can't add
 8 | search spaces to the config like `"seed": tune.choice([0, 1, 2, 3])`.
 9 | 
10 | For tuning hyperparameters of an algorithm on a given environment,
11 | check out the imitation/scripts/tuning.py script.
12 | """
13 | 
14 | import numpy as np
15 | import ray.tune as tune
16 | import sacred
17 | 
18 | from imitation.util.util import make_unique_timestamp
19 | 
20 | parallel_ex = sacred.Experiment("parallel")
21 | 
22 | 
23 | @parallel_ex.config
24 | def config():
25 |     sacred_ex_name = "train_rl"  # The experiment to parallelize
26 |     init_kwargs = {}  # Keyword arguments to pass to ray.init()
27 |     _uuid = make_unique_timestamp()
28 |     run_name = f"DEFAULT_{_uuid}"  # CLI --name option. For analysis grouping.
29 |     resources_per_trial = {}  # Argument to `tune.run`
30 |     base_named_configs = []  # Background settings before search_space is applied
31 |     base_config_updates = {}  # Background settings before search_space is applied
32 |     search_space = {
33 |         "named_configs": [],
34 |         "config_updates": {},
35 |     }  # `config` argument to `ray.tune.run(trainable, config)`
36 | 
37 |     num_samples = 1  # Number of samples per grid search configuration
38 |     repeat = 1  # Number of times to repeat a sampled configuration
39 |     experiment_checkpoint_path = ""  # Path to checkpoint of experiment
40 |     tune_run_kwargs = {}  # Additional kwargs to pass to `tune.run`
41 | 
42 | 
43 | # Debug named configs
44 | 
45 | 
46 | @parallel_ex.named_config
47 | def generate_test_data():
48 |     """Used by tests/generate_test_data.sh to generate tests/testdata/gather_tb/.
49 | 
50 |     "tests/testdata/gather_tb/" should contain 2 Tensorboard run directories,
51 |     one for each of the trials in the search space below.
52 |     """
53 |     sacred_ex_name = "train_rl"
54 |     run_name = "TEST"
55 |     repeat = 1
56 |     search_space = {
57 |         "config_updates": {
58 |             "rl": {
59 |                 "rl_kwargs": {
60 |                     "learning_rate": tune.choice(
61 |                         [3e-4 * x for x in (1 / 3, 1 / 2)],
62 |                     ),
63 |                 },
64 |             },
65 |         },
66 |     }
67 |     base_named_configs = [
68 |         "cartpole",
69 |         "environment.fast",
70 |         "policy_evaluation.fast",
71 |         "rl.fast",
72 |         "fast",
73 |     ]
74 |     base_config_updates = {
75 |         "rollout_save_final": False,
76 |     }
77 | 
78 | 
79 | @parallel_ex.named_config
80 | def example_cartpole_rl():
81 |     sacred_ex_name = "train_rl"
82 |     run_name = "example-cartpole"
83 |     repeat = 2
84 |     search_space = {
85 |         "config_updates": {
86 |             "rl": {
87 |                 "rl_kwargs": {
88 |                     "learning_rate": tune.choice(np.logspace(3e-6, 1e-1, num=3)),
89 |                     "nminibatches": tune.choice([16, 32, 64]),
90 |                 },
91 |             },
92 |         },
93 |     }
94 |     base_named_configs = ["cartpole"]
95 |     resources_per_trial = dict(cpu=4)
96 | 


--------------------------------------------------------------------------------
/experiments/transfer_learn_benchmark.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | # Train PPO experts using reward models from experiments/imit_benchmark.sh
  3 | 
  4 | set -e
  5 | 
  6 | source experiments/common.sh
  7 | 
  8 | SEEDS=(0 1 2)
  9 | CONFIG_CSV="experiments/imit_benchmark_config.csv"
 10 | REWARD_MODELS_DIR="data/reward_models"
 11 | # To prevent race conditions, we use a different log root for each process id.
 12 | LOG_ROOT="output/train_experts/${TIMESTAMP}-${BASHPID}"
 13 | RESULTS_FILE="results.txt"
 14 | ALGORITHM="gail"
 15 | NEED_TEST_FILES="false"
 16 | extra_configs=()
 17 | 
 18 | 
 19 | if ! TEMP=$($GNU_GETOPT -o fw -l fast,gail,airl,run_name:,log_root:,wandb -- "$@"); then
 20 |   exit 1
 21 | fi
 22 | eval set -- "$TEMP"
 23 | 
 24 | while true; do
 25 |   case "$1" in
 26 |     # Fast mode (debug)
 27 |     -f | --fast)
 28 |       CONFIG_CSV="tests/testdata/imit_benchmark_config.csv"
 29 |       REWARD_MODELS_DIR="tests/testdata/reward_models"
 30 |       NEED_TEST_FILES="true"
 31 |       SEEDS=(0)
 32 |       extra_configs=("${extra_configs[@]}" environment.fast rl.fast policy_evaluation.fast fast)
 33 |       shift
 34 |       ;;
 35 |     -w | --wandb)
 36 |       # activate wandb logging by adding 'wandb' format string to logging.log_format_strs
 37 |       extra_configs=("${extra_configs[@]}" "logging.wandb_logging")
 38 |       shift
 39 |       ;;
 40 |     --gail)
 41 |       ALGORITHM="gail"
 42 |       shift
 43 |       ;;
 44 |     --airl)
 45 |       ALGORITHM="airl"
 46 |       shift
 47 |       ;;
 48 |     --run_name)
 49 |       # Used by analysis scripts to filter runs later.
 50 |       extra_options=("${extra_options[@]}" --name "$2")
 51 |       shift 2
 52 |       ;;
 53 |     --log_root)
 54 |       LOG_ROOT="$2"
 55 |       shift 2
 56 |       ;;
 57 |     --)
 58 |       shift
 59 |       break
 60 |       ;;
 61 |     *)
 62 |       echo "Unrecognized argument $1"
 63 |       exit 1
 64 |       ;;
 65 |   esac
 66 | done
 67 | 
 68 | 
 69 | if [[ $NEED_TEST_FILES == "true" ]]; then
 70 |   # Generate quick reward models for test.
 71 |   # To prevent race conditions, we use a different save_dir for each process id.
 72 |   save_dir=tests/testdata/reward_models/${ALGORITHM}/${TIMESTAMP}-${BASHPID}
 73 | 
 74 |   # Wipe directories for writing later.
 75 |   if [[ -d ${save_dir} ]]; then
 76 |     rm -r "${save_dir}"
 77 |   fi
 78 |   mkdir -p "${save_dir}"
 79 | 
 80 |   experiments/imit_benchmark.sh -f --${ALGORITHM} --log_root "${save_dir}"
 81 | fi
 82 | 
 83 | 
 84 | echo "Writing logs in ${LOG_ROOT}"
 85 | parallel -j 25% --header : --results "${LOG_ROOT}/parallel/" --colsep , --progress \
 86 |   python -m imitation.scripts.train_rl \
 87 |   --capture=sys \
 88 |   "${extra_options[@]}" \
 89 |   with \
 90 |   '{env_config_name}' seed='{seed}' \
 91 |   logging.log_dir="${LOG_ROOT}/${ALGORITHM}/{env_config_name}_{seed}/n_expert_demos_{n_expert_demos}" \
 92 |   reward_type="RewardNet_unshaped" \
 93 |   reward_path="${REWARD_MODELS_DIR}/${ALGORITHM}/${TIMESTAMP}-${BASHPID}/{env_config_name}_0/n_expert_demos_{n_expert_demos}/checkpoints/final/reward_test.pt" \
 94 |   "${extra_configs[@]}" \
 95 |   :::: ${CONFIG_CSV} \
 96 |   ::: seed "${SEEDS[@]}"
 97 | 
 98 | pushd "$LOG_ROOT"
 99 | 
100 | # Display and save mean episode reward to ${RESULTS_FILE}.
101 | find . -name stdout -print0 | sort -z | xargs -0 tail -n 15 | grep -E '(==|Result)' | tee ${RESULTS_FILE}
102 | 
103 | popd
104 | 


--------------------------------------------------------------------------------
/experiments/dagger_benchmark.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | set -e
  3 | source experiments/common.sh
  4 | 
  5 | ENVS=(seals_cartpole)
  6 | SEEDS=(0 1 2 3 4)
  7 | # To prevent race conditions, we use a different log root for each process id.
  8 | LOG_ROOT="output/dagger_benchmark/${TIMESTAMP}-${BASHPID}"
  9 | extra_configs=()
 10 | extra_options=()
 11 | extra_parallel_options=()
 12 | 
 13 | if ! TEMP=$($GNU_GETOPT -o fTw -l fast,wandb,paper,tmux,pdb,echo,run_name:,log_root:,file_storage: -- "$@"); then
 14 |   exit 1
 15 | fi
 16 | eval set -- "$TEMP"
 17 | 
 18 | while true; do
 19 |   case "$1" in
 20 |     # Fast mode (debug)
 21 |     -f | --fast)
 22 |       SEEDS=(0)
 23 |       extra_configs=("${extra_configs[@]}" environment.fast demonstrations.fast policy_evaluation.fast fast)
 24 |       shift
 25 |       ;;
 26 |     --paper)  # Table benchmark settings
 27 |       ENVS=(seals_cartpole seals_mountain_car seals_half_cheetah)
 28 |       shift
 29 |       ;;
 30 |     -w | --wandb)
 31 |       # activate wandb logging by adding 'wandb' format string to logging.log_format_strs
 32 |       extra_configs=("${extra_configs[@]}" "logging.wandb_logging")
 33 |       shift
 34 |       ;;
 35 |     -T | --tmux)
 36 |       extra_parallel_options=("${extra_parallel_options[@]}" --tmux)
 37 |       shift
 38 |       ;;
 39 |     --run_name)
 40 |       extra_options=("${extra_options[@]}" --name "$2")
 41 |       shift 2
 42 |       ;;
 43 |     --log_root)
 44 |       LOG_ROOT="$2"
 45 |       shift 2
 46 |       ;;
 47 |     --file_storage)
 48 |       # Used by `tests/generate_test_data.sh` to save Sacred logs in tests/testdata.
 49 |       extra_options=("${extra_options[@]}" --file_storage "$2")
 50 |       shift 2
 51 |       ;;
 52 |     --pdb)
 53 |       # shellcheck disable=SC2016
 54 |       echo 'NOTE: Interact with PDB session via tmux. If an error occurs, `parallel` '
 55 |       echo 'will hang and wait for user input in tmux session.'
 56 |       # Needed for terminal output.
 57 |       extra_parallel_options=("${extra_parallel_options[@]}" --tmux)
 58 |       extra_options=("${extra_options[@]}" --pdb)
 59 |       shift
 60 |       ;;
 61 |     --echo)
 62 |       extra_parallel_options=("${extra_parallel_options[@]}" echo)
 63 |       shift
 64 |       ;;
 65 |     --)
 66 |       shift
 67 |       break
 68 |       ;;
 69 |     *)
 70 |       echo "Unrecognized flag $1" >&2
 71 |       exit 1
 72 |       ;;
 73 |   esac
 74 | done
 75 | 
 76 | mkdir -p "${LOG_ROOT}"
 77 | echo "Logging to: ${LOG_ROOT}"
 78 | 
 79 | parallel -j 25% --header : --results "${LOG_ROOT}/parallel/" --colsep , --progress \
 80 |   "${extra_parallel_options[@]}" \
 81 |   python -m imitation.scripts.train_imitation \
 82 |   --capture=sys \
 83 |   "${extra_options[@]}" \
 84 |   dagger \
 85 |   with \
 86 |   '{env_config_name}' \
 87 |   logging.log_dir="${LOG_ROOT}/{env_config_name}_{seed}" \
 88 |   dagger.expert_policy_type='ppo' \
 89 |   seed='{seed}' \
 90 |   "${extra_configs[@]}" \
 91 |   ::: env_config_name "${ENVS[@]}" \
 92 |   ::: seed "${SEEDS[@]}"
 93 | 
 94 | # Directory path is really long. Enter the directory to shorten results output,
 95 | # which includes directory of each stdout file.
 96 | pushd "${LOG_ROOT}/parallel"
 97 | find . -name stderr -print0 | sort -z | xargs -0 tail -n 15 | grep -E '==|Result'
 98 | popd
 99 | 
100 | # shellcheck disable=SC2016
101 | echo 'Generate results table using `python -m imitation.scripts.analyze`'
102 | 


--------------------------------------------------------------------------------
/src/imitation/util/video_wrapper.py:
--------------------------------------------------------------------------------
 1 | """Wrapper to record rendered video frames from an environment."""
 2 | 
 3 | import pathlib
 4 | from typing import Any, Dict, Optional, SupportsFloat, Tuple
 5 | 
 6 | import gymnasium as gym
 7 | from gymnasium.core import WrapperActType, WrapperObsType
 8 | from gymnasium.wrappers.monitoring import video_recorder
 9 | 
10 | 
11 | class VideoWrapper(gym.Wrapper):
12 |     """Creates videos from wrapped environment by calling render after each timestep."""
13 | 
14 |     episode_id: int
15 |     video_recorder: Optional[video_recorder.VideoRecorder]
16 |     single_video: bool
17 |     directory: pathlib.Path
18 | 
19 |     def __init__(
20 |         self,
21 |         env: gym.Env,
22 |         directory: pathlib.Path,
23 |         single_video: bool = True,
24 |     ):
25 |         """Builds a VideoWrapper.
26 | 
27 |         Args:
28 |             env: the wrapped environment.
29 |             directory: the output directory.
30 |             single_video: if True, generates a single video file, with episodes
31 |                 concatenated. If False, a new video file is created for each episode.
32 |                 Usually a single video file is what is desired. However, if one is
33 |                 searching for an interesting episode (perhaps by looking at the
34 |                 metadata), then saving to different files can be useful.
35 |         """
36 |         super().__init__(env)
37 |         self.episode_id = 0
38 |         self.video_recorder = None
39 |         self.single_video = single_video
40 | 
41 |         self.directory = directory
42 |         self.directory.mkdir(parents=True, exist_ok=True)
43 | 
44 |     def _reset_video_recorder(self) -> None:
45 |         """Creates a video recorder if one does not already exist.
46 | 
47 |         Called at the start of each episode (by `reset`). When a video recorder is
48 |         already present, it will only create a new one if `self.single_video == False`.
49 |         """
50 |         if self.video_recorder is not None:
51 |             # Video recorder already started.
52 |             if not self.single_video:
53 |                 # We want a new video for each episode, so destroy current recorder.
54 |                 self.video_recorder.close()
55 |                 self.video_recorder = None
56 | 
57 |         if self.video_recorder is None:
58 |             # No video recorder -- start a new one.
59 |             self.video_recorder = video_recorder.VideoRecorder(
60 |                 env=self.env,
61 |                 base_path=str(self.directory / f"video.{self.episode_id:06}"),
62 |                 metadata={"episode_id": self.episode_id},
63 |             )
64 | 
65 |     def reset(
66 |         self,
67 |         *,
68 |         seed: Optional[int] = None,
69 |         options: Optional[Dict[str, Any]] = None,
70 |     ) -> Tuple[WrapperObsType, Dict[str, Any]]:
71 |         self._reset_video_recorder()
72 |         self.episode_id += 1
73 |         return super().reset(seed=seed, options=options)
74 | 
75 |     def step(
76 |         self,
77 |         action: WrapperActType,
78 |     ) -> Tuple[WrapperObsType, SupportsFloat, bool, bool, Dict[str, Any]]:
79 |         res = super().step(action)
80 |         assert self.video_recorder is not None
81 |         self.video_recorder.capture_frame()
82 |         return res
83 | 
84 |     def close(self) -> None:
85 |         if self.video_recorder is not None:
86 |             self.video_recorder.close()
87 |             self.video_recorder = None
88 |         super().close()
89 | 


--------------------------------------------------------------------------------
/tests/algorithms/conftest.py:
--------------------------------------------------------------------------------
  1 | """Fixtures common across algorithm tests."""
  2 | from typing import Sequence
  3 | 
  4 | import pytest
  5 | from stable_baselines3.common import envs
  6 | from stable_baselines3.common.policies import BasePolicy
  7 | from stable_baselines3.common.vec_env import DummyVecEnv, VecEnv
  8 | 
  9 | from imitation.algorithms import bc
 10 | from imitation.data.types import TrajectoryWithRew
 11 | from imitation.data.wrappers import RolloutInfoWrapper
 12 | from imitation.policies import serialize
 13 | from imitation.testing.expert_trajectories import (
 14 |     lazy_generate_expert_trajectories,
 15 |     make_expert_transition_loader,
 16 | )
 17 | from imitation.util import util
 18 | 
 19 | CARTPOLE_ENV_NAME = "seals/CartPole-v0"
 20 | 
 21 | 
 22 | @pytest.fixture
 23 | def cartpole_expert_policy(cartpole_venv: VecEnv) -> BasePolicy:
 24 |     return serialize.load_policy(
 25 |         "ppo-huggingface",
 26 |         cartpole_venv,
 27 |         env_name=CARTPOLE_ENV_NAME,
 28 |     )
 29 | 
 30 | 
 31 | @pytest.fixture
 32 | def cartpole_expert_trajectories(
 33 |     cartpole_expert_policy,
 34 |     cartpole_venv,
 35 |     pytestconfig,
 36 |     rng,
 37 | ) -> Sequence[TrajectoryWithRew]:
 38 |     return lazy_generate_expert_trajectories(
 39 |         pytestconfig.cache.makedir("experts"),
 40 |         CARTPOLE_ENV_NAME,
 41 |         60,
 42 |         rng,
 43 |     )
 44 | 
 45 | 
 46 | PENDULUM_ENV_NAME = "Pendulum-v1"
 47 | 
 48 | 
 49 | @pytest.fixture
 50 | def cartpole_bc_trainer(
 51 |     pytestconfig,
 52 |     cartpole_venv,
 53 |     cartpole_expert_trajectories,
 54 |     rng,
 55 | ):
 56 |     return bc.BC(
 57 |         observation_space=cartpole_venv.observation_space,
 58 |         action_space=cartpole_venv.action_space,
 59 |         batch_size=50,
 60 |         demonstrations=make_expert_transition_loader(
 61 |             cache_dir=pytestconfig.cache.makedir("experts"),
 62 |             batch_size=50,
 63 |             expert_data_type="transitions",
 64 |             env_name="seals/CartPole-v0",
 65 |             rng=rng,
 66 |             num_trajectories=60,
 67 |         ),
 68 |         custom_logger=None,
 69 |         rng=rng,
 70 |     )
 71 | 
 72 | 
 73 | @pytest.fixture
 74 | def pendulum_expert_trajectories(
 75 |     pytestconfig,
 76 |     rng,
 77 | ) -> Sequence[TrajectoryWithRew]:
 78 |     return lazy_generate_expert_trajectories(
 79 |         pytestconfig.cache.makedir("experts"),
 80 |         PENDULUM_ENV_NAME,
 81 |         60,
 82 |         rng,
 83 |     )
 84 | 
 85 | 
 86 | @pytest.fixture
 87 | def pendulum_expert_policy(pendulum_venv) -> BasePolicy:
 88 |     return serialize.load_policy(
 89 |         "ppo-huggingface",
 90 |         pendulum_venv,
 91 |         env_name=PENDULUM_ENV_NAME,
 92 |     )
 93 | 
 94 | 
 95 | @pytest.fixture
 96 | def pendulum_venv(rng) -> VecEnv:
 97 |     return util.make_vec_env(
 98 |         PENDULUM_ENV_NAME,
 99 |         n_envs=8,
100 |         post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],
101 |         rng=rng,
102 |     )
103 | 
104 | 
105 | @pytest.fixture
106 | def pendulum_single_venv(rng) -> VecEnv:
107 |     return util.make_vec_env(
108 |         PENDULUM_ENV_NAME,
109 |         n_envs=1,
110 |         post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],
111 |         rng=rng,
112 |     )
113 | 
114 | 
115 | @pytest.fixture
116 | def multi_obs_venv() -> VecEnv:
117 |     def make_env():
118 |         env = envs.SimpleMultiObsEnv(channel_last=False)
119 |         return RolloutInfoWrapper(env)
120 | 
121 |     return DummyVecEnv([make_env, make_env])
122 | 


--------------------------------------------------------------------------------
/src/imitation/util/registry.py:
--------------------------------------------------------------------------------
  1 | """Registry mapping IDs to objects, such as environments or policy loaders."""
  2 | 
  3 | import functools
  4 | import importlib
  5 | from typing import Callable, Generic, Iterable, Optional, TypeVar
  6 | 
  7 | import gymnasium as gym
  8 | from stable_baselines3.common.vec_env import VecEnv
  9 | 
 10 | T = TypeVar("T")
 11 | LoaderFn = Callable[..., T]
 12 | """The type stored in Registry is commonly an instance of LoaderFn."""
 13 | 
 14 | 
 15 | def load_attr(name):
 16 |     """Load an attribute in format path.to.module:attribute."""
 17 |     module_name, attr_name = name.split(":")
 18 |     module = importlib.import_module(module_name)
 19 |     attr = getattr(module, attr_name)
 20 |     return attr
 21 | 
 22 | 
 23 | class Registry(Generic[T]):
 24 |     """A registry mapping IDs to type T objects, with support for lazy loading.
 25 | 
 26 |     The registry allows for insertion and retrieval. Modification of existing
 27 |     elements is not allowed.
 28 | 
 29 |     If the registered item is a string, it is assumed to be a path to an attribute
 30 |     in the form path.to.module:attribute. In this case, the module is loaded
 31 |     only if and when the registered item is retrieved.
 32 | 
 33 |     This is helpful both to reduce overhead from importing unused modules,
 34 |     and when some modules may have additional dependencies that are not installed
 35 |     in all deployments.
 36 | 
 37 |     Note: This is a similar idea to gym.EnvRegistry.
 38 |     """
 39 | 
 40 |     def __init__(self):
 41 |         """Builds empty Registry."""
 42 |         self._values = {}
 43 |         self._indirect = {}
 44 | 
 45 |     def get(self, key: str) -> T:
 46 |         if key not in self._values and key not in self._indirect:
 47 |             raise KeyError(f"Key '{key}' is not registered.")
 48 | 
 49 |         if key not in self._values:
 50 |             self._values[key] = load_attr(self._indirect[key])
 51 |         return self._values[key]
 52 | 
 53 |     def keys(self) -> Iterable[str]:
 54 |         return set(self._values.keys()).union(self._indirect.keys())
 55 | 
 56 |     def register(
 57 |         self,
 58 |         key: str,
 59 |         *,
 60 |         value: Optional[T] = None,
 61 |         indirect: Optional[str] = None,
 62 |     ):
 63 |         if key in self._values or key in self._indirect:
 64 |             raise KeyError(f"Duplicate registration for '{key}'")
 65 | 
 66 |         provided_args = sum([value is not None, indirect is not None])
 67 |         if provided_args != 1:
 68 |             raise ValueError(
 69 |                 "Must provide exactly one of 'value' and 'indirect',"
 70 |                 f"{provided_args} have been provided.",
 71 |             )
 72 | 
 73 |         if value is not None:
 74 |             self._values[key] = value
 75 |         else:
 76 |             self._indirect[key] = indirect
 77 | 
 78 | 
 79 | def build_loader_fn_require_space(
 80 |     fn: Callable[[gym.Space, gym.Space], T],
 81 |     **kwargs,
 82 | ) -> LoaderFn:
 83 |     """Converts a factory taking observation and action space into a LoaderFn."""
 84 | 
 85 |     @functools.wraps(fn)
 86 |     def wrapper(venv: VecEnv) -> T:
 87 |         return fn(venv.observation_space, venv.action_space, **kwargs)
 88 | 
 89 |     return wrapper
 90 | 
 91 | 
 92 | def build_loader_fn_require_env(fn: Callable[[VecEnv], T], **kwargs) -> LoaderFn:
 93 |     """Converts a factory taking an environment into a LoaderFn."""
 94 | 
 95 |     @functools.wraps(fn)
 96 |     def wrapper(venv: VecEnv) -> T:
 97 |         return fn(venv, **kwargs)
 98 | 
 99 |     return wrapper
100 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/train_imitation.py:
--------------------------------------------------------------------------------
  1 | """Configuration settings for train_dagger, training DAgger from synthetic demos."""
  2 | 
  3 | import pathlib
  4 | 
  5 | import sacred
  6 | 
  7 | from imitation.scripts.ingredients import bc
  8 | from imitation.scripts.ingredients import demonstrations as demos_common
  9 | from imitation.scripts.ingredients import environment, expert
 10 | from imitation.scripts.ingredients import logging as logging_ingredient
 11 | from imitation.scripts.ingredients import policy_evaluation, sqil
 12 | 
 13 | train_imitation_ex = sacred.Experiment(
 14 |     "train_imitation",
 15 |     ingredients=[
 16 |         logging_ingredient.logging_ingredient,
 17 |         demos_common.demonstrations_ingredient,
 18 |         expert.expert_ingredient,
 19 |         environment.environment_ingredient,
 20 |         policy_evaluation.policy_evaluation_ingredient,
 21 |         bc.bc_ingredient,
 22 |         sqil.sqil_ingredient,
 23 |     ],
 24 | )
 25 | 
 26 | 
 27 | @train_imitation_ex.config
 28 | def config():
 29 |     dagger = dict(
 30 |         use_offline_rollouts=False,  # warm-start policy with BC from offline demos
 31 |         total_timesteps=1e5,
 32 |         beta_schedule=None,
 33 |     )
 34 | 
 35 | 
 36 | @train_imitation_ex.named_config
 37 | def mountain_car():
 38 |     environment = dict(gym_id="MountainCar-v0")
 39 |     bc = dict(l2_weight=0.0)
 40 |     dagger = dict(total_timesteps=20000)
 41 | 
 42 | 
 43 | @train_imitation_ex.named_config
 44 | def seals_mountain_car():
 45 |     environment = dict(gym_id="seals/MountainCar-v0")
 46 |     bc = dict(l2_weight=0.0)
 47 |     dagger = dict(total_timesteps=20000)
 48 | 
 49 | 
 50 | @train_imitation_ex.named_config
 51 | def cartpole():
 52 |     environment = dict(gym_id="CartPole-v1")
 53 |     dagger = dict(total_timesteps=20000)
 54 | 
 55 | 
 56 | @train_imitation_ex.named_config
 57 | def seals_cartpole():
 58 |     environment = dict(gym_id="seals/CartPole-v0")
 59 |     dagger = dict(total_timesteps=20000)
 60 | 
 61 | 
 62 | @train_imitation_ex.named_config
 63 | def pendulum():
 64 |     environment = dict(gym_id="Pendulum-v1")
 65 | 
 66 | 
 67 | @train_imitation_ex.named_config
 68 | def ant():
 69 |     environment = dict(gym_id="Ant-v2")
 70 | 
 71 | 
 72 | @train_imitation_ex.named_config
 73 | def half_cheetah():
 74 |     environment = dict(gym_id="HalfCheetah-v2")
 75 |     bc = dict(l2_weight=0.0)
 76 |     dagger = dict(total_timesteps=60000)
 77 | 
 78 | 
 79 | @train_imitation_ex.named_config
 80 | def humanoid():
 81 |     environment = dict(gym_id="Humanoid-v2")
 82 | 
 83 | 
 84 | @train_imitation_ex.named_config
 85 | def seals_humanoid():
 86 |     environment = dict(gym_id="seals/Humanoid-v0")
 87 | 
 88 | 
 89 | @train_imitation_ex.named_config
 90 | def fast():
 91 |     dagger = dict(total_timesteps=50)
 92 |     bc = dict(train_kwargs=dict(n_batches=50))
 93 |     sqil = dict(total_timesteps=50)
 94 | 
 95 | 
 96 | hyperparam_dir = pathlib.Path(__file__).absolute().parent / "tuned_hps"
 97 | tuned_alg_envs = [
 98 |     "bc_seals_ant",
 99 |     "bc_seals_half_cheetah",
100 |     "bc_seals_hopper",
101 |     "bc_seals_swimmer",
102 |     "bc_seals_walker",
103 |     "dagger_seals_ant",
104 |     "dagger_seals_half_cheetah",
105 |     "dagger_seals_hopper",
106 |     "dagger_seals_swimmer",
107 |     "dagger_seals_walker",
108 | ]
109 | 
110 | for tuned_alg_env in tuned_alg_envs:
111 |     config_file = hyperparam_dir / f"{tuned_alg_env}_best_hp_eval.json"
112 |     assert config_file.is_file(), f"{config_file} does not exist"
113 |     train_imitation_ex.add_named_config(tuned_alg_env, str(config_file))
114 | 


--------------------------------------------------------------------------------
/src/imitation/scripts/config/eval_policy.py:
--------------------------------------------------------------------------------
  1 | """Configuration settings for eval_policy, evaluating pre-trained policies."""
  2 | 
  3 | import sacred
  4 | 
  5 | from imitation.scripts.ingredients import environment, expert
  6 | from imitation.scripts.ingredients import logging as logging_ingredient
  7 | 
  8 | eval_policy_ex = sacred.Experiment(
  9 |     "eval_policy",
 10 |     ingredients=[
 11 |         logging_ingredient.logging_ingredient,
 12 |         environment.environment_ingredient,
 13 |         expert.expert_ingredient,
 14 |     ],
 15 | )
 16 | 
 17 | 
 18 | @eval_policy_ex.config
 19 | def replay_defaults():
 20 |     eval_n_timesteps = int(1e4)  # Min timesteps to evaluate, optional.
 21 |     eval_n_episodes = None  # Num episodes to evaluate, optional.
 22 | 
 23 |     videos = False  # save video files
 24 |     video_kwargs = {}  # arguments to VideoWrapper
 25 |     render = False  # render to screen
 26 |     render_fps = 60  # -1 to render at full speed
 27 | 
 28 |     reward_type = None  # Optional: override with reward of this type
 29 |     reward_path = None  # Path of serialized reward to load
 30 | 
 31 |     rollout_save_path = None  # where to save rollouts to -- if None, do not save
 32 | 
 33 |     explore_kwargs = (
 34 |         None  # kwargs to feed to ExplorationWrapper -- if None, do not wrap
 35 |     )
 36 | 
 37 | 
 38 | @eval_policy_ex.named_config
 39 | def explore_eps_greedy():
 40 |     explore_kwargs = dict(switch_prob=1.0, random_prob=0.1)
 41 | 
 42 | 
 43 | @eval_policy_ex.named_config
 44 | def render():
 45 |     environment = dict(num_vec=1, parallel=False)
 46 |     render = True
 47 | 
 48 | 
 49 | @eval_policy_ex.named_config
 50 | def acrobot():
 51 |     environment = dict(gym_id="Acrobot-v1")
 52 | 
 53 | 
 54 | @eval_policy_ex.named_config
 55 | def ant():
 56 |     environment = dict(gym_id="Ant-v2")
 57 | 
 58 | 
 59 | @eval_policy_ex.named_config
 60 | def cartpole():
 61 |     environment = dict(gym_id="CartPole-v1")
 62 | 
 63 | 
 64 | @eval_policy_ex.named_config
 65 | def seals_cartpole():
 66 |     environment = dict(gym_id="seals/CartPole-v0")
 67 | 
 68 | 
 69 | @eval_policy_ex.named_config
 70 | def half_cheetah():
 71 |     environment = dict(gym_id="HalfCheetah-v2")
 72 | 
 73 | 
 74 | @eval_policy_ex.named_config
 75 | def seals_half_cheetah():
 76 |     environment = dict(gym_id="seals/HalfCheetah-v0")
 77 | 
 78 | 
 79 | @eval_policy_ex.named_config
 80 | def seals_hopper():
 81 |     environment = dict(gym_id="seals/Hopper-v0")
 82 | 
 83 | 
 84 | @eval_policy_ex.named_config
 85 | def seals_humanoid():
 86 |     environment = dict(gym_id="seals/Humanoid-v0")
 87 | 
 88 | 
 89 | @eval_policy_ex.named_config
 90 | def mountain_car():
 91 |     environment = dict(gym_id="MountainCar-v0")
 92 | 
 93 | 
 94 | @eval_policy_ex.named_config
 95 | def seals_mountain_car():
 96 |     environment = dict(gym_id="seals/MountainCar-v0")
 97 | 
 98 | 
 99 | @eval_policy_ex.named_config
100 | def pendulum():
101 |     environment = dict(gym_id="Pendulum-v1")
102 | 
103 | 
104 | @eval_policy_ex.named_config
105 | def reacher():
106 |     environment = dict(gym_id="Reacher-v2")
107 | 
108 | 
109 | @eval_policy_ex.named_config
110 | def seals_ant():
111 |     environment = dict(gym_id="seals/Ant-v0")
112 | 
113 | 
114 | @eval_policy_ex.named_config
115 | def seals_swimmer():
116 |     environment = dict(gym_id="seals/Swimmer-v0")
117 | 
118 | 
119 | @eval_policy_ex.named_config
120 | def seals_walker():
121 |     environment = dict(gym_id="seals/Walker2d-v0")
122 | 
123 | 
124 | @eval_policy_ex.named_config
125 | def fast():
126 |     environment = dict(gym_id="seals/CartPole-v0", num_vec=1, parallel=False)
127 |     render = True
128 |     eval_n_timesteps = 1
129 |     eval_n_episodes = None
130 | 


--------------------------------------------------------------------------------
/docs/algorithms/gail.rst:
--------------------------------------------------------------------------------
  1 | .. _gail docs:
  2 | 
  3 | ================================================
  4 | Generative Adversarial Imitation Learning (GAIL)
  5 | ================================================
  6 | 
  7 | `GAIL <https://arxiv.org/abs/1606.03476>`_ learns a policy by simultaneously training it
  8 | with a discriminator that aims to distinguish expert trajectories against
  9 | trajectories from the learned policy.
 10 | 
 11 | .. note::
 12 |     GAIL paper: `Generative Adversarial Imitation Learning <https://arxiv.org/abs/1606.03476>`_
 13 | 
 14 | Example
 15 | =======
 16 | 
 17 | Detailed example notebook: :doc:`../tutorials/3_train_gail`
 18 | 
 19 | .. testcode::
 20 |     :skipif: skip_doctests
 21 | 
 22 |     import numpy as np
 23 |     import gymnasium as gym
 24 |     from stable_baselines3 import PPO
 25 |     from stable_baselines3.common.evaluation import evaluate_policy
 26 |     from stable_baselines3.ppo import MlpPolicy
 27 | 
 28 |     from imitation.algorithms.adversarial.gail import GAIL
 29 |     from imitation.data import rollout
 30 |     from imitation.data.wrappers import RolloutInfoWrapper
 31 |     from imitation.policies.serialize import load_policy
 32 |     from imitation.rewards.reward_nets import BasicRewardNet
 33 |     from imitation.util.networks import RunningNorm
 34 |     from imitation.util.util import make_vec_env
 35 | 
 36 |     SEED = 42
 37 | 
 38 |     env = make_vec_env(
 39 |         "seals:seals/CartPole-v0",
 40 |         rng=np.random.default_rng(SEED),
 41 |         n_envs=8,
 42 |         post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],  # to compute rollouts
 43 |     )
 44 |     expert = load_policy(
 45 |         "ppo-huggingface",
 46 |         organization="HumanCompatibleAI",
 47 |         env_name="seals-CartPole-v0",
 48 |         venv=env,
 49 |     )
 50 | 
 51 |     rollouts = rollout.rollout(
 52 |         expert,
 53 |         env,
 54 |         rollout.make_sample_until(min_timesteps=None, min_episodes=60),
 55 |         rng=np.random.default_rng(SEED),
 56 |     )
 57 | 
 58 |     learner = PPO(
 59 |         env=env,
 60 |         policy=MlpPolicy,
 61 |         batch_size=64,
 62 |         ent_coef=0.0,
 63 |         learning_rate=0.0004,
 64 |         gamma=0.95,
 65 |         n_epochs=5,
 66 |         seed=SEED,
 67 |     )
 68 |     reward_net = BasicRewardNet(
 69 |         observation_space=env.observation_space,
 70 |         action_space=env.action_space,
 71 |         normalize_input_layer=RunningNorm,
 72 |     )
 73 |     gail_trainer = GAIL(
 74 |         demonstrations=rollouts,
 75 |         demo_batch_size=1024,
 76 |         gen_replay_buffer_capacity=512,
 77 |         n_disc_updates_per_round=8,
 78 |         venv=env,
 79 |         gen_algo=learner,
 80 |         reward_net=reward_net,
 81 |     )
 82 | 
 83 |     # evaluate the learner before training
 84 |     env.seed(SEED)
 85 |     learner_rewards_before_training, _ = evaluate_policy(
 86 |         learner, env, 100, return_episode_rewards=True,
 87 |     )
 88 | 
 89 |     # train the learner and evaluate again
 90 |     gail_trainer.train(20000)  # Train for 800_000 steps to match expert.
 91 |     env.seed(SEED)
 92 |     learner_rewards_after_training, _ = evaluate_policy(
 93 |         learner, env, 100, return_episode_rewards=True,
 94 |     )
 95 | 
 96 |     print("mean reward after training:", np.mean(learner_rewards_after_training))
 97 |     print("mean reward before training:", np.mean(learner_rewards_before_training))
 98 | 
 99 | .. testoutput::
100 |     :hide:
101 | 
102 |     ...
103 | 
104 | API
105 | ===
106 | .. autoclass:: imitation.algorithms.adversarial.gail.GAIL
107 |     :members:
108 |     :inherited-members:
109 |     :noindex:
110 | 
111 | .. autoclass:: imitation.algorithms.adversarial.common.AdversarialTrainer
112 |     :members:
113 |     :inherited-members:
114 |     :noindex:
115 | 


--------------------------------------------------------------------------------
/src/imitation/data/serialize.py:
--------------------------------------------------------------------------------
 1 | """Serialization utilities for trajectories."""
 2 | import logging
 3 | import os
 4 | import warnings
 5 | from typing import Mapping, Sequence, cast
 6 | 
 7 | import datasets
 8 | import numpy as np
 9 | 
10 | from imitation.data import huggingface_utils
11 | from imitation.data.types import AnyPath, Trajectory, TrajectoryWithRew
12 | from imitation.util import util
13 | 
14 | 
15 | def save(path: AnyPath, trajectories: Sequence[Trajectory]) -> None:
16 |     """Save a sequence of Trajectories to disk using HuggingFace's datasets library.
17 | 
18 |     Args:
19 |         path: Trajectories are saved to this path.
20 |         trajectories: The trajectories to save.
21 |     """
22 |     p = util.parse_path(path)
23 |     huggingface_utils.trajectories_to_dataset(trajectories).save_to_disk(str(p))
24 |     logging.info(f"Dumped demonstrations to {p}.")
25 | 
26 | 
27 | def load(path: AnyPath) -> Sequence[Trajectory]:
28 |     """Loads a sequence of trajectories saved by `save()` from `path`."""
29 |     # Interestingly, np.load will just silently load a normal pickle file when you
30 |     # set `allow_pickle=True`. So this call should succeed for both the new compressed
31 |     # .npz format and the old pickle based format. To tell the difference, we need to
32 |     # look at the type of the resulting object. If it's the new compressed format,
33 |     # it should be a Mapping that we need to decode, whereas if it's the old format,
34 |     # it's just the sequence of trajectories, and we can return it directly.
35 | 
36 |     if os.path.isdir(path):  # huggingface datasets format
37 |         dataset = datasets.load_from_disk(str(path))
38 |         if not isinstance(dataset, datasets.Dataset):  # pragma: no cover
39 |             raise ValueError(
40 |                 f"Expected to load a `datasets.Dataset` but got {type(dataset)}",
41 |             )
42 | 
43 |         return huggingface_utils.TrajectoryDatasetSequence(dataset)
44 | 
45 |     data = np.load(path, allow_pickle=True)  # works for both .npz and .pkl
46 | 
47 |     if isinstance(data, Sequence):  # pickle format
48 |         warnings.warn("Loading old pickle version of Trajectories", DeprecationWarning)
49 |         return data
50 |     if isinstance(data, Mapping):  # .npz format
51 |         warnings.warn("Loading old npz version of Trajectories", DeprecationWarning)
52 |         num_trajs = len(data["indices"])
53 |         fields = [
54 |             # Account for the extra obs in each trajectory
55 |             np.split(data["obs"], data["indices"] + np.arange(num_trajs) + 1),
56 |             np.split(data["acts"], data["indices"]),
57 |             np.split(data["infos"], data["indices"]),
58 |             data["terminal"],
59 |         ]
60 |         if "rews" in data:
61 |             fields = [
62 |                 *fields,
63 |                 np.split(data["rews"], data["indices"]),
64 |             ]
65 |             return [TrajectoryWithRew(*args) for args in zip(*fields)]
66 |         else:
67 |             return [Trajectory(*args) for args in zip(*fields)]  # pragma: no cover
68 |     else:  # pragma: no cover
69 |         raise ValueError(
70 |             f"Expected either an .npz file or a pickled sequence of trajectories; "
71 |             f"got a pickled object of type {type(data).__name__}",
72 |         )
73 | 
74 | 
75 | def load_with_rewards(path: AnyPath) -> Sequence[TrajectoryWithRew]:
76 |     """Loads a sequence of trajectories with rewards from a file."""
77 |     data = load(path)
78 | 
79 |     mismatched_types = [
80 |         type(traj) for traj in data if not isinstance(traj, TrajectoryWithRew)
81 |     ]
82 |     if mismatched_types:
83 |         raise ValueError(
84 |             f"Expected all trajectories to be of type `TrajectoryWithRew`, "
85 |             f"but found {mismatched_types[0].__name__}",
86 |         )
87 | 
88 |     return cast(Sequence[TrajectoryWithRew], data)
89 | 


--------------------------------------------------------------------------------
/docs/algorithms/airl.rst:
--------------------------------------------------------------------------------
  1 | .. _airl docs:
  2 | 
  3 | =================================================
  4 | Adversarial Inverse Reinforcement Learning (AIRL)
  5 | =================================================
  6 | `AIRL <https://arxiv.org/abs/1710.11248>`_, similar to :ref:`GAIL <gail docs>`,
  7 | adversarially trains a policy against a discriminator that aims to distinguish the expert
  8 | demonstrations from the learned policy. Unlike GAIL, AIRL recovers a reward function
  9 | that is more generalizable to changes in environment dynamics.
 10 | 
 11 | The expert policy must be stochastic.
 12 | 
 13 | 
 14 | .. note::
 15 |     AIRL paper: `Learning Robust Rewards with Adversarial Inverse Reinforcement Learning <https://arxiv.org/abs/1710.11248>`_
 16 | 
 17 | Example
 18 | =======
 19 | 
 20 | Detailed example notebook: :doc:`../tutorials/4_train_airl`
 21 | 
 22 | .. testcode::
 23 |     :skipif: skip_doctests
 24 | 
 25 |     import numpy as np
 26 |     import gymnasium as gym
 27 |     from stable_baselines3 import PPO
 28 |     from stable_baselines3.common.evaluation import evaluate_policy
 29 |     from stable_baselines3.ppo import MlpPolicy
 30 | 
 31 |     from imitation.algorithms.adversarial.airl import AIRL
 32 |     from imitation.data import rollout
 33 |     from imitation.data.wrappers import RolloutInfoWrapper
 34 |     from imitation.policies.serialize import load_policy
 35 |     from imitation.rewards.reward_nets import BasicShapedRewardNet
 36 |     from imitation.util.networks import RunningNorm
 37 |     from imitation.util.util import make_vec_env
 38 | 
 39 |     SEED = 42
 40 | 
 41 |     env = make_vec_env(
 42 |         "seals:seals/CartPole-v0",
 43 |         rng=np.random.default_rng(SEED),
 44 |         n_envs=8,
 45 |         post_wrappers=[lambda env, _: RolloutInfoWrapper(env)],  # to compute rollouts
 46 |     )
 47 |     expert = load_policy(
 48 |         "ppo-huggingface",
 49 |         organization="HumanCompatibleAI",
 50 |         env_name="seals-CartPole-v0",
 51 |         venv=env,
 52 |     )
 53 |     rollouts = rollout.rollout(
 54 |         expert,
 55 |         env,
 56 |         rollout.make_sample_until(min_episodes=60),
 57 |         rng=np.random.default_rng(SEED),
 58 |     )
 59 | 
 60 |     learner = PPO(
 61 |         env=env,
 62 |         policy=MlpPolicy,
 63 |         batch_size=64,
 64 |         ent_coef=0.0,
 65 |         learning_rate=0.0005,
 66 |         gamma=0.95,
 67 |         clip_range=0.1,
 68 |         vf_coef=0.1,
 69 |         n_epochs=5,
 70 |         seed=SEED,
 71 |     )
 72 |     reward_net = BasicShapedRewardNet(
 73 |         observation_space=env.observation_space,
 74 |         action_space=env.action_space,
 75 |         normalize_input_layer=RunningNorm,
 76 |     )
 77 |     airl_trainer = AIRL(
 78 |         demonstrations=rollouts,
 79 |         demo_batch_size=2048,
 80 |         gen_replay_buffer_capacity=512,
 81 |         n_disc_updates_per_round=16,
 82 |         venv=env,
 83 |         gen_algo=learner,
 84 |         reward_net=reward_net,
 85 |     )
 86 | 
 87 |     env.seed(SEED)
 88 |     learner_rewards_before_training, _ = evaluate_policy(
 89 |         learner, env, 100, return_episode_rewards=True,
 90 |     )
 91 |     airl_trainer.train(20000)  # Train for 2_000_000 steps to match expert.
 92 |     env.seed(SEED)
 93 |     learner_rewards_after_training, _ = evaluate_policy(
 94 |         learner, env, 100, return_episode_rewards=True,
 95 |     )
 96 | 
 97 |     print("mean reward after training:", np.mean(learner_rewards_after_training))
 98 |     print("mean reward before training:", np.mean(learner_rewards_before_training))
 99 | 
100 | .. testoutput::
101 |     :hide:
102 | 
103 |     ...
104 | 
105 | API
106 | ===
107 | .. autoclass:: imitation.algorithms.adversarial.airl.AIRL
108 |     :members:
109 |     :inherited-members:
110 |     :noindex:
111 | 
112 | .. autoclass:: imitation.algorithms.adversarial.common.AdversarialTrainer
113 |     :members:
114 |     :noindex:
115 | 


--------------------------------------------------------------------------------
/src/imitation/policies/exploration_wrapper.py:
--------------------------------------------------------------------------------
 1 | """Wrapper to turn a policy into a more exploratory version."""
 2 | 
 3 | from typing import Dict, Optional, Tuple, Union
 4 | 
 5 | import numpy as np
 6 | from stable_baselines3.common import vec_env
 7 | 
 8 | from imitation.data import rollout
 9 | from imitation.util import util
10 | 
11 | 
12 | class ExplorationWrapper:
13 |     """Wraps a PolicyCallable to create a partially randomized version.
14 | 
15 |     This wrapper randomly switches between two policies: the wrapped policy,
16 |     and a random one. After each action, the current policy is kept
17 |     with a certain probability. Otherwise, one of these two policies is chosen
18 |     at random (without any dependence on what the current policy is).
19 | 
20 |     The random policy uses the `action_space.sample()` method.
21 |     """
22 | 
23 |     def __init__(
24 |         self,
25 |         policy: rollout.AnyPolicy,
26 |         venv: vec_env.VecEnv,
27 |         random_prob: float,
28 |         switch_prob: float,
29 |         rng: np.random.Generator,
30 |         deterministic_policy: bool = False,
31 |     ):
32 |         """Initializes the ExplorationWrapper.
33 | 
34 |         Args:
35 |             policy: The policy to randomize.
36 |             venv: The environment to use (needed for sampling random actions).
37 |             random_prob: The probability of picking the random policy when switching.
38 |             switch_prob: The probability of switching away from the current policy.
39 |             rng: The random state to use for seeding the environment and for
40 |                 switching policies.
41 |             deterministic_policy: Whether to make the policy deterministic when not
42 |                 exploring. This must be False when ``policy`` is a ``PolicyCallable``.
43 |         """
44 |         policy_callable = rollout.policy_to_callable(policy, venv, deterministic_policy)
45 |         self.wrapped_policy = policy_callable
46 |         self.random_prob = random_prob
47 |         self.switch_prob = switch_prob
48 |         self.venv = venv
49 | 
50 |         self.rng = rng
51 |         seed = util.make_seeds(self.rng)
52 |         self.venv.action_space.seed(seed)
53 | 
54 |         self.current_policy = policy_callable
55 |         # Choose the initial policy at random
56 |         self._switch()
57 | 
58 |     def _random_policy(
59 |         self,
60 |         obs: Union[np.ndarray, Dict[str, np.ndarray]],
61 |         state: Optional[Tuple[np.ndarray, ...]],
62 |         episode_start: Optional[np.ndarray],
63 |     ) -> Tuple[np.ndarray, Optional[Tuple[np.ndarray, ...]]]:
64 |         del state, episode_start  # Unused
65 |         acts = [self.venv.action_space.sample() for _ in range(len(obs))]
66 |         return np.stack(acts, axis=0), None
67 | 
68 |     def _switch(self) -> None:
69 |         """Pick a new policy at random."""
70 |         if self.rng.random() < self.random_prob:
71 |             self.current_policy = self._random_policy
72 |         else:
73 |             self.current_policy = self.wrapped_policy
74 | 
75 |     def __call__(
76 |         self,
77 |         observation: Union[np.ndarray, Dict[str, np.ndarray]],
78 |         input_state: Optional[Tuple[np.ndarray, ...]],
79 |         episode_start: Optional[np.ndarray],
80 |     ) -> Tuple[np.ndarray, Optional[Tuple[np.ndarray, ...]]]:
81 |         del episode_start  # Unused
82 | 
83 |         if input_state is not None:
84 |             # This checks that we aren't passed a state
85 |             raise ValueError("Exploration wrapper does not support stateful policies.")
86 | 
87 |         acts, output_state = self.current_policy(observation, None, None)
88 | 
89 |         if output_state is not None:
90 |             # This checks that the policy doesn't return a state
91 |             raise ValueError("Exploration wrapper does not support stateful policies.")
92 | 
93 |         if self.rng.random() < self.switch_prob:
94 |             self._switch()
95 |         return acts, None
96 | 


--------------------------------------------------------------------------------
/tests/test_examples.py:
--------------------------------------------------------------------------------
 1 | """Tests examples/*: quickstart code and Jupyter notebook."""
 2 | 
 3 | import os
 4 | import pathlib
 5 | import subprocess
 6 | import sys
 7 | from typing import Iterable, Sequence
 8 | 
 9 | import pytest
10 | import pytest_notebook as ptnb
11 | 
12 | 
13 | def _paths_to_strs(x: Iterable[pathlib.Path]) -> Sequence[str]:
14 |     """Convert Path to str for nice Pytest `parameterized` logs.
15 | 
16 |     For example, if we use Path, we get something inscrutable like
17 |     test_run_example_sh_scripts[sh_path0] rather than seeing the actual path name.
18 | 
19 |     Args:
20 |         x: The paths to convert.
21 | 
22 |     Returns:
23 |         A sequence of the same length as `x`, with each element the string
24 |         representation of the corresponding path in `x`.
25 |     """
26 |     return [str(path) for path in x]
27 | 
28 | 
29 | THIS_DIR = pathlib.Path(__file__).absolute().parent
30 | EXAMPLES_DIR = THIS_DIR / ".." / "examples"
31 | TUTORIALS_DIR = THIS_DIR / ".." / "docs" / "tutorials"
32 | 
33 | EXCLUDED_EXAMPLE_FILES = ["train_dagger_atari_interactive_policy.py"]
34 | EXCLUDED_EXAMPLE_PATHS = [EXAMPLES_DIR / f for f in EXCLUDED_EXAMPLE_FILES]
35 | 
36 | SH_PATHS = _paths_to_strs(set(EXAMPLES_DIR.glob("*.sh")) - set(EXCLUDED_EXAMPLE_PATHS))
37 | TUTORIAL_PATHS = _paths_to_strs(TUTORIALS_DIR.glob("*.ipynb"))
38 | PY_PATHS = _paths_to_strs(set(EXAMPLES_DIR.glob("*.py")) - set(EXCLUDED_EXAMPLE_PATHS))
39 | 
40 | 
41 | # Note: This is excluded from coverage since is computed on linux. However, it is
42 | #   covered by mac and windows runners.
43 | @pytest.mark.skipif(sys.platform == "linux", reason="Linux is covered by readthedocs.")
44 | @pytest.mark.parametrize("nb_path", TUTORIAL_PATHS)
45 | def test_run_tutorial_notebooks(nb_path) -> None:  # pragma: no cover
46 |     """Smoke test ensuring that tutorial notebooks run without error.
47 | 
48 |     The `pytest_notebook` package also includes regression test functionality against
49 |     saved notebook outputs, if we want to check that later.
50 | 
51 |     Args:
52 |         nb_path: Path to the notebook to test.
53 |     """
54 |     nb = ptnb.notebook.load_notebook(nb_path)
55 |     # TODO(GH#793): Shorten timeout and ensure the notebook can still show desired
56 |     # improvement.
57 |     result = ptnb.execution.execute_notebook(nb, cwd=TUTORIALS_DIR, timeout=540)
58 |     assert result.exec_error is None
59 | 
60 | 
61 | @pytest.mark.parametrize("py_path", PY_PATHS)
62 | def test_run_example_py_scripts(py_path):
63 |     """Smoke test ensuring that python example scripts run without error."""
64 |     # We need to use sys.executable, not just "python", on Windows as
65 |     # subprocess.call ignores PATH (unless shell=True) so runs a
66 |     # system-wide Python interpreter outside of our venv. See:
67 |     # https://stackoverflow.com/questions/5658622/
68 |     exit_code = subprocess.call([sys.executable, py_path])
69 |     assert exit_code == 0
70 | 
71 | 
72 | @pytest.mark.parametrize("sh_path", SH_PATHS)
73 | def test_run_example_sh_scripts(sh_path):
74 |     """Smoke test ensuring that shell example scripts run without error."""
75 |     if os.name == "nt":  # pragma: no cover
76 |         pytest.skip("bash shell scripts not ported to Windows.")
77 |     for _ in range(2):  # Repeat because historically these have failed on second run.
78 |         exit_code = subprocess.call(["env", "bash", "-e", sh_path])
79 |         assert exit_code == 0
80 | 
81 | 
82 | README_SNIPPET_PATHS = _paths_to_strs([EXAMPLES_DIR / "quickstart.sh"])
83 | 
84 | 
85 | @pytest.mark.parametrize("snippet_path", README_SNIPPET_PATHS)
86 | def test_example_snippets_are_in_readme(snippet_path):
87 |     """Check that README.md examples haven't diverged from snippets."""
88 |     with open(snippet_path, "r") as f:
89 |         x = "".join(f.readlines()[2:])  # strip away shebang line
90 |     with open("README.md", "r", encoding="utf-8") as f:
91 |         y = f.read()
92 |     assert x in y, f"{snippet_path} has diverged from README.md"
93 | 


--------------------------------------------------------------------------------
/src/imitation/policies/replay_buffer_wrapper.py:
--------------------------------------------------------------------------------
  1 | """Wrapper for reward labeling for transitions sampled from a replay buffer."""
  2 | 
  3 | from typing import Mapping, Type
  4 | 
  5 | import numpy as np
  6 | from gymnasium import spaces
  7 | from stable_baselines3.common.buffers import ReplayBuffer
  8 | from stable_baselines3.common.type_aliases import ReplayBufferSamples
  9 | 
 10 | from imitation.rewards.reward_function import RewardFn
 11 | from imitation.util import util
 12 | 
 13 | 
 14 | def _samples_to_reward_fn_input(
 15 |     samples: ReplayBufferSamples,
 16 | ) -> Mapping[str, np.ndarray]:
 17 |     """Convert a sample from a replay buffer to a numpy array."""
 18 |     return dict(
 19 |         state=samples.observations.cpu().numpy(),
 20 |         action=samples.actions.cpu().numpy(),
 21 |         next_state=samples.next_observations.cpu().numpy(),
 22 |         done=samples.dones.cpu().numpy(),
 23 |     )
 24 | 
 25 | 
 26 | class ReplayBufferRewardWrapper(ReplayBuffer):
 27 |     """Relabel the rewards in transitions sampled from a ReplayBuffer."""
 28 | 
 29 |     def __init__(
 30 |         self,
 31 |         buffer_size: int,
 32 |         observation_space: spaces.Space,
 33 |         action_space: spaces.Space,
 34 |         *,
 35 |         replay_buffer_class: Type[ReplayBuffer],
 36 |         reward_fn: RewardFn,
 37 |         **kwargs,
 38 |     ):
 39 |         """Builds ReplayBufferRewardWrapper.
 40 | 
 41 |         Args:
 42 |             buffer_size: Max number of elements in the buffer
 43 |             observation_space: Observation space
 44 |             action_space: Action space
 45 |             replay_buffer_class: Class of the replay buffer.
 46 |             reward_fn: Reward function for reward relabeling.
 47 |             **kwargs: keyword arguments for ReplayBuffer.
 48 |         """
 49 |         # Note(yawen-d): we directly inherit ReplayBuffer and leave out the case of
 50 |         # DictReplayBuffer because the current RewardFn only takes in NumPy array-based
 51 |         # inputs, and SAC is the only use case for ReplayBuffer relabeling. See:
 52 |         # https://github.com/HumanCompatibleAI/imitation/pull/459#issuecomment-1201997194
 53 |         assert replay_buffer_class is ReplayBuffer, "only ReplayBuffer is supported"
 54 |         assert not isinstance(observation_space, spaces.Dict)
 55 |         self.replay_buffer = replay_buffer_class(
 56 |             buffer_size,
 57 |             observation_space,
 58 |             action_space,
 59 |             **kwargs,
 60 |         )
 61 |         self.reward_fn = reward_fn
 62 |         _base_kwargs = {k: v for k, v in kwargs.items() if k in ["device", "n_envs"]}
 63 |         super().__init__(buffer_size, observation_space, action_space, **_base_kwargs)
 64 | 
 65 |     @property
 66 |     def pos(self) -> int:
 67 |         return self.replay_buffer.pos
 68 | 
 69 |     @pos.setter
 70 |     def pos(self, pos: int):
 71 |         self.replay_buffer.pos = pos
 72 | 
 73 |     @property
 74 |     def full(self) -> bool:
 75 |         return self.replay_buffer.full
 76 | 
 77 |     @full.setter
 78 |     def full(self, full: bool):
 79 |         self.replay_buffer.full = full
 80 | 
 81 |     def sample(self, *args, **kwargs):
 82 |         samples = self.replay_buffer.sample(*args, **kwargs)
 83 |         rewards = self.reward_fn(**_samples_to_reward_fn_input(samples))
 84 |         shape = samples.rewards.shape
 85 |         device = samples.rewards.device
 86 |         rewards_th = util.safe_to_tensor(rewards).reshape(shape).to(device)
 87 | 
 88 |         return ReplayBufferSamples(
 89 |             samples.observations,
 90 |             samples.actions,
 91 |             samples.next_observations,
 92 |             samples.dones,
 93 |             rewards_th,
 94 |         )
 95 | 
 96 |     def add(self, *args, **kwargs):
 97 |         self.replay_buffer.add(*args, **kwargs)
 98 | 
 99 |     def _get_samples(self):
100 |         raise NotImplementedError(
101 |             "_get_samples() is intentionally not implemented."
102 |             "This method should not be called.",
103 |         )
104 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
  1 | =========
  2 | Imitation
  3 | =========
  4 | 
  5 | **Imitation provides clean implementations of imitation and reward learning algorithms**, under a unified and user-friendly API.
  6 | Currently, we have implementations of Behavioral Cloning, `DAgger <https://arxiv.org/pdf/1011.0686.pdf>`_
  7 | (with synthetic examples), density-based reward modeling, `Maximum Causal Entropy Inverse Reinforcement Learning <https://www.cs.cmu.edu/~bziebart/publications/maximum-causal-entropy.pdf>`_,
  8 | `Adversarial Inverse Reinforcement Learning <https://arxiv.org/abs/1710.11248>`_,
  9 | `Generative Adversarial Imitation Learning <https://arxiv.org/abs/1606.03476>`_, and
 10 | `Deep RL from Human Preferences <https://arxiv.org/abs/1706.03741>`_.
 11 | 
 12 | You can find us on GitHub at http://github.com/HumanCompatibleAI/imitation.
 13 | 
 14 | 
 15 | Main Features
 16 | ~~~~~~~~~~~~~
 17 | 
 18 | - Built on and compatible with Stable Baselines 3 (SB3).
 19 | - Modular Pytorch implementations of Behavioral Cloning, DAgger, GAIL, and AIRL that can
 20 |   train arbitrary SB3 policies.
 21 | - GAIL and AIRL have customizable reward and discriminator networks.
 22 | - Scripts to train policies using SB3 and save rollouts from these policies as synthetic "expert" demonstrations.
 23 | - Data structures and scripts for loading and storing expert demonstrations.
 24 | 
 25 | Citing imitation
 26 | ~~~~~~~~~~~~~~~~
 27 | 
 28 | If you use ``imitation`` in your research project, please cite our paper to help us track our impact and enable readers to more easily replicate your results. You may use the following BibTeX::
 29 | 
 30 |     @misc{gleave2022imitation,
 31 |       author = {Gleave, Adam and Taufeeque, Mohammad and Rocamonde, Juan and Jenner, Erik and Wang, Steven H. and Toyer, Sam and Ernestus, Maximilian and Belrose, Nora and Emmons, Scott and Russell, Stuart},
 32 |       title = {imitation: Clean Imitation Learning Implementations},
 33 |       year = {2022},
 34 |       howPublished = {arXiv:2211.11972v1 [cs.LG]},
 35 |       archivePrefix = {arXiv},
 36 |       eprint = {2211.11972},
 37 |       primaryClass = {cs.LG},
 38 |       url = {https://arxiv.org/abs/2211.11972},
 39 |     }
 40 | 
 41 | .. toctree::
 42 |    :maxdepth: 2
 43 |    :caption: Getting Started
 44 |    :hidden:
 45 | 
 46 |    getting-started/what_is_imitation
 47 |    getting-started/installation
 48 |    getting-started/first_steps
 49 |    getting-started/cli
 50 | 
 51 | .. toctree::
 52 |     :maxdepth: 2
 53 |     :caption: Main Concepts
 54 |     :hidden:
 55 | 
 56 |     main-concepts/experts
 57 |     main-concepts/trajectories
 58 |     main-concepts/reward_networks
 59 |     main-concepts/variable_horizon
 60 |     main-concepts/benchmarks
 61 |     main-concepts/benchmark_summary
 62 | 
 63 | 
 64 | .. toctree::
 65 |    :maxdepth: 2
 66 |    :caption: Algorithms
 67 |    :hidden:
 68 | 
 69 |    algorithms/bc
 70 |    algorithms/gail
 71 |    algorithms/airl
 72 |    algorithms/dagger
 73 |    algorithms/density
 74 |    algorithms/mce_irl
 75 |    algorithms/preference_comparisons
 76 |    algorithms/sqil
 77 | 
 78 | .. toctree::
 79 |    :maxdepth: 2
 80 |    :caption: Tutorials
 81 |    :hidden:
 82 | 
 83 |    tutorials/1_train_bc
 84 |    tutorials/2_train_dagger
 85 |    tutorials/3_train_gail
 86 |    tutorials/4_train_airl
 87 |    tutorials/5_train_preference_comparisons
 88 |    tutorials/5a_train_preference_comparisons_with_cnn
 89 |    tutorials/6_train_mce
 90 |    tutorials/7_train_density
 91 |    tutorials/8_train_sqil
 92 |    tutorials/8a_train_sqil_sac
 93 |    tutorials/9_compare_baselines
 94 |    tutorials/10_train_custom_env
 95 | 
 96 | API Reference
 97 | ~~~~~~~~~~~~~
 98 | 
 99 | .. autosummary::
100 |    :toctree: _api
101 |    :caption: API Reference
102 |    :recursive:
103 |    :template: autosummary/module.rst
104 | 
105 |    imitation
106 | 
107 | 
108 | .. toctree::
109 |    :maxdepth: 1
110 |    :caption: Development
111 |    :hidden:
112 | 
113 |    development/developer
114 |    development/contributing/index
115 |    development/release-notes
116 |    development/license
117 | 
118 | 
119 | 
120 | 
121 | Index
122 | ==================
123 | 
124 | * :ref:`genindex`
125 | * :ref:`modindex`
126 | 


--------------------------------------------------------------------------------