├── .github ├── Issue_Template │ └── issue_template.md ├── Pull_Request_Template │ └── pull_request_template.md └── workflows │ ├── docs_test.yaml │ ├── publish-release.yaml │ └── test.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── HPs ├── DQN.yaml ├── PPO.yaml └── SAC.yaml ├── LICENSE ├── Makefile ├── README.md ├── docs ├── api_generator.py ├── basic_usage.md ├── hooks │ ├── cleanup_log_output.py │ ├── debug_which_page_is_being_rendered.py │ └── disable_markdown_exec.py ├── img │ ├── favicon.ico │ ├── logo.png │ └── logo_no_font.png ├── index.md ├── installation.md ├── methods │ ├── algorithms.md │ ├── architectures.md │ ├── inner_loops.md │ └── outer_loops.md ├── package_structure.md └── usecases │ ├── Contextual_RL.md │ ├── DAC.md │ └── Standard_RL.md ├── examples ├── README.md ├── __init__.py ├── custom_exploration_scheduler.py ├── custom_policy.py ├── hypersweeper_smac_example_config.yaml ├── multiple_runs │ ├── mighty_experiment_0 │ │ ├── 0 │ │ │ └── .hydra │ │ │ │ ├── config.yaml │ │ │ │ ├── hydra.yaml │ │ │ │ └── overrides.yaml │ │ ├── eval_results.csv │ │ ├── hyperparameters.csv │ │ ├── losses.csv │ │ ├── multirun.yaml │ │ ├── results.csv │ │ └── results.npz │ ├── mighty_experiment_1 │ │ ├── 1 │ │ │ └── .hydra │ │ │ │ ├── config.yaml │ │ │ │ ├── hydra.yaml │ │ │ │ └── overrides.yaml │ │ ├── eval_results.csv │ │ ├── hyperparameters.csv │ │ ├── losses.csv │ │ ├── results.csv │ │ └── results.npz │ ├── mighty_experiment_2 │ │ ├── 2 │ │ │ └── .hydra │ │ │ │ ├── config.yaml │ │ │ │ ├── hydra.yaml │ │ │ │ └── overrides.yaml │ │ ├── eval_results.csv │ │ ├── hyperparameters.csv │ │ ├── losses.csv │ │ ├── results.csv │ │ └── results.npz │ ├── mighty_experiment_3 │ │ ├── 3 │ │ │ └── .hydra │ │ │ │ ├── config.yaml │ │ │ │ ├── hydra.yaml │ │ │ │ └── overrides.yaml │ │ ├── eval_results.csv │ │ ├── hyperparameters.csv │ │ ├── losses.csv │ │ ├── results.csv │ │ └── results.npz │ └── mighty_experiment_4 │ │ ├── 4 │ │ └── .hydra │ │ │ ├── config.yaml │ │ │ ├── hydra.yaml │ │ │ └── overrides.yaml │ │ ├── eval_results.csv │ │ ├── hyperparameters.csv │ │ ├── losses.csv │ │ ├── results.csv │ │ └── results.npz ├── optuna_example_config.yaml └── plot_examples.ipynb ├── mighty ├── __init__.py ├── configs │ ├── algorithm │ │ ├── atari_dqn.yaml │ │ ├── ddqn.yaml │ │ ├── dqn.yaml │ │ ├── minigrid_dqn.yaml │ │ ├── ppo.yaml │ │ ├── ppo_mountaincar.yaml │ │ ├── procgen_dqn.yaml │ │ ├── sac.yaml │ │ └── sac_mujoco.yaml │ ├── base.yaml │ ├── cluster │ │ ├── local.yaml │ │ ├── luis.yaml │ │ ├── noctua.yaml │ │ └── tnt.yaml │ ├── cmaes_hpo.yaml │ ├── environment │ │ ├── carl_walkers │ │ │ └── ant_goals.yaml │ │ ├── dacbench │ │ │ ├── function_approximation.yaml │ │ │ └── function_approximation_benchmark.yaml │ │ ├── gymnasium │ │ │ ├── atari_pong.yaml │ │ │ ├── cartpole.yaml │ │ │ ├── mountaincar.yaml │ │ │ ├── mountaincarcontinuous.yaml │ │ │ └── pendulum.yaml │ │ ├── procgen_bigfish.yaml │ │ ├── pufferlib_minigrid │ │ │ └── minigrid_env.yaml │ │ ├── pufferlib_ocean │ │ │ ├── bandit.yaml │ │ │ ├── memory.yaml │ │ │ ├── password.yaml │ │ │ ├── squared.yaml │ │ │ └── stochastic.yaml │ │ └── pufferlib_procgen │ │ │ └── bigfish.yaml │ ├── exploration │ │ ├── epsilon_decay.yaml │ │ ├── ez_greedy.yaml │ │ ├── noveld.yaml │ │ └── rnd.yaml │ ├── hydra │ │ └── help │ │ │ └── mighty_help.yaml │ ├── nes.yaml │ ├── ppo_smac.yaml │ ├── sac_smac.yaml │ ├── search_space │ │ ├── dqn_gym_classic.yaml │ │ ├── dqn_rs.yaml │ │ ├── dqn_template.yaml │ │ ├── mighty_template.yaml │ │ ├── ppo_rs.yaml │ │ └── sac_rs.yaml │ ├── sweep_ppo_pbt.yaml │ ├── sweep_rs.yaml │ └── target_function.yaml ├── mighty_agents │ ├── .gitkeep │ ├── __init__.py │ ├── base_agent.py │ ├── dqn.py │ ├── factory.py │ ├── ppo.py │ └── sac.py ├── mighty_exploration │ ├── __init__.py │ ├── decaying_epsilon_greedy.py │ ├── epsilon_greedy.py │ ├── ez_greedy.py │ ├── mighty_exploration_policy.py │ └── stochastic_policy.py ├── mighty_meta │ ├── __init__.py │ ├── cosine_lr_schedule.py │ ├── mighty_component.py │ ├── plr.py │ ├── rnd.py │ └── space.py ├── mighty_models │ ├── __init__.py │ ├── dqn.py │ ├── networks.py │ ├── ppo.py │ └── sac.py ├── mighty_replay │ ├── __init__.py │ ├── buffer.py │ ├── mighty_prioritized_replay.py │ ├── mighty_replay_buffer.py │ └── mighty_rollout_buffer.py ├── mighty_runners │ ├── __init__.py │ ├── factory.py │ ├── mighty_es_runner.py │ ├── mighty_maml_runner.py │ ├── mighty_online_runner.py │ └── mighty_runner.py ├── mighty_update │ ├── __init__.py │ ├── ppo_update.py │ ├── q_learning.py │ └── sac_update.py ├── mighty_utils │ ├── __init__ .py │ ├── __init__.py │ ├── envs.py │ ├── migthy_types.py │ ├── plotting.py │ ├── test_helpers.py │ ├── update_utils.py │ └── wrappers.py └── run_mighty.py ├── mkdocs.yml ├── pyproject.toml └── test ├── __init__.py ├── agents ├── test_agent_factory.py ├── test_base_agent.py └── test_dqn_agent.py ├── exploration ├── test_epsilon_greedy.py ├── test_exploration.py └── test_ez_greedy.py ├── meta_components ├── test_cosine_schedule.py ├── test_noveld.py ├── test_plr.py ├── test_rnd.py └── test_space.py ├── models ├── test_networks.py └── test_q_networks.py ├── replay └── test_buffer.py ├── runners ├── test_es_runner.py ├── test_runner.py └── test_runner_factory.py ├── test_cli.py ├── test_env_creation.py └── update └── test_q_update.py /.github/Issue_Template/issue_template.md: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | name: General issue template 4 | about: General template issues 5 | labels: 6 | 7 | --- 8 | 9 | * Mighty version: 10 | * Python version: 11 | * Operating System: 12 | 13 | 14 | 17 | 18 | #### Description 19 | 20 | 21 | #### Steps/Code to Reproduce 22 | 25 | 26 | #### Expected Results 27 | 28 | 29 | #### Actual Results 30 | 31 | 32 | #### Additional Info 33 | 34 | - Did you try upgrading to the most current version? yes/no 35 | - Are you using a supported operating system (version)? yes/no 36 | - How did you install this package (e.g. GitHub, pip, etc.)? 37 | 38 | -------------------------------------------------------------------------------- /.github/Pull_Request_Template/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | #### Reference Issues/PRs 7 | 11 | 12 | #### What does this implement/fix? Explain your changes. 13 | 14 | 24 | 25 | #### Checklist 26 | 27 | - Are the tests passing locally? yes/no 28 | - Is the pre-commit passing locally? yes/no 29 | - Are all new features documented in code and docs? yes/no 30 | - Are all examples still running? yes/no 31 | - Are the requirements up to date? yes/no 32 | - Did you add yourself to the contributors in the authors file? yes/no 33 | 34 | #### Any other comments? 35 | 36 | -------------------------------------------------------------------------------- /.github/workflows/docs_test.yaml: -------------------------------------------------------------------------------- 1 | # This workflow is to test that the docs build successfully. 2 | name: test-docs 3 | env: 4 | package-name: "mighty" 5 | UV_SYSTEM_PYTHON: 1 6 | 7 | concurrency: 8 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 9 | cancel-in-progress: true 10 | on: 11 | workflow_dispatch: 12 | push: 13 | branches: 14 | - main 15 | pull_request: 16 | branches: 17 | - main 18 | permissions: 19 | contents: write 20 | jobs: 21 | build: 22 | runs-on: ubuntu-latest 23 | steps: 24 | - name: Checkout 25 | uses: actions/checkout@v4 26 | 27 | - name: Install uv 28 | uses: astral-sh/setup-uv@v5 29 | with: 30 | # Install a specific version of uv. 31 | version: "0.6.14" 32 | 33 | - name: "Set up Python" 34 | uses: actions/setup-python@v5 35 | with: 36 | python-version-file: "pyproject.toml" 37 | 38 | - name: Install Mighty 39 | run: make install-dev 40 | 41 | - name: "Build Docs" 42 | run: mkdocs build --clean --strict -------------------------------------------------------------------------------- /.github/workflows/publish-release.yaml: -------------------------------------------------------------------------------- 1 | name: Publish Python Package 2 | permissions: 3 | id-token: write 4 | 5 | env: 6 | package-name: "mighty" 7 | UV_SYSTEM_PYTHON: 1 8 | 9 | on: 10 | # Manually triggerable in github 11 | workflow_dispatch: 12 | release: 13 | types: [created] 14 | 15 | jobs: 16 | test: 17 | name: publish-release 18 | runs-on: "ubuntu-latest" 19 | 20 | steps: 21 | - name: Checkout 22 | uses: actions/checkout@v4 23 | 24 | - name: Install uv 25 | uses: astral-sh/setup-uv@v5 26 | with: 27 | # Install a specific version of uv. 28 | version: "0.6.14" 29 | 30 | - name: "Set up Python" 31 | uses: actions/setup-python@v5 32 | with: 33 | python-version-file: "pyproject.toml" 34 | 35 | - name: Install ${{ env.package-name }} 36 | run: make install-dev 37 | 38 | - name: Store git status 39 | id: status-before 40 | shell: bash 41 | run: | 42 | echo "::set-output name=BEFORE::$(git status --porcelain -b)" 43 | 44 | - name: Tests 45 | run: make test 46 | 47 | pypi-publish: 48 | name: Upload release to PyPI 49 | runs-on: ubuntu-latest 50 | environment: 51 | name: pypi 52 | url: https://pypi.org/p/mighty 53 | steps: 54 | - name: Checkout 55 | uses: actions/checkout@v4 56 | 57 | - name: Install uv 58 | uses: astral-sh/setup-uv@v5 59 | with: 60 | # Install a specific version of uv. 61 | version: "0.6.14" 62 | 63 | - name: "Set up Python" 64 | uses: actions/setup-python@v5 65 | with: 66 | python-version-file: "pyproject.toml" 67 | 68 | - name: Install ${{ env.package-name }} 69 | run: make install-dev 70 | - name: Build package 71 | run: uv build 72 | 73 | - name: Publish package distributions to PyPI 74 | run: uv publish -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | 2 | name: Tests 3 | concurrency: 4 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 5 | cancel-in-progress: true 6 | 7 | on: 8 | workflow_dispatch: # Manually trigger the workflow 9 | # Triggers with push to main 10 | push: 11 | branches: 12 | - main 13 | - development 14 | 15 | # Triggers with push to a PR aimed at main 16 | pull_request: 17 | branches: 18 | - main 19 | - development 20 | 21 | env: 22 | package-name: "mighty" 23 | test-dir: test 24 | UV_SYSTEM_PYTHON: 1 25 | 26 | jobs: 27 | # General unit tests 28 | source-test: 29 | name: test 30 | runs-on: "ubuntu-latest" 31 | defaults: 32 | run: 33 | shell: bash # Default to using bash on all 34 | 35 | steps: 36 | - name: Checkout 37 | uses: actions/checkout@v4 38 | 39 | - name: Install uv 40 | uses: astral-sh/setup-uv@v5 41 | with: 42 | # Install a specific version of uv. 43 | version: "0.6.14" 44 | 45 | - name: "Set up Python" 46 | uses: actions/setup-python@v5 47 | with: 48 | python-version-file: "pyproject.toml" 49 | 50 | - name: Install ${{ env.package-name }} 51 | run: make install-dev 52 | 53 | - name: Store git status 54 | id: status-before 55 | shell: bash 56 | run: | 57 | echo "::set-output name=BEFORE::$(git status --porcelain -b)" 58 | 59 | - name: Tests 60 | run: make test 61 | 62 | - name: Check for files left behind by test 63 | run: | 64 | before="${{ steps.status-before.outputs.BEFORE }}" 65 | after="$(git status --porcelain -b)" 66 | if [[ "$before" != "$after" ]]; then 67 | echo "git status from before: $before" 68 | echo "git status from after: $after" 69 | echo "Not all generated files have been deleted!" 70 | exit 1 71 | fi -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *experiments/ 3 | .DS* 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | pip-wheel-metadata/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | uv.lock 98 | 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 100 | __pypackages__/ 101 | 102 | # Celery stuff 103 | celerybeat-schedule 104 | celerybeat.pid 105 | 106 | # SageMath parsed files 107 | *.sage.py 108 | 109 | # Environments 110 | .env 111 | .venv 112 | env/ 113 | venv/ 114 | ENV/ 115 | env.bak/ 116 | venv.bak/ 117 | 118 | # Spyder project settings 119 | .spyderproject 120 | .spyproject 121 | 122 | # Rope project settings 123 | .ropeproject 124 | 125 | # mkdocs documentation 126 | /site 127 | 128 | # mypy 129 | .mypy_cache/ 130 | .dmypy.json 131 | dmypy.json 132 | 133 | # Pyre type checker 134 | .pyre/ 135 | 136 | runs/* 137 | docs/build/* 138 | docs/api/* 139 | docs/examples/* 140 | site/* -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: formatting 5 | name: formatting 6 | entry: make 7 | args: ["format"] 8 | language: system 9 | pass_filenames: false -------------------------------------------------------------------------------- /HPs/DQN.yaml: -------------------------------------------------------------------------------- 1 | # CartPole-v1 DQN configuration: num_steps: 5e4 num_envs: 64 2 | algorithm_kwargs: 3 | # Core architecture / model 4 | n_units: 256 5 | q_kwargs: 6 | dueling: False 7 | feature_extractor_kwargs: 8 | architecture: mlp 9 | n_layers: 1 10 | hidden_sizes: [256] 11 | head_kwargs: 12 | hidden_sizes: [256] 13 | 14 | # Exploration (decaying ε‐greedy) 15 | policy_class: 16 | _target_: mighty.mighty_exploration.DecayingEpsilonGreedy 17 | policy_kwargs: 18 | epsilon_start: 1.0 19 | epsilon_final: 0.04 20 | epsilon_decay_steps: 8000 21 | 22 | # Replay‐buffer settings 23 | replay_buffer_class: 24 | _target_: mighty.mighty_replay.MightyReplay 25 | replay_buffer_kwargs: 26 | capacity: 100000 27 | 28 | # Training hyperparameters 29 | learning_rate: 2.3e-3 30 | batch_size: 128 31 | gamma: 0.99 32 | learning_starts: 1000 # wait 1k transitions before training 33 | 34 | # Target‐network / updating (hard update every 1k ∇‐steps) 35 | use_target: True 36 | soft_update_weight: 0.005 37 | target_update_freq: null 38 | 39 | # Double DQN update 40 | td_update_class: mighty.mighty_update.QLearning 41 | 42 | td_update_kwargs: 43 | gamma: 0.99 44 | optimizer_class: 45 | _target_: torch.optim.Adam 46 | optimizer_kwargs: 47 | lr: 2.3e-3 48 | weight_decay: 1e-5 49 | eps: 1e-6 50 | max_grad_norm: 10.0 51 | 52 | # Checkpointing 53 | save_replay: False 54 | n_gradient_steps: 128 55 | 56 | # Misc/Evaluation/Logging 57 | log_wandb: False 58 | wandb_kwargs: 59 | project: "my_dqn_experiment" 60 | name: "DQN-cartpole" 61 | -------------------------------------------------------------------------------- /HPs/PPO.yaml: -------------------------------------------------------------------------------- 1 | # CartPole-v1 PPO configuration: num_steps: 1e5 num_envs: 8 2 | algorithm_kwargs: 3 | # Hyperparameters 4 | n_policy_units: 64 5 | n_critic_units: 64 6 | soft_update_weight: 0.01 7 | 8 | rollout_buffer_class: 9 | _target_: mighty.mighty_replay.MightyRolloutBuffer # Using rollout buffer 10 | rollout_buffer_kwargs: 11 | buffer_size: 256 # Size of the rollout buffer. 12 | gamma: 0.98 # Discount factor for future rewards. 13 | gae_lambda: 0.8 # GAE lambda. 14 | obs_shape: ??? # Placeholder for observation shape 15 | act_dim: ??? # Placeholder for action dimension 16 | n_envs: ??? 17 | discrete_action: ??? # Placeholder for discrete action flag 18 | 19 | 20 | # Training 21 | learning_rate: 3e-4 22 | batch_size: 32 # Batch size for training. 23 | gamma: 0.99 # The amount by which to discount future rewards. 24 | ppo_clip: 0.2 # Clipping parameter for PPO. 25 | value_loss_coef: 0.5 # Coefficient for value loss. 26 | entropy_coef: 0.0 # Coefficient for entropy loss. 27 | max_grad_norm: 0.5 # Maximum value for gradient clipping. 28 | 29 | 30 | hidden_sizes: [64] 31 | activation: 'tanh' 32 | 33 | n_epochs: 20 34 | minibatch_size: 256 35 | kl_target: 0.01 36 | use_value_clip: True 37 | value_clip_eps: 0.2 38 | 39 | policy_class: mighty.mighty_exploration.StochasticPolicy # Policy class for exploration 40 | policy_kwargs: 41 | entropy_coefficient: 0.0 # Coefficient for entropy-based exploration. -------------------------------------------------------------------------------- /HPs/SAC.yaml: -------------------------------------------------------------------------------- 1 | # Pendulum-SAC hyperparameters num_envs=4 num_steps=5e4 2 | algorithm_kwargs: 3 | # network sizes (PPO-style) 4 | n_policy_units: 256 # will become hidden_sizes=[8,8] 5 | n_critic_units: 256 # same for both Q-nets 6 | soft_update_weight: 0.01 # maps to tau 7 | 8 | # Replay buffer 9 | replay_buffer_class: 10 | _target_: mighty.mighty_replay.MightyReplay 11 | replay_buffer_kwargs: 12 | capacity: 1e6 13 | 14 | # Scheduling & batch-updates 15 | batch_size: 256 16 | learning_starts: 5000 17 | update_every: 1 18 | n_gradient_steps: 1 19 | 20 | # Learning rates 21 | policy_lr: 1e-3 22 | q_lr: 1e-3 23 | 24 | # SAC hyperparameters 25 | gamma: 0.99 26 | alpha: 0.2 27 | auto_alpha: True 28 | target_entropy: -1 29 | alpha_lr: 3e-4 30 | 31 | # Exploration wrapper 32 | policy_class: mighty.mighty_exploration.StochasticPolicy 33 | policy_kwargs: 34 | entropy_coefficient: 0.2 35 | discrete: False 36 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | # NOTE: Used on linux, limited support outside of Linux 3 | # 4 | # A simple makefile to help with small tasks related to development of Mighty 5 | # These have been configured to only really run short tasks. Longer form tasks 6 | # are usually completed in github actions. 7 | 8 | .PHONY: help install-dev install check format pre-commit clean build clean-doc clean-build test doc publish 9 | 10 | help: 11 | @echo "Makefile Mighty" 12 | @echo "* install-dev to install all dev requirements and install pre-commit" 13 | @echo "* check to check the source code for issues" 14 | @echo "* format to format the code with ruff" 15 | @echo "* typing to type check the code with mypy" 16 | @echo "* pre-commit to run the pre-commit check" 17 | @echo "* clean to clean the dist and doc build files" 18 | @echo "* build to build a dist" 19 | @echo "* test to run the tests" 20 | @echo "* docs to serve and view the docs" 21 | @echo "* docs-build-only to generate and view the html files" 22 | @echo "* docs-deploy to push the latest doc version to gh-pages" 23 | @echo "* publish to help publish the current branch to pypi" 24 | 25 | PYTHON ?= python 26 | CYTHON ?= cython 27 | PYTEST ?= uv run pytest 28 | CTAGS ?= ctags 29 | PIP ?= uv pip 30 | MAKE ?= make 31 | PRECOMMIT ?= uv run pre-commit 32 | RUFF ?= uv run ruff 33 | MYPY ?= uv run mypy 34 | ISORT ?= uv run isort 35 | 36 | DIR := ${CURDIR} 37 | DIST := ${CURDIR}/dist 38 | DOCDIR := ${CURDIR}/docs 39 | INDEX_HTML := file://${DOCDIR}/html/build/index.html 40 | 41 | install-dev: 42 | $(PIP) install -e ".[dev,carl,docs,pufferlib,dacbench]" 43 | 44 | install: 45 | $(PIP) install -e ".[examples]" 46 | 47 | 48 | # pydocstyle does not have easy ignore rules, instead, we include as they are covered 49 | check: 50 | ruff format --check mighty test 51 | ruff check mighty test 52 | 53 | pre-commit: 54 | $(PRECOMMIT) run --all-files 55 | 56 | format: 57 | $(ISORT) isort mighty test 58 | $(RUFF) format --silent mighty test 59 | $(RUFF) check --fix --silent mighty test --exit-zero 60 | $(RUFF) check --fix mighty test --exit-zero 61 | 62 | typing: 63 | $(MYPY) mighty 64 | 65 | test: 66 | $(PYTEST) -v --cov=mighty test --durations=20 --cov-report html 67 | 68 | clean-doc: 69 | rm -rf site 70 | 71 | clean-build: 72 | rm -rf ${DIST} 73 | 74 | # Clean up any builds in ./dist as well as doc 75 | clean: clean-doc clean-build 76 | 77 | # Build a distribution in ./dist 78 | build: 79 | uv build 80 | 81 | docs: 82 | mkdocs serve 83 | 84 | docs-build-only: 85 | mkdocs build --clean --strict 86 | 87 | docs-deploy: 88 | mkdocs gh-deploy --force 89 | 90 | # Publish to testpypi 91 | # Will echo the commands to actually publish to be run to publish to actual PyPi 92 | # This is done to prevent accidental publishing but provide the same conveniences 93 | publish: clean-build build 94 | uv publish --index testpypi 95 | @echo 96 | @echo "Test by installing from testpypi:" 97 | @echo "pip install --index-url https://test.pypi.org/simple/ mighty-rl" 98 | @echo 99 | @echo "Once you have decided it works, publish to actual pypi with" 100 | @echo "uv publish" -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | Mighty Logo 4 | 5 |

6 | 7 |
8 | 9 | [![PyPI Version](https://img.shields.io/pypi/v/mighty-rl.svg)](https://pypi.org/project/Mighty-RL/) 10 | ![Python](https://img.shields.io/badge/Python-3.10-3776AB) 11 | ![License](https://img.shields.io/badge/License-BSD3-orange) 12 | [![Test](https://github.com/automl/Mighty/actions/workflows/test.yaml/badge.svg)](https://github.com/automl/Mighty/actions/workflows/test.yaml) 13 | [![Doc Status](https://github.com/automl/Mighty/actions/workflows/docs_test.yaml/badge.svg)](https://github.com/automl/Mighty/actions/workflows/docs_test.yaml) 14 | 15 |
16 | 17 |
18 |

19 | Installation | 20 | Documentation | 21 | Run a Mighty Agent | 22 | Cite Us 23 |

24 |
25 | 26 | --- 27 | 28 | # Mighty 29 | 30 | **Warning: Mighty is still in development without an official release! Use at your own peril and check back frequently for updates!** 31 | 32 | Welcome to Mighty, hopefully your future one-stop shop for everything cRL. 33 | Currently Mighty is still in its early stages with support for normal gym envs, DACBench and CARL. 34 | The interface is controlled through hydra and we provide DQN, PPO and SAC algorithms. 35 | We log training and regular evaluations to file and optionally also to tensorboard or wandb. 36 | If you have any questions or feedback, please tell us, ideally via the GitHub issues! 37 | 38 | Mighty features: 39 | - Modular structure for easy (Meta-)RL tinkering 40 | - PPO, SAC and DQN as base algorithms 41 | - Environment integrations via Gymnasium, Pufferlib, CARL & DACBench 42 | - Implementations of some important baselines: MAML, PLR, Cosine LR Schedule and more! 43 | 44 | ## Installation 45 | We recommend to using uv to install and run Mighty in a virtual environment. 46 | The code has been tested with python 3.10. 47 | 48 | First create a clean python environment: 49 | 50 | ```bash 51 | uv venv --python=3.10 52 | source .venv/bin/activate 53 | ``` 54 | 55 | Then install Mighty: 56 | 57 | ```bash 58 | make install 59 | ``` 60 | 61 | Optionally you can install the dev requirements directly: 62 | ```bash 63 | make install-dev 64 | ``` 65 | 66 | Alternatively, you can install Mighty from PyPI: 67 | ```bash 68 | pip install mighty-rl 69 | ``` 70 | 71 | ## Run a Mighty Agent 72 | In order to run a Mighty Agent, use the run_mighty.py script and provide any training options as keywords. 73 | If you want to know more about the configuration options, call: 74 | ```bash 75 | python mighty/run_mighty.py --help 76 | ``` 77 | 78 | An example for running the PPO agent on the Pendulum gym environment looks like this: 79 | ```bash 80 | python mighty/run_mighty.py 'algorithm=ppo' 'environment=gymnasium/pendulum' 81 | ``` 82 | 83 | ## Learning a Configuration Policy via DAC 84 | 85 | In order to use Mighty with DACBench, you need to install DACBench first. 86 | We recommend following the instructions in the [DACBench repo](https://github.com/automl/DACBench). 87 | 88 | Afterwards, select the benchmark you want to run, for example the SigmoidBenchmark, and providing it as the "env" keyword: 89 | ```bash 90 | python mighty/run_mighty.py 'algorithm=dqn' 'env=SigmoidBenchmark' 'env_wrappers=[dacbench.wrappers.MultiDiscreteActionWrapper]' 91 | ``` 92 | 93 | ## Train your Agent on a CARL Environment 94 | Mighty is designed with contextual RL in mind and therefore fully compatible with CARL. 95 | Before you start training, however, please follow the installation instructions in the [CARL repo](https://github.com/automl/CARL). 96 | 97 | Then use the same command as before, but provide the CARL environment, in this example CARLCartPoleEnv, 98 | and information about the context distribution as keywords: 99 | ```bash 100 | python mighty/run_mighty.py 'algorithm=dqn' 'env=CARLCartPoleEnv' '+env_kwargs.num_contexts=10' '+env_kwargs.context_feature_args=[gravity]' 101 | ``` 102 | 103 | ## Optimize Hyperparameters 104 | You can optimize the hyperparameters of your algorithm with the [Hypersweeper](https://github.com/automl/hypersweeper) package, e.g. using [SMAC3](https://github.com/automl/SMAC3). Mighty is directly compatible with Hypersweeper and thus smart and distributed HPO! 105 | 106 | ## Further Examples 107 | We provide further examples, such as how to plot the logged evaluation data, in the [examples](examples) folder. 108 | 109 | ## Cite Us 110 | 111 | If you use Mighty in your work, please cite us: 112 | 113 | ```bibtex 114 | @misc{mohaneimer24, 115 | author = {A. Mohan and T. Eimer and C. Benjamins and F. Hutter and M. Lindauer and A. Biedenkapp}, 116 | title = {Mighty}, 117 | year = {2024}, 118 | url = {https://github.com/automl/mighty} 119 | } 120 | ``` 121 | -------------------------------------------------------------------------------- /docs/api_generator.py: -------------------------------------------------------------------------------- 1 | """Generate the code reference pages and navigation. 2 | 3 | # https://mkdocstrings.github.io/recipes/ 4 | """ 5 | from __future__ import annotations 6 | 7 | import logging 8 | from pathlib import Path 9 | 10 | import mkdocs_gen_files 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | source_path = "mighty" 15 | 16 | # Modules whose members should not include inherited attributes or methods 17 | # NOTE: Given the current setup, we can only operate at a module level. 18 | # Ideally we specify options (at least at a module level) and we render 19 | # them into strings using a yaml parser. For now this is fine though 20 | NO_INHERITS = ("torch.nn",) 21 | TAB = " " 22 | 23 | for path in sorted(Path(source_path).rglob("*.py")): 24 | module_path = path.relative_to(source_path).with_suffix("") 25 | doc_path = path.relative_to(source_path).with_suffix(".md") 26 | full_doc_path = Path("api/mighty", doc_path) 27 | 28 | parts = tuple(module_path.parts) 29 | 30 | if parts[-1] in ("__main__", "__version__", "__init__"): 31 | continue 32 | 33 | if any(part.startswith("_") for part in parts): 34 | continue 35 | 36 | if any(part.startswith("mighty_utils") for part in parts): 37 | continue 38 | 39 | with mkdocs_gen_files.open(full_doc_path, "w") as fd: 40 | if parts[0] != source_path: 41 | parts = (source_path,) + parts 42 | ident = ".".join(parts) 43 | fd.write(f"::: {ident}") 44 | 45 | if ident.endswith(NO_INHERITS): 46 | fd.write(f"\n{TAB}options:") 47 | fd.write(f"\n{TAB}{TAB}inherited_members: false") 48 | 49 | mkdocs_gen_files.set_edit_path(full_doc_path, path) -------------------------------------------------------------------------------- /docs/basic_usage.md: -------------------------------------------------------------------------------- 1 | There are a few different ways you can use Mighty: 2 | 3 | ### Running Meta-Methods 4 | This is the easiest part. We have several algorithms and meta-methods implemented in Mighty and you should be able to run them directly on any environment of your choosing. The most difficult part will likely be the configuration of each method since they might require specific keywords or are only compatible with a given base algorithm. So you will likely want to read up on whatever method you choose. Then you also need to know if your method is of the runner or meta module type. Each have their own configuration keyword. An example for using a specific runner is: 5 | 6 | ```bash 7 | python mighty/run_mighty runner=es popsize=5 iterations=100 es=evosax.CMA_ES search_targets=["learning_rate", "_batch_size"] rl_train_agent=true 8 | ``` 9 | This will use the evosax CMA-ES implementation with population size 5 to optimize the learning rate and batch size in 100 iterations. Meta modules, on the other hand, use a different keyword: 10 | ```bash 11 | python mighty/run_mighty.py +algorithm_kwargs.meta_methods=[mighty.mighty_meta.PrioritizedLevelReplay] 12 | ``` 13 | This meta methods list collects all meta modules in the order they should be used. So while you can't use multiple runners, you can use layers of meta modules. 14 | 15 | ### Implementing New Components 16 | Of course Mighty currently only supports a limited amount of methods. This is where you come in! It should be fairly easy for you to add your own. We recommend following these steps: 17 | 18 | 1. What are you adding? A runner, meta module, exploration policy, buffer, update variation or model? Make sure you choose the best level to implement your idea in. 19 | 2. Implement your method using the abstract class and existing methods as templates. 20 | 3. Plug your class into your Mighty config file. This works by replacing the default value with the import path of your custom class. 21 | 4. Run the algorithm. 22 | 23 | Since you are passing the place from which to import your new class, you do not need to work within the Mighty codebase directly, but keep your changes separate. This way you can add several new methods to Mighty without copying the code. 24 | 25 | ### Combining Different Ideas 26 | You can combine different approaches with Mighty by varying the runner, exploration, buffer, update class and network architecture and combining them with an arbitrary number of meta modules. 27 | At this point, configuration might become very difficult. We recommend that you take a close look at how to use different hydra configuration files to separately configure each of your methods so that you can keep track of everything. 28 | Depending on what exactly you want to do, it can make sense to keep separate configuration files for each variation you make. This can be confusing, especially if you haven't worked with hydra before, so we recommed you take the time to focus on configurations when attempting combinations of several methods. -------------------------------------------------------------------------------- /docs/hooks/cleanup_log_output.py: -------------------------------------------------------------------------------- 1 | """The module is a hook which disables warnings and log messages which pollute the 2 | doc build output. 3 | 4 | One possible downside is if one of these modules ends up giving an actual 5 | error, such as OpenML failing to retrieve a dataset. I tried to make sure ERROR 6 | log message are still allowed through. 7 | """ 8 | import logging 9 | import warnings 10 | from typing import Any 11 | 12 | import mkdocs 13 | import mkdocs.plugins 14 | import mkdocs.structure.pages 15 | 16 | log = logging.getLogger("mkdocs") 17 | 18 | 19 | @mkdocs.plugins.event_priority(-50) 20 | def on_startup(**kwargs: Any): 21 | # We can probably safely disregard these 22 | warnings.filterwarnings("ignore", category=DeprecationWarning) 23 | 24 | 25 | def on_pre_page( 26 | page: mkdocs.structure.pages.Page, 27 | config: Any, 28 | files: Any, 29 | ) -> mkdocs.structure.pages.Page | None: 30 | # NOTE: mkdocs says they're always normalized to be '/' seperated 31 | # which means this should work on windows as well. 32 | 33 | logging.getLogger("mighty").setLevel(logging.ERROR) 34 | return page -------------------------------------------------------------------------------- /docs/hooks/debug_which_page_is_being_rendered.py: -------------------------------------------------------------------------------- 1 | """This module is a hook that when any code is being rendered, it will 2 | print the path to the file being rendered. 3 | 4 | This makes it easier to identify which file is being rendered when an error happens. 5 | """ 6 | from __future__ import annotations 7 | 8 | import logging 9 | import os 10 | from typing import TYPE_CHECKING, Any 11 | 12 | import mkdocs 13 | import mkdocs.plugins 14 | 15 | if TYPE_CHECKING: 16 | import mkdocs.structure.pages 17 | 18 | log = logging.getLogger("mkdocs") 19 | 20 | RENDER_EXAMPLES_ENV_VAR = "MIGHTY_DOC_RENDER_EXAMPLES" 21 | EXEC_DOCS_ENV_VAR = "MIGHTY_EXEC_DOCS" 22 | 23 | truthy_values = {"yes", "on", "true", "1", "all"} 24 | 25 | 26 | def on_pre_page( 27 | page: mkdocs.structure.pages.Page, 28 | config: Any, 29 | files: Any, 30 | ) -> mkdocs.structure.pages.Page | None: 31 | render_examples = os.environ.get(RENDER_EXAMPLES_ENV_VAR, "true") 32 | render_code = os.environ.get(EXEC_DOCS_ENV_VAR, "true") 33 | if render_examples.lower() in truthy_values or render_code.lower() in truthy_values: 34 | log.info(f"{page.file.src_path}") -------------------------------------------------------------------------------- /docs/hooks/disable_markdown_exec.py: -------------------------------------------------------------------------------- 1 | """This disable markdown_exec based on an environment variable. 2 | This speeds up the build of the docs for faster iteration. 3 | 4 | This is done by overwriting the module responsible for compiling and executing the code 5 | by overriding the `exec(...)` global variable that is used to run the code. 6 | We hijack it and print a helpful message about how to run the code cell instead. 7 | 8 | https://github.com/pawamoy/markdown-exec/blob/adff40b2928dbb2d22f27684e085f02d39a07291/src/markdown_exec/formatters/python.py#L42-L70 9 | """ 10 | from __future__ import annotations 11 | 12 | import logging 13 | import os 14 | from typing import Any 15 | 16 | import mkdocs 17 | import mkdocs.plugins 18 | import mkdocs.structure.pages 19 | 20 | RUN_CODE_BLOCKS_ENV_VAR = "MIGHTY_EXEC_DOCS" 21 | 22 | logger = logging.getLogger("mkdocs") 23 | 24 | 25 | def _print_msg(compiled_code: Any, code_block_id: int, exec_globals: dict) -> None: 26 | _print = exec_globals["print"] 27 | _print( 28 | f"Env variable {RUN_CODE_BLOCKS_ENV_VAR}=0 - No code to display." 29 | "\nUse `just docs-code` (or `just docs-full` for examples) to run" 30 | " the code block and display output." 31 | ) 32 | 33 | truthy_values = {"yes", "on", "true", "1"} 34 | 35 | @mkdocs.plugins.event_priority(100) 36 | def on_startup(**kwargs: Any): 37 | run_code_blocks = os.environ.get(RUN_CODE_BLOCKS_ENV_VAR, "true") 38 | if run_code_blocks.lower() not in truthy_values: 39 | logger.warning( 40 | f"Disabling markdown-exec due to {RUN_CODE_BLOCKS_ENV_VAR}={run_code_blocks}" 41 | "\n.Use `just docs-full` to run and render examples.", 42 | ) 43 | from markdown_exec.formatters import python 44 | 45 | setattr(python, "exec_python", _print_msg) -------------------------------------------------------------------------------- /docs/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/docs/img/favicon.ico -------------------------------------------------------------------------------- /docs/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/docs/img/logo.png -------------------------------------------------------------------------------- /docs/img/logo_no_font.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/docs/img/logo_no_font.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | 2 | Mighty is a Reinforcement Learning (RL) library that aims to make training general agents easy. 3 | We natively support context in RL, i.e. train and test distributions that can be easily configured, as well 4 | as Meta- and AutoRL methods on all levels. 5 | That means if you're interested in general RL, you can start with well-known simulation environments and scale up 6 | to actually applications using Mighty! 7 | 8 | ### What Can I Do With Mighty? 9 | Mighty offers a lot of flexibility for training general agents with online RL: 10 | 11 | - train on standard and contextual RL environments 12 | - apply outer-loop methods like Bayesian Optimization or Evolutionary Strategies for Meta-Learning, Hyperparameter Optimization and more 13 | - use in-the-loop ideas like curriculum learning to enhance training 14 | - plug in modules for exploration, buffers or architectures without touching the full pipeline 15 | - combine different methods for Meta- and AutoRL to form full RL pipelines 16 | 17 | We currently do not support other learning paradigms, but might extend to e.g. include offline data as an option. 18 | 19 | ### Where Is Mighty Going? 20 | 21 | Currently Mighty is in early development and includes only standard RL algorithms compatible with cRL benchmarks and 22 | evaluation mechanisms. In the future, we hope to extend mighty with Meta-Learning methods as well as AutoRL, so stay tuned. 23 | 24 | ### Contact & Citation 25 | Mighty is developed at [LUHAI Hannover]() by members of [AutoRL.org](). Your first contact is lead maintainer [Aditya Mohan](). If you found issues or want to contribute new features, it's best to visit our [GitHub page](https://github.com/automl/Mighty) page and start a discussion. 26 | 27 | If you use Mighty for your research, please cite us: 28 | 29 | ```bibtex 30 | @misc{mohaneimer24, 31 | author = {A. Mohan and T. Eimer and C. Benjamins and F. Hutter and M. Lindauer and A. Biedenkapp}, 32 | title = {Mighty}, 33 | year = {2024}, 34 | url = {https://github.com/automl/mighty} 35 | } 36 | ``` -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- 1 | We recommend to using uv to install and run Mighty in a virtual environment. 2 | The code has been tested with python 3.10. 3 | 4 | First create a clean python environment: 5 | 6 | ```bash 7 | uv venv --python=3.11 8 | source .venv/bin/activate 9 | ``` 10 | 11 | Then install Mighty via pip: 12 | 13 | ```bash 14 | uv pip install mighty-rl 15 | ``` 16 | 17 | For a custom setup or if you want to hack Mighty, please see the [GitHub repo](https://github.com/automl/Mighty) instead. -------------------------------------------------------------------------------- /docs/methods/architectures.md: -------------------------------------------------------------------------------- 1 | # Mighty Architectures 2 | Mighty is made for deep RL, meaning we rely on neural networks for function approximation. You'll find them under the 'mighty_models' keyword in the code. This page should give you a rough overview of their intended use and how to handle them for your experiments. 3 | 4 | ## Network Structures in Mighty 5 | Mighty networks are based on Torch. 6 | We implement some basic network architecture building block which can then be combined. 7 | You will usually choose a feature extractor architecture and a head architecture which can be the same or different. 8 | Furthermore, you can combine two different architectures in the feature extractor. 9 | You can choose between: 10 | 11 | - MLP: standard fully connected networks (flexible structure) 12 | - CNN: 1D or 2D convolutional networks (flexible structure) 13 | - ResNet: a 2D convolutional layer with two residual blocks 14 | - TorchHub model (experimental): loading models from TorchHub 15 | 16 | This should cover many standard combinations like a CNN feature extractor with an MLP head. 17 | 18 | ## Implemented Models 19 | The implemented 'mighty_models' define the prediction patterns for different algorithm classes. 20 | The DQN model, for example, is initialized to predict Q-values while the PPO model forwards through the policy head when called. 21 | Both can be based upon the same feature extraction and head structures, of course. 22 | If we look at the DQN model, we can see it primarily combines different elements to achieve this instead of implementing all of them: 23 | 24 | ```python 25 | class DQN(nn.Module): 26 | """DQN network.""" 27 | 28 | def __init__(self, num_actions, obs_size, dueling=False, **kwargs): 29 | """Initialize the network.""" 30 | super().__init__() 31 | head_kwargs = {"hidden_sizes": [32, 32]} 32 | feature_extractor_kwargs = {"obs_shape": obs_size} 33 | if "head_kwargs" in kwargs: 34 | head_kwargs.update(kwargs["head_kwargs"]) 35 | if "feature_extractor_kwargs" in kwargs: 36 | feature_extractor_kwargs.update(kwargs["feature_extractor_kwargs"]) 37 | 38 | # Make feature extractor 39 | self.feature_extractor, self.output_size = make_feature_extractor( 40 | **feature_extractor_kwargs 41 | ) 42 | self.dueling = dueling 43 | self.num_actions = int(num_actions) 44 | self.obs_size = obs_size 45 | self.hidden_sizes = head_kwargs["hidden_sizes"] 46 | 47 | # Make policy head 48 | self.head, self.value, self.advantage = make_q_head( 49 | self.output_size, 50 | self.num_actions, 51 | **head_kwargs, 52 | ) 53 | 54 | def forward(self, x): 55 | """Forward pass.""" 56 | x = self.feature_extractor(x) 57 | x = self.head(x) 58 | advantage = self.advantage(x) 59 | if self.dueling: 60 | value = self.value(x) 61 | x = value + advantage - advantage.mean(dim=1, keepdim=True) 62 | else: 63 | x = advantage 64 | return x 65 | 66 | def reset_head(self, hidden_sizes=None): 67 | """Reset the head of the network.""" 68 | if hidden_sizes is None: 69 | hidden_sizes = self.hidden_sizes 70 | self.head, self.value, self.advantage = make_q_head( 71 | self.output_size, 72 | self.num_actions, 73 | hidden_sizes, 74 | ) 75 | self.hidden_sizes = hidden_sizes 76 | 77 | def shrink_weights(self, shrinkage, noise_weight): 78 | """Shrink weights of the network.""" 79 | params_old = deepcopy(list(self.head.parameters())) 80 | value_params_old = deepcopy(list(self.value.parameters())) 81 | adv_params_old = deepcopy(list(self.advantage.parameters())) 82 | self.reset_head(hidden_sizes=self.hidden_sizes) 83 | for p_old, p_rand in zip(*[params_old, self.head.parameters()], strict=False): 84 | p_rand.data = deepcopy(shrinkage * p_old.data + noise_weight * p_rand.data) 85 | for p_old, p_rand in zip( 86 | *[adv_params_old, self.advantage.parameters()], strict=False 87 | ): 88 | p_rand.data = deepcopy(shrinkage * p_old.data + noise_weight * p_rand.data) 89 | if self.dueling: 90 | for p_old, p_rand in zip( 91 | *[value_params_old, self.value.parameters()], strict=False 92 | ): 93 | p_rand.data = deepcopy( 94 | shrinkage * p_old.data + noise_weight * p_rand.data 95 | ) 96 | 97 | def __getstate__(self): 98 | return ( 99 | self.feature_extractor, 100 | self.head, 101 | self.advantage, 102 | self.value, 103 | self.dueling, 104 | self.num_actions, 105 | ) 106 | 107 | def __setstate__(self, state): 108 | self.feature_extractor = state[0] 109 | self.head = state[1] 110 | self.advantage = state[2] 111 | self.value = state[3] 112 | self.dueling = state[4] 113 | self.num_actions = state[5] 114 | ``` 115 | This allows us to have the actual architectures and network structures in central network classes and keeping the model classes quite short. 116 | As you can see, the DQN class also has additional utility functions like parameter shrinking that can be used in different updates or meta components. 117 | These are fully optional and can be added as you need them for other components. 118 | Depending on how you structure your model class, you should also revisit the corresponding update to ensure compatibility. 119 | 120 | ## Changing Network Structure 121 | The MLP and CNN networks have a semi-configurable structure. 122 | Via the algorithm_kwargs, you can specify activations as well as number of layers and units for MLPs and number and kind of convolutions, channels, strides and paddings for CNN. 123 | Hidden sizes, number of channels, stride and padding can be configured per layer for more variation. 124 | Activations, on the other hand, are currently set for the full network. 125 | 126 | ## When Should I Implement A New Network Class? 127 | Current network classes cover standard cases with some flexibility for MLPs and CNNs. 128 | The TorchHub option is still being tested and also limited since it's not focused on RL models. 129 | Therefore several relevant options like Transformers still need their own class. 130 | If you want to use a different architecture than listed here, you should simply make a new class for it and enable its creation via 'make_feature_extractor'. -------------------------------------------------------------------------------- /docs/methods/inner_loops.md: -------------------------------------------------------------------------------- 1 | # Mighty Inner Loops 2 | A key motivation for Mighty is to make it easy to create systems that interact with the RL loop. If these systems work during an algorithm's runtime, they are inner loop components. In Mighty, we call them Meta Components. This page documents their structure and why they're so useful. 3 | 4 | ## What Are Meta Components? 5 | Meta components are elements interacting with the main loop at various points. 6 | Within this interaction, they have access to virtually all current internal information and can adapt it. 7 | This means everything from hyperparameter scheduling to learning a separate dynamics models and more is possible within this structure. 8 | Meta components can be stacked on top of one another to combine different approaches in a single run. 9 | This enables complex inner loop setups without entangling methods in code. 10 | 11 | ## The Metrics Dictionary 12 | The most important part when adding components is the 'metrics' dictionary. This is Mighty's central information hub. 13 | Here you can find transitions, losses, predictions, batches and parameters - everything you need to build methods that actively work with the RL loop. 14 | If you want examples of how it is used, you can check out our RND implementation: 15 | ```python 16 | def get_reward(self, metrics): 17 | """Adapt LR on step. 18 | 19 | :param metrics: Dict of current metrics 20 | :return: 21 | """ 22 | if self.rnd_net is None: 23 | self.initialize_networks(metrics["transition"]["next_state"].shape[1:]) 24 | 25 | rnd_error = self.rnd_net.get_error(metrics["transition"]["next_state"]) 26 | metrics["transition"]["intrinsic_reward"] = ( 27 | self.internal_reward_weight * rnd_error 28 | ) 29 | metrics["transition"]["reward"] = ( 30 | metrics["transition"]["reward"] + self.internal_reward_weight * rnd_error 31 | ) 32 | return metrics 33 | ``` 34 | Here we read the next state from the metrics dictionary, predict state novelty from it and update the transition reward. 35 | We also add a new intrinsic reward key to enable logging. 36 | You can assume that most if not all relevant information is contained in the metrics dictionary at any given time. 37 | It is also transmitted to many different Mighty components like the exploration policy, the buffer, the update function or to any meta-components. 38 | 39 | ## Interactions With The Main Loop 40 | Meta-components are classes with methods that can be called at different points in the learning loop. There are several different call positions and they are specified by the component itself: 41 | ```python 42 | def __init__(self) -> None: 43 | """Meta module init. 44 | 45 | :return: 46 | """ 47 | self.pre_step_methods = [] 48 | self.post_step_methods = [] 49 | self.pre_update_methods = [] 50 | self.post_update_methods = [] 51 | self.pre_episode_methods = [] 52 | self.post_episode_methods = [] 53 | ``` 54 | Each of these calls will receive the metrics dictionary, resulting in a very flexible type. 55 | Right now Mighty contains a few meta-components doing very different things, e.g.: 56 | 57 | - task scheduling/curriculum learning 58 | - hyperparameter scheduling 59 | - intrinsic rewards 60 | 61 | Meta-components are also stackable, i.e. you can run multiple ones per training run. 62 | In principle, you can do almost anything in a meta-component, including training additional networks or calling the policy directly. 63 | Before you default to using this class, however, we recommend double checking if your idea isn't better suited to a more specific class. 64 | 65 | ## Combining Components 66 | When combining different modules, they are stacked on top of one another. 67 | This means they are executed in order for each method. 68 | For meta components interacting with each other or the same parts of the base loop, this order can be important! 69 | If you, for example, use a curriculum based on training reward and intrinsic reward, you should likely configure the curriculum to be called first to avoid basing the difficulty on the reward bonus. -------------------------------------------------------------------------------- /docs/methods/outer_loops.md: -------------------------------------------------------------------------------- 1 | # Mighty Outer Loops 2 | Methods that interact with repeated runs of RL algorithms are our Mighty runners. These function a level above the standard RL training to modify the inner loop. On this page, you'll find information on their structure and what kind of usecases they cover. 3 | 4 | ## Runners 5 | Runners are a wrapper class around the agent and can interact with the full task spectrum, i.e. adapt agent and environment and run this combination for an arbitrary amount of steps. 6 | The very basic online runner simply executes a task and evaluates the resulting policy: 7 | ```python 8 | class MightyOnlineRunner(MightyRunner): 9 | def run(self) -> Tuple[Dict, Dict]: 10 | train_results = self.train(self.num_steps) 11 | eval_results = self.evaluate() 12 | return train_results, eval_results 13 | ``` 14 | The ES runner, on the other hand, has a considerably longer 'run' function including multiple calls to versions of the agent: 15 | ```python 16 | def run(self) -> Tuple[Dict, Dict]: 17 | es_state = self.es.initialize(self.rng) 18 | for _ in range(self.iterations): 19 | rng_ask, _ = jax.random.split(self.rng, 2) 20 | x, es_state = self.es.ask(rng_ask, es_state) 21 | eval_rewards = [] 22 | for individual in x: 23 | if self.search_params: 24 | self.apply_parameters(individual[: self.total_n_params]) 25 | individual = individual[self.total_n_params :] 26 | for i, target in enumerate(self.search_targets): 27 | if target == "parameters": 28 | continue 29 | new_value = np.asarray(individual[i]).item() 30 | if target in ["_batch_size", "n_units"]: 31 | new_value = max(0, int(new_value)) 32 | setattr(self.agent, target, new_value) 33 | if self.train_agent: 34 | self.train(self.num_steps_per_iteration) 35 | eval_results = self.evaluate() 36 | eval_rewards.append(eval_results["mean_eval_reward"]) 37 | fitness = self.fit_shaper.apply(x, jnp.array(eval_rewards)) 38 | es_state = self.es.tell(x, fitness, es_state) 39 | eval_results = self.evaluate() 40 | return {"step": self.iterations}, eval_results 41 | ``` 42 | Conceptually, you should think of runners creating new RL tasks, that is combinations of environment and agent, to achieve some goal. 43 | This can be meta-learning, hyperparameter optimization and more. 44 | 45 | ## Information Flow 46 | Runners don't interact with the inner loop directly, but primarily via the agent class interface. 47 | Running and evaluation the agent are the two most important function calls, but runners can also utilize the update and access buffers, environments, parameters and more. 48 | Thus, the information can be performance as well as much of the algorithm state after execution. 49 | Notably, runners can also access meta components, enabling hybrid approaches inner loops that span multiple outer loops. -------------------------------------------------------------------------------- /docs/package_structure.md: -------------------------------------------------------------------------------- 1 | Mighty is desined to be highly modular, enabling access to the RL loop on different levels. This means it's not designed to be the absolute fastest way to run RL, but the most convenient one to apply different sorts of RL, MetaRL and AutoRL methods. As such, there are a few things you should know about the structure of Mighty. 2 | 3 | ### For Multiple Inner Runs: Mighty Runners 4 | Mighty uses runner classes to control the outer training loop. In the simplest case, a runner will just directly call the agent's train and evaluation functions without any changes: 5 | 6 | ```python 7 | def run(self) -> Tuple[Dict, Dict]: 8 | train_results = self.train(self.num_steps) 9 | eval_results = self.evaluate() 10 | return train_results, eval_results 11 | ``` 12 | This will result in a standard RL agent training run. Of course, we can at this point also run agents multiple times, make changes to their setup (hyperparameters, weights, environments) and integrate learning on this meta-level. 13 | A still fairly simple example is our ESRunner for outer loops with Evolutionary Strategies: 14 | 15 | ```python 16 | def run(self) -> Tuple[Dict, Dict]: 17 | es_state = self.es.initialize(self.rng) 18 | for _ in range(self.iterations): 19 | rng_ask, _ = jax.random.split(self.rng, 2) 20 | x, es_state = self.es.ask(rng_ask, es_state) 21 | eval_rewards = [] 22 | 23 | for individual in x: 24 | if self.search_params: 25 | self.apply_parameters(individual[: self.total_n_params]) 26 | individual = individual[self.total_n_params :] 27 | 28 | for i, target in enumerate(self.search_targets): 29 | if target == "parameters": 30 | continue 31 | new_value = np.asarray(individual[i]).item() 32 | if target in ["_batch_size", "n_units"]: 33 | new_value = max(0, int(new_value)) 34 | setattr(self.agent, target, new_value) 35 | 36 | if self.train_agent: 37 | self.train(self.num_steps_per_iteration) 38 | 39 | eval_results = self.evaluate() 40 | eval_rewards.append(eval_results["mean_eval_reward"]) 41 | 42 | fitness = self.fit_shaper.apply(x, jnp.array(eval_rewards)) 43 | es_state = self.es.tell(x, fitness, es_state) 44 | 45 | eval_results = self.evaluate() 46 | return {"step": self.iterations}, eval_results 47 | ``` 48 | Here we can change all sorts of things about the agent, train in between or only evaluate and use the ES to get fresh inputs. Runner classes are defined with these multiple evaluations of RL tasks in mind, i.e. these classes will usually train multiple agents, reset their policies completely or otherwise start over at some point. 49 | 50 | ### For In-The-Loop Methods: Mighty Meta Modules 51 | 52 | Not all Meta- or AutoRL methods operate in an outer loop, however. For the ones that configure training while it is still ongoing, we use the Mighty Meta Modules. 53 | These are classes that maintain lists of function calls to make at different points in training: 54 | 55 | ```python 56 | def __init__(self) -> None: 57 | """Meta module init. 58 | 59 | :return: 60 | """ 61 | self.pre_step_methods = [] 62 | self.post_step_methods = [] 63 | self.pre_update_methods = [] 64 | self.post_update_methods = [] 65 | self.pre_episode_methods = [] 66 | self.post_episode_methods = [] 67 | ``` 68 | This gives meta modules a lot of flexibility of when to act upon training. Additionally, each of these function calls is given a "metrics" dictionary. This dictionary contains most, if not all, relevant information about training progress, e.g.: 69 | 70 | - the last transitions 71 | - the last losses, errors and predictions 72 | - policy, Q- and value-networks 73 | - hyperparameters 74 | 75 | This means meta modules can use everything from the current timestep to agent predictions. 76 | 77 | 78 | ### Algorithm Components: Mighty Exploration, Buffers and Updates 79 | 80 | The Mighty algorithms themselves also have modules which can be easily switched. These are exploration policies, buffers and update classes. 81 | Exploration policies and buffers furthermore have access to the same metrics dictionary as meta modules, meaning you can get creative as to what they do with this information. 82 | The way they are used in the RL loop is fixed, however, such that these are a bit more streamlined than the completely free meta-modules. 83 | 84 | 85 | ### Inside the Agent: Mighty Models 86 | 87 | Agent loops outside of exploration, buffers and updates are harder to alter in Mighty, since Mighty is primarily focused on meta-methods. 88 | You can control the network architecture of your agent fairly easily, however. 89 | There are two principal avenues for this: 90 | 91 | 1. You can use one of the pre-defined Mighty Models and configure it to use a different network architecture in the config. We use torch internally, that means you can allocate torch.nn layers and activations in different parts of these networks to form a custom architecture. 92 | 2. If you also want to customize what exactly the network predicts or add things like frozen weights, you probably want to implement your own Mighty Model. These always contain a 'feature_extractor' as a base and can vary beyond that. -------------------------------------------------------------------------------- /docs/usecases/Contextual_RL.md: -------------------------------------------------------------------------------- 1 | ### What Is Contextual RL? 2 | 3 | Most RL environments are either not concerned with generalization at all or test generalization performance without providing much insight into what agents are tested on, 4 | e.g. by using procedurally generated levels that are hard to understand as a structured training or test distribution. 5 | Contextual RL (or cRL)[[Hallak et al., CoRR 2015](https://arxiv.org/pdf/1502.02259.pdf), [Benjamins et al., CoRR 2022](https://arxiv.org/pdf/2202.04500.pdf)] aims to make the task distributions agents are trained on a specific as possible in order to gain better insights where agents perform well and what is currently missing in RL generalization. 6 | 7 | ### Contextual RL With CARL 8 | 9 | [CARL (context adaptive RL)](https://github.com/automl/CARL) (see [Benjamins et al., EcoRL 2021]() for more information) is a benchmark library specifically designed for contextual RL. 10 | It provides highly configurable contextual extensions to several well-known RL environments and is what we recommend to get started in cRL. 11 | Mighty is designed with contextual RL in mind and therefore fully compatible with CARL. 12 | 13 | The training works similarly to a standard RL environment, but now you can specify the training and test distributions, for example 10 variations of gravity for CartPole from CARL's default distribution: 14 | 15 | ```bash 16 | python mighty/run_mighty.py 'algorithm=dqn' 'env=CARLCartPoleEnv' '+env_kwargs.num_contexts=10' '+env_kwargs.context_feature_args=[gravity]' 17 | ``` 18 | 19 | Other CARL options are supported similarly as env_kwargs, though we recommend checking out the CARL examples to get a better idea of how to define these distributions. -------------------------------------------------------------------------------- /docs/usecases/DAC.md: -------------------------------------------------------------------------------- 1 | ### What Is Dynamic Algorithm Configuration? 2 | Dynamic Algorithm Configuration (DAC) [[Biedenkapp et al., ECAI 2020](https://ml.informatik.uni-freiburg.de/wp-content/uploads/papers/20-ECAI-DAC.pdf), [Adriaensen et al., JAIR 2022](https://arxiv.org/pdf/2205.13881.pdf)] 3 | is a hyperparameter optimization paradigm aiming to find the best possible hyperparameter configuration for a given *algorithm instance* at every *timestep* during runtime. 4 | DAC can easily be modelled as a contextual MDP and is thus a real-world application of RL. 5 | 6 | 7 | ### Dynamic Algorithm Configuration with Mighty 8 | In order to interface with configurable algorithms, we recommend [DACBench](https://github.com/automl/DACBench). 9 | It provides algorithms from different fields as well as artificial benchmarks, all with the OpenAI gym interface. 10 | 11 | Select the benchmark you want to run, for example the FunctionApproximationBenchmark, and providing it as the "env" keyword: 12 | 13 | ```bash 14 | python mighty/run_mighty.py 'algorithm=dqn' 'env=FunctionApproximationBenchmark' 15 | ``` 16 | The naming here will make Mighty autodetect it as a DACBench environment. 17 | 18 | The benchmarks in DACBench have many configuration options. You can use your hydra configs to include your changes, simply add them to the env_kwargs like this: 19 | 20 | ```bash 21 | python run_mighty.py 'algorithm=dqn' 'env=FunctionApproximationBenchmark' '+env_kwargs.dimension=3' 22 | ``` 23 | 24 | To see the full options for DACBench environments, refer to the [DACBench documentation](https://automl.github.io/DACBench/main/index.html). -------------------------------------------------------------------------------- /docs/usecases/Standard_RL.md: -------------------------------------------------------------------------------- 1 | If you want to use Mighty on standard RL environments, you can choose between the [Gymnasium](https://gymnasium.farama.org/) interface and [Pufferlib](https://puffer.ai/) as a fast alternative. 2 | Generally we recommend you use Pufferlib where possible, but the choice is yours! 3 | 4 | ### Mighty on Gymnasium Environments 5 | 6 | Mighty can be used as a standard RL library for all environments that follow the Gymnasium interface. 7 | In order to run a Mighty Agent, use the run_mighty.py script and provide any training options as keywords. If you want to know more about the configuration options, call: 8 | 9 | ```bash 10 | python mighty/run_mighty.py --help 11 | ``` 12 | 13 | An example for running the PPO agent on the Pendulum gym environment for 1000 steps looks like this: 14 | 15 | ```bash 16 | python mighty/run_mighty.py 'num_steps=1000' 'algorithm=ppo' 'env=Pendulum-v1' 17 | ``` 18 | 19 | We assume that if you don't specify anything beyond the name, you want to use Gymnasium. This will also work for environments that are registered with Gymnasium upon installation, e.g. [Gymnasium Robotics](https://robotics.farama.org/) and others. 20 | You can assume specifying the environment name like this to work just like "gym.make()". 21 | 22 | ### Mighty on Pufferlib Environments 23 | Pufferlib offers an efficient way to parallelize environment evaluations for a wide selection of tasks. Many well-known Gymnasium environments or benchmarks like [ProcGen]() are included in Pufferlib and we recommend it as a default for these. 24 | Running Pufferlib environments is very similar to running Gymnasium environments, you only need to add the pufferlib domain: 25 | 26 | ```bash 27 | python mighty/run_mighty.py 'num_steps=1000' 'algorithm=ppo' 'env=pufferlib.environments.procgen.bigfish' 28 | ``` 29 | 30 | We have some example configs where the env domain is pre-configured and you can override the name only. An example for minigrid would be: 31 | 32 | ```yaml 33 | env_name: MiniGrid-DoorKey-8x8-v0 # Overide with names z.B MiniGrid-LavaGapS5-v0, MiniGrid-DoorKey-8x8-v0, MiniGrid-ObstructedMaze-1Dl-v0, MiniGrid-KeyCorridorS3R2-v0, MiniGrid-UnlockPickup-v0 34 | env: pufferlib.environments.minigrid.${env_name} 35 | env_kwargs: {} 36 | ``` 37 | 38 | Meaning you can use this configuration (let's call it pufferlib_minigrid) with just the env name, similar to above: 39 | ```bash 40 | python mighty/run_mighty.py 'num_steps=1000' 'algorithm=ppo' 'env=pufferlib_minigrid' 'env_name=MiniGrid-LavaGapS5-v0' 41 | ``` -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/examples/__init__.py -------------------------------------------------------------------------------- /examples/custom_exploration_scheduler.py: -------------------------------------------------------------------------------- 1 | """Epsilon Greedy Scheduler.""" 2 | 3 | from __future__ import annotations 4 | from mighty.mighty_meta.mighty_component import MightyMetaComponent 5 | 6 | 7 | class EpsilonSchedule(MightyMetaComponent): 8 | """Cosine LR Schedule with optional warm restarts.""" 9 | 10 | def __init__( 11 | self, 12 | initial_epsilon=1.0, 13 | num_decay_steps=40000, 14 | target_epsilon=0.01 15 | ) -> None: 16 | """Epsilon schedule initialization. 17 | 18 | :param initial_epsilon: Initial maximal epsilon 19 | :param num_decay_steps: Length of schedule in steps 20 | :param target_epsilon: Minimal epsilon 21 | :return: 22 | """ 23 | super().__init__() 24 | self.initial_epsilon = initial_epsilon 25 | self.target_epsilon = target_epsilon 26 | self.num_decay_steps = num_decay_steps 27 | self.pre_step_methods = [self.adapt_epsilon] 28 | 29 | def adapt_epsilon(self, metrics): 30 | """Adapt epsilon on step. 31 | 32 | :param metrics: Dict of current metrics 33 | :return: 34 | """ 35 | current_epsilon = self.initial_epsilon - ( 36 | (self.initial_epsilon - self.target_epsilon) 37 | * metrics["step"] 38 | / self.num_decay_steps 39 | ) 40 | metrics["hp/pi_epsilon"] = current_epsilon -------------------------------------------------------------------------------- /examples/custom_policy.py: -------------------------------------------------------------------------------- 1 | """UCB exploration for DQN.""" 2 | 3 | from __future__ import annotations 4 | 5 | import numpy as np 6 | 7 | from mighty.mighty_exploration.mighty_exploration_policy import MightyExplorationPolicy 8 | 9 | 10 | class QValueUCB(MightyExplorationPolicy): 11 | """Exploration via UCB for DQN.""" 12 | 13 | def __init__( 14 | self, 15 | algo, 16 | model, 17 | constant=2, 18 | ): 19 | """Initialize UCB. 20 | 21 | :param algo: algorithm name 22 | :param func: policy function 23 | :param constant: c constant for UCB 24 | :return: 25 | """ 26 | super().__init__(algo, model) 27 | self.c = constant 28 | self.action_selected_count = np.zeros(model.num_actions) 29 | 30 | def explore(self, s, return_logp, metrics): 31 | """Explore. 32 | 33 | :param s: state 34 | :param return_logp: return logprobs 35 | :param metrics: metrics dictionary 36 | :return: action or (action, logprobs) 37 | """ 38 | # Get Q-values 39 | _, qvals = self.sample_action(s) 40 | # Calculate UCB bonus 41 | ucb_bonus = self.c*np.sqrt(np.log(metrics["step"] + 1)/(self.action_selected_count + 1e-4)) 42 | # Add bonus and selection actions 43 | ucb_actions = np.argmax(qvals.detach().numpy() + ucb_bonus, axis=1) 44 | # Update action counter 45 | if isinstance(ucb_actions, np.ndarray): 46 | for action in ucb_actions: 47 | self.action_selected_count[action] += 1 48 | else: 49 | self.action_selected_count[ucb_actions] += 1 50 | ucb_actions = np.array([ucb_actions]) 51 | return (ucb_actions, qvals) if return_logp else ucb_actions -------------------------------------------------------------------------------- /examples/hypersweeper_smac_example_config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - override hydra/job_logging: colorlog 4 | - override hydra/hydra_logging: colorlog 5 | - override hydra/sweeper: HyperSMAC 6 | 7 | runner: standard 8 | debug: false 9 | seed: 0 10 | output_dir: examples/hypersweeper_example_output 11 | wandb_project: null 12 | tensorboard_file: null 13 | experiment_name: optuna_tuning_example 14 | num_steps: 50_000 15 | env: pufferlib.ocean.bandit 16 | env_kwargs: {} 17 | env_wrappers: [] 18 | num_envs: 64 19 | 20 | # @package _global_ 21 | algorithm: PPO 22 | 23 | algorithm_kwargs: 24 | # Hyperparameters 25 | n_policy_units: 128 26 | n_critic_units: 128 27 | soft_update_weight: 0.01 28 | 29 | rollout_buffer_class: 30 | _target_: mighty.mighty_replay.MightyRolloutBuffer # Using rollout buffer 31 | rollout_buffer_kwargs: 32 | buffer_size: 4096 # Size of the rollout buffer. 33 | gamma: 0.99 # Discount factor for future rewards. 34 | gae_lambda: 0.95 # GAE lambda. 35 | obs_shape: ??? # Placeholder for observation shape 36 | act_dim: ??? # Placeholder for action dimension 37 | n_envs: ??? 38 | 39 | 40 | # Training 41 | learning_rate: 3e-4 42 | batch_size: 1024 # Batch size for training. 43 | gamma: 0.99 # The amount by which to discount future rewards. 44 | n_gradient_steps: 3 # Number of epochs for updating policy. 45 | ppo_clip: 0.2 # Clipping parameter for PPO. 46 | value_loss_coef: 0.5 # Coefficient for value loss. 47 | entropy_coef: 0.01 # Coefficient for entropy loss. 48 | max_grad_norm: 0.5 # Maximum value for gradient clipping. 49 | 50 | 51 | hidden_sizes: [64, 64] 52 | activation: 'tanh' 53 | 54 | n_epochs: 10 55 | minibatch_size: 64 56 | kl_target: 0.01 57 | use_value_clip: True 58 | value_clip_eps: 0.2 59 | 60 | policy_class: mighty.mighty_exploration.StochasticPolicy # Policy class for exploration 61 | policy_kwargs: 62 | entropy_coefficient: 0.0 # Coefficient for entropy-based exploration. 63 | 64 | # Training 65 | eval_every_n_steps: 1e4 # After how many steps to evaluate. 66 | n_episodes_eval: 10 67 | checkpoint: null # Path to load model checkpoint 68 | save_model_every_n_steps: 5e5 69 | 70 | hydra: 71 | run: 72 | dir: ${output_dir}/${experiment_name}_${seed} 73 | sweep: 74 | dir: ${output_dir}/${experiment_name}_${seed} 75 | sweeper: 76 | n_trials: 20 77 | budget_variable: num_steps 78 | sweeper_kwargs: 79 | optimizer_kwargs: 80 | smac_facade: 81 | _target_: smac.facade.multi_fidelity_facade.MultiFidelityFacade 82 | _partial_: true 83 | intensifier: 84 | _target_: smac.facade.multi_fidelity_facade.MultiFidelityFacade.get_intensifier 85 | _partial_: true 86 | eta: 3 87 | scenario: 88 | n_trials: ${hydra.sweeper.n_trials} 89 | seed: ${seed} 90 | min_budget: 5000 91 | max_budget: 50000 92 | deterministic: true 93 | n_workers: 1 94 | output_directory: ${hydra.sweep.dir} 95 | search_space: 96 | hyperparameters: 97 | algorithm_kwargs.learning_rate: 98 | type: uniform_float 99 | lower: 1e-5 100 | upper: 1e-3 101 | log: true 102 | algorithm_kwargs.batch_size: 103 | type: uniform_int 104 | lower: 8 105 | upper: 128 -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_0/0/.hydra/config.yaml: -------------------------------------------------------------------------------- 1 | runner: standard 2 | debug: false 3 | seed: 0 4 | output_dir: examples/multiple_runs 5 | wandb_project: null 6 | tensorboard_file: null 7 | experiment_name: mighty_experiment 8 | algorithm_kwargs: 9 | n_policy_units: 128 10 | n_critic_units: 128 11 | soft_update_weight: 0.01 12 | rollout_buffer_class: 13 | _target_: mighty.mighty_replay.MightyRolloutBuffer 14 | rollout_buffer_kwargs: 15 | buffer_size: 4096 16 | gamma: 0.99 17 | gae_lambda: 0.95 18 | obs_shape: ??? 19 | act_dim: ??? 20 | n_envs: ??? 21 | learning_rate: 0.0003 22 | batch_size: 1024 23 | gamma: 0.99 24 | n_gradient_steps: 3 25 | ppo_clip: 0.2 26 | value_loss_coef: 0.5 27 | entropy_coef: 0.01 28 | max_grad_norm: 0.5 29 | hidden_sizes: 30 | - 64 31 | - 64 32 | activation: tanh 33 | n_epochs: 10 34 | minibatch_size: 64 35 | kl_target: 0.01 36 | use_value_clip: true 37 | value_clip_eps: 0.2 38 | policy_class: mighty.mighty_exploration.StochasticPolicy 39 | policy_kwargs: 40 | entropy_coefficient: 0.0 41 | eval_every_n_steps: 10000.0 42 | n_episodes_eval: 10 43 | checkpoint: null 44 | save_model_every_n_steps: 500000.0 45 | algorithm: PPO 46 | num_steps: 50000 47 | env: CartPole-v1 48 | env_kwargs: {} 49 | env_wrappers: [] 50 | num_envs: 10 51 | search_space: 52 | hyperparameters: 53 | algorithm_kwargs.learning_rate: 54 | type: uniform_float 55 | lower: 1.0e-06 56 | upper: 0.01 57 | log: true 58 | default_value: 0.005 59 | algorithm_kwargs.epsilon: 60 | type: uniform_float 61 | lower: 0.01 62 | upper: 0.25 63 | default_value: 0.1 64 | algorithm_kwargs.batch_size: 65 | type: categorical 66 | choices: 67 | - 32 68 | - 64 69 | - 128 70 | - 256 71 | default_value: 32 72 | algorithm_kwargs.soft_update_weight: 73 | type: uniform_float 74 | lower: 0.01 75 | upper: 1.0 76 | log: true 77 | default_value: 1.0 78 | algorithm_kwargs.td_update_class: 79 | type: categorical 80 | choices: 81 | - mighty.mighty_update.QLearning 82 | - mighty.mighty_update.DoubleQLearning 83 | default_value: mighty.mighty_update.DoubleQLearning 84 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_0/0/.hydra/overrides.yaml: -------------------------------------------------------------------------------- 1 | - env=CartPole-v1 2 | - num_steps=50000 3 | - num_envs=10 4 | - seed=0 5 | - output_dir=examples/multiple_runs 6 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_0/eval_results.csv: -------------------------------------------------------------------------------- 1 | ,step,seed,eval_episodes,mean_eval_step_reward,mean_eval_reward,instance 2 | 0,10000,0,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.59375 1.275 1.59375 0.49038462 1.59375 0.98076923 3 | 0.91071429 1.5 0.72857143 0.87931034]",25.5,None 4 | 1,20000,0,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.14 1.62857143 0.46530612 0.87692308 2.07272727 1.03636364 5 | 1.52 0.81428571 0.84444444 1.425 ]",22.8,None 6 | 2,30000,0,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.53157895 2.23846154 0.67674419 2.425 0.74615385 1.38571429 7 | 0.66136364 2.07857143 1.81875 0.41571429]",29.1,None 8 | 3,40000,0,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.52380952 2.13333333 0.82051282 2.28571429 0.7804878 0.91428571 9 | 1.23076923 1.23076923 0.41558442 1.23076923]",32.0,None 10 | 4,50000,0,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[0.63815789 0.51595745 2.30952381 0.65540541 1.515625 0.76984127 11 | 1.94 1.03191489 1.515625 2.30952381]",48.5,None 12 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_0/hyperparameters.csv: -------------------------------------------------------------------------------- 1 | ,step,hp/lr,hp/pi_epsilon,hp/batch_size,hp/learning_starts,meta_modules 2 | 0,0,0.0003,0.1,1024,1,[] 3 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_0/results.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/examples/multiple_runs/mighty_experiment_0/results.npz -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_1/1/.hydra/config.yaml: -------------------------------------------------------------------------------- 1 | runner: standard 2 | debug: false 3 | seed: 1 4 | output_dir: examples/multiple_runs 5 | wandb_project: null 6 | tensorboard_file: null 7 | experiment_name: mighty_experiment 8 | algorithm_kwargs: 9 | n_policy_units: 128 10 | n_critic_units: 128 11 | soft_update_weight: 0.01 12 | rollout_buffer_class: 13 | _target_: mighty.mighty_replay.MightyRolloutBuffer 14 | rollout_buffer_kwargs: 15 | buffer_size: 4096 16 | gamma: 0.99 17 | gae_lambda: 0.95 18 | obs_shape: ??? 19 | act_dim: ??? 20 | n_envs: ??? 21 | learning_rate: 0.0003 22 | batch_size: 1024 23 | gamma: 0.99 24 | n_gradient_steps: 3 25 | ppo_clip: 0.2 26 | value_loss_coef: 0.5 27 | entropy_coef: 0.01 28 | max_grad_norm: 0.5 29 | hidden_sizes: 30 | - 64 31 | - 64 32 | activation: tanh 33 | n_epochs: 10 34 | minibatch_size: 64 35 | kl_target: 0.01 36 | use_value_clip: true 37 | value_clip_eps: 0.2 38 | policy_class: mighty.mighty_exploration.StochasticPolicy 39 | policy_kwargs: 40 | entropy_coefficient: 0.0 41 | eval_every_n_steps: 10000.0 42 | n_episodes_eval: 10 43 | checkpoint: null 44 | save_model_every_n_steps: 500000.0 45 | algorithm: PPO 46 | num_steps: 50000 47 | env: CartPole-v1 48 | env_kwargs: {} 49 | env_wrappers: [] 50 | num_envs: 10 51 | search_space: 52 | hyperparameters: 53 | algorithm_kwargs.learning_rate: 54 | type: uniform_float 55 | lower: 1.0e-06 56 | upper: 0.01 57 | log: true 58 | default_value: 0.005 59 | algorithm_kwargs.epsilon: 60 | type: uniform_float 61 | lower: 0.01 62 | upper: 0.25 63 | default_value: 0.1 64 | algorithm_kwargs.batch_size: 65 | type: categorical 66 | choices: 67 | - 32 68 | - 64 69 | - 128 70 | - 256 71 | default_value: 32 72 | algorithm_kwargs.soft_update_weight: 73 | type: uniform_float 74 | lower: 0.01 75 | upper: 1.0 76 | log: true 77 | default_value: 1.0 78 | algorithm_kwargs.td_update_class: 79 | type: categorical 80 | choices: 81 | - mighty.mighty_update.QLearning 82 | - mighty.mighty_update.DoubleQLearning 83 | default_value: mighty.mighty_update.DoubleQLearning 84 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_1/1/.hydra/overrides.yaml: -------------------------------------------------------------------------------- 1 | - env=CartPole-v1 2 | - num_steps=50000 3 | - num_envs=10 4 | - seed=1 5 | - output_dir=examples/multiple_runs 6 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_1/eval_results.csv: -------------------------------------------------------------------------------- 1 | ,step,seed,eval_episodes,mean_eval_step_reward,mean_eval_reward,instance 2 | 0,10000,1,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[0.81071429 0.84074074 0.43653846 1.26111111 1.51333333 1.03181818 3 | 1.74615385 2.52222222 1.51333333 0.81071429]",22.7,None 4 | 1,20000,1,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.07272727 2.36 1.38823529 1.12380952 0.944 1.07272727 5 | 0.69411765 1.24210526 0.48163265 1.38823529]",23.6,None 6 | 2,30000,1,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.97777778 0.60338983 1.87368421 1.97777778 1.36923077 0.58360656 7 | 1.1483871 0.77391304 0.91282051 0.91282051]",35.6,None 8 | 3,40000,1,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.45714286 1.85454545 1.02 1.13333333 1.2 1.7 9 | 1.36 0.41632653 1.36 0.61818182]",20.4,None 10 | 4,50000,1,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[0.34065041 1.82173913 0.87291667 3.49166667 2.61875 1.26969697 11 | 0.91086957 3.22307692 2.46470588 0.47613636]",41.9,None 12 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_1/hyperparameters.csv: -------------------------------------------------------------------------------- 1 | ,step,hp/lr,hp/pi_epsilon,hp/batch_size,hp/learning_starts,meta_modules 2 | 0,0,0.0003,0.1,1024,1,[] 3 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_1/results.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/examples/multiple_runs/mighty_experiment_1/results.npz -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_2/2/.hydra/config.yaml: -------------------------------------------------------------------------------- 1 | runner: standard 2 | debug: false 3 | seed: 2 4 | output_dir: examples/multiple_runs 5 | wandb_project: null 6 | tensorboard_file: null 7 | experiment_name: mighty_experiment 8 | algorithm_kwargs: 9 | n_policy_units: 128 10 | n_critic_units: 128 11 | soft_update_weight: 0.01 12 | rollout_buffer_class: 13 | _target_: mighty.mighty_replay.MightyRolloutBuffer 14 | rollout_buffer_kwargs: 15 | buffer_size: 4096 16 | gamma: 0.99 17 | gae_lambda: 0.95 18 | obs_shape: ??? 19 | act_dim: ??? 20 | n_envs: ??? 21 | learning_rate: 0.0003 22 | batch_size: 1024 23 | gamma: 0.99 24 | n_gradient_steps: 3 25 | ppo_clip: 0.2 26 | value_loss_coef: 0.5 27 | entropy_coef: 0.01 28 | max_grad_norm: 0.5 29 | hidden_sizes: 30 | - 64 31 | - 64 32 | activation: tanh 33 | n_epochs: 10 34 | minibatch_size: 64 35 | kl_target: 0.01 36 | use_value_clip: true 37 | value_clip_eps: 0.2 38 | policy_class: mighty.mighty_exploration.StochasticPolicy 39 | policy_kwargs: 40 | entropy_coefficient: 0.0 41 | eval_every_n_steps: 10000.0 42 | n_episodes_eval: 10 43 | checkpoint: null 44 | save_model_every_n_steps: 500000.0 45 | algorithm: PPO 46 | num_steps: 50000 47 | env: CartPole-v1 48 | env_kwargs: {} 49 | env_wrappers: [] 50 | num_envs: 10 51 | search_space: 52 | hyperparameters: 53 | algorithm_kwargs.learning_rate: 54 | type: uniform_float 55 | lower: 1.0e-06 56 | upper: 0.01 57 | log: true 58 | default_value: 0.005 59 | algorithm_kwargs.epsilon: 60 | type: uniform_float 61 | lower: 0.01 62 | upper: 0.25 63 | default_value: 0.1 64 | algorithm_kwargs.batch_size: 65 | type: categorical 66 | choices: 67 | - 32 68 | - 64 69 | - 128 70 | - 256 71 | default_value: 32 72 | algorithm_kwargs.soft_update_weight: 73 | type: uniform_float 74 | lower: 0.01 75 | upper: 1.0 76 | log: true 77 | default_value: 1.0 78 | algorithm_kwargs.td_update_class: 79 | type: categorical 80 | choices: 81 | - mighty.mighty_update.QLearning 82 | - mighty.mighty_update.DoubleQLearning 83 | default_value: mighty.mighty_update.DoubleQLearning 84 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_2/2/.hydra/overrides.yaml: -------------------------------------------------------------------------------- 1 | - env=CartPole-v1 2 | - num_steps=50000 3 | - num_envs=10 4 | - seed=2 5 | - output_dir=examples/multiple_runs 6 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_2/eval_results.csv: -------------------------------------------------------------------------------- 1 | ,step,seed,eval_episodes,mean_eval_step_reward,mean_eval_reward,instance 2 | 0,10000,2,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.256 0.95151515 2.85454545 0.92352941 1.36521739 1.01290323 3 | 0.45507246 0.95151515 0.73023256 2.61666667]",31.4,None 4 | 1,20000,2,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.41538462 1.53333333 0.47179487 1.02222222 2.04444444 1.53333333 5 | 1.02222222 0.96842105 0.87619048 0.8 ]",18.4,None 6 | 2,30000,2,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[2.26363636 1.46470588 0.996 1.46470588 0.59285714 2.26363636 7 | 0.46111111 1.77857143 1.38333333 0.6225 ]",24.9,None 8 | 3,40000,2,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.37333333 0.54210526 1.37333333 1.14444444 1.47142857 2.28888889 9 | 0.64375 0.50243902 2.06 1.47142857]",20.6,None 10 | 4,50000,2,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[0.7 1.11363636 0.79032258 0.79032258 1.63333333 2.45 11 | 0.74242424 1.225 0.81666667 1.36111111]",24.5,None 12 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_2/hyperparameters.csv: -------------------------------------------------------------------------------- 1 | ,step,hp/lr,hp/pi_epsilon,hp/batch_size,hp/learning_starts,meta_modules 2 | 0,0,0.0003,0.1,1024,1,[] 3 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_2/results.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/examples/multiple_runs/mighty_experiment_2/results.npz -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_3/3/.hydra/config.yaml: -------------------------------------------------------------------------------- 1 | runner: standard 2 | debug: false 3 | seed: 3 4 | output_dir: examples/multiple_runs 5 | wandb_project: null 6 | tensorboard_file: null 7 | experiment_name: mighty_experiment 8 | algorithm_kwargs: 9 | n_policy_units: 128 10 | n_critic_units: 128 11 | soft_update_weight: 0.01 12 | rollout_buffer_class: 13 | _target_: mighty.mighty_replay.MightyRolloutBuffer 14 | rollout_buffer_kwargs: 15 | buffer_size: 4096 16 | gamma: 0.99 17 | gae_lambda: 0.95 18 | obs_shape: ??? 19 | act_dim: ??? 20 | n_envs: ??? 21 | learning_rate: 0.0003 22 | batch_size: 1024 23 | gamma: 0.99 24 | n_gradient_steps: 3 25 | ppo_clip: 0.2 26 | value_loss_coef: 0.5 27 | entropy_coef: 0.01 28 | max_grad_norm: 0.5 29 | hidden_sizes: 30 | - 64 31 | - 64 32 | activation: tanh 33 | n_epochs: 10 34 | minibatch_size: 64 35 | kl_target: 0.01 36 | use_value_clip: true 37 | value_clip_eps: 0.2 38 | policy_class: mighty.mighty_exploration.StochasticPolicy 39 | policy_kwargs: 40 | entropy_coefficient: 0.0 41 | eval_every_n_steps: 10000.0 42 | n_episodes_eval: 10 43 | checkpoint: null 44 | save_model_every_n_steps: 500000.0 45 | algorithm: PPO 46 | num_steps: 50000 47 | env: CartPole-v1 48 | env_kwargs: {} 49 | env_wrappers: [] 50 | num_envs: 10 51 | search_space: 52 | hyperparameters: 53 | algorithm_kwargs.learning_rate: 54 | type: uniform_float 55 | lower: 1.0e-06 56 | upper: 0.01 57 | log: true 58 | default_value: 0.005 59 | algorithm_kwargs.epsilon: 60 | type: uniform_float 61 | lower: 0.01 62 | upper: 0.25 63 | default_value: 0.1 64 | algorithm_kwargs.batch_size: 65 | type: categorical 66 | choices: 67 | - 32 68 | - 64 69 | - 128 70 | - 256 71 | default_value: 32 72 | algorithm_kwargs.soft_update_weight: 73 | type: uniform_float 74 | lower: 0.01 75 | upper: 1.0 76 | log: true 77 | default_value: 1.0 78 | algorithm_kwargs.td_update_class: 79 | type: categorical 80 | choices: 81 | - mighty.mighty_update.QLearning 82 | - mighty.mighty_update.DoubleQLearning 83 | default_value: mighty.mighty_update.DoubleQLearning 84 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_3/3/.hydra/overrides.yaml: -------------------------------------------------------------------------------- 1 | - env=CartPole-v1 2 | - num_steps=50000 3 | - num_envs=10 4 | - seed=3 5 | - output_dir=examples/multiple_runs 6 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_3/eval_results.csv: -------------------------------------------------------------------------------- 1 | ,step,seed,eval_episodes,mean_eval_step_reward,mean_eval_reward,instance 2 | 0,10000,3,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[0.82666667 2.25454545 1.55 1.12727273 1.77142857 0.992 3 | 1.37777778 0.44285714 0.52765957 2.75555556]",24.8,None 4 | 1,20000,3,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[0.88421053 0.88421053 1.52727273 0.76363636 1.68 0.88421053 5 | 1.2 0.88421053 0.98823529 0.93333333]",16.8,None 6 | 2,30000,3,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[0.92105263 1.94444444 1.09375 1.25 0.5 1.25 7 | 0.67307692 1.02941176 1.94444444 1.09375 ]",17.5,None 8 | 3,40000,3,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.1625 2.06666667 1.28275862 1.77142857 0.64137931 0.88571429 9 | 0.6 1.24 0.55522388 2.86153846]",37.2,None 10 | 4,50000,3,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[2.07058824 1.03529412 0.66415094 0.8 1.46666667 0.95135135 11 | 0.54153846 1.1 2.51428571 1.1 ]",35.2,None 12 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_3/hyperparameters.csv: -------------------------------------------------------------------------------- 1 | ,step,hp/lr,hp/pi_epsilon,hp/batch_size,hp/learning_starts,meta_modules 2 | 0,0,0.0003,0.1,1024,1,[] 3 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_3/results.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/examples/multiple_runs/mighty_experiment_3/results.npz -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_4/4/.hydra/config.yaml: -------------------------------------------------------------------------------- 1 | runner: standard 2 | debug: false 3 | seed: 4 4 | output_dir: examples/multiple_runs 5 | wandb_project: null 6 | tensorboard_file: null 7 | experiment_name: mighty_experiment 8 | algorithm_kwargs: 9 | n_policy_units: 128 10 | n_critic_units: 128 11 | soft_update_weight: 0.01 12 | rollout_buffer_class: 13 | _target_: mighty.mighty_replay.MightyRolloutBuffer 14 | rollout_buffer_kwargs: 15 | buffer_size: 4096 16 | gamma: 0.99 17 | gae_lambda: 0.95 18 | obs_shape: ??? 19 | act_dim: ??? 20 | n_envs: ??? 21 | learning_rate: 0.0003 22 | batch_size: 1024 23 | gamma: 0.99 24 | n_gradient_steps: 3 25 | ppo_clip: 0.2 26 | value_loss_coef: 0.5 27 | entropy_coef: 0.01 28 | max_grad_norm: 0.5 29 | hidden_sizes: 30 | - 64 31 | - 64 32 | activation: tanh 33 | n_epochs: 10 34 | minibatch_size: 64 35 | kl_target: 0.01 36 | use_value_clip: true 37 | value_clip_eps: 0.2 38 | policy_class: mighty.mighty_exploration.StochasticPolicy 39 | policy_kwargs: 40 | entropy_coefficient: 0.0 41 | eval_every_n_steps: 10000.0 42 | n_episodes_eval: 10 43 | checkpoint: null 44 | save_model_every_n_steps: 500000.0 45 | algorithm: PPO 46 | num_steps: 50000 47 | env: CartPole-v1 48 | env_kwargs: {} 49 | env_wrappers: [] 50 | num_envs: 10 51 | search_space: 52 | hyperparameters: 53 | algorithm_kwargs.learning_rate: 54 | type: uniform_float 55 | lower: 1.0e-06 56 | upper: 0.01 57 | log: true 58 | default_value: 0.005 59 | algorithm_kwargs.epsilon: 60 | type: uniform_float 61 | lower: 0.01 62 | upper: 0.25 63 | default_value: 0.1 64 | algorithm_kwargs.batch_size: 65 | type: categorical 66 | choices: 67 | - 32 68 | - 64 69 | - 128 70 | - 256 71 | default_value: 32 72 | algorithm_kwargs.soft_update_weight: 73 | type: uniform_float 74 | lower: 0.01 75 | upper: 1.0 76 | log: true 77 | default_value: 1.0 78 | algorithm_kwargs.td_update_class: 79 | type: categorical 80 | choices: 81 | - mighty.mighty_update.QLearning 82 | - mighty.mighty_update.DoubleQLearning 83 | default_value: mighty.mighty_update.DoubleQLearning 84 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_4/4/.hydra/overrides.yaml: -------------------------------------------------------------------------------- 1 | - env=CartPole-v1 2 | - num_steps=50000 3 | - num_envs=10 4 | - seed=4 5 | - output_dir=examples/multiple_runs 6 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_4/eval_results.csv: -------------------------------------------------------------------------------- 1 | ,step,seed,eval_episodes,mean_eval_step_reward,mean_eval_reward,instance 2 | 0,10000,4,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.01 0.96190476 1.83636364 0.808 0.91818182 0.72142857 3 | 0.87826087 1.44285714 1.18823529 0.96190476]",20.2,None 4 | 1,20000,4,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.98823529 1.46956522 1.20714286 1.98823529 2.25333333 0.6627451 5 | 2.6 0.76818182 2.81666667 0.28644068]",33.8,None 6 | 2,30000,4,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[0.971875 0.7775 0.45735294 1.29583333 1.82941176 1.94375 7 | 1.29583333 0.60980392 2.39230769 1.19615385]",31.1,None 8 | 3,40000,4,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[2.05384615 1.02692308 1.57058824 0.98888889 1.90714286 0.98888889 9 | 0.70263158 0.45254237 0.78529412 2.225 ]",26.7,None 10 | 4,50000,4,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.14137931 2.06875 0.59107143 1.43913043 0.50923077 0.76976744 11 | 1.43913043 3.31 0.70425532 1.74210526]",33.1,None 12 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_4/hyperparameters.csv: -------------------------------------------------------------------------------- 1 | ,step,hp/lr,hp/pi_epsilon,hp/batch_size,hp/learning_starts,meta_modules 2 | 0,0,0.0003,0.1,1024,1,[] 3 | -------------------------------------------------------------------------------- /examples/multiple_runs/mighty_experiment_4/results.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/examples/multiple_runs/mighty_experiment_4/results.npz -------------------------------------------------------------------------------- /examples/optuna_example_config.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - override hydra/job_logging: colorlog 4 | - override hydra/hydra_logging: colorlog 5 | - override hydra/sweeper: optuna 6 | - override hydra/sweeper/sampler: tpe 7 | 8 | runner: standard 9 | debug: false 10 | seed: 0 11 | output_dir: examples/optuna_example_output 12 | wandb_project: null 13 | tensorboard_file: null 14 | experiment_name: optuna_tuning_example 15 | num_steps: 50_000 16 | env: pufferlib.ocean.bandit 17 | env_kwargs: {} 18 | env_wrappers: [] 19 | num_envs: 64 20 | 21 | # @package _global_ 22 | algorithm: PPO 23 | 24 | algorithm_kwargs: 25 | # Hyperparameters 26 | n_policy_units: 128 27 | n_critic_units: 128 28 | soft_update_weight: 0.01 29 | 30 | rollout_buffer_class: 31 | _target_: mighty.mighty_replay.MightyRolloutBuffer # Using rollout buffer 32 | rollout_buffer_kwargs: 33 | buffer_size: 4096 # Size of the rollout buffer. 34 | gamma: 0.99 # Discount factor for future rewards. 35 | gae_lambda: 0.95 # GAE lambda. 36 | obs_shape: ??? # Placeholder for observation shape 37 | act_dim: ??? # Placeholder for action dimension 38 | n_envs: ??? 39 | 40 | 41 | # Training 42 | learning_rate: 3e-4 43 | batch_size: 1024 # Batch size for training. 44 | gamma: 0.99 # The amount by which to discount future rewards. 45 | n_gradient_steps: 3 # Number of epochs for updating policy. 46 | ppo_clip: 0.2 # Clipping parameter for PPO. 47 | value_loss_coef: 0.5 # Coefficient for value loss. 48 | entropy_coef: 0.01 # Coefficient for entropy loss. 49 | max_grad_norm: 0.5 # Maximum value for gradient clipping. 50 | 51 | 52 | hidden_sizes: [64, 64] 53 | activation: 'tanh' 54 | 55 | n_epochs: 10 56 | minibatch_size: 64 57 | kl_target: 0.01 58 | use_value_clip: True 59 | value_clip_eps: 0.2 60 | 61 | policy_class: mighty.mighty_exploration.StochasticPolicy # Policy class for exploration 62 | policy_kwargs: 63 | entropy_coefficient: 0.0 # Coefficient for entropy-based exploration. 64 | 65 | # Training 66 | eval_every_n_steps: 1e4 # After how many steps to evaluate. 67 | n_episodes_eval: 10 68 | checkpoint: null # Path to load model checkpoint 69 | save_model_every_n_steps: 5e5 70 | 71 | hydra: 72 | run: 73 | dir: ${output_dir}/${experiment_name}_${seed} 74 | sweep: 75 | dir: ${output_dir}/${experiment_name}_${seed} 76 | sweeper: 77 | sampler: 78 | seed: 123 79 | direction: maximize 80 | study_name: optuna_tuning_example 81 | storage: null 82 | n_trials: 20 83 | n_jobs: 1 84 | params: 85 | algorithm_kwargs.learning_rate: range(0.0001, 0.05) 86 | algorithm_kwargs.batch_size: range(8, 128) -------------------------------------------------------------------------------- /mighty/__init__.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | name = "Mighty" 4 | package_name = "mighty" 5 | author = "Todo" 6 | author_email = "Todo" 7 | description = "No description given" 8 | url = "https://www.automl.org" 9 | project_urls = { 10 | "Documentation": "https://automl.github.io/Mighty/main", 11 | "Source Code": "https://github.com/automl/mighty", 12 | } 13 | copyright = f"Copyright {datetime.date.today().strftime('%Y')}, AutoML" 14 | version = "0.0.1" 15 | -------------------------------------------------------------------------------- /mighty/configs/algorithm/atari_dqn.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | algorithm: DQN 3 | q_func: ??? 4 | 5 | algorithm_kwargs: 6 | # Hyperparameters 7 | epsilon: 0.2 # Controls epsilon-greedy action selection in policy. 8 | 9 | replay_buffer_class: 10 | _target_: mighty.mighty_replay.PrioritizedReplay 11 | replay_buffer_kwargs: 12 | capacity: 1000000 # Maximum size of replay buffer. 13 | 14 | gamma: 0.9 # The amount by which to discount future rewards. 15 | 16 | # Training 17 | learning_rate: 0.001 18 | batch_size: 64 # Batch size for training. 19 | # begin_updating_weights: 1 # Begin updating policy weights after this many observed transitions. 20 | soft_update_weight: 0.01 # If we set :math:`\tau=1` we do a hard update. If we pick a smaller value, we do a smooth update. 21 | q_kwargs: 22 | dueling: False 23 | feature_extractor_kwargs: 24 | architecture: [cnn, mlp] 25 | n_convolutions: 3 26 | out_channels: [16, 32, 64] 27 | sizes: [[2, 2], [2,2], [2,2]] 28 | conv_dim: 2 29 | flatten_cnn: True 30 | n_layers: 1 31 | hidden_sizes: [512] -------------------------------------------------------------------------------- /mighty/configs/algorithm/ddqn.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | algorithm: DQN 3 | q_func: ??? 4 | 5 | algorithm_kwargs: 6 | # Core architecture / model 7 | n_units: 64 8 | q_kwargs: 9 | dueling: True 10 | feature_extractor_kwargs: 11 | architecture: mlp 12 | n_layers: 1 13 | hidden_sizes: [64] 14 | head_kwargs: 15 | hidden_sizes: [64] 16 | 17 | # Exploration (decaying ε‐greedy) 18 | policy_class: 19 | _target_: mighty.mighty_exploration.DecayingEpsilonGreedy 20 | policy_kwargs: 21 | epsilon_start: 1.0 22 | epsilon_final: 0.05 23 | epsilon_decay_steps: 320000 24 | 25 | # Replay‐buffer settings 26 | replay_buffer_class: 27 | _target_: mighty.mighty_replay.PrioritizedReplay 28 | replay_buffer_kwargs: 29 | capacity: 250000 30 | alpha: 0.6 31 | beta: 0.4 32 | epsilon: 1e-6 33 | device: "cpu" 34 | obs_shape: ??? # ← will be auto-filled at runtime 35 | action_shape: ??? # ← will be auto-filled at runtime 36 | 37 | # Training hyperparameters 38 | learning_rate: 3e-4 39 | batch_size: 64 40 | gamma: 0.97 41 | learning_starts: 64000 # wait 1k transitions before training 42 | 43 | # Target‐network / updating (hard update every 1k ∇‐steps) 44 | use_target: True 45 | soft_update_weight: 0.1 46 | target_update_freq: null 47 | 48 | # Double DQN update 49 | td_update_class: mighty.mighty_update.DoubleQLearning 50 | 51 | td_update_kwargs: 52 | gamma: 0.97 53 | optimizer_class: 54 | _target_: torch.optim.Adam 55 | optimizer_kwargs: 56 | lr: 5e-5 57 | weight_decay: 1e-5 58 | eps: 1e-6 59 | max_grad_norm: 10.0 60 | 61 | # Checkpointing 62 | save_replay: False 63 | n_gradient_steps: 1 -------------------------------------------------------------------------------- /mighty/configs/algorithm/dqn.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | algorithm: DQN 3 | q_func: ??? 4 | 5 | algorithm_kwargs: 6 | # Core architecture / model 7 | n_units: 256 8 | q_kwargs: 9 | dueling: False 10 | feature_extractor_kwargs: 11 | architecture: mlp 12 | n_layers: 1 13 | hidden_sizes: [256] 14 | head_kwargs: 15 | hidden_sizes: [256] 16 | 17 | # Exploration (decaying ε‐greedy) 18 | policy_class: 19 | _target_: mighty.mighty_exploration.DecayingEpsilonGreedy 20 | policy_kwargs: 21 | epsilon_start: 1.0 22 | epsilon_final: 0.04 23 | epsilon_decay_steps: 8000 24 | 25 | # Replay‐buffer settings 26 | replay_buffer_class: 27 | _target_: mighty.mighty_replay.MightyReplay 28 | replay_buffer_kwargs: 29 | capacity: 100000 30 | 31 | # Training hyperparameters 32 | learning_rate: 2.3e-3 33 | batch_size: 128 34 | gamma: 0.99 35 | learning_starts: 1000 # wait 1k transitions before training 36 | 37 | # Target‐network / updating (hard update every 1k ∇‐steps) 38 | use_target: True 39 | soft_update_weight: 0.005 40 | target_update_freq: null 41 | 42 | # Double DQN update 43 | td_update_class: mighty.mighty_update.QLearning 44 | 45 | td_update_kwargs: 46 | gamma: 0.99 47 | optimizer_class: 48 | _target_: torch.optim.Adam 49 | optimizer_kwargs: 50 | lr: 2.3e-3 51 | weight_decay: 1e-5 52 | eps: 1e-6 53 | max_grad_norm: 10.0 54 | 55 | # Checkpointing 56 | save_replay: False 57 | n_gradient_steps: 128 -------------------------------------------------------------------------------- /mighty/configs/algorithm/minigrid_dqn.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | algorithm: DQN 3 | q_func: ??? 4 | 5 | algorithm_kwargs: 6 | # Hyperparameters 7 | epsilon: 0.2 # Controls epsilon-greedy action selection in policy. 8 | td_update_class: mighty.mighty_update.DoubleQLearning 9 | 10 | replay_buffer_class: 11 | _target_: mighty.mighty_replay.PrioritizedReplay 12 | replay_buffer_kwargs: 13 | capacity: 1000000 # Maximum size of replay buffer. 14 | 15 | gamma: 0.9 # The amount by which to discount future rewards. 16 | 17 | # Training 18 | learning_rate: 0.001 19 | batch_size: 64 # Batch size for training. 20 | # begin_updating_weights: 1 # Begin updating policy weights after this many observed transitions. 21 | soft_update_weight: 0.01 # If we set :math:`\tau=1` we do a hard update. If we pick a smaller value, we do a smooth update. 22 | q_kwargs: 23 | dueling: False 24 | feature_extractor_kwargs: 25 | architecture: [cnn, mlp] 26 | n_convolutions: 3 27 | out_channels: [16, 32, 64] 28 | sizes: [[2, 2], [2,2], [2,2]] 29 | conv_dim: 2 30 | flatten_cnn: True 31 | n_layers: 1 32 | hidden_sizes: [512] -------------------------------------------------------------------------------- /mighty/configs/algorithm/ppo.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | algorithm: PPO 3 | 4 | algorithm_kwargs: 5 | # Hyperparameters 6 | n_policy_units: 64 7 | n_critic_units: 64 8 | soft_update_weight: 0.01 9 | 10 | rollout_buffer_class: 11 | _target_: mighty.mighty_replay.MightyRolloutBuffer # Using rollout buffer 12 | rollout_buffer_kwargs: 13 | buffer_size: 256 # Size of the rollout buffer. 14 | gamma: 0.98 # Discount factor for future rewards. 15 | gae_lambda: 0.8 # GAE lambda. 16 | obs_shape: ??? # Placeholder for observation shape 17 | act_dim: ??? # Placeholder for action dimension 18 | n_envs: ??? 19 | discrete_action: ??? # Placeholder for discrete action flag 20 | 21 | 22 | # Training 23 | learning_rate: 3e-4 24 | batch_size: 32 # Batch size for training. 25 | gamma: 0.99 # The amount by which to discount future rewards. 26 | ppo_clip: 0.2 # Clipping parameter for PPO. 27 | value_loss_coef: 0.5 # Coefficient for value loss. 28 | entropy_coef: 0.0 # Coefficient for entropy loss. 29 | max_grad_norm: 0.5 # Maximum value for gradient clipping. 30 | 31 | 32 | hidden_sizes: [64] 33 | activation: 'tanh' 34 | 35 | n_epochs: 20 36 | minibatch_size: 256 37 | kl_target: 0.01 38 | use_value_clip: True 39 | value_clip_eps: 0.2 40 | 41 | policy_class: mighty.mighty_exploration.StochasticPolicy # Policy class for exploration 42 | policy_kwargs: 43 | entropy_coefficient: 0.0 # Coefficient for entropy-based exploration. -------------------------------------------------------------------------------- /mighty/configs/algorithm/ppo_mountaincar.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | algorithm: PPO 3 | 4 | algorithm_kwargs: 5 | # Hyperparameters 6 | n_policy_units: 64 7 | n_critic_units: 64 8 | soft_update_weight: 0.01 9 | 10 | rollout_buffer_class: 11 | _target_: mighty.mighty_replay.MightyRolloutBuffer # Using rollout buffer 12 | rollout_buffer_kwargs: 13 | buffer_size: 256 # Size of the rollout buffer. 14 | gamma: 0.99 # Discount factor for future rewards. 15 | gae_lambda: 0.98 # GAE lambda. 16 | obs_shape: ??? # Placeholder for observation shape 17 | act_dim: ??? # Placeholder for action dimension 18 | n_envs: ??? 19 | discrete_action: ??? # Placeholder for discrete action flag 20 | 21 | 22 | # Training 23 | learning_rate: 1e-3 24 | batch_size: 1024 # Batch size for training. 25 | gamma: 0.99 # The amount by which to discount future rewards. 26 | ppo_clip: 0.2 # Clipping parameter for PPO. 27 | value_loss_coef: 0.5 # Coefficient for value loss. 28 | entropy_coef: 0.0 # Coefficient for entropy loss. 29 | max_grad_norm: 0.5 # Maximum value for gradient clipping. 30 | 31 | 32 | hidden_sizes: [64] 33 | activation: 'tanh' 34 | 35 | n_epochs: 4 36 | minibatch_size: 256 37 | kl_target: 0.01 38 | use_value_clip: True 39 | value_clip_eps: 0.2 40 | 41 | policy_class: mighty.mighty_exploration.StochasticPolicy # Policy class for exploration 42 | policy_kwargs: 43 | entropy_coefficient: 0.0 # Coefficient for entropy-based exploration. 44 | 45 | normalize_obs: True -------------------------------------------------------------------------------- /mighty/configs/algorithm/procgen_dqn.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | algorithm: DQN 3 | q_func: ??? 4 | 5 | algorithm_kwargs: 6 | # Hyperparameters 7 | epsilon: 0.2 # Controls epsilon-greedy action selection in policy. 8 | td_update_class: mighty.mighty_update.DoubleQLearning 9 | 10 | replay_buffer_class: 11 | _target_: mighty.mighty_replay.PrioritizedReplay 12 | replay_buffer_kwargs: 13 | capacity: 1000000 # Maximum size of replay buffer. 14 | 15 | gamma: 0.9 # The amount by which to discount future rewards. 16 | 17 | # Training 18 | learning_rate: 0.001 19 | batch_size: 64 # Batch size for training. 20 | # begin_updating_weights: 1 # Begin updating policy weights after this many observed transitions. 21 | soft_update_weight: 0.01 # If we set :math:`\tau=1` we do a hard update. If we pick a smaller value, we do a smooth update. 22 | q_kwargs: 23 | dueling: False 24 | feature_extractor_kwargs: 25 | architecture: resnet 26 | head_kwargs: 27 | hidden_sizes: [512] -------------------------------------------------------------------------------- /mighty/configs/algorithm/sac.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | algorithm: SAC 3 | 4 | algorithm_kwargs: 5 | # network sizes (PPO-style) 6 | n_policy_units: 256 # will become hidden_sizes=[8,8] 7 | n_critic_units: 256 # same for both Q-nets 8 | soft_update_weight: 0.01 # maps to tau 9 | 10 | # Replay buffer 11 | replay_buffer_class: 12 | _target_: mighty.mighty_replay.MightyReplay 13 | replay_buffer_kwargs: 14 | capacity: 1e6 15 | 16 | # Scheduling & batch-updates 17 | batch_size: 256 18 | learning_starts: 5000 19 | update_every: 2 20 | n_gradient_steps: 1 21 | 22 | # Learning rates 23 | policy_lr: 3e-4 24 | q_lr: 3e-4 25 | 26 | # SAC hyperparameters 27 | gamma: 0.99 28 | alpha: 0.2 29 | auto_alpha: True 30 | target_entropy: null 31 | alpha_lr: 3e-4 32 | 33 | # Exploration wrapper 34 | policy_class: mighty.mighty_exploration.StochasticPolicy 35 | policy_kwargs: 36 | entropy_coefficient: 0.2 37 | discrete: False 38 | -------------------------------------------------------------------------------- /mighty/configs/algorithm/sac_mujoco.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | algorithm: SAC 3 | 4 | algorithm_kwargs: 5 | # network sizes (PPO-style) 6 | n_policy_units: 256 # will become hidden_sizes=[8,8] 7 | n_critic_units: 256 # same for both Q-nets 8 | soft_update_weight: 0.005 # maps to tau 9 | 10 | # Replay buffer 11 | replay_buffer_class: 12 | _target_: mighty.mighty_replay.MightyReplay 13 | replay_buffer_kwargs: 14 | capacity: 1e6 15 | 16 | # Scheduling & batch-updates 17 | batch_size: 256 18 | learning_starts: 10000 19 | update_every: 1 20 | n_gradient_steps: 1 21 | 22 | # Learning rates 23 | policy_lr: 3e-4 24 | q_lr: 3e-4 25 | 26 | # SAC hyperparameters 27 | gamma: 0.99 28 | alpha: 0.2 29 | auto_alpha: True 30 | target_entropy: null 31 | alpha_lr: 3e-4 32 | 33 | # Exploration wrapper 34 | policy_class: mighty.mighty_exploration.StochasticPolicy 35 | policy_kwargs: 36 | entropy_coefficient: 0.0 37 | discrete: False 38 | 39 | normalize_obs: True -------------------------------------------------------------------------------- /mighty/configs/base.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - algorithm: ppo 4 | - environment: pufferlib_ocean/bandit 5 | - search_space: dqn_gym_classic 6 | - override hydra/job_logging: colorlog 7 | - override hydra/hydra_logging: colorlog 8 | - override hydra/help: mighty_help 9 | 10 | runner: standard 11 | debug: false 12 | seed: 0 13 | output_dir: runs 14 | wandb_project: null 15 | tensorboard_file: null 16 | experiment_name: mighty_experiment 17 | 18 | algorithm_kwargs: {} 19 | 20 | # Training 21 | eval_every_n_steps: 5e3 # After how many steps to evaluate. 22 | n_episodes_eval: 10 23 | checkpoint: null # Path to load model checkpoint 24 | save_model_every_n_steps: 5e5 25 | 26 | hydra: 27 | run: 28 | dir: ${output_dir}/${experiment_name}_${seed} 29 | sweep: 30 | dir: ${output_dir}/${experiment_name}_${seed} 31 | 32 | 33 | -------------------------------------------------------------------------------- /mighty/configs/cluster/local.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | # defaults: 3 | # - override /hydra/launcher: joblib 4 | 5 | # hydra: 6 | # launcher: 7 | # n_jobs: 16 8 | 9 | cluster: 10 | _target_: distributed.deploy.local.LocalCluster 11 | n_workers: ${hydra.sweeper.scenario.n_workers} 12 | processes: false 13 | threads_per_worker: 1 -------------------------------------------------------------------------------- /mighty/configs/cluster/luis.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /hydra/launcher: submitit_slurm 4 | 5 | cluster: 6 | queue: ai,tnt # partition 7 | 8 | hydra: 9 | launcher: 10 | partition: ai 11 | cpus_per_task: 1 12 | name: expl2 13 | timeout_min: 20 14 | mem_gb: 4 15 | setup: 16 | - module load Miniconda3 17 | - conda activate /bigwork/nhwpbenc/conda/envs/mighty -------------------------------------------------------------------------------- /mighty/configs/cluster/noctua.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - override /hydra/launcher: submitit_slurm 4 | 5 | hydra: 6 | launcher: 7 | partition: normal 8 | cpus_per_task: 1 9 | name: expl2 10 | timeout_min: 20 11 | mem_gb: 4 12 | setup: 13 | - micromamba activate /scratch/hpc-prf-intexml/cbenjamins/envs/mighty 14 | 15 | cluster: 16 | _target_: dask_jobqueue.SLURMCluster 17 | queue: normal # set in cluster config 18 | # account: myaccount 19 | cores: 16 20 | memory: 32 GB 21 | walltime: 01:00:00 22 | processes: 1 23 | log_directory: tmp/mighty_smac 24 | n_workers: 16 25 | death_timeout: 30 26 | -------------------------------------------------------------------------------- /mighty/configs/cluster/tnt.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - override hydra/launcher: submitit_slurm 3 | 4 | cluster: 5 | queue: cpu_short # partition 6 | 7 | hydra: 8 | launcher: 9 | partition: cpu_short # change this to your partition name 10 | #gres: gpu:1 # use this option when running on GPUs 11 | mem_gb: 12 # memory requirements 12 | cpus_per_task: 20 # number of cpus per run 13 | timeout_min: 720 # timeout in minutes 14 | setup: 15 | - export XLA_PYTHON_CLIENT_PREALLOCATE=false 16 | -------------------------------------------------------------------------------- /mighty/configs/cmaes_hpo.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - algorithm: dqn 4 | - environment: pufferlib_ocean/bandit 5 | - search_space: dqn_gym_classic 6 | - override hydra/job_logging: colorlog 7 | - override hydra/hydra_logging: colorlog 8 | - override hydra/help: mighty_help 9 | 10 | runner: es 11 | popsize: 5 12 | iterations: 100 13 | es: evosax.CMA_ES 14 | search_targets: ["learning_rate", "_batch_size"] 15 | rl_train_agent: true 16 | num_steps_per_iteration: 1000 17 | 18 | debug: false 19 | seed: 0 20 | output_dir: runs 21 | wandb_project: null 22 | tensorboard_file: null 23 | experiment_name: mighty_experiment 24 | 25 | algorithm_kwargs: {} 26 | 27 | # Training 28 | eval_every_n_steps: 1e4 # After how many steps to evaluate. 29 | n_episodes_eval: 10 30 | checkpoint: null # Path to load model checkpoint 31 | save_model_every_n_steps: 5e5 -------------------------------------------------------------------------------- /mighty/configs/environment/carl_walkers/ant_goals.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | num_steps: 25e6 4 | env: CARLBraxAnt 5 | env_wrappers: [mighty.mighty_utils.wrappers.FlattenVecObs] 6 | # For CARL, batch size should be one and num_envs should control parallel envs 7 | num_envs: 256 8 | 9 | env_kwargs: 10 | context_sample_seed: 0 11 | evaluation_context_sample_seed: 1 12 | num_contexts: 10 13 | num_evaluation_contexts: 10 14 | context_feature_args: {"target_distance": [normal, 9.8, 1.0, -100.0, 100.0], "target_direction": [categorical, [1, 2, 3, 4]]} 15 | batch_size: 1 16 | -------------------------------------------------------------------------------- /mighty/configs/environment/dacbench/function_approximation.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | num_steps: 1e5 4 | env: FunctionApproximationBenchmark 5 | env_kwargs: {} 6 | env_wrappers: [mighty.mighty_utils.wrappers.DictToVecActions] 7 | num_envs: 16 -------------------------------------------------------------------------------- /mighty/configs/environment/dacbench/function_approximation_benchmark.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | num_steps: 1e5 4 | env: FunctionApproximationBenchmark 5 | env_kwargs: {benchmark: true, dimension: 1} 6 | env_wrappers: [] 7 | num_envs: 16 -------------------------------------------------------------------------------- /mighty/configs/environment/gymnasium/atari_pong.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | num_steps: 1e6 4 | env: ALE/Pong-v5 5 | env_kwargs: {} 6 | env_wrappers: [] 7 | num_envs: 64 -------------------------------------------------------------------------------- /mighty/configs/environment/gymnasium/cartpole.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | num_steps: 1_000_000 4 | env: CartPole-v1 5 | env_kwargs: {} 6 | env_wrappers: [] 7 | num_envs: 64 -------------------------------------------------------------------------------- /mighty/configs/environment/gymnasium/mountaincar.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | num_steps: 100_000 4 | env: MountainCar-v0 5 | env_kwargs: {} 6 | env_wrappers: [] 7 | num_envs: 32 -------------------------------------------------------------------------------- /mighty/configs/environment/gymnasium/mountaincarcontinuous.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | num_steps: 1_000_000 4 | env: MountainCarContinuous-v0 5 | env_kwargs: {} 6 | env_wrappers: [] 7 | num_envs: 64 -------------------------------------------------------------------------------- /mighty/configs/environment/gymnasium/pendulum.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | num_steps: 1_000_000 4 | env: Pendulum-v1 5 | env_kwargs: {} 6 | env_wrappers: [] 7 | num_envs: 64 -------------------------------------------------------------------------------- /mighty/configs/environment/procgen_bigfish.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | num_steps: 25e6 4 | env: procgen:bigfish 5 | env_kwargs: {} 6 | env_wrappers: [] 7 | num_envs: 1 -------------------------------------------------------------------------------- /mighty/configs/environment/pufferlib_minigrid/minigrid_env.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | num_steps: 2e5 4 | env_name: MiniGrid-DoorKey-8x8-v0 # Overide with names z.B MiniGrid-LavaGapS5-v0, MiniGrid-DoorKey-8x8-v0, MiniGrid-ObstructedMaze-1Dl-v0, MiniGrid-KeyCorridorS3R2-v0, MiniGrid-UnlockPickup-v0 5 | env: pufferlib.environments.minigrid.${env_name} 6 | env_kwargs: {} 7 | env_wrappers: [] 8 | num_envs: 64 -------------------------------------------------------------------------------- /mighty/configs/environment/pufferlib_ocean/bandit.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | num_steps: 50_000 4 | env: pufferlib.ocean.bandit 5 | env_kwargs: {} 6 | env_wrappers: [] 7 | num_envs: 64 -------------------------------------------------------------------------------- /mighty/configs/environment/pufferlib_ocean/memory.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | num_steps: 50_000 4 | env: pufferlib.ocean.memory 5 | env_kwargs: {} 6 | env_wrappers: [] 7 | num_envs: 1 -------------------------------------------------------------------------------- /mighty/configs/environment/pufferlib_ocean/password.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | num_steps: 50_000 4 | env: pufferlib.ocean.password 5 | env_kwargs: {} 6 | env_wrappers: [] 7 | num_envs: 1 -------------------------------------------------------------------------------- /mighty/configs/environment/pufferlib_ocean/squared.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | num_steps: 50_000 4 | env: pufferlib.ocean.squared 5 | env_kwargs: {} 6 | env_wrappers: [mighty.utils.wrappers.FlattenVecObs] 7 | num_envs: 1 -------------------------------------------------------------------------------- /mighty/configs/environment/pufferlib_ocean/stochastic.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | num_steps: 50_000 4 | env: pufferlib.ocean.stochastic 5 | env_kwargs: {} 6 | env_wrappers: [] 7 | num_envs: 1 -------------------------------------------------------------------------------- /mighty/configs/environment/pufferlib_procgen/bigfish.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | 3 | num_steps: 25e6 4 | env: pufferlib.environments.procgen.bigfish 5 | env_kwargs: {} 6 | env_wrappers: [] 7 | num_envs: 256 -------------------------------------------------------------------------------- /mighty/configs/exploration/epsilon_decay.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | algorithm_kwargs: 3 | policy_class: mighty.mighty_exploration.DecayingEpsilonGreedy 4 | policy_kwargs: 5 | # start at ε=1.0, linearly decay down to ε=0.05 over 10 000 actions 6 | epsilon_start: 1.0 7 | epsilon_final: 0.01 8 | epsilon_decay_steps: 5000 -------------------------------------------------------------------------------- /mighty/configs/exploration/ez_greedy.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | algorithm_kwargs: 3 | policy_class: mighty.mighty_exploration.EZGreedy -------------------------------------------------------------------------------- /mighty/configs/exploration/noveld.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | algorithm_kwargs: 3 | meta_methods: 4 | - mighty.mighty_meta.NovelD 5 | meta_kwargs: 6 | - rnd_output_dim: 16 -------------------------------------------------------------------------------- /mighty/configs/exploration/rnd.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | algorithm_kwargs: 3 | meta_methods: 4 | - mighty.mighty_meta.RND 5 | meta_kwargs: 6 | - rnd_output_dim: 16 7 | rnd_network_config: 8 | - type: Linear 9 | kwargs: 10 | out_features: 64 11 | - type: ReLU 12 | kwargs: {} 13 | - type: Linear 14 | kwargs: 15 | out_features: 16 -------------------------------------------------------------------------------- /mighty/configs/hydra/help/mighty_help.yaml: -------------------------------------------------------------------------------- 1 | # App name, override to match the name your app is known by 2 | app_name: Mighty-DACs 3 | 4 | # Help header, customize to describe your app to your users 5 | header: |- 6 | == ${hydra.help.app_name} == 7 | The Mighty cRL library you've been looking for! 8 | 9 | footer: |- 10 | Powered by Hydra (https://hydra.cc) 11 | Use --hydra-help to view Hydra specific help 12 | 13 | template: |- 14 | ${hydra.help.header} 15 | 16 | == Configuration groups == 17 | Compose your configuration from those algorithms (algorithm=dqn) 18 | 19 | $APP_CONFIG_GROUPS 20 | 21 | == Common Hyperparameters == 22 | * debug: flag to toggle debug output (default: false) 23 | * seed: Which seed to use (default: 0) 24 | * output_dir: Where to store result data (default: /tmp) 25 | hydra specific information will be in "output_dir/year-month-day/timestamp/.hydra" 26 | 27 | * wandb_project: For wandb integration (default: null) 28 | * tensorboard_file: For tensorboard integration (default: null) 29 | * experiment_name: The folder in which the specific experiment data is to be stored. 30 | I.e. the path will be "output_dir/experiment_name" 31 | 32 | * algorithm_kwargs: A dictionary to specify hyperparameter settings to the algorithms. 33 | Will be overwritten/populated with the choice of algorithm. 34 | * num_steps: Maximum number of steps in the environment before episode ends. (default: 1000000) 35 | * env: The environment string name to use, e.g., MountainCarContinuous (default: CartPole-v1) 36 | For gym environments please see https://www.gymlibrary.ml/ (simple control environments are by 37 | default supported) 38 | For DACBench environments please see https://github.com/automl/DACBench 39 | For CARL environments please see https://github.com/automl/CARL 40 | * env_kwargs: Dict to modify environment parameters. Note: Currently only supported for CARL envs 41 | * env_warppers: List of wrapper classes to apply to the environment. (default: []) 42 | 43 | * eval_every_n_steps: Training steps interval after which the agent is evaluated on a separate eval_env, i.e., a 44 | second copy of the training env (default: 1000) 45 | * n_episodes_eval: Training episodes interval after which the agent is evlauted on a separate eval_env, i.e., a 46 | second copy of the training environment (default: null) 47 | * checkpoint: Path to load a checkpointed model from. This allows to contnue training. If unset a new model is 48 | trained from scratch (default: null) 49 | 50 | == Config == 51 | Any key=value argument can be overridden (use dots for.nested=overrides), for example: 52 | python mighty/run_mighty.py 'algorithm=ppo' 'env=MountainCarContinuous' 'num_steps=1000' 'algorithm_kwargs.learning_rate=0.1' 53 | or 54 | python mighty/run_mighty.py 'algorithm=dqn' 'env=SigmoidBenchmark' 'num_steps=100000' 55 | 56 | This is the configuration that was generated for this run: 57 | ------- 58 | $CONFIG 59 | ------- 60 | 61 | ${hydra.help.footer} 62 | -------------------------------------------------------------------------------- /mighty/configs/nes.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - algorithm: dqn 4 | - environment: pufferlib_ocean/bandit 5 | - search_space: dqn_gym_classic 6 | - override hydra/job_logging: colorlog 7 | - override hydra/hydra_logging: colorlog 8 | - override hydra/help: mighty_help 9 | 10 | runner: es 11 | popsize: 5 12 | iterations: 100 13 | es: evosax.xNES 14 | search_targets: ["parameters"] 15 | rl_train_agent: false 16 | 17 | debug: false 18 | seed: 0 19 | output_dir: runs 20 | wandb_project: null 21 | tensorboard_file: null 22 | experiment_name: mighty_experiment 23 | 24 | algorithm_kwargs: {} 25 | 26 | # Training 27 | eval_every_n_steps: 1e4 # After how many steps to evaluate. 28 | n_episodes_eval: 10 29 | checkpoint: null # Path to load model checkpoint 30 | save_model_every_n_steps: 5e5 -------------------------------------------------------------------------------- /mighty/configs/ppo_smac.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - algorithm: ppo_mujoco 4 | - environment: gymnasium/pendulum 5 | - search_space: ppo_rs 6 | - override hydra/job_logging: colorlog 7 | - override hydra/hydra_logging: colorlog 8 | - override hydra/help: mighty_help 9 | - override hydra/sweeper: HyperSMAC # use Hypersweeper’s RandomSearch 10 | 11 | runner: standard 12 | debug: false 13 | seed: 0 14 | output_dir: sweep_smac 15 | wandb_project: null 16 | tensorboard_file: null 17 | experiment_name: ppo_smac 18 | 19 | budget: 200000 # Budget for the hyperparameter search 20 | 21 | algorithm_kwargs: {} 22 | 23 | # Training 24 | eval_every_n_steps: 1e4 # After how many steps to evaluate. 25 | n_episodes_eval: 10 26 | checkpoint: null # Path to load model checkpoint 27 | save_model_every_n_steps: 5e5 28 | 29 | hydra: 30 | sweeper: 31 | n_trials: 10 32 | budget_variable: budget 33 | sweeper_kwargs: 34 | seeds: [0] 35 | optimizer_kwargs: 36 | smac_facade: 37 | _target_: smac.facade.blackbox_facade.BlackBoxFacade 38 | _partial_: true 39 | logging_level: 20 # 10 DEBUG, 20 INFO 40 | scenario: 41 | seed: 42 42 | n_trials: ${hydra.sweeper.n_trials} 43 | deterministic: true 44 | n_workers: 4 45 | output_directory: ${hydra.sweep.dir} 46 | search_space: ${search_space} 47 | run: 48 | dir: ./tmp/branin_smac/ 49 | sweep: 50 | dir: ./tmp/branin_smac/ -------------------------------------------------------------------------------- /mighty/configs/sac_smac.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - algorithm: sac_mujoco 4 | - environment: gymnasium/pendulum 5 | - search_space: sac_rs 6 | - override hydra/job_logging: colorlog 7 | - override hydra/hydra_logging: colorlog 8 | - override hydra/help: mighty_help 9 | - override hydra/sweeper: HyperSMAC # use Hypersweeper’s RandomSearch 10 | 11 | runner: standard 12 | debug: false 13 | seed: 0 14 | output_dir: sweep_smac 15 | wandb_project: null 16 | tensorboard_file: null 17 | experiment_name: ppo_smac 18 | 19 | budget: 200000 # Budget for the hyperparameter search 20 | 21 | algorithm_kwargs: {} 22 | 23 | # Training 24 | eval_every_n_steps: 1e4 # After how many steps to evaluate. 25 | n_episodes_eval: 10 26 | checkpoint: null # Path to load model checkpoint 27 | save_model_every_n_steps: 5e5 28 | 29 | hydra: 30 | sweeper: 31 | n_trials: 10 32 | budget_variable: budget 33 | sweeper_kwargs: 34 | seeds: [0] 35 | optimizer_kwargs: 36 | smac_facade: 37 | _target_: smac.facade.blackbox_facade.BlackBoxFacade 38 | _partial_: true 39 | logging_level: 20 # 10 DEBUG, 20 INFO 40 | scenario: 41 | seed: 42 42 | n_trials: ${hydra.sweeper.n_trials} 43 | deterministic: true 44 | n_workers: 4 45 | output_directory: ${hydra.sweep.dir} 46 | search_space: ${search_space} 47 | run: 48 | dir: ./tmp/branin_smac/ 49 | sweep: 50 | dir: ./tmp/branin_smac/ -------------------------------------------------------------------------------- /mighty/configs/search_space/dqn_gym_classic.yaml: -------------------------------------------------------------------------------- 1 | hyperparameters: 2 | algorithm_kwargs.learning_rate: 3 | type: uniform_float 4 | lower: 1e-6 5 | upper: 1e-2 6 | log: true 7 | default_value: 5e-3 8 | algorithm_kwargs.epsilon: 9 | type: uniform_float 10 | lower: 0.01 11 | upper: 0.25 12 | default_value: 0.1 13 | algorithm_kwargs.batch_size: 14 | type: categorical 15 | choices: [32, 64, 128, 256] 16 | default_value: 32 17 | algorithm_kwargs.soft_update_weight: 18 | type: uniform_float 19 | lower: 0.01 20 | upper: 1.0 21 | log: true 22 | default_value: 1. 23 | algorithm_kwargs.td_update_class: 24 | type: categorical 25 | choices: [mighty.mighty_update.QLearning, mighty.mighty_update.DoubleQLearning] #, coax.td_learning.ClippedDoubleQLearning, coax.td_learning.SoftClippedDoubleQLearning] 26 | default_value: mighty.mighty_update.DoubleQLearning 27 | 28 | -------------------------------------------------------------------------------- /mighty/configs/search_space/dqn_rs.yaml: -------------------------------------------------------------------------------- 1 | hyperparameters: 2 | algorithm_kwargs.learning_rate: 3 | type: uniform_float 4 | upper: 0.1 5 | lower: 1.0e-06 6 | default: 0.0003 7 | log: true 8 | algorithm_kwargs.gamma: 9 | type: uniform_float 10 | lower: 0.9 11 | upper: 0.9999 12 | log: false 13 | algorithm_kwargs.batch_size: 14 | type: categorical 15 | choices: [32, 64, 128, 256] -------------------------------------------------------------------------------- /mighty/configs/search_space/dqn_template.yaml: -------------------------------------------------------------------------------- 1 | # @package hydra.sweeper.search_space 2 | hyperparameters: 3 | algorithm_kwargs.n_units: 4 | type: ordinal 5 | sequence: [4,8,16,32,64,128,256,512] 6 | algorithm_kwargs.soft_update_weight: 7 | type: uniform_float 8 | lower: 0 9 | upper: 1 10 | default_value: 1 11 | 12 | -------------------------------------------------------------------------------- /mighty/configs/search_space/mighty_template.yaml: -------------------------------------------------------------------------------- 1 | # @package hydra.sweeper.search_space 2 | 3 | # Possible HP types: 4 | # constant, unparametrized, uniform_float, normal_float, beta_float 5 | # uniform_int, normal_int, beta_int, categorical, ordinal 6 | hyperparameters: 7 | algorithm_kwargs.learning_rate: 8 | type: uniform_float 9 | lower: 1e-6 10 | upper: 1e-2 11 | log: true 12 | default_value: 1e-3 13 | algorithm_kwargs.epsilon: 14 | type: uniform_float 15 | lower: 0 16 | upper: 1 17 | log: false 18 | default_value: 0.1 19 | algorithm_kwargs.batch_size: 20 | type: ordinal 21 | sequence: [4,8,16,32,64,128,256,512,1024] 22 | default: 64 -------------------------------------------------------------------------------- /mighty/configs/search_space/ppo_rs.yaml: -------------------------------------------------------------------------------- 1 | # configs/search_space/ppo_rs.yaml 2 | hyperparameters: 3 | # match the keys under algorithm_kwargs in your PPO config 4 | algorithm_kwargs.learning_rate: 5 | type: uniform_float 6 | lower: 1e-5 7 | upper: 1e-3 8 | log: true 9 | algorithm_kwargs.batch_size: 10 | type: categorical 11 | choices: [8192, 16384, 32768] 12 | algorithm_kwargs.n_gradient_steps: 13 | type: uniform_int 14 | lower: 1 15 | upper: 20 16 | log: false 17 | algorithm_kwargs.gamma: 18 | type: uniform_float 19 | lower: 0.9 20 | upper: 0.9999 21 | log: false 22 | algorithm_kwargs.ppo_clip: 23 | type: uniform_float 24 | lower: 0.1 25 | upper: 0.3 26 | log: false 27 | algorithm_kwargs.value_loss_coef: 28 | type: uniform_float 29 | lower: 0.1 30 | upper: 1.0 31 | log: false 32 | algorithm_kwargs.entropy_coef: 33 | type: uniform_float 34 | lower: 0.0 35 | upper: 0.1 36 | log: false 37 | algorithm_kwargs.max_grad_norm: 38 | type: uniform_float 39 | lower: 0.1 40 | upper: 1.0 41 | log: false 42 | -------------------------------------------------------------------------------- /mighty/configs/search_space/sac_rs.yaml: -------------------------------------------------------------------------------- 1 | hyperparameters: 2 | algorithm_kwargs.learning_rate: 3 | type: uniform_float 4 | lower: 0.000001 5 | upper: 0.01 6 | log: true 7 | algorithm_kwargs.batch_size: 8 | type: categorical 9 | choices: [32, 64, 128, 256] -------------------------------------------------------------------------------- /mighty/configs/sweep_ppo_pbt.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - algorithm: ppo 4 | - environment: gymnasium/pendulum 5 | - search_space: ppo_rs 6 | - override hydra/job_logging: colorlog 7 | - override hydra/hydra_logging: colorlog 8 | - override hydra/help: mighty_help 9 | - override hydra/sweeper: HyperPBT # use Hypersweeper’s RandomSearch 10 | 11 | runner: standard 12 | debug: false 13 | seed: 0 14 | output_dir: sweep_pbt 15 | wandb_project: null 16 | tensorboard_file: null 17 | experiment_name: mighty_experiment 18 | 19 | algorithm_kwargs: {} 20 | 21 | # Training 22 | eval_every_n_steps: 1e4 # After how many steps to evaluate. 23 | n_episodes_eval: 10 24 | checkpoint: null # Path to load model checkpoint 25 | save_model_every_n_steps: 5e5 26 | 27 | hydra: 28 | sweeper: 29 | budget: 100000 30 | budget_variable: 100000 31 | loading_variable: load 32 | saving_variable: save 33 | sweeper_kwargs: 34 | optimizer_kwargs: 35 | population_size: 10 36 | config_interval: 1e4 37 | checkpoint_tf: true 38 | load_tf: true 39 | search_space: ${search_space} 40 | run: 41 | dir: ${output_dir}/${experiment_name}_${seed} 42 | sweep: 43 | dir: ${output_dir}/${experiment_name}_${seed} -------------------------------------------------------------------------------- /mighty/configs/sweep_rs.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _self_ 3 | - algorithm: ppo 4 | - environment: gymnasium/pendulum 5 | - search_space: ppo_rs 6 | - override hydra/job_logging: colorlog 7 | - override hydra/hydra_logging: colorlog 8 | - override hydra/help: mighty_help 9 | - override hydra/sweeper: HyperRS # use Hypersweeper’s RandomSearch 10 | 11 | runner: standard 12 | debug: false 13 | seed: 0 14 | output_dir: sweep_rs 15 | wandb_project: null 16 | tensorboard_file: null 17 | experiment_name: dqn_sweep 18 | 19 | algorithm_kwargs: {} 20 | 21 | # Training 22 | eval_every_n_steps: 1e4 # After how many steps to evaluate. 23 | n_episodes_eval: 10 24 | checkpoint: null # Path to load model checkpoint 25 | save_model_every_n_steps: 5e5 26 | 27 | hydra: 28 | sweeper: 29 | n_trials: 10 30 | sweeper_kwargs: 31 | max_parallelization: 0.8 32 | max_budget: 100000 33 | search_space: ${search_space} 34 | run: 35 | dir: ${output_dir}/${experiment_name}_${seed} 36 | sweep: 37 | dir: ${output_dir}/${experiment_name}_${seed} -------------------------------------------------------------------------------- /mighty/configs/target_function.yaml: -------------------------------------------------------------------------------- 1 | # configs/target_function.yaml 2 | _target_: run_mighty # or fully‑qualified: mighty.run_mighty.run_mighty -------------------------------------------------------------------------------- /mighty/mighty_agents/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/mighty/mighty_agents/.gitkeep -------------------------------------------------------------------------------- /mighty/mighty_agents/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_agent import MightyAgent 2 | from .dqn import MightyDQNAgent 3 | from .ppo import MightyPPOAgent 4 | from .sac import MightySACAgent 5 | 6 | # FIXME: does it make sense to also split them in on- and off-policy agents? I mean for ifs in the base class? 7 | # Then we wouldn't have to test for PPO, just for on-policy 8 | VALID_AGENT_TYPES = ["DQN", "PPO", "SAC", "DDQN"] 9 | AGENT_CLASSES = { 10 | "DQN": MightyDQNAgent, 11 | "PPO": MightyPPOAgent, 12 | "SAC": MightySACAgent, 13 | "DDQN": MightyDQNAgent, 14 | } 15 | 16 | from .factory import get_agent_class # noqa: E402 17 | 18 | __all__ = [ 19 | "MightyAgent", 20 | "get_agent_class", 21 | "MightyDQNAgent", 22 | "MightyPPOAgent", 23 | "MightySACAgent", 24 | ] 25 | -------------------------------------------------------------------------------- /mighty/mighty_agents/factory.py: -------------------------------------------------------------------------------- 1 | """Factory for creating agents based on config.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import TYPE_CHECKING 6 | 7 | from mighty.mighty_agents import AGENT_CLASSES, VALID_AGENT_TYPES 8 | 9 | if TYPE_CHECKING: 10 | from mighty.mighty_agents.base_agent import MightyAgent 11 | 12 | 13 | def get_agent_class(agent_type: str) -> MightyAgent: 14 | """Transforms config keyword for agents to class.""" 15 | agent_class = None 16 | if agent_type in VALID_AGENT_TYPES: 17 | agent_class = AGENT_CLASSES[agent_type] 18 | else: 19 | raise ValueError(f"Unknown agent_type {agent_type}.") 20 | 21 | return agent_class # type: ignore 22 | -------------------------------------------------------------------------------- /mighty/mighty_exploration/__init__.py: -------------------------------------------------------------------------------- 1 | from mighty.mighty_exploration.decaying_epsilon_greedy import DecayingEpsilonGreedy 2 | from mighty.mighty_exploration.epsilon_greedy import EpsilonGreedy 3 | from mighty.mighty_exploration.ez_greedy import EZGreedy 4 | from mighty.mighty_exploration.mighty_exploration_policy import MightyExplorationPolicy 5 | from mighty.mighty_exploration.stochastic_policy import StochasticPolicy 6 | 7 | __all__ = [ 8 | "MightyExplorationPolicy", 9 | "EpsilonGreedy", 10 | "EZGreedy", 11 | "StochasticPolicy", 12 | "DecayingEpsilonGreedy", 13 | ] 14 | -------------------------------------------------------------------------------- /mighty/mighty_exploration/decaying_epsilon_greedy.py: -------------------------------------------------------------------------------- 1 | """Decaying Epsilon‐Greedy Exploration.""" 2 | 3 | from __future__ import annotations 4 | 5 | import numpy as np 6 | 7 | from mighty.mighty_exploration.epsilon_greedy import EpsilonGreedy 8 | 9 | 10 | class DecayingEpsilonGreedy(EpsilonGreedy): 11 | """Epsilon-Greedy Exploration with linear decay schedule.""" 12 | 13 | def __init__( 14 | self, 15 | algo, 16 | model, 17 | epsilon: float | None = None, 18 | epsilon_start: float = 1.0, 19 | epsilon_final: float = 0.01, 20 | epsilon_decay_steps: int = 10000, 21 | ): 22 | """ 23 | :param algo: algorithm name 24 | :param model: policy model (e.g. Q-network) 25 | :param epsilon_start: Initial ε (at time step 0) 26 | :param epsilon_final: Final ε (after decay_steps) 27 | :param epsilon_decay_steps: Number of steps over which to linearly 28 | decay ε from epsilon_start → epsilon_final. 29 | """ 30 | super().__init__(algo=algo, model=model, epsilon=epsilon_start) 31 | self.epsilon_start = epsilon_start 32 | self.epsilon_final = epsilon_final 33 | self.epsilon_decay_steps = epsilon_decay_steps 34 | self.total_steps = 0 35 | 36 | def _compute_epsilon(self) -> float: 37 | """Linearly interpolate between epsilon_start and epsilon_final.""" 38 | if self.total_steps >= self.epsilon_decay_steps: 39 | return self.epsilon_final 40 | fraction = self.total_steps / self.epsilon_decay_steps 41 | return float( 42 | self.epsilon_start + fraction * (self.epsilon_final - self.epsilon_start) 43 | ) 44 | 45 | def get_random_actions(self, n_actions, action_length): 46 | """ 47 | Override to recompute ε at each call, then delegate to EpsilonGreedy's logic. 48 | """ 49 | # 1) Update ε based on total_steps 50 | current_epsilon = self._compute_epsilon() 51 | self.epsilon = current_epsilon 52 | 53 | # 2) Call parent method to build exploration flags & random actions 54 | exploration_flags, random_actions = super().get_random_actions( 55 | n_actions, action_length 56 | ) 57 | 58 | # 3) Advance the step counter (so subsequent calls see a smaller ε) 59 | self.total_steps += n_actions 60 | 61 | return exploration_flags, random_actions 62 | 63 | def explore_func(self, s): 64 | """Same as EpsilonGreedy, except uses decayed ε each time.""" 65 | greedy_actions, qvals = self.sample_action(s) 66 | exploration_flags, random_actions = self.get_random_actions( 67 | len(greedy_actions), len(qvals[0]) 68 | ) 69 | actions = np.where(exploration_flags, random_actions, greedy_actions) 70 | return actions.astype(int), qvals 71 | -------------------------------------------------------------------------------- /mighty/mighty_exploration/epsilon_greedy.py: -------------------------------------------------------------------------------- 1 | """Epsilon Greedy Exploration.""" 2 | 3 | from __future__ import annotations 4 | 5 | import numpy as np 6 | 7 | from mighty.mighty_exploration.mighty_exploration_policy import MightyExplorationPolicy 8 | 9 | 10 | class EpsilonGreedy(MightyExplorationPolicy): 11 | """Epsilon Greedy Exploration.""" 12 | 13 | def __init__( 14 | self, 15 | algo, 16 | model, 17 | epsilon=0.1, 18 | ): 19 | """Initialize Epsilon Greedy. 20 | 21 | :param algo: algorithm name 22 | :param func: policy function 23 | :param epsilon: exploration epsilon 24 | :param env: environment 25 | :return: 26 | """ 27 | super().__init__(algo, model) 28 | self.epsilon = epsilon 29 | 30 | def get_random_actions(self, n_actions, action_length): 31 | if isinstance(self.epsilon, float): 32 | exploration_flags = [ 33 | self.rng.random() < self.epsilon for _ in range(n_actions) 34 | ] 35 | else: 36 | index = 0 37 | exploration_flags = [] 38 | while len(exploration_flags) < n_actions: 39 | exploration_flags.append(self.rng.random() < self.epsilon[index]) 40 | index += 1 41 | if index >= len(self.epsilon): 42 | index = 0 43 | exploration_flags = np.array(exploration_flags) 44 | random_actions = self.rng.integers(action_length, size=n_actions) 45 | return exploration_flags, random_actions 46 | 47 | def explore_func(self, s): 48 | greedy_actions, qvals = self.sample_action(s) 49 | exploration_flags, random_actions = self.get_random_actions( 50 | len(greedy_actions), len(qvals[0]) 51 | ) 52 | actions = np.where(exploration_flags, random_actions, greedy_actions) 53 | return actions.astype(int), qvals 54 | -------------------------------------------------------------------------------- /mighty/mighty_exploration/ez_greedy.py: -------------------------------------------------------------------------------- 1 | """Epsilon Greedy Exploration.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import TYPE_CHECKING, Tuple 6 | 7 | import numpy as np 8 | 9 | from mighty.mighty_exploration import EpsilonGreedy 10 | 11 | if TYPE_CHECKING: 12 | import torch 13 | 14 | 15 | class EZGreedy(EpsilonGreedy): 16 | """Epsilon Greedy Exploration.""" 17 | 18 | def __init__( 19 | self, 20 | algo: str, 21 | model: torch.nn.Module, 22 | epsilon: float = 0.1, 23 | zipf_param: int = 2, 24 | ): 25 | """Initialize EZ Greedy. 26 | 27 | :param algo: algorithm name 28 | :param model: model 29 | :param epsilon: exploration epsilon 30 | :param zipf_param: parametrizes the Zipf distribution for skipping 31 | :return: 32 | """ 33 | super().__init__(algo, model) 34 | self.epsilon = epsilon 35 | self.zipf_param = zipf_param 36 | self.skip = max(1, np.random.default_rng().zipf(self.zipf_param)) 37 | self.skipped = None 38 | self.frozen_actions = None 39 | 40 | def explore_func(self, s: torch.Tensor) -> Tuple: 41 | # Epsilon Greedy Step 42 | greedy_actions, qvals = self.sample_action(s) 43 | 44 | # Initialize Skips 45 | if self.skipped is None: 46 | self.skipped = np.zeros(len(greedy_actions)) # type: ignore 47 | self.frozen_actions = np.zeros(greedy_actions.shape) # type: ignore 48 | 49 | # Do epsilon greedy exploration 50 | exploration_flags, random_actions = self.get_random_actions( 51 | len(greedy_actions), len(qvals[0]) 52 | ) 53 | actions = np.where(exploration_flags, random_actions, greedy_actions) 54 | 55 | # Decay Skips 56 | self.skipped = np.maximum(0, self.skipped - 1) # type: ignore 57 | 58 | # Sample skip lengths for new exploration steps 59 | new_skips = np.where( 60 | exploration_flags, 61 | [self.rng.zipf(self.zipf_param) for _ in range(len(exploration_flags))], 62 | [0] * len(exploration_flags), 63 | ) 64 | for i in range(len(self.skipped)): # type: ignore 65 | if self.skipped[i] == 0: # type: ignore 66 | self.frozen_actions[i] = actions[i] # type: ignore 67 | 68 | if exploration_flags[i] and self.skipped[i] == 0: # type: ignore 69 | self.skipped[i] = new_skips[i] # type: ignore 70 | 71 | # Apply skip 72 | skips = [self.skipped[i] > 0 for i in range(len(self.skipped))] # type: ignore 73 | actions = np.where(skips, self.frozen_actions, actions) # type: ignore 74 | return actions.astype(int), qvals 75 | -------------------------------------------------------------------------------- /mighty/mighty_exploration/mighty_exploration_policy.py: -------------------------------------------------------------------------------- 1 | """Mighty Exploration Policy.""" 2 | 3 | from __future__ import annotations 4 | 5 | import numpy as np 6 | import torch 7 | from torch.distributions import Categorical, Normal 8 | 9 | 10 | class MightyExplorationPolicy: 11 | """Generic Exploration Policy Interface. 12 | 13 | Now supports: 14 | - Discrete: `model(state)` → logits → Categorical 15 | - Continuous (squashed-Gaussian): `model(state)` → (action, z, mean, log_std) 16 | - Continuous (legacy): `model(state)` → (mean, std) 17 | """ 18 | 19 | def __init__( 20 | self, 21 | algo, 22 | model, 23 | discrete=False, 24 | ) -> None: 25 | """ 26 | :param algo: Algorithm name (e.g. "ppo", "sac", etc.) 27 | :param model: The policy network (any nn.Module) 28 | :param discrete: True if action-space is discrete 29 | """ 30 | self.rng = np.random.default_rng() 31 | self.algo = algo 32 | self.model = model 33 | self.discrete = discrete 34 | 35 | # Undistorted action sampling 36 | if self.algo == "q": 37 | 38 | def sample_func(state_np): 39 | """ 40 | Q-learning branch: 41 | • state_np: np.ndarray of shape [batch, obs_dim] 42 | • model(state) returns Q-values: tensor [batch, n_actions] 43 | We choose action = argmax(Q), and also return the full Q‐vector. 44 | """ 45 | state = torch.as_tensor(state_np, dtype=torch.float32) 46 | qs = self.model(state) # [batch, n_actions] 47 | # Choose greedy action 48 | action = torch.argmax(qs, dim=1) # [batch] 49 | return action.detach().cpu().numpy(), qs # action_np, Q‐vector 50 | 51 | self.sample_action = sample_func 52 | 53 | else: 54 | 55 | def sample_func(state_np): 56 | """ 57 | state_np: np.ndarray of shape [batch, obs_dim] 58 | Returns: (action_tensor, log_prob_tensor) 59 | """ 60 | state = torch.as_tensor(state_np, dtype=torch.float32) 61 | 62 | # ─── Discrete action branch ───────────────────────────────────────── 63 | if self.discrete: 64 | logits = self.model(state) # [batch, n_actions] 65 | dist = Categorical(logits=logits) 66 | action = dist.sample() # [batch] 67 | log_prob = dist.log_prob(action) # [batch] 68 | return action.detach().cpu().numpy(), log_prob 69 | 70 | # ─── Continuous squashed‐Gaussian (4‐tuple) ────────────────────────── 71 | out = self.model(state) 72 | if isinstance(out, tuple) and len(out) == 4: 73 | # Unpack exactly (action, z, mean, log_std) 74 | action, z, mean, log_std = out # each [batch, action_dim] 75 | std = torch.exp(log_std) # [batch, action_dim] 76 | dist = Normal(mean, std) 77 | 78 | # 2a) log_pz = ∑ᵢ log N(zᵢ; μᵢ, σᵢ) 79 | log_pz = dist.log_prob(z).sum(dim=-1) # [batch] 80 | 81 | # 2b) tanh‐correction = ∑ᵢ log(1 − tanh(zᵢ)² + ε) 82 | eps = 1e-6 83 | log_correction = torch.log(1.0 - torch.tanh(z).pow(2) + eps).sum( 84 | dim=-1 85 | ) # [batch] 86 | 87 | # 2c) final log_prob of a = tanh(z) 88 | log_prob = log_pz - log_correction # [batch] 89 | return action.detach().cpu().numpy(), log_prob 90 | 91 | # ─── Legacy continuous branch (model returns (mean, std)) ──────────── 92 | if isinstance(out, tuple) and len(out) == 2: 93 | mean, std = out # both [batch, action_dim] 94 | dist = Normal(mean, std) 95 | z = dist.rsample() # [batch, action_dim] 96 | action = torch.tanh(z) # [batch, action_dim] 97 | 98 | # 3a) log_pz = ∑ᵢ log N(zᵢ; μᵢ, σᵢ) 99 | log_pz = dist.log_prob(z).sum(dim=-1) # [batch] 100 | 101 | # 3b) tanh‐correction 102 | eps = 1e-6 103 | log_correction = torch.log(1.0 - action.pow(2) + eps).sum( 104 | dim=-1 105 | ) # [batch] 106 | 107 | log_prob = log_pz - log_correction # [batch] 108 | return action.detach().cpu().numpy(), log_prob 109 | 110 | # ─── Fallback: if model(state) returns a Distribution ──────────────── 111 | if isinstance(out, torch.distributions.Distribution): 112 | dist = out # user returned a Distribution 113 | action = dist.sample() # [batch] 114 | log_prob = dist.log_prob(action) # [batch] 115 | return action.detach().cpu().numpy(), log_prob 116 | 117 | # ─── Otherwise, we don’t know how to sample ───────────────────────── 118 | raise RuntimeError( 119 | "MightyExplorationPolicy: cannot interpret model(state) output of type " 120 | f"{type(out)}" 121 | ) 122 | 123 | self.sample_action = sample_func 124 | 125 | def __call__(self, s, return_logp=False, metrics=None, evaluate=False): 126 | """Get action. 127 | 128 | :param s: state 129 | :param return_logp: return logprobs 130 | :param metrics: current metric dict 131 | :param eval: eval mode 132 | :return: action or (action, logprobs) 133 | """ 134 | if metrics is None: 135 | metrics = {} 136 | if evaluate: 137 | action, logprobs = self.sample_action(s) 138 | output = (action, logprobs) if return_logp else action 139 | else: 140 | output = self.explore(s, return_logp, metrics) 141 | 142 | return output 143 | 144 | def explore(self, s, return_logp, metrics=None): 145 | """Explore. 146 | 147 | :param s: state 148 | :param return_logp: return logprobs 149 | :param _: not used 150 | :return: action or (action, logprobs) 151 | """ 152 | action, logprobs = self.explore_func(s) 153 | return (action, logprobs) if return_logp else action 154 | 155 | def explore_func(self, s): 156 | """Explore function.""" 157 | raise NotImplementedError 158 | -------------------------------------------------------------------------------- /mighty/mighty_exploration/stochastic_policy.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Tuple 4 | 5 | import numpy as np 6 | import torch 7 | from torch.distributions import Categorical, Normal 8 | 9 | from mighty.mighty_exploration.mighty_exploration_policy import MightyExplorationPolicy 10 | from mighty.mighty_models import SACModel 11 | 12 | 13 | class StochasticPolicy(MightyExplorationPolicy): 14 | """Entropy-Based Exploration for discrete and continuous action spaces.""" 15 | 16 | def __init__( 17 | self, algo, model, entropy_coefficient: float = 0.2, discrete: bool = True 18 | ): 19 | """ 20 | :param algo: the RL algorithm instance 21 | :param model: the policy model 22 | :param entropy_coefficient: weight on entropy term 23 | :param discrete: whether the action space is discrete 24 | """ 25 | super().__init__(algo, model, discrete) 26 | self.entropy_coefficient = entropy_coefficient 27 | self.discrete = discrete 28 | 29 | # --- override sample_action only for continuous SAC --- 30 | if not discrete and isinstance(model, SACModel): 31 | # for evaluation use deterministic=True; training will go through .explore() 32 | def _sac_sample(state_np): 33 | state = torch.as_tensor(state_np, dtype=torch.float32) 34 | # forward returns (action, z, mean, log_std) 35 | action, z, mean, log_std = model(state, deterministic=True) 36 | logp = model.policy_log_prob(z, mean, log_std) 37 | 38 | return action.detach().cpu().numpy(), logp 39 | 40 | self.sample_action = _sac_sample 41 | 42 | def explore(self, s, return_logp, metrics=None) -> Tuple[np.ndarray, torch.Tensor]: 43 | """ 44 | Given observations `s`, sample an exploratory action and compute a weighted log-prob. 45 | 46 | Returns: 47 | action: numpy array of actions 48 | weighted_log_prob: Tensor of shape [batch, 1] 49 | """ 50 | state = torch.as_tensor(s, dtype=torch.float32) 51 | if self.discrete: 52 | logits = self.model(state) 53 | dist = Categorical(logits=logits) 54 | action = dist.sample() 55 | log_prob = dist.log_prob(action).unsqueeze(-1) 56 | return action.detach().cpu().numpy(), log_prob * self.entropy_coefficient 57 | else: 58 | # If model has attribute continuous_action=True, we know: 59 | # model(state) → (action, z, mean, log_std) 60 | if hasattr(self.model, "continuous_action") and getattr( 61 | self.model, "continuous_action" 62 | ): 63 | # 1) Forward pass: get (action, z, mean, log_std) 64 | action, z, mean, log_std = self.model( 65 | state 66 | ) # each: [batch, action_dim] 67 | std = torch.exp(log_std) # [batch, action_dim] 68 | dist = Normal(mean, std) 69 | 70 | # 2) Compute log_prob of "z" under N(mean, std) 71 | log_pz = dist.log_prob(z).sum(dim=-1, keepdim=True) # [batch, 1] 72 | 73 | # 3) Tanh Jacobian‐correction: sum_i log(1 − tanh(z_i)^2 + ε) 74 | eps = 1e-6 75 | log_correction = torch.log(1.0 - torch.tanh(z).pow(2) + eps).sum( 76 | dim=-1, keepdim=True 77 | ) # [batch, 1] 78 | 79 | # 4) Final log_prob of a = tanh(z) 80 | log_prob = log_pz - log_correction # [batch, 1] 81 | 82 | # 5) (Optional) multiply by entropy_coeff to get “weighted log_prob” 83 | weighted_log_prob = log_prob * self.entropy_coefficient 84 | 85 | return action.detach().cpu().numpy(), weighted_log_prob 86 | 87 | # If it’s actually a SACModel, fallback (should only happen in training if model∈SACModel) 88 | elif isinstance(self.model, SACModel): 89 | action, z, mean, log_std = self.model(state, deterministic=False) 90 | std = torch.exp(log_std) 91 | dist = Normal(mean, std) 92 | 93 | log_pz = dist.log_prob(z).sum(dim=-1, keepdim=True) 94 | weighted_log_prob = log_pz * self.entropy_coefficient 95 | return action.detach().cpu().numpy(), weighted_log_prob 96 | 97 | # If it’s “mean, std”‐style continuous (rare in our code), handle that case 98 | else: 99 | mean, std = self.model(state) 100 | dist = Normal(mean, std) 101 | z = dist.rsample() # [batch, action_dim] 102 | action = torch.tanh(z) # [batch, action_dim] 103 | 104 | log_pz = dist.log_prob(z).sum(dim=-1, keepdim=True) 105 | eps = 1e-6 106 | log_correction = torch.log(1.0 - action.pow(2) + eps).sum( 107 | dim=-1, keepdim=True 108 | ) 109 | log_prob = log_pz - log_correction # [batch, 1] 110 | entropy = dist.entropy().sum(dim=-1, keepdim=True) # [batch, 1] 111 | weighted_log_prob = log_prob * entropy 112 | 113 | return action.detach().cpu().numpy(), weighted_log_prob 114 | 115 | def forward(self, s): 116 | """ 117 | Alias for explore, so policy(s) returns (action, weighted_log_prob). 118 | """ 119 | return self.explore(s) 120 | -------------------------------------------------------------------------------- /mighty/mighty_meta/__init__.py: -------------------------------------------------------------------------------- 1 | from mighty.mighty_meta.cosine_lr_schedule import CosineLRSchedule 2 | from mighty.mighty_meta.plr import PrioritizedLevelReplay 3 | from mighty.mighty_meta.rnd import RND, NovelD 4 | from mighty.mighty_meta.space import SPaCE 5 | 6 | __all__ = ["PrioritizedLevelReplay", "SPaCE", "CosineLRSchedule", "RND", "NovelD"] 7 | -------------------------------------------------------------------------------- /mighty/mighty_meta/cosine_lr_schedule.py: -------------------------------------------------------------------------------- 1 | """Cosine LR Schedule with optional warm restarts.""" 2 | 3 | from __future__ import annotations 4 | 5 | import numpy as np 6 | 7 | from mighty.mighty_meta.mighty_component import MightyMetaComponent 8 | 9 | 10 | class CosineLRSchedule(MightyMetaComponent): 11 | """Cosine LR Schedule with optional warm restarts.""" 12 | 13 | def __init__( 14 | self, 15 | initial_lr, 16 | num_decay_steps, 17 | min_lr=0, 18 | restart_every=10000, 19 | restart_multiplier=1.2, 20 | ) -> None: 21 | """Cosine schedule initialization. 22 | 23 | :param initial_lr: Initial maximal LR 24 | :param num_decay_steps: Length of schedule in steps 25 | :param min_lr: Minimal LR 26 | :param restart_every: Restart frequency 27 | :param restart multiplier: Multiplies current learning rate on restart. 28 | :return: 29 | """ 30 | super().__init__() 31 | self.restart_every = restart_every 32 | self.n_restarts = 0 33 | self.t_mult = restart_multiplier 34 | self.eta_max = initial_lr 35 | self.t_max = num_decay_steps 36 | self.eta_min = min_lr 37 | self.pre_step_methods = [self.adapt_lr] 38 | 39 | def adapt_lr(self, metrics): 40 | """Adapt LR on step. 41 | 42 | :param metrics: Dict of current metrics 43 | :return: 44 | """ 45 | reset = False 46 | if self.restart_every > 0: 47 | if self.n_restarts < np.floor(metrics["step"] / self.restart_every): 48 | self.n_restarts += 1 49 | self.eta_max = ( 50 | self.eta_min 51 | + 0.5 52 | * (self.eta_max - self.eta_min) 53 | * (1 + np.cos((metrics["step"] / self.t_max) * np.pi)) 54 | * self.t_mult 55 | ) 56 | metrics["hp/lr"] = self.eta_max 57 | reset = True 58 | 59 | if metrics["step"] < self.t_max and not reset: 60 | metrics["hp/lr"] = self.eta_min + 0.5 * (self.eta_max - self.eta_min) * ( 61 | 1 + np.cos((metrics["step"] / self.t_max) * np.pi) 62 | ) 63 | -------------------------------------------------------------------------------- /mighty/mighty_meta/mighty_component.py: -------------------------------------------------------------------------------- 1 | """Template for meta-learning components.""" 2 | 3 | from __future__ import annotations 4 | 5 | 6 | class MightyMetaComponent: 7 | """Component for registering meta-control methods.""" 8 | 9 | def __init__(self) -> None: 10 | """Meta module init. 11 | 12 | :return: 13 | """ 14 | self.pre_step_methods = [] 15 | self.post_step_methods = [] 16 | self.pre_update_methods = [] 17 | self.post_update_methods = [] 18 | self.pre_episode_methods = [] 19 | self.post_episode_methods = [] 20 | 21 | def pre_step(self, metrics): 22 | """Execute methods before a step. 23 | 24 | :param metrics: Current metrics dict 25 | :return: 26 | """ 27 | for m in self.pre_step_methods: 28 | m(metrics) 29 | 30 | def post_step(self, metrics): 31 | """Execute methods after a step. 32 | 33 | :param metrics: Current metrics dict 34 | :return: 35 | """ 36 | for m in self.post_step_methods: 37 | m(metrics) 38 | 39 | def pre_update(self, metrics): 40 | """Execute methods before the update. 41 | 42 | :param metrics: Current metrics dict 43 | :return: 44 | """ 45 | for m in self.pre_update_methods: 46 | m(metrics) 47 | 48 | def post_update(self, metrics): 49 | """Execute methods after the update. 50 | 51 | :param metrics: Current metrics dict 52 | :return: 53 | """ 54 | for m in self.post_update_methods: 55 | m(metrics) 56 | 57 | def pre_episode(self, metrics): 58 | """Execute methods before an episode. 59 | 60 | :param metrics: Current metrics dict 61 | :return: 62 | """ 63 | for m in self.pre_episode_methods: 64 | m(metrics) 65 | 66 | def post_episode(self, metrics): 67 | """Execute methods at the end of an episode. 68 | 69 | :param metrics: Current metrics dict 70 | :return: 71 | """ 72 | for m in self.post_episode_methods: 73 | m(metrics) 74 | -------------------------------------------------------------------------------- /mighty/mighty_meta/space.py: -------------------------------------------------------------------------------- 1 | """Curriculum Learning via Self-Paced Context Evaluation.""" 2 | 3 | from __future__ import annotations 4 | 5 | import numpy as np 6 | import torch 7 | 8 | from mighty.mighty_meta.mighty_component import MightyMetaComponent 9 | 10 | 11 | class SPaCE(MightyMetaComponent): 12 | """Curriculum Learning via Self-Paced Context Evaluation.""" 13 | 14 | def __init__(self, criterion="relative_improvement", threshold=0.1, k=1) -> None: 15 | """SPaCE initialization. 16 | 17 | :param criterion: Ranking criterion 18 | :param threshold: Minimum average change needed to keep train set size 19 | :param k: Size of instance set increase 20 | :return: 21 | """ 22 | super().__init__() 23 | self.criterion = criterion 24 | self.threshold = threshold 25 | self.instance_set = [] 26 | self.increase_by_k_instances = k 27 | self.current_instance_set_size = k 28 | self.last_evals = None 29 | self.all_instances = None 30 | self.pre_episode_methods = [self.get_instances] 31 | 32 | def get_instances(self, metrics): 33 | """Get Training set on episode start. 34 | 35 | :param metrics: Current metrics dict 36 | :return: 37 | """ 38 | env = metrics["env"] 39 | vf = metrics["vf"] 40 | rollout_values = None 41 | if "rollout_values" in metrics: 42 | rollout_values = metrics["rollout_values"] 43 | 44 | if self.all_instances is None: 45 | self.all_instances = np.array(env.instance_id_list.copy()) 46 | 47 | if self.last_evals is None and rollout_values is None: 48 | self.instance_set = np.random.default_rng().choice( 49 | self.all_instances, size=self.current_instance_set_size 50 | ) 51 | elif self.last_evals is None: 52 | self.instance_set = np.random.default_rng().choice( 53 | self.all_instances, size=self.current_instance_set_size 54 | ) 55 | self.last_evals = np.nanmean(rollout_values) 56 | else: 57 | if ( 58 | abs(np.mean(rollout_values) - self.last_evals) 59 | / (self.last_evals + 1e-6) 60 | <= self.threshold 61 | ): 62 | self.current_instance_set_size = min( 63 | self.current_instance_set_size + self.increase_by_k_instances, 64 | len(self.all_instances), 65 | ) 66 | self.last_evals = np.nanmean(rollout_values) 67 | evals = self.get_evals(env, vf) 68 | if self.criterion == "improvement": 69 | improvement = evals - self.last_evals 70 | elif self.criterion == "relative_improvement": 71 | improvement = (evals - self.last_evals) / self.last_evals 72 | else: 73 | raise NotImplementedError("This SpaCE criterion is not implemented.") 74 | self.instance_set = self.all_instances[np.argsort(improvement)[::-1]][ 75 | : self.current_instance_set_size 76 | ] 77 | env.set_instance_set(self.instance_set) 78 | 79 | def get_evals(self, env, vf): 80 | """Get values for s_0 of all instances. 81 | 82 | :param env: environment 83 | :param vf: value or q function 84 | :return: 85 | """ 86 | values = [] 87 | for i in self.all_instances: 88 | state, _ = env.reset() 89 | env.set_inst_id(i) 90 | v = vf(torch.tensor(state)).squeeze().detach().numpy() 91 | # If we're dealing with a q function, we transform to value here 92 | if isinstance(v[0], np.ndarray): 93 | v = v.sum(axis=1) 94 | values.append(v[0]) 95 | return values 96 | -------------------------------------------------------------------------------- /mighty/mighty_models/__init__.py: -------------------------------------------------------------------------------- 1 | from mighty.mighty_models.dqn import DQN 2 | from mighty.mighty_models.ppo import PPOModel 3 | from mighty.mighty_models.sac import SACModel 4 | 5 | __all__ = ["DQN", "SACModel", "PPOModel"] 6 | -------------------------------------------------------------------------------- /mighty/mighty_models/ppo.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import Tuple 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from mighty.mighty_models.networks import make_feature_extractor 8 | 9 | 10 | class PPOModel(nn.Module): 11 | """PPO Model with policy and value networks.""" 12 | 13 | def __init__( 14 | self, 15 | obs_shape: int, 16 | action_size: int, 17 | hidden_sizes: list[int] = [64, 64], 18 | activation: str = "tanh", 19 | continuous_action: bool = False, 20 | log_std_min: float = -20.0, 21 | log_std_max: float = 2.0, 22 | ): 23 | """Initialize the PPO model.""" 24 | super().__init__() 25 | 26 | self.obs_size = int(obs_shape) 27 | self.action_size = int(action_size) 28 | self.hidden_sizes = hidden_sizes 29 | self.activation = activation 30 | self.continuous_action = continuous_action 31 | self.log_std_min = log_std_min 32 | self.log_std_max = log_std_max 33 | 34 | # Make feature extractor 35 | self.feature_extractor_policy, feat_dim = make_feature_extractor( 36 | architecture="mlp", 37 | obs_shape=obs_shape, 38 | n_layers=len(hidden_sizes), 39 | hidden_sizes=hidden_sizes, 40 | activation=activation, 41 | ) 42 | 43 | self.feature_extractor_value, _ = make_feature_extractor( 44 | architecture="mlp", 45 | obs_shape=obs_shape, 46 | n_layers=len(hidden_sizes), 47 | hidden_sizes=hidden_sizes, 48 | activation=activation, 49 | ) 50 | 51 | if self.continuous_action: 52 | # Output size must be 2 * action_size (mean + log_std) 53 | final_out_dim = action_size * 2 54 | else: 55 | # For discrete actions, output logits of size = action_size 56 | final_out_dim = action_size 57 | 58 | # (Architecture based on 59 | # https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/common/policies.py) 60 | 61 | # Policy network 62 | self.policy_head = nn.Sequential( 63 | self.feature_extractor_policy, # [batch, feat_dim] 64 | nn.Linear(feat_dim, hidden_sizes[0]), # [batch, hidden_sizes[0]] 65 | nn.LayerNorm(hidden_sizes[0]), # (optional normalization) 66 | getattr(nn, activation.capitalize())(), # e.g. tanh or ReLU 67 | nn.Linear(hidden_sizes[0], final_out_dim), # [batch, final_out_dim] 68 | ) 69 | 70 | # Value network 71 | self.value_head = nn.Sequential( 72 | self.feature_extractor_value, # [batch, feat_dim] 73 | nn.Linear(feat_dim, hidden_sizes[0]), # [batch, hidden_sizes[0]] 74 | nn.LayerNorm(hidden_sizes[0]), 75 | getattr(nn, activation.capitalize())(), 76 | nn.Linear(hidden_sizes[0], 1), # [batch, 1] 77 | ) 78 | 79 | # Orthogonal initialization 80 | def _init_weights(m: nn.Module): 81 | if isinstance(m, nn.Linear): 82 | out_dim = m.out_features 83 | if self.continuous_action and out_dim == final_out_dim: 84 | # This is the final policy‐output layer (mean & log_std): 85 | gain = 0.01 86 | elif (not self.continuous_action) and out_dim == action_size: 87 | # Final policy‐output layer (discrete‐logits): 88 | gain = 0.01 89 | elif out_dim == 1: 90 | # Final value‐output layer: 91 | gain = 1.0 92 | else: 93 | # Any intermediate hidden layer: 94 | gain = math.sqrt(2) 95 | nn.init.orthogonal_(m.weight, gain) 96 | nn.init.constant_(m.bias, 0.0) 97 | 98 | self.apply(_init_weights) 99 | 100 | def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 101 | """Forward pass through the policy network.""" 102 | 103 | if self.continuous_action: 104 | raw = self.policy_head(x) # [batch, 2 * action_size] 105 | mean, log_std = raw.chunk(2, dim=-1) # each [batch, action_size] 106 | log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) 107 | std = torch.exp(log_std) # [batch, action_size] 108 | 109 | # Sample a raw Gaussian z; during inference/training this is 'reparameterized' 110 | # (If you need a deterministic‐eval mode, you can add a flag argument here.) 111 | eps = torch.randn_like(mean) 112 | z = mean + std * eps # [batch, action_size] 113 | action = torch.tanh(z) # squash to [−1, +1] 114 | 115 | return action, z, mean, log_std 116 | 117 | else: 118 | logits = self.policy_head(x) # [batch, action_size] 119 | return logits 120 | 121 | def forward_value(self, x: torch.Tensor) -> torch.Tensor: 122 | """Forward pass through the value network.""" 123 | return self.value_head(x) 124 | -------------------------------------------------------------------------------- /mighty/mighty_models/sac.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import torch 4 | from torch import nn 5 | 6 | from mighty.mighty_models.networks import make_feature_extractor 7 | 8 | 9 | class SACModel(nn.Module): 10 | """SAC Model with squashed Gaussian policy and twin Q-networks.""" 11 | 12 | def __init__( 13 | self, 14 | obs_size: int, 15 | action_size: int, 16 | hidden_sizes: list[int] = [256, 256], 17 | activation: str = "relu", 18 | log_std_min: float = -20, 19 | log_std_max: float = 2, 20 | ): 21 | super().__init__() 22 | self.obs_size = obs_size 23 | self.action_size = action_size 24 | self.log_std_min = log_std_min 25 | self.log_std_max = log_std_max 26 | self.hidden_sizes = hidden_sizes 27 | self.activation = activation 28 | 29 | # Shared feature extractor for policy and Q-networks 30 | extractor, out_dim = make_feature_extractor( 31 | architecture="mlp", 32 | obs_shape=obs_size, 33 | n_layers=len(hidden_sizes), 34 | hidden_sizes=hidden_sizes, 35 | activation=activation, 36 | ) 37 | 38 | # Policy network outputs mean and log_std 39 | self.policy_net = nn.Sequential( 40 | extractor, 41 | nn.Linear(out_dim, action_size * 2), 42 | ) 43 | 44 | # Twin Q-networks 45 | # — live Q-nets — 46 | self.q_net1 = self._make_q_net() 47 | self.q_net2 = self._make_q_net() 48 | 49 | self.target_q_net1 = self._make_q_net() 50 | self.target_q_net1.load_state_dict(self.q_net1.state_dict()) 51 | self.target_q_net2 = self._make_q_net() 52 | self.target_q_net2.load_state_dict(self.q_net2.state_dict()) 53 | for p in self.target_q_net1.parameters(): 54 | p.requires_grad = False 55 | for p in self.target_q_net2.parameters(): 56 | p.requires_grad = False 57 | 58 | def _make_q_net(self) -> nn.Sequential: 59 | q_in = self.obs_size + self.action_size 60 | q_extractor, _ = make_feature_extractor( 61 | architecture="mlp", 62 | obs_shape=q_in, 63 | n_layers=len(self.hidden_sizes), 64 | hidden_sizes=self.hidden_sizes, 65 | activation=self.activation, 66 | ) 67 | return nn.Sequential(q_extractor, nn.Linear(self.hidden_sizes[-1], 1)) 68 | 69 | def forward( 70 | self, state: torch.Tensor, deterministic: bool = False 71 | ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: 72 | """ 73 | Forward pass for policy sampling. 74 | 75 | Returns: 76 | action: torch.Tensor in [-1,1] 77 | z: raw Gaussian sample before tanh 78 | mean: Gaussian mean 79 | log_std: Gaussian log std 80 | """ 81 | x = self.policy_net(state) 82 | mean, log_std = x.chunk(2, dim=-1) 83 | log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) 84 | std = torch.exp(log_std) 85 | 86 | if deterministic: 87 | z = mean 88 | else: 89 | z = mean + std * torch.randn_like(mean) 90 | action = torch.tanh(z) 91 | return action, z, mean, log_std 92 | 93 | def policy_log_prob( 94 | self, z: torch.Tensor, mean: torch.Tensor, log_std: torch.Tensor 95 | ) -> torch.Tensor: 96 | """ 97 | Compute log-prob of action a = tanh(z), correcting for tanh transform. 98 | """ 99 | std = torch.exp(log_std) 100 | dist = torch.distributions.Normal(mean, std) 101 | log_pz = dist.log_prob(z).sum(dim=-1, keepdim=True) 102 | eps = 1e-6 # small constant to avoid numerical issues 103 | log_correction = (torch.log(1 - torch.tanh(z).pow(2) + eps)).sum( 104 | dim=-1, keepdim=True 105 | ) 106 | log_pa = log_pz - log_correction 107 | return log_pa 108 | 109 | def forward_q1(self, state_action: torch.Tensor) -> torch.Tensor: 110 | return self.q_net1(state_action) 111 | 112 | def forward_q2(self, state_action: torch.Tensor) -> torch.Tensor: 113 | return self.q_net2(state_action) 114 | -------------------------------------------------------------------------------- /mighty/mighty_replay/__init__.py: -------------------------------------------------------------------------------- 1 | from mighty.mighty_replay.buffer import MightyBuffer 2 | from mighty.mighty_replay.mighty_prioritized_replay import PrioritizedReplay 3 | from mighty.mighty_replay.mighty_replay_buffer import MightyReplay, TransitionBatch 4 | from mighty.mighty_replay.mighty_rollout_buffer import MightyRolloutBuffer, RolloutBatch 5 | 6 | __all__ = [ 7 | "MightyReplay", 8 | "PrioritizedReplay", 9 | "TransitionBatch", 10 | "MightyRolloutBuffer", 11 | "MightyBuffer", 12 | "RolloutBatch", 13 | ] 14 | -------------------------------------------------------------------------------- /mighty/mighty_replay/buffer.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class MightyBuffer(ABC): 5 | @abstractmethod 6 | def add(self, *args, **kwargs): 7 | pass 8 | 9 | @abstractmethod 10 | def sample(self, batch_size): 11 | pass 12 | 13 | @abstractmethod 14 | def reset(self): 15 | pass 16 | 17 | @abstractmethod 18 | def __len__(self): 19 | pass 20 | 21 | @abstractmethod 22 | def __bool__(self): 23 | pass 24 | -------------------------------------------------------------------------------- /mighty/mighty_replay/mighty_replay_buffer.py: -------------------------------------------------------------------------------- 1 | """Mighty replay buffer.""" 2 | 3 | from __future__ import annotations 4 | 5 | from collections.abc import Iterable 6 | 7 | import dill as pickle 8 | import numpy as np 9 | import torch 10 | 11 | from mighty.mighty_replay.buffer import MightyBuffer 12 | 13 | 14 | def flatten_infos(xs): 15 | """Transform info dict to flat list. 16 | 17 | :param xs: info dict 18 | :return: flattened infos 19 | """ 20 | if isinstance(xs, dict): 21 | xs = list(xs.values()) 22 | for x in xs: 23 | if isinstance(x, Iterable) and not isinstance(x, str | bytes): 24 | yield from flatten_infos(x) 25 | else: 26 | yield x 27 | 28 | 29 | class TransitionBatch: 30 | """Transition batch.""" 31 | 32 | def __init__( 33 | self, 34 | observations, 35 | actions, 36 | rewards, 37 | next_observations, 38 | dones, 39 | device: torch.device | str = "cpu", 40 | ) -> None: 41 | """Initialize TransitionBatch.""" 42 | if isinstance(rewards, float | int): 43 | observations = np.array([observations], dtype=np.float32) 44 | actions = np.array([actions], dtype=np.float32) 45 | rewards = np.array([rewards], dtype=np.float32) 46 | next_observations = np.array([next_observations], dtype=np.float32) 47 | dones = np.array([dones], dtype=np.float32) 48 | if isinstance(rewards, np.ndarray): 49 | self.observations = torch.from_numpy(observations.astype(np.float32)).to( 50 | device 51 | ) 52 | self.actions = torch.from_numpy(actions.astype(np.float32)).to(device) 53 | self.rewards = torch.from_numpy(rewards.astype(np.float32)).to(device) 54 | self.next_obs = torch.from_numpy(next_observations.astype(np.float32)).to( 55 | device 56 | ) 57 | self.dones = torch.from_numpy(dones.astype(np.int64)).to(device) 58 | else: 59 | self.observations = observations.to(device) 60 | self.actions = actions.to(device) 61 | self.rewards = rewards.to(device) 62 | self.next_obs = next_observations.to(device) 63 | self.dones = dones.to(device) 64 | 65 | @property 66 | def size(self): 67 | """Current buffer size.""" 68 | return len(self.observations) 69 | 70 | def __len__(self): 71 | return self.size 72 | 73 | def __iter__(self): 74 | yield from zip( 75 | self.observations, 76 | self.actions, 77 | self.rewards, 78 | self.next_obs, 79 | self.dones, 80 | strict=False, 81 | ) 82 | 83 | 84 | class MightyReplay(MightyBuffer): 85 | """Simple replay buffer.""" 86 | 87 | def __init__( 88 | self, 89 | capacity, 90 | keep_infos=False, 91 | flatten_infos=False, 92 | device: torch.device | str = "cpu", 93 | ): 94 | """Initialize Buffer. 95 | 96 | :param capacity: Buffer size 97 | :param random_seed: Seed for sampling 98 | :param keep_infos: Keep the extra info dict. Required for some algorithms. 99 | :param flatten_infos: Make flat list from infos. 100 | Might be necessary, depending on info content. 101 | :return: 102 | """ 103 | self.capacity = capacity 104 | self.keep_infos = keep_infos 105 | self.flatten_infos = flatten_infos 106 | self.device = torch.device(device) 107 | self.rng = np.random.default_rng() 108 | self.reset() 109 | 110 | @property 111 | def full(self): 112 | """Check if the buffer is full.""" 113 | return self.index + 1 >= self.capacity 114 | 115 | def add(self, transition_batch, _): 116 | """Add transition(s). 117 | 118 | :param transition_batch: Transition(s) to add 119 | :param metrics: Current metrics dict 120 | :return: 121 | """ 122 | if not self.keep_infos: 123 | transition_batch.extra_info = [] 124 | elif self.flatten_infos: 125 | transition_batch.extra_info = [ 126 | list(flatten_infos(transition_batch.extra_info)) 127 | ] 128 | 129 | self.index += transition_batch.size 130 | if len(self.obs) == 0: 131 | self.obs = transition_batch.observations 132 | self.next_obs = transition_batch.next_obs 133 | self.actions = transition_batch.actions 134 | self.rewards = transition_batch.rewards 135 | self.dones = transition_batch.dones 136 | else: 137 | self.obs = torch.cat((self.obs, transition_batch.observations)) 138 | self.next_obs = torch.cat((self.next_obs, transition_batch.next_obs)) 139 | self.actions = torch.cat((self.actions, transition_batch.actions)) 140 | self.rewards = torch.cat((self.rewards, transition_batch.rewards)) 141 | self.dones = torch.cat((self.dones, transition_batch.dones)) 142 | if len(self) > self.capacity: 143 | self.obs = self.obs[len(self) - self.capacity :] 144 | self.next_obs = self.next_obs[len(self) - self.capacity :] 145 | self.actions = self.actions[len(self) - self.capacity :] 146 | self.rewards = self.rewards[len(self) - self.capacity :] 147 | self.dones = self.dones[len(self) - self.capacity :] 148 | self.index = self.capacity 149 | 150 | def sample(self, batch_size=32): 151 | """Sample transitions.""" 152 | batch_indices = self.rng.choice(np.arange(len(self)), size=batch_size) 153 | return TransitionBatch( 154 | self.obs[batch_indices], 155 | self.actions[batch_indices], 156 | self.rewards[batch_indices], 157 | self.next_obs[batch_indices], 158 | self.dones[batch_indices], 159 | device=self.device, 160 | ) 161 | 162 | def reset(self): 163 | """Reset the buffer.""" 164 | self.obs = [] 165 | self.next_obs = [] 166 | self.actions = [] 167 | self.rewards = [] 168 | self.dones = [] 169 | self.index = 0 170 | 171 | def __len__(self): 172 | return len(self.obs) 173 | 174 | def __bool__(self): 175 | return bool(len(self)) 176 | 177 | def save(self, filename="buffer.pkl"): 178 | """Save the buffer to a file.""" 179 | with open(filename, "wb") as f: 180 | pickle.dump(self, f) 181 | -------------------------------------------------------------------------------- /mighty/mighty_runners/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | from .mighty_maml_runner import MightyMAMLRunner, MightyTRPOMAMLRunner 4 | from .mighty_online_runner import MightyOnlineRunner 5 | from .mighty_runner import MightyRunner 6 | 7 | VALID_RUNNER_TYPES = ["standard", "default", "online"] 8 | RUNNER_CLASSES: Dict[str, type[MightyRunner]] = { 9 | "standard": MightyOnlineRunner, 10 | "default": MightyOnlineRunner, 11 | "online": MightyOnlineRunner, 12 | } 13 | 14 | try: 15 | import evosax # noqa: F401 16 | 17 | found = True 18 | except ImportError: 19 | print("evosax not found, to use ES runners please install mighty[es].") 20 | found = False 21 | 22 | if found: 23 | from .mighty_es_runner import MightyESRunner 24 | 25 | VALID_RUNNER_TYPES.append("es") 26 | RUNNER_CLASSES["es"] = MightyESRunner 27 | 28 | 29 | from .factory import get_runner_class # noqa: E402 30 | 31 | __all__ = [ 32 | "MightyRunner", 33 | "MightyOnlineRunner", 34 | "MightyMAMLRunner", 35 | "MightyTRPOMAMLRunner", 36 | "MightyESRunner", 37 | "get_runner_class", 38 | ] 39 | -------------------------------------------------------------------------------- /mighty/mighty_runners/factory.py: -------------------------------------------------------------------------------- 1 | """Factory for creating runners based on config.""" 2 | 3 | from __future__ import annotations 4 | 5 | from typing import TYPE_CHECKING 6 | 7 | from mighty.mighty_runners import RUNNER_CLASSES, VALID_RUNNER_TYPES 8 | 9 | if TYPE_CHECKING: 10 | from mighty.mighty_runners.mighty_runner import MightyRunner 11 | 12 | 13 | def get_runner_class(agent_type: str) -> type[MightyRunner]: 14 | """Transforms config keyword for agents to class.""" 15 | agent_class = None 16 | if agent_type in VALID_RUNNER_TYPES: 17 | agent_class = RUNNER_CLASSES[agent_type] 18 | else: 19 | raise ValueError(f"Unknown agent_type {agent_type}.") 20 | 21 | return agent_class 22 | -------------------------------------------------------------------------------- /mighty/mighty_runners/mighty_es_runner.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import importlib.util as iutil 4 | from typing import TYPE_CHECKING, Dict, Tuple 5 | 6 | import numpy as np 7 | import torch 8 | 9 | from mighty.mighty_agents.base_agent import retrieve_class 10 | from mighty.mighty_runners.mighty_runner import MightyRunner 11 | 12 | spec = iutil.find_spec("evosax") 13 | found = spec is not None 14 | if found: 15 | import jax 16 | from evosax import FitnessShaper, xNES # type: ignore 17 | from jax import numpy as jnp 18 | else: 19 | import warnings 20 | 21 | warnings.warn("evosax not found, to use NES runners please install mighty[es].") 22 | 23 | if TYPE_CHECKING: 24 | from omegaconf import DictConfig 25 | 26 | 27 | class MightyESRunner(MightyRunner): 28 | def __init__(self, cfg: DictConfig) -> None: 29 | super().__init__(cfg) 30 | self.search_targets = cfg.search_targets 31 | num_dims = len(self.search_targets) 32 | self.search_params = False 33 | if "parameters" in self.search_targets: 34 | self.search_params = True 35 | self.total_n_params = sum([len(p.flatten()) for p in self.agent.parameters]) 36 | num_dims -= 1 37 | num_dims += self.total_n_params 38 | 39 | es_cls = retrieve_class(cfg.es, default_cls=xNES) 40 | es_kwargs = {} 41 | if "es_kwargs" in cfg.keys(): 42 | es_kwargs = cfg.es_kwargs 43 | 44 | self.es = es_cls(popsize=cfg.popsize, num_dims=num_dims, **es_kwargs) 45 | self.rng = jax.random.PRNGKey(0) 46 | self.fit_shaper = FitnessShaper(centered_rank=True, w_decay=0.0, maximize=True) 47 | self.iterations = cfg.iterations 48 | self.train_agent = cfg.rl_train_agent 49 | if self.train_agent: 50 | self.num_steps_per_iteration = cfg.num_steps_per_iteration 51 | 52 | def apply_parameters(self, individual) -> None: # type: ignore 53 | # 1. Make tensor from x 54 | individual = np.asarray(individual) 55 | individual = torch.tensor(individual, dtype=torch.float32) 56 | 57 | # 2. Shape it to match the model's parameters 58 | param_shapes = [p.shape for p in self.agent.parameters] 59 | reshaped_individual = [] 60 | for shape in param_shapes: 61 | new_individual = individual[: shape.numel()] 62 | new_individual = new_individual.reshape(shape) 63 | reshaped_individual.append(new_individual) 64 | individual = individual[shape.numel() :] 65 | # 3. Set the model's parameters to the shaped tensor 66 | for p, x_ in zip(self.agent.parameters, reshaped_individual): 67 | p.data = x_ 68 | 69 | def run(self) -> Tuple[Dict, Dict]: 70 | es_state = self.es.initialize(self.rng) 71 | for _ in range(self.iterations): 72 | rng_ask, _ = jax.random.split(self.rng, 2) 73 | x, es_state = self.es.ask(rng_ask, es_state) 74 | eval_rewards = [] 75 | for individual in x: 76 | if self.search_params: 77 | self.apply_parameters(individual[: self.total_n_params]) 78 | individual = individual[self.total_n_params :] 79 | for i, target in enumerate(self.search_targets): 80 | if target == "parameters": 81 | continue 82 | new_value = np.asarray(individual[i]).item() 83 | if target in ["_batch_size", "n_units"]: 84 | new_value = max(0, int(new_value)) 85 | setattr(self.agent, target, new_value) 86 | if self.train_agent: 87 | self.train(self.num_steps_per_iteration) 88 | eval_results = self.evaluate() 89 | eval_rewards.append(eval_results["mean_eval_reward"]) 90 | fitness = self.fit_shaper.apply(x, jnp.array(eval_rewards)) 91 | es_state = self.es.tell(x, fitness, es_state) 92 | eval_results = self.evaluate() 93 | return {"step": self.iterations}, eval_results 94 | -------------------------------------------------------------------------------- /mighty/mighty_runners/mighty_online_runner.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Dict, Tuple 4 | 5 | from mighty.mighty_runners.mighty_runner import MightyRunner 6 | 7 | 8 | class MightyOnlineRunner(MightyRunner): 9 | def run(self) -> Tuple[Dict, Dict]: 10 | train_results = self.train(self.num_steps) 11 | eval_results = self.evaluate() 12 | return train_results, eval_results 13 | -------------------------------------------------------------------------------- /mighty/mighty_runners/mighty_runner.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | import warnings 5 | from abc import ABC 6 | from pathlib import Path 7 | from typing import TYPE_CHECKING, Any, Dict, Tuple 8 | 9 | from hydra.utils import get_class 10 | 11 | from mighty.mighty_agents.factory import get_agent_class 12 | from mighty.mighty_utils.envs import make_mighty_env 13 | 14 | warnings.filterwarnings("ignore") 15 | 16 | if TYPE_CHECKING: 17 | from omegaconf import DictConfig 18 | 19 | 20 | class MightyRunner(ABC): 21 | def __init__(self, cfg: DictConfig) -> None: 22 | """Parse config and run Mighty agent.""" 23 | output_dir = Path(cfg.output_dir) / f"{cfg.experiment_name}_{cfg.seed}" 24 | if not output_dir.exists(): 25 | output_dir.mkdir(parents=True) 26 | 27 | # Check whether env is from DACBench, CARL or gym 28 | # Make train and eval env 29 | env, base_eval_env, eval_default = make_mighty_env(cfg) 30 | 31 | wrapper_classes = [] 32 | for w in cfg.env_wrappers: 33 | wkwargs = cfg.wrapper_kwargs if "wrapper_kwargs" in cfg else {} 34 | cls = get_class(w) 35 | env = cls(env, **wkwargs) 36 | wrapper_classes.append((cls, wkwargs)) 37 | 38 | def wrap_eval(): # type: ignore 39 | wrapped_env = base_eval_env() 40 | for cls, wkwargs in wrapper_classes: 41 | wrapped_env = cls(wrapped_env, **wkwargs) 42 | return wrapped_env 43 | 44 | eval_env = wrap_eval() 45 | 46 | # Setup agent 47 | agent_class = get_agent_class(cfg.algorithm) 48 | args_agent = dict(cfg.algorithm_kwargs) 49 | self.agent = agent_class( # type: ignore 50 | env=env, 51 | eval_env=eval_env, 52 | output_dir=output_dir, 53 | seed=cfg.seed, 54 | **args_agent, 55 | ) 56 | 57 | self.eval_every_n_steps = cfg.eval_every_n_steps 58 | self.num_steps = cfg.num_steps 59 | 60 | # Load checkpoint if one is given 61 | if cfg.checkpoint is not None: 62 | self.agent.load(cfg.checkpoint) 63 | logging.info("#" * 80) 64 | logging.info(f"Loading checkpoint at {cfg.checkpoint}") 65 | 66 | # Train 67 | logging.info("#" * 80) 68 | logging.info(f'Using agent type "{self.agent}" to learn') 69 | logging.info("#" * 80) 70 | 71 | def train(self, num_steps: int, env=None) -> Any: # type: ignore 72 | return self.agent.run( 73 | n_steps=num_steps, env=env, eval_every_n_steps=self.eval_every_n_steps 74 | ) 75 | 76 | def evaluate(self, eval_env=None) -> Any: # type: ignore 77 | return self.agent.evaluate(eval_env) 78 | 79 | def run(self) -> Tuple[Dict, Dict]: 80 | raise NotImplementedError 81 | -------------------------------------------------------------------------------- /mighty/mighty_update/__init__.py: -------------------------------------------------------------------------------- 1 | from mighty.mighty_update.ppo_update import PPOUpdate 2 | from mighty.mighty_update.q_learning import ( 3 | ClippedDoubleQLearning, 4 | DoubleQLearning, 5 | QLearning, 6 | SPRQLearning, 7 | ) 8 | from mighty.mighty_update.sac_update import SACUpdate 9 | 10 | __all__ = [ 11 | "QLearning", 12 | "DoubleQLearning", 13 | "ClippedDoubleQLearning", 14 | "SPRQLearning", 15 | "SACUpdate", 16 | "PPOUpdate", 17 | ] 18 | -------------------------------------------------------------------------------- /mighty/mighty_utils/__init__ .py: -------------------------------------------------------------------------------- 1 | from types import MIGHTYENV, TypeKwargs, retrieve_class 2 | 3 | from envs import make_mighty_env 4 | from update_utils import polyak_update 5 | 6 | __all__ = [ 7 | "MIGHTYENV", 8 | "make_mighty_env", 9 | "TypeKwargs", 10 | "retrieve_class", 11 | "polyak_update", 12 | ] 13 | -------------------------------------------------------------------------------- /mighty/mighty_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .envs import make_mighty_env 2 | from .migthy_types import MIGHTYENV, TypeKwargs, retrieve_class 3 | from .update_utils import polyak_update 4 | 5 | __all__ = [ 6 | "MIGHTYENV", 7 | "make_mighty_env", 8 | "TypeKwargs", 9 | "retrieve_class", 10 | "polyak_update", 11 | ] 12 | -------------------------------------------------------------------------------- /mighty/mighty_utils/migthy_types.py: -------------------------------------------------------------------------------- 1 | """Type helpers for the mighty package.""" 2 | 3 | from __future__ import annotations 4 | 5 | import importlib 6 | from typing import Any, NewType 7 | 8 | import hydra 9 | from omegaconf import DictConfig 10 | 11 | TypeKwargs = NewType("TypeKwargs", dict[str, Any] | DictConfig) 12 | 13 | MIGHTYENV = None 14 | 15 | 16 | dacbench = importlib.util.find_spec("dacbench") 17 | dacbench_found = dacbench is not None 18 | if dacbench_found: 19 | import dacbench 20 | 21 | MIGHTYENV = dacbench.AbstractEnv 22 | DACENV = dacbench.AbstractEnv 23 | else: 24 | DACENV = int 25 | 26 | carl = importlib.util.find_spec("carl") 27 | carl_found = carl is not None 28 | if carl_found: 29 | from carl.envs.carl_env import CARLEnv 30 | 31 | if MIGHTYENV is None: 32 | MIGHTYENV = CARLEnv 33 | CARLENV = CARLEnv 34 | else: 35 | CARLENV = int 36 | 37 | if not carl_found and not dacbench_found: 38 | import gymnasium as gym 39 | 40 | MIGHTYENV = gym.Env 41 | 42 | 43 | def retrieve_class(cls: str | DictConfig | type, default_cls: type) -> type: 44 | """Get mighty class.""" 45 | if cls is None: 46 | cls = default_cls 47 | elif isinstance(cls, DictConfig): 48 | cls = hydra.utils.get_class(cls._target_) 49 | elif isinstance(cls, str): 50 | cls = hydra.utils.get_class(cls) 51 | return cls 52 | -------------------------------------------------------------------------------- /mighty/mighty_utils/test_helpers.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | 3 | import gymnasium as gym 4 | import numpy as np 5 | import torch 6 | 7 | 8 | class DummyEnv(gym.Env): 9 | def __init__(self): 10 | self.observation_space = gym.spaces.Box(low=-1, high=1, shape=(3,)) 11 | self.action_space = gym.spaces.Discrete(4) 12 | self.inst_id = None 13 | self.instance_set = [42] 14 | 15 | @property 16 | def instance_id_list(self): 17 | return [self.inst_id] 18 | 19 | def set_inst_id(self, inst_id): 20 | self.inst_id = inst_id 21 | 22 | def set_instance_set(self, instance_set): 23 | self.instance_set = instance_set 24 | 25 | def reset(self, options={}, seed=None): 26 | if self.inst_id is None: 27 | self.inst_id = np.random.default_rng().integers(0, 100) 28 | return self.observation_space.sample(), {} 29 | 30 | def step(self, action): 31 | tr = np.random.default_rng().choice([0, 1], p=[0.9, 0.1]) 32 | return self.observation_space.sample(), 0, False, tr, {} 33 | 34 | 35 | class DummyModel: 36 | def __init__(self, action=1): 37 | self.action = action 38 | 39 | def __call__(self, s): 40 | fake_qs = np.zeros((len(s), 5)) 41 | fake_qs[:, self.action] = 1 42 | return torch.tensor(fake_qs) 43 | 44 | 45 | def clean(path): 46 | shutil.rmtree(path, ignore_errors=False, onerror=None) 47 | -------------------------------------------------------------------------------- /mighty/mighty_utils/update_utils.py: -------------------------------------------------------------------------------- 1 | def polyak_update(source_params, target_params, tau: float): 2 | """Polyak averaging for target network updates.""" 3 | for source, target in zip(source_params, target_params): 4 | target.data.copy_(tau * source.data + (1 - tau) * target.data) 5 | -------------------------------------------------------------------------------- /mighty/run_mighty.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import hydra 4 | import numpy as np 5 | from omegaconf import DictConfig 6 | 7 | from mighty.mighty_runners.factory import get_runner_class 8 | 9 | 10 | @hydra.main("./configs", "base", version_base=None) 11 | def run_mighty(cfg: DictConfig) -> None: 12 | # Make runner 13 | runner_cls = get_runner_class(cfg.runner) 14 | runner = runner_cls(cfg) 15 | 16 | # Execute run 17 | start = time.time() 18 | train_result, eval_result = runner.run() 19 | end = time.time() 20 | 21 | # Print stats 22 | print("Training finished!") 23 | print( 24 | f"Reached a reward of {np.round(eval_result['mean_eval_reward'], decimals=2)} in {train_result['step']} steps and {np.round(end - start, decimals=2)}s." 25 | ) 26 | return eval_result["mean_eval_reward"] 27 | 28 | 29 | if __name__ == "__main__": 30 | run_mighty() 31 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "Mighty-RL" 7 | version = "0.0.1" 8 | description = "A modular, meta-learning-ready RL library." 9 | authors = [{ name = "AutoRL@LUHAI", email = "a.mohan@ai.uni-hannover.de" }] 10 | readme = "README.md" 11 | requires-python = ">=3.10,<3.12" 12 | license = { file = "LICENSE" } 13 | keywords = [ 14 | "Reinforcement Learning", 15 | "MetaRL", 16 | "Generalization in RL" 17 | ] 18 | classifiers = [ 19 | "Intended Audience :: Developers", 20 | "Programming Language :: Python :: 3 :: Only", 21 | "Development Status :: 3 - Alpha", 22 | "Topic :: Utilities", 23 | "Topic :: Scientific/Engineering", 24 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 25 | "License :: OSI Approved :: BSD License" 26 | ] 27 | 28 | dependencies = [ 29 | "numpy~=1.21", 30 | "gymnasium", 31 | "matplotlib~=3.4", 32 | "seaborn~=0.11", 33 | "tensorboard", 34 | "hydra-core~=1.2", 35 | "hydra-colorlog~=1.2", 36 | "hydra-submitit-launcher~=1.2", 37 | "pandas", 38 | "scipy", 39 | "rich~=12.4", 40 | "wandb~=0.12", 41 | "torch", 42 | "dill", 43 | "imageio", 44 | "evosax==0.1.6", 45 | "rliable", 46 | "seaborn", 47 | "uniplot" 48 | ] 49 | 50 | [project.optional-dependencies] 51 | dev = ["ruff", "mypy", "build", "pytest", "pytest-cov"] 52 | carl = ["carl_bench==1.1.0", "brax==0.9.3", "protobuf>=3.17.3", "jax==0.4.18", "jaxlib~=0.4.18"] 53 | dacbench = ["dacbench>=0.3.0", "torchvision", "ioh"] 54 | pufferlib = ["pufferlib==2.0.6"] 55 | docs = ["mkdocs", "mkdocs-material", "mkdocs-autorefs", 56 | "mkdocs-gen-files", "mkdocs-literate-nav", 57 | "mkdocs-glightbox", "mkdocs-glossary-plugin", 58 | "mkdocstrings[python]", "markdown-exec[ansi]", "mike"] 59 | examples = [] 60 | 61 | [tool.setuptools.packages.find] 62 | include = ["mighty*", "examples"] 63 | 64 | [tool.ruff] 65 | extend-exclude = [] 66 | 67 | [tool.ruff.lint] 68 | ignore = [ 69 | # Conflicts with the formatter 70 | "COM812", "ISC001" 71 | ] 72 | 73 | [tool.mypy] 74 | python_version = "3.10" 75 | disallow_untyped_defs = true 76 | show_error_codes = true 77 | no_implicit_optional = true 78 | warn_return_any = true 79 | warn_unused_ignores = true 80 | exclude = ["scripts", "docs", "test"] 81 | 82 | [[tool.uv.index]] 83 | name = "testpypi" 84 | url = "https://test.pypi.org/simple/" 85 | publish-url = "https://test.pypi.org/legacy/" 86 | explicit = true -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/test/__init__.py -------------------------------------------------------------------------------- /test/agents/test_agent_factory.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | from mighty.mighty_agents import AGENT_CLASSES, VALID_AGENT_TYPES 6 | from mighty.mighty_agents.factory import get_agent_class 7 | 8 | 9 | class TestFactory: 10 | def test_create_agent(self): 11 | for agent_type in VALID_AGENT_TYPES: 12 | agent_class = get_agent_class(agent_type) 13 | assert agent_class == AGENT_CLASSES[agent_type] 14 | 15 | def test_create_agent_with_invalid_type(self): 16 | with pytest.raises(ValueError): 17 | get_agent_class("INVALID") 18 | -------------------------------------------------------------------------------- /test/agents/test_base_agent.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | 5 | import gymnasium as gym 6 | import pytest 7 | 8 | from mighty.mighty_agents.dqn import MightyAgent, MightyDQNAgent 9 | from mighty.mighty_utils.test_helpers import DummyEnv, clean 10 | 11 | 12 | class TestMightyAgent: 13 | def test_init(self): 14 | env = gym.vector.SyncVectorEnv([DummyEnv for _ in range(1)]) 15 | output_dir = Path("test_base_agent") 16 | output_dir.mkdir(parents=True, exist_ok=True) 17 | with pytest.raises(NotImplementedError): 18 | MightyAgent( 19 | output_dir, 20 | env, 21 | meta_kwargs=None, 22 | wandb_kwargs=None, 23 | meta_methods=None, 24 | ) 25 | clean(output_dir) 26 | 27 | def test_make_checkpoint_dir(self): 28 | env = gym.vector.SyncVectorEnv([DummyEnv for _ in range(1)]) 29 | output_dir = Path("test_base_agent") 30 | output_dir.mkdir(parents=True, exist_ok=True) 31 | agent = MightyDQNAgent(output_dir, env) 32 | 33 | agent.make_checkpoint_dir(1) 34 | assert Path(agent.checkpoint_dir).exists() 35 | clean(output_dir) 36 | 37 | def test_apply_config(self): 38 | env = gym.vector.SyncVectorEnv([DummyEnv for _ in range(1)]) 39 | output_dir = Path("test_base_agent") 40 | output_dir.mkdir(parents=True, exist_ok=True) 41 | agent = MightyDQNAgent(output_dir, env) 42 | config = { 43 | "learning_rate": -1, 44 | } 45 | agent.apply_config(config) 46 | assert agent.learning_rate == -1 47 | clean(output_dir) 48 | -------------------------------------------------------------------------------- /test/exploration/test_epsilon_greedy.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import numpy as np 4 | import pytest 5 | import torch 6 | 7 | from mighty.mighty_exploration import EpsilonGreedy 8 | from mighty.mighty_utils.test_helpers import DummyModel 9 | 10 | 11 | class TestEpsilonGreedy: 12 | def get_policy(self, epsilon=0.1): 13 | return EpsilonGreedy(algo="q", model=DummyModel(), epsilon=epsilon) 14 | 15 | @pytest.mark.parametrize( 16 | "state", 17 | [ 18 | torch.tensor([[0, 1], [0, 1]]), 19 | torch.tensor([[0, 235, 67], [0, 1, 2]]), 20 | torch.tensor( 21 | [[0, 235, 67], [0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2]] 22 | ), 23 | ], 24 | ) 25 | def test_exploration_func(self, state): 26 | policy = self.get_policy(epsilon=0.0) 27 | actions, qvals = policy.explore_func(state) 28 | greedy_actions, greedy_qvals = policy.sample_action(state) 29 | assert len(actions) == len(state), "Action should be predicted per state." 30 | assert all(a == g for g in greedy_actions for a in actions), ( 31 | f"Actions should match greedy: {actions}///{greedy_actions}" 32 | ) 33 | assert torch.equal(qvals, greedy_qvals), ( 34 | f"Q-values should match greedy: {qvals}///{greedy_qvals}" 35 | ) 36 | 37 | policy = self.get_policy(epsilon=0.5) 38 | actions = np.array( 39 | [policy.explore_func(state)[0] for _ in range(100)] 40 | ).flatten() 41 | assert sum([a == 1 for a in actions]) / (100 * len(state)) > 0.5, ( 42 | "Actions should match greedy at least in half of cases." 43 | ) 44 | assert sum([a == 1 for a in actions]) / (100 * len(state)) < 0.8, ( 45 | "Actions should match greedy in less than 4/5 of cases." 46 | ) 47 | 48 | policy = self.get_policy(epsilon=np.linspace(0, 1, len(state))) 49 | actions = np.array([policy.explore_func(state)[0] for _ in range(100)]) 50 | assert all(actions[:, 0] == 1), "Low index actions should match greedy." 51 | assert sum(actions[:, -1] == 1) / 100 < 0.33, ( 52 | "High index actions should not match greedy more than 1/3 of the time." 53 | ) 54 | 55 | @pytest.mark.parametrize( 56 | "state", 57 | [ 58 | torch.tensor([[0, 1], [0, 1]]), 59 | torch.tensor([[0, 235, 67], [0, 1, 2]]), 60 | torch.tensor( 61 | [[0, 235, 67], [0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2]] 62 | ), 63 | ], 64 | ) 65 | def test_multiple_epsilons(self, state): 66 | """Test multiple epsilon values.""" 67 | policy = self.get_policy(epsilon=[0.1, 0.5]) 68 | assert np.all(policy.epsilon == [0.1, 0.5]), "Epsilon should be [0.1, 0.5]." 69 | action, _ = policy.explore_func(state) 70 | assert len(action) == len(state.numpy()), ( 71 | f"Action should be predicted per state: len({action}) != len({state.numpy()})." 72 | ) 73 | -------------------------------------------------------------------------------- /test/exploration/test_exploration.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | import torch 5 | 6 | from mighty.mighty_exploration import MightyExplorationPolicy 7 | from mighty.mighty_utils.test_helpers import DummyModel 8 | 9 | 10 | class TestPolicy: 11 | def get_policy(self, action=1): 12 | return MightyExplorationPolicy(algo="q", model=DummyModel(action=action)) 13 | 14 | def test_exploration_func(self): 15 | with pytest.raises(NotImplementedError): 16 | self.get_policy().explore_func([0]) 17 | 18 | @pytest.mark.parametrize( 19 | "state", 20 | [ 21 | torch.tensor([0]), 22 | torch.tensor([0, 1]), 23 | torch.tensor([[0, 235, 67], [0, 1, 2]]), 24 | ], 25 | ) 26 | def test_call(self, state): 27 | policy = self.get_policy() 28 | with pytest.raises(NotImplementedError): 29 | policy(state) 30 | 31 | greedy_actions, qvals = policy(state, evaluate=True, return_logp=True) 32 | assert all(greedy_actions == 1), ( 33 | f"Greedy actions should be 1: {greedy_actions}///{qvals}" 34 | ) 35 | assert qvals.shape[-1] == 5, "Q-value shape should not be changed." 36 | assert len(qvals) == len(state), "Q-value length should not be changed." 37 | 38 | policy = self.get_policy(action=3) 39 | greedy_actions, qvals = policy(state, evaluate=True, return_logp=True) 40 | assert all(greedy_actions == 3), ( 41 | f"Greedy actions should be 3: {greedy_actions}///{qvals}" 42 | ) 43 | assert qvals.shape[-1] == 5, "Q-value shape should not be changed." 44 | assert len(qvals) == len(state), "Q-value length should not be changed." 45 | -------------------------------------------------------------------------------- /test/exploration/test_ez_greedy.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import gymnasium as gym 4 | import numpy as np 5 | 6 | from mighty.mighty_agents.dqn import MightyDQNAgent 7 | from mighty.mighty_exploration.ez_greedy import EZGreedy 8 | from mighty.mighty_utils.test_helpers import DummyEnv, clean 9 | 10 | 11 | class TestEZGreedy: 12 | def test_init(self) -> None: 13 | env = gym.vector.SyncVectorEnv([DummyEnv for _ in range(1)]) 14 | output_dir = Path("test_ez_greedy") 15 | output_dir.mkdir(parents=True, exist_ok=True) 16 | dqn = MightyDQNAgent( 17 | output_dir, 18 | env, 19 | use_target=False, 20 | policy_class="mighty.mighty_exploration.EZGreedy", 21 | ) 22 | assert isinstance(dqn.policy, EZGreedy), ( 23 | "Policy should be an instance of EZGreedy when creating with string." 24 | ) 25 | assert dqn.policy.epsilon == 0.1, "Default epsilon should be 0.1." 26 | assert dqn.policy.zipf_param == 2, "Default zipf_param should be 2." 27 | assert dqn.policy.skipped is None, "Skip should be initialized at None." 28 | assert dqn.policy.frozen_actions is None, ( 29 | "Frozen actions should be initialized at None." 30 | ) 31 | 32 | dqn = MightyDQNAgent( 33 | output_dir, 34 | env, 35 | use_target=False, 36 | policy_class=EZGreedy, 37 | policy_kwargs={"epsilon": [0.5, 0.3], "zipf_param": 3}, 38 | ) 39 | assert isinstance(dqn.policy, EZGreedy), ( 40 | "Policy should be an instance of EZGreedy when creating with class." 41 | ) 42 | assert np.all(dqn.policy.epsilon == [0.5, 0.3]), "Epsilon should be [0.5, 0.3]." 43 | assert dqn.policy.zipf_param == 3, "zipf_param should be 3." 44 | clean(output_dir) 45 | 46 | def test_skip_single(self) -> None: 47 | env = gym.vector.SyncVectorEnv([DummyEnv for _ in range(1)]) 48 | output_dir = Path("test_ez_greedy") 49 | output_dir.mkdir(parents=True, exist_ok=True) 50 | dqn = MightyDQNAgent( 51 | output_dir, 52 | env, 53 | use_target=False, 54 | policy_class="mighty.mighty_exploration.EZGreedy", 55 | policy_kwargs={"epsilon": 0.0, "zipf_param": 3}, 56 | ) 57 | 58 | state, _ = env.reset() 59 | action = dqn.policy([state]) 60 | assert np.all(action < env.single_action_space.n), ( 61 | "Action should be within the action space." 62 | ) 63 | assert len(action) == len(state), "Action should be predicted per state." 64 | 65 | dqn.policy.skipped = np.array([1]) 66 | next_action = dqn.policy([state]) 67 | assert np.all(action == next_action), ( 68 | "Action should be the same as the previous action when skip is active." 69 | ) 70 | assert dqn.policy.skipped[0] == 0, "Skip should be decayed by one." 71 | clean(output_dir) 72 | 73 | def test_skip_batch(self) -> None: 74 | env = gym.vector.SyncVectorEnv([DummyEnv for _ in range(2)]) 75 | output_dir = Path("test_ez_greedy") 76 | output_dir.mkdir(parents=True, exist_ok=True) 77 | dqn = MightyDQNAgent( 78 | output_dir, 79 | env, 80 | use_target=False, 81 | policy_class=EZGreedy, 82 | policy_kwargs={"epsilon": [0.5, 1.0], "zipf_param": 3}, 83 | ) 84 | 85 | state, _ = env.reset() 86 | action = dqn.policy(state) 87 | assert all([a < env.single_action_space.n for a in action]), ( 88 | "Actions should be within the action space." 89 | ) 90 | assert len(action) == len(state), "Action should be predicted per state." 91 | 92 | dqn.policy.skipped = np.array([3, 0]) 93 | next_action = dqn.policy(state + 2) 94 | assert action[0] == next_action[0], ( 95 | f"First action should be the same as the previous action when skip is active: {action[0]} != {next_action[0]}" 96 | ) 97 | assert dqn.policy.skipped[0] == 2, "Skip should be decayed by one." 98 | assert dqn.policy.skipped[1] >= 0, "Skip should not be decayed below one." 99 | clean(output_dir) 100 | -------------------------------------------------------------------------------- /test/meta_components/test_cosine_schedule.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | 5 | import gymnasium as gym 6 | 7 | from mighty.mighty_agents.dqn import MightyDQNAgent 8 | from mighty.mighty_utils.test_helpers import DummyEnv, clean 9 | 10 | 11 | class TestCosineLR: 12 | def test_decay(self) -> None: 13 | env = gym.vector.SyncVectorEnv([DummyEnv for _ in range(1)]) 14 | output_dir = Path("test_cosine") 15 | output_dir.mkdir(parents=True, exist_ok=True) 16 | dqn = MightyDQNAgent( 17 | output_dir, 18 | env, 19 | meta_methods=["mighty.mighty_meta.CosineLRSchedule"], 20 | meta_kwargs=[ 21 | {"initial_lr": 0.2, "num_decay_steps": 100, "restart_every": 0} 22 | ], 23 | ) 24 | lr = 1.5 25 | dqn.learning_rate = lr 26 | for i in range(4): 27 | metrics = dqn.run(n_steps=10 * (i + 1)) 28 | assert metrics["hp/lr"] == dqn.learning_rate, ( 29 | f"Learning rate should be set to schedule value {metrics['hp/lr']} instead of {dqn.learning_rate}." 30 | ) 31 | assert dqn.learning_rate < lr, ( 32 | f"Learning rate should decrease: {dqn.learning_rate} is not less than {lr}." 33 | ) 34 | lr = dqn.learning_rate.copy() 35 | clean(output_dir) 36 | 37 | def test_restart(self) -> None: 38 | env = gym.vector.SyncVectorEnv([DummyEnv for _ in range(1)]) 39 | output_dir = Path("test_cosine") 40 | output_dir.mkdir(parents=True, exist_ok=True) 41 | dqn = MightyDQNAgent( 42 | output_dir, 43 | env, 44 | meta_methods=["mighty.mighty_meta.CosineLRSchedule"], 45 | meta_kwargs=[ 46 | {"initial_lr": 0.2, "num_decay_steps": 100, "restart_every": 5} 47 | ], 48 | ) 49 | dqn.run(6, 0) 50 | assert dqn.meta_modules["CosineLRSchedule"].n_restarts == 1, ( 51 | "Restart counter should increase." 52 | ) 53 | assert dqn.learning_rate >= dqn.meta_modules["CosineLRSchedule"].eta_max, ( 54 | f"Restart should increase learning rate: {dqn.learning_rate} is not {dqn.meta_modules['CosineLRSchedule'].eta_max}." 55 | ) 56 | clean(output_dir) 57 | -------------------------------------------------------------------------------- /test/meta_components/test_space.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | 5 | from mighty.mighty_agents.dqn import MightyDQNAgent 6 | from mighty.mighty_meta import SPaCE 7 | from mighty.mighty_utils.test_helpers import DummyEnv, DummyModel, clean 8 | from mighty.mighty_utils.wrappers import ContextualVecEnv 9 | 10 | 11 | class TestSPaCE: 12 | def test_init(self) -> None: 13 | space = SPaCE(criterion="improvement", threshold=0.5, k=2) 14 | assert space.criterion == "improvement" 15 | assert space.threshold == 0.5 16 | assert space.increase_by_k_instances == 2 17 | assert space.current_instance_set_size == 2 18 | assert space.last_evals is None 19 | 20 | def test_get_instances(self) -> None: 21 | space = SPaCE() 22 | metrics = { 23 | "env": DummyEnv(), 24 | "vf": DummyModel(), 25 | "rollout_values": [[0.0, 0.6, 0.7]], 26 | } 27 | space.get_instances(metrics) 28 | assert len(space.all_instances) == 1, ( 29 | f"Expected 1, got {len(space.all_instances)}" 30 | ) 31 | assert len(space.instance_set) == 1, ( 32 | f"Expected 1, got {len(space.instance_set)}" 33 | ) 34 | assert space.last_evals is not None, "Evals should not be None." 35 | 36 | def test_get_evals(self) -> None: 37 | vf = DummyModel() 38 | env = DummyEnv() 39 | space = SPaCE() 40 | space.all_instances = env.instance_id_list 41 | values = space.get_evals(env, vf) 42 | assert len(values) == 1, f"Expected 1 value, got {len(values)}" 43 | 44 | def test_in_loop(self) -> None: 45 | env = ContextualVecEnv([DummyEnv for _ in range(2)]) 46 | output_dir = Path("test_space") 47 | output_dir.mkdir(parents=True, exist_ok=True) 48 | dqn = MightyDQNAgent( 49 | output_dir, 50 | env, 51 | use_target=False, 52 | meta_methods=["mighty.mighty_meta.SPaCE"], 53 | ) 54 | assert dqn.meta_modules["SPaCE"] is not None, "SPaCE should be initialized." 55 | dqn.run(100, 0) 56 | assert dqn.meta_modules["SPaCE"].all_instances is not None, ( 57 | "All instances should be initialized." 58 | ) 59 | assert env.inst_ids[0] in dqn.meta_modules["SPaCE"].all_instances, ( 60 | "Instance should be in all instances." 61 | ) 62 | clean(output_dir) 63 | -------------------------------------------------------------------------------- /test/runners/test_es_runner.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import shutil 4 | from copy import deepcopy 5 | 6 | from omegaconf import OmegaConf 7 | 8 | from mighty.mighty_agents import MightyAgent 9 | from mighty.mighty_runners import MightyESRunner, MightyRunner 10 | from mighty.mighty_utils.wrappers import PufferlibToGymAdapter 11 | 12 | 13 | class TestMightyNESRunner: 14 | runner_config = OmegaConf.create( 15 | { 16 | "runner": "es", 17 | "es": "evosax.xNES", 18 | "search_targets": ["parameters", "_batch_size", "learning_rate"], 19 | "rl_train_agent": True, 20 | "num_steps_per_iteration": 10, 21 | "iterations": 2, 22 | "popsize": 3, 23 | "debug": False, 24 | "seed": 0, 25 | "output_dir": "test_nes_runner", 26 | "wandb_project": None, 27 | "tensorboard_file": None, 28 | "experiment_name": "mighty_experiment", 29 | "eval_every_n_steps": 1e4, 30 | "n_episodes_eval": 10, 31 | "checkpoint": None, 32 | "save_model_every_n_steps": 5e5, 33 | "num_steps": 100, 34 | "env": "pufferlib.ocean.bandit", 35 | "env_kwargs": {}, 36 | "env_wrappers": [], 37 | "num_envs": 1, 38 | "algorithm": "DQN", 39 | "algorithm_kwargs": { 40 | "n_units": 8, 41 | "epsilon": 0.2, 42 | "replay_buffer_class": "mighty.mighty_replay.PrioritizedReplay", 43 | "replay_buffer_kwargs": {"capacity": 1000000, "alpha": 0.6}, 44 | "learning_rate": 0.001, 45 | "batch_size": 64, 46 | "gamma": 0.9, 47 | "soft_update_weight": 1.0, 48 | "td_update_class": "mighty.mighty_update.QLearning", 49 | "q_kwargs": { 50 | "dueling": False, 51 | "feature_extractor_kwargs": { 52 | "architecture": "mlp", 53 | "n_layers": 1, 54 | "hidden_sizes": [32], 55 | }, 56 | "head_kwargs": {"hidden_sizes": [32]}, 57 | }, 58 | }, 59 | } 60 | ) 61 | 62 | def test_init(self): 63 | runner = MightyESRunner(self.runner_config) 64 | assert isinstance(runner, MightyRunner), ( 65 | "MightyNESRunner should be an instance of MightyRunner" 66 | ) 67 | assert isinstance(runner.agent, MightyAgent), ( 68 | "MightyNESRunner should have a MightyAgent" 69 | ) 70 | assert isinstance(runner.agent.eval_env, PufferlibToGymAdapter), ( 71 | "Eval env should be a PufferlibToGymAdapter" 72 | ) 73 | assert runner.agent.env is not None, "Env should be set" 74 | assert runner.iterations is not None, "Iterations should be set" 75 | assert runner.es is not None, "ES should be set" 76 | assert runner.fit_shaper is not None, "Fit shaper should be set" 77 | assert runner.rng is not None, "RNG should be set" 78 | 79 | def test_run(self): 80 | runner = MightyESRunner(self.runner_config) 81 | old_params = deepcopy(runner.agent.parameters) 82 | old_lr = runner.agent.learning_rate 83 | old_batch_size = runner.agent._batch_size 84 | train_results, eval_results = runner.run() 85 | new_params = runner.agent.parameters 86 | assert isinstance(train_results, dict), "Train results should be a dictionary" 87 | assert isinstance(eval_results, dict), "Eval results should be a dictionary" 88 | assert "mean_eval_reward" in eval_results, ( 89 | "Mean eval reward should be in eval results" 90 | ) 91 | param_equals = [o == p for o, p in zip(old_params, new_params)] 92 | for params in param_equals: 93 | assert not all(params.flatten()), ( 94 | "Parameters should have changed in training" 95 | ) 96 | assert not old_lr == runner.agent.learning_rate, ( 97 | "Learning rate should have changed in training" 98 | ) 99 | assert not old_batch_size == runner.agent._batch_size, ( 100 | "Batch size should have changed in training" 101 | ) 102 | shutil.rmtree("test_nes_runner") 103 | -------------------------------------------------------------------------------- /test/runners/test_runner.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import shutil 4 | 5 | import pytest 6 | from omegaconf import OmegaConf 7 | 8 | from mighty.mighty_agents import MightyAgent 9 | from mighty.mighty_runners import MightyOnlineRunner, MightyRunner 10 | from mighty.mighty_utils.wrappers import PufferlibToGymAdapter 11 | 12 | 13 | class TestMightyRunner: 14 | runner_config = OmegaConf.create( 15 | { 16 | "runner": "standard", 17 | "debug": False, 18 | "seed": 0, 19 | "output_dir": "test_runner", 20 | "wandb_project": None, 21 | "tensorboard_file": None, 22 | "experiment_name": "mighty_experiment", 23 | "eval_every_n_steps": 1e4, 24 | "n_episodes_eval": 10, 25 | "checkpoint": None, 26 | "save_model_every_n_steps": 5e5, 27 | "num_steps": 100, 28 | "env": "pufferlib.ocean.bandit", 29 | "env_kwargs": {}, 30 | "env_wrappers": [], 31 | "num_envs": 1, 32 | "algorithm": "DQN", 33 | "algorithm_kwargs": { 34 | "n_units": 8, 35 | "epsilon": 0.2, 36 | "replay_buffer_class": "mighty.mighty_replay.PrioritizedReplay", 37 | "replay_buffer_kwargs": {"capacity": 1000000, "alpha": 0.6}, 38 | "learning_rate": 0.001, 39 | "batch_size": 64, 40 | "gamma": 0.9, 41 | "soft_update_weight": 1.0, 42 | "td_update_class": "mighty.mighty_update.QLearning", 43 | "q_kwargs": { 44 | "dueling": False, 45 | "feature_extractor_kwargs": { 46 | "architecture": "mlp", 47 | "n_layers": 1, 48 | "hidden_sizes": [32], 49 | }, 50 | "head_kwargs": {"hidden_sizes": [32]}, 51 | }, 52 | }, 53 | } 54 | ) 55 | 56 | def test_init(self): 57 | runner = MightyOnlineRunner(self.runner_config) 58 | assert isinstance(runner, MightyRunner), ( 59 | "MightyOnlineRunner should be an instance of MightyRunner" 60 | ) 61 | assert isinstance(runner.agent, MightyAgent), ( 62 | "MightyOnlineRunner should have a MightyAgent" 63 | ) 64 | assert isinstance(runner.agent.eval_env, PufferlibToGymAdapter), ( 65 | "Eval env should be a PufferlibToGymAdapter" 66 | ) 67 | assert runner.agent.env is not None, "Env should not be None" 68 | assert runner.eval_every_n_steps == self.runner_config.eval_every_n_steps, ( 69 | "Eval every n steps should be set" 70 | ) 71 | assert runner.num_steps == self.runner_config.num_steps, ( 72 | "Num steps should be set" 73 | ) 74 | 75 | def test_train(self): 76 | runner = MightyOnlineRunner(self.runner_config) 77 | results = runner.train(100) 78 | assert isinstance(results, dict), "Results should be a dictionary" 79 | alternate_env = True 80 | with pytest.raises(AttributeError): 81 | runner.train(100, alternate_env) 82 | 83 | def test_evaluate(self): 84 | runner = MightyOnlineRunner(self.runner_config) 85 | results = runner.evaluate() 86 | assert isinstance(results, dict), "Results should be a dictionary" 87 | assert "mean_eval_reward" in results, "Results should have mean_eval_reward" 88 | alternate_env = True 89 | with pytest.raises(AttributeError): 90 | runner.evaluate(alternate_env) 91 | 92 | def test_run(self): 93 | runner = MightyOnlineRunner(self.runner_config) 94 | train_results, eval_results = runner.run() 95 | assert isinstance(train_results, dict), "Train results should be a dictionary" 96 | assert isinstance(eval_results, dict), "Eval results should be a dictionary" 97 | assert "mean_eval_reward" in eval_results, ( 98 | "Eval results should have mean_eval_reward" 99 | ) 100 | shutil.rmtree("test_runner") 101 | -------------------------------------------------------------------------------- /test/runners/test_runner_factory.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | from mighty.mighty_runners.factory import get_runner_class 6 | from mighty.mighty_runners.mighty_online_runner import MightyOnlineRunner 7 | 8 | VALID_RUNNER_TYPES = ["standard", "default", "online"] 9 | RUNNER_CLASSES = { 10 | "standard": MightyOnlineRunner, 11 | "default": MightyOnlineRunner, 12 | "online": MightyOnlineRunner, 13 | } 14 | 15 | 16 | class TestFactory: 17 | def test_create_agent(self): 18 | for runner_type in VALID_RUNNER_TYPES: 19 | runner_class = get_runner_class(runner_type) 20 | assert runner_class == RUNNER_CLASSES[runner_type], ( 21 | f"Runner class should be {RUNNER_CLASSES[runner_type]}" 22 | ) 23 | 24 | def test_create_agent_with_invalid_type(self): 25 | with pytest.raises(ValueError): 26 | get_runner_class("INVALID") 27 | -------------------------------------------------------------------------------- /test/test_cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | import pytest 5 | 6 | IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" 7 | 8 | 9 | @pytest.mark.skipif( 10 | IN_GITHUB_ACTIONS, 11 | reason="The python called here for some reason will lack some dependencies in GH actions. Test locally instead.", 12 | ) 13 | class TestMightCLI: 14 | def test_run_from_file(self): 15 | exit_status = os.system( 16 | "uv run python mighty/run_mighty.py num_steps=100 output_dir=test_cli" 17 | ) 18 | assert exit_status == 0 19 | shutil.rmtree("test_cli") 20 | --------------------------------------------------------------------------------