├── .github
    ├── Issue_Template
    │   └── issue_template.md
    ├── Pull_Request_Template
    │   └── pull_request_template.md
    └── workflows
    │   ├── docs_test.yaml
    │   ├── publish-release.yaml
    │   └── test.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── HPs
    ├── DQN.yaml
    ├── PPO.yaml
    └── SAC.yaml
├── LICENSE
├── Makefile
├── README.md
├── docs
    ├── api_generator.py
    ├── basic_usage.md
    ├── hooks
    │   ├── cleanup_log_output.py
    │   ├── debug_which_page_is_being_rendered.py
    │   └── disable_markdown_exec.py
    ├── img
    │   ├── favicon.ico
    │   ├── logo.png
    │   └── logo_no_font.png
    ├── index.md
    ├── installation.md
    ├── methods
    │   ├── algorithms.md
    │   ├── architectures.md
    │   ├── inner_loops.md
    │   └── outer_loops.md
    ├── package_structure.md
    └── usecases
    │   ├── Contextual_RL.md
    │   ├── DAC.md
    │   └── Standard_RL.md
├── examples
    ├── README.md
    ├── __init__.py
    ├── custom_exploration_scheduler.py
    ├── custom_policy.py
    ├── hypersweeper_smac_example_config.yaml
    ├── multiple_runs
    │   ├── mighty_experiment_0
    │   │   ├── 0
    │   │   │   └── .hydra
    │   │   │   │   ├── config.yaml
    │   │   │   │   ├── hydra.yaml
    │   │   │   │   └── overrides.yaml
    │   │   ├── eval_results.csv
    │   │   ├── hyperparameters.csv
    │   │   ├── losses.csv
    │   │   ├── multirun.yaml
    │   │   ├── results.csv
    │   │   └── results.npz
    │   ├── mighty_experiment_1
    │   │   ├── 1
    │   │   │   └── .hydra
    │   │   │   │   ├── config.yaml
    │   │   │   │   ├── hydra.yaml
    │   │   │   │   └── overrides.yaml
    │   │   ├── eval_results.csv
    │   │   ├── hyperparameters.csv
    │   │   ├── losses.csv
    │   │   ├── results.csv
    │   │   └── results.npz
    │   ├── mighty_experiment_2
    │   │   ├── 2
    │   │   │   └── .hydra
    │   │   │   │   ├── config.yaml
    │   │   │   │   ├── hydra.yaml
    │   │   │   │   └── overrides.yaml
    │   │   ├── eval_results.csv
    │   │   ├── hyperparameters.csv
    │   │   ├── losses.csv
    │   │   ├── results.csv
    │   │   └── results.npz
    │   ├── mighty_experiment_3
    │   │   ├── 3
    │   │   │   └── .hydra
    │   │   │   │   ├── config.yaml
    │   │   │   │   ├── hydra.yaml
    │   │   │   │   └── overrides.yaml
    │   │   ├── eval_results.csv
    │   │   ├── hyperparameters.csv
    │   │   ├── losses.csv
    │   │   ├── results.csv
    │   │   └── results.npz
    │   └── mighty_experiment_4
    │   │   ├── 4
    │   │       └── .hydra
    │   │       │   ├── config.yaml
    │   │       │   ├── hydra.yaml
    │   │       │   └── overrides.yaml
    │   │   ├── eval_results.csv
    │   │   ├── hyperparameters.csv
    │   │   ├── losses.csv
    │   │   ├── results.csv
    │   │   └── results.npz
    ├── optuna_example_config.yaml
    └── plot_examples.ipynb
├── mighty
    ├── __init__.py
    ├── configs
    │   ├── algorithm
    │   │   ├── atari_dqn.yaml
    │   │   ├── ddqn.yaml
    │   │   ├── dqn.yaml
    │   │   ├── minigrid_dqn.yaml
    │   │   ├── ppo.yaml
    │   │   ├── ppo_mountaincar.yaml
    │   │   ├── procgen_dqn.yaml
    │   │   ├── sac.yaml
    │   │   └── sac_mujoco.yaml
    │   ├── base.yaml
    │   ├── cluster
    │   │   ├── local.yaml
    │   │   ├── luis.yaml
    │   │   ├── noctua.yaml
    │   │   └── tnt.yaml
    │   ├── cmaes_hpo.yaml
    │   ├── environment
    │   │   ├── carl_walkers
    │   │   │   └── ant_goals.yaml
    │   │   ├── dacbench
    │   │   │   ├── function_approximation.yaml
    │   │   │   └── function_approximation_benchmark.yaml
    │   │   ├── gymnasium
    │   │   │   ├── atari_pong.yaml
    │   │   │   ├── cartpole.yaml
    │   │   │   ├── mountaincar.yaml
    │   │   │   ├── mountaincarcontinuous.yaml
    │   │   │   └── pendulum.yaml
    │   │   ├── procgen_bigfish.yaml
    │   │   ├── pufferlib_minigrid
    │   │   │   └── minigrid_env.yaml
    │   │   ├── pufferlib_ocean
    │   │   │   ├── bandit.yaml
    │   │   │   ├── memory.yaml
    │   │   │   ├── password.yaml
    │   │   │   ├── squared.yaml
    │   │   │   └── stochastic.yaml
    │   │   └── pufferlib_procgen
    │   │   │   └── bigfish.yaml
    │   ├── exploration
    │   │   ├── epsilon_decay.yaml
    │   │   ├── ez_greedy.yaml
    │   │   ├── noveld.yaml
    │   │   └── rnd.yaml
    │   ├── hydra
    │   │   └── help
    │   │   │   └── mighty_help.yaml
    │   ├── nes.yaml
    │   ├── ppo_smac.yaml
    │   ├── sac_smac.yaml
    │   ├── search_space
    │   │   ├── dqn_gym_classic.yaml
    │   │   ├── dqn_rs.yaml
    │   │   ├── dqn_template.yaml
    │   │   ├── mighty_template.yaml
    │   │   ├── ppo_rs.yaml
    │   │   └── sac_rs.yaml
    │   ├── sweep_ppo_pbt.yaml
    │   ├── sweep_rs.yaml
    │   └── target_function.yaml
    ├── mighty_agents
    │   ├── .gitkeep
    │   ├── __init__.py
    │   ├── base_agent.py
    │   ├── dqn.py
    │   ├── factory.py
    │   ├── ppo.py
    │   └── sac.py
    ├── mighty_exploration
    │   ├── __init__.py
    │   ├── decaying_epsilon_greedy.py
    │   ├── epsilon_greedy.py
    │   ├── ez_greedy.py
    │   ├── mighty_exploration_policy.py
    │   └── stochastic_policy.py
    ├── mighty_meta
    │   ├── __init__.py
    │   ├── cosine_lr_schedule.py
    │   ├── mighty_component.py
    │   ├── plr.py
    │   ├── rnd.py
    │   └── space.py
    ├── mighty_models
    │   ├── __init__.py
    │   ├── dqn.py
    │   ├── networks.py
    │   ├── ppo.py
    │   └── sac.py
    ├── mighty_replay
    │   ├── __init__.py
    │   ├── buffer.py
    │   ├── mighty_prioritized_replay.py
    │   ├── mighty_replay_buffer.py
    │   └── mighty_rollout_buffer.py
    ├── mighty_runners
    │   ├── __init__.py
    │   ├── factory.py
    │   ├── mighty_es_runner.py
    │   ├── mighty_maml_runner.py
    │   ├── mighty_online_runner.py
    │   └── mighty_runner.py
    ├── mighty_update
    │   ├── __init__.py
    │   ├── ppo_update.py
    │   ├── q_learning.py
    │   └── sac_update.py
    ├── mighty_utils
    │   ├── __init__ .py
    │   ├── __init__.py
    │   ├── envs.py
    │   ├── migthy_types.py
    │   ├── plotting.py
    │   ├── test_helpers.py
    │   ├── update_utils.py
    │   └── wrappers.py
    └── run_mighty.py
├── mkdocs.yml
├── pyproject.toml
└── test
    ├── __init__.py
    ├── agents
        ├── test_agent_factory.py
        ├── test_base_agent.py
        └── test_dqn_agent.py
    ├── exploration
        ├── test_epsilon_greedy.py
        ├── test_exploration.py
        └── test_ez_greedy.py
    ├── meta_components
        ├── test_cosine_schedule.py
        ├── test_noveld.py
        ├── test_plr.py
        ├── test_rnd.py
        └── test_space.py
    ├── models
        ├── test_networks.py
        └── test_q_networks.py
    ├── replay
        └── test_buffer.py
    ├── runners
        ├── test_es_runner.py
        ├── test_runner.py
        └── test_runner_factory.py
    ├── test_cli.py
    ├── test_env_creation.py
    └── update
        └── test_q_update.py


/.github/Issue_Template/issue_template.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ---
 3 | name: General issue template
 4 | about: General template issues
 5 | labels: 
 6 | 
 7 | ---
 8 | 
 9 | * Mighty version:
10 | * Python version:
11 | * Operating System:
12 | 
13 | 
14 | <!--
15 | Please file an issue for bug reports (label as `bug`), usage questions (label as `question`), feature requests (label as `feature request`), to notify us about upcoming contributions and any other topic that you think may be important discussing with us.
16 | -->
17 | 
18 | #### Description
19 | <!-- Describe what you were trying to do -->
20 | 
21 | #### Steps/Code to Reproduce
22 | <!--
23 | Please provide us with a minimal code example to reproduce your issue. If the code is too long, feel free to put it in a public gist and link it in the issue: https://gist.github.com
24 | -->
25 | 
26 | #### Expected Results
27 | <!-- Please paste or describe the expected results.-->
28 | 
29 | #### Actual Results
30 | <!-- Please paste or specifically describe the actual output or traceback. If you cannot provide a full traceback, please include the kind of error you encounter as well as where it occurs in the code! -->
31 | 
32 | #### Additional Info
33 | 
34 | - Did you try upgrading to the most current version? yes/no
35 | - Are you using a supported operating system (version)? yes/no
36 | - How did you install this package (e.g. GitHub, pip, etc.)? 
37 | 
38 | <!-- Thanks for contributing! -->


--------------------------------------------------------------------------------
/.github/Pull_Request_Template/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <!--
 3 | Thanks for contributing a pull request! Please ensure you have taken a look at the contribution guidelines in the CONTRIBUTING.md
 4 | -->
 5 | 
 6 | #### Reference Issues/PRs
 7 | <!--
 8 | Example: Fixes #1234. See also #3456.
 9 | Please use keywords (e.g., Fixes) to create link to the issues or pull requests you resolved, so that they will automatically be closed when your pull request is merged. See https://github.com/blog/1506-closing-issues-via-pull-requests
10 | -->
11 | 
12 | #### What does this implement/fix? Explain your changes.
13 | 
14 | <!--
15 | Good PR descriptions help us immensely in reviewing and testing your PRs. This will make merging much faster and smoother for everyone involved.
16 | Examples:
17 | 
18 | **Good:** Added Documentation -- documented the learning rate feature both in the code and the docs. Also added an example.
19 | **Bad:** Explained learning rates
20 | 
21 | **Good:** Dask Parallelization Backend -- parallelization is now possible with dask. This meant some restructuring of the runner backend and testing, but shouldn't change usage at all. 
22 | **Bad:** new parallel feature
23 | -->
24 | 
25 | #### Checklist
26 | 
27 | - Are the tests passing locally? yes/no
28 | - Is the pre-commit passing locally? yes/no
29 | - Are all new features documented in code and docs? yes/no
30 | - Are all examples still running? yes/no
31 | - Are the requirements up to date? yes/no
32 | - Did you add yourself to the contributors in the authors file? yes/no
33 | 
34 | #### Any other comments?
35 | 
36 | <!--
37 | We appreciate your effort, but please be aware that we are not working full-time on this project. We welcome any contribution and value your effort, no matter how minor they are. If we are slow to review, either the pull request needs some benchmarking, tinkering, convincing, etc. or more likely the reviewers are simply busy. In either case, we ask for your understanding during the review process.
38 | 
39 | Thanks for contributing!
40 | -->


--------------------------------------------------------------------------------
/.github/workflows/docs_test.yaml:
--------------------------------------------------------------------------------
 1 | # This workflow is to test that the docs build successfully.
 2 | name: test-docs
 3 | env:
 4 |   package-name: "mighty"
 5 |   UV_SYSTEM_PYTHON: 1
 6 | 
 7 | concurrency:
 8 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
 9 |   cancel-in-progress: true
10 | on:
11 |   workflow_dispatch:
12 |   push:
13 |     branches:
14 |       - main
15 |   pull_request:
16 |     branches:
17 |       - main
18 | permissions:
19 |   contents: write
20 | jobs:
21 |   build:
22 |     runs-on: ubuntu-latest
23 |     steps:
24 |       - name: Checkout
25 |         uses: actions/checkout@v4
26 | 
27 |       - name: Install uv
28 |         uses: astral-sh/setup-uv@v5
29 |         with:
30 |           # Install a specific version of uv.
31 |           version: "0.6.14"
32 | 
33 |       - name: "Set up Python"
34 |         uses: actions/setup-python@v5
35 |         with:
36 |           python-version-file: "pyproject.toml"
37 | 
38 |       - name: Install Mighty
39 |         run: make install-dev
40 | 
41 |       - name: "Build Docs"
42 |         run: mkdocs build --clean --strict


--------------------------------------------------------------------------------
/.github/workflows/publish-release.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish Python Package
 2 | permissions:
 3 |   id-token: write
 4 | 
 5 | env:
 6 |   package-name: "mighty"
 7 |   UV_SYSTEM_PYTHON: 1
 8 | 
 9 | on:
10 |   # Manually triggerable in github
11 |   workflow_dispatch:
12 |   release:
13 |     types: [created]
14 | 
15 | jobs:
16 |   test:
17 |     name: publish-release
18 |     runs-on: "ubuntu-latest"
19 | 
20 |     steps:
21 |       - name: Checkout
22 |         uses: actions/checkout@v4
23 | 
24 |       - name: Install uv
25 |         uses: astral-sh/setup-uv@v5
26 |         with:
27 |           # Install a specific version of uv.
28 |           version: "0.6.14"
29 | 
30 |       - name: "Set up Python"
31 |         uses: actions/setup-python@v5
32 |         with:
33 |           python-version-file: "pyproject.toml"
34 | 
35 |       - name: Install ${{ env.package-name }}
36 |         run: make install-dev
37 | 
38 |       - name: Store git status
39 |         id: status-before
40 |         shell: bash
41 |         run: |
42 |           echo "::set-output name=BEFORE::$(git status --porcelain -b)"
43 | 
44 |       - name: Tests
45 |         run: make test
46 | 
47 |   pypi-publish:
48 |     name: Upload release to PyPI
49 |     runs-on: ubuntu-latest
50 |     environment:
51 |       name: pypi
52 |       url: https://pypi.org/p/mighty
53 |     steps:
54 |       - name: Checkout
55 |         uses: actions/checkout@v4
56 | 
57 |       - name: Install uv
58 |         uses: astral-sh/setup-uv@v5
59 |         with:
60 |           # Install a specific version of uv.
61 |           version: "0.6.14"
62 | 
63 |       - name: "Set up Python"
64 |         uses: actions/setup-python@v5
65 |         with:
66 |           python-version-file: "pyproject.toml"
67 | 
68 |       - name: Install ${{ env.package-name }}
69 |         run: make install-dev
70 |       - name: Build package
71 |         run: uv build
72 | 
73 |       - name: Publish package distributions to PyPI
74 |         run: uv publish


--------------------------------------------------------------------------------
/.github/workflows/test.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | name: Tests
 3 | concurrency:
 4 |   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
 5 |   cancel-in-progress: true
 6 | 
 7 | on:
 8 |   workflow_dispatch:  # Manually trigger the workflow
 9 |   # Triggers with push to main
10 |   push:
11 |     branches:
12 |       - main
13 |       - development
14 | 
15 |   # Triggers with push to a PR aimed at main
16 |   pull_request:
17 |     branches:
18 |       - main
19 |       - development
20 | 
21 | env:
22 |   package-name: "mighty"
23 |   test-dir: test
24 |   UV_SYSTEM_PYTHON: 1
25 | 
26 | jobs:
27 |   # General unit tests
28 |   source-test:
29 |     name: test
30 |     runs-on: "ubuntu-latest"
31 |     defaults:
32 |       run:
33 |         shell: bash # Default to using bash on all
34 | 
35 |     steps:
36 |     - name: Checkout
37 |       uses: actions/checkout@v4
38 | 
39 |     - name: Install uv
40 |       uses: astral-sh/setup-uv@v5
41 |       with:
42 |         # Install a specific version of uv.
43 |         version: "0.6.14"
44 | 
45 |     - name: "Set up Python"
46 |       uses: actions/setup-python@v5
47 |       with:
48 |         python-version-file: "pyproject.toml"
49 | 
50 |     - name: Install ${{ env.package-name }}
51 |       run: make install-dev
52 | 
53 |     - name: Store git status
54 |       id: status-before
55 |       shell: bash
56 |       run: |
57 |         echo "::set-output name=BEFORE::$(git status --porcelain -b)"
58 | 
59 |     - name: Tests
60 |       run: make test
61 | 
62 |     - name: Check for files left behind by test
63 |       run: |
64 |         before="${{ steps.status-before.outputs.BEFORE }}"
65 |         after="$(git status --porcelain -b)"
66 |         if [[ "$before" != "$after" ]]; then
67 |             echo "git status from before: $before"
68 |             echo "git status from after: $after"
69 |             echo "Not all generated files have been deleted!"
70 |             exit 1
71 |         fi


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .idea
  2 | *experiments/
  3 | .DS*
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | pip-wheel-metadata/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | uv.lock
 98 | 
 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 | 
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 | 
106 | # SageMath parsed files
107 | *.sage.py
108 | 
109 | # Environments
110 | .env
111 | .venv
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 | 
118 | # Spyder project settings
119 | .spyderproject
120 | .spyproject
121 | 
122 | # Rope project settings
123 | .ropeproject
124 | 
125 | # mkdocs documentation
126 | /site
127 | 
128 | # mypy
129 | .mypy_cache/
130 | .dmypy.json
131 | dmypy.json
132 | 
133 | # Pyre type checker
134 | .pyre/
135 | 
136 | runs/*
137 | docs/build/*
138 | docs/api/*
139 | docs/examples/*
140 | site/*


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | -   repo: local
3 |     hooks:
4 |     -   id: formatting
5 |         name: formatting
6 |         entry: make
7 |         args: ["format"]
8 |         language: system
9 |         pass_filenames: false


--------------------------------------------------------------------------------
/HPs/DQN.yaml:
--------------------------------------------------------------------------------
 1 | # CartPole-v1 DQN configuration: num_steps: 5e4 num_envs: 64
 2 | algorithm_kwargs:
 3 |   # Core architecture / model
 4 |   n_units: 256
 5 |   q_kwargs:
 6 |     dueling: False
 7 |     feature_extractor_kwargs:
 8 |       architecture: mlp
 9 |       n_layers: 1
10 |       hidden_sizes: [256]
11 |     head_kwargs:
12 |       hidden_sizes: [256]
13 | 
14 |   # Exploration (decaying ε‐greedy)
15 |   policy_class:
16 |     _target_: mighty.mighty_exploration.DecayingEpsilonGreedy
17 |   policy_kwargs:
18 |     epsilon_start: 1.0
19 |     epsilon_final: 0.04
20 |     epsilon_decay_steps: 8000
21 | 
22 |   # Replay‐buffer settings
23 |   replay_buffer_class:
24 |     _target_: mighty.mighty_replay.MightyReplay
25 |   replay_buffer_kwargs:
26 |     capacity: 100000
27 | 
28 |   # Training hyperparameters
29 |   learning_rate: 2.3e-3
30 |   batch_size: 128
31 |   gamma: 0.99
32 |   learning_starts: 1000       # wait 1k transitions before training
33 | 
34 |   # Target‐network / updating (hard update every 1k ∇‐steps)
35 |   use_target: True
36 |   soft_update_weight: 0.005
37 |   target_update_freq: null
38 | 
39 |   # Double DQN update
40 |   td_update_class: mighty.mighty_update.QLearning
41 | 
42 |   td_update_kwargs:
43 |     gamma: 0.99
44 |     optimizer_class:
45 |       _target_: torch.optim.Adam
46 |     optimizer_kwargs:
47 |       lr: 2.3e-3
48 |       weight_decay: 1e-5
49 |       eps: 1e-6
50 |     max_grad_norm: 10.0
51 | 
52 |   # Checkpointing
53 |   save_replay: False
54 |   n_gradient_steps: 128
55 | 
56 |   # Misc/Evaluation/Logging
57 |   log_wandb: False
58 |   wandb_kwargs:
59 |     project: "my_dqn_experiment"
60 |     name: "DQN-cartpole"
61 | 


--------------------------------------------------------------------------------
/HPs/PPO.yaml:
--------------------------------------------------------------------------------
 1 | # CartPole-v1 PPO configuration: num_steps: 1e5 num_envs: 8
 2 | algorithm_kwargs:
 3 |   # Hyperparameters
 4 |   n_policy_units: 64
 5 |   n_critic_units: 64
 6 |   soft_update_weight: 0.01
 7 | 
 8 |   rollout_buffer_class:
 9 |     _target_: mighty.mighty_replay.MightyRolloutBuffer  # Using rollout buffer
10 |   rollout_buffer_kwargs:
11 |     buffer_size: 256  # Size of the rollout buffer.
12 |     gamma: 0.98  # Discount factor for future rewards.
13 |     gae_lambda: 0.8  # GAE lambda.
14 |     obs_shape: ???  # Placeholder for observation shape
15 |     act_dim: ???  # Placeholder for action dimension
16 |     n_envs: ???
17 |     discrete_action: ???  # Placeholder for discrete action flag
18 |     
19 | 
20 |   # Training
21 |   learning_rate:  3e-4
22 |   batch_size: 32        # Batch size for training.
23 |   gamma: 0.99             # The amount by which to discount future rewards.
24 |   ppo_clip: 0.2          # Clipping parameter for PPO.
25 |   value_loss_coef: 0.5    # Coefficient for value loss.
26 |   entropy_coef: 0.0      # Coefficient for entropy loss.
27 |   max_grad_norm: 0.5      # Maximum value for gradient clipping.
28 |   
29 | 
30 |   hidden_sizes: [64]
31 |   activation: 'tanh'
32 | 
33 |   n_epochs: 20
34 |   minibatch_size: 256
35 |   kl_target: 0.01
36 |   use_value_clip: True
37 |   value_clip_eps: 0.2
38 | 
39 |   policy_class: mighty.mighty_exploration.StochasticPolicy  # Policy class for exploration
40 |   policy_kwargs:
41 |     entropy_coefficient: 0.0  # Coefficient for entropy-based exploration.


--------------------------------------------------------------------------------
/HPs/SAC.yaml:
--------------------------------------------------------------------------------
 1 | # Pendulum-SAC hyperparameters num_envs=4 num_steps=5e4
 2 | algorithm_kwargs:
 3 |   # network sizes (PPO-style)
 4 |   n_policy_units:     256       # will become hidden_sizes=[8,8]
 5 |   n_critic_units:     256       # same for both Q-nets
 6 |   soft_update_weight: 0.01    # maps to tau
 7 | 
 8 |   # Replay buffer
 9 |   replay_buffer_class:
10 |     _target_: mighty.mighty_replay.MightyReplay
11 |   replay_buffer_kwargs:
12 |     capacity: 1e6
13 | 
14 |   # Scheduling & batch-updates
15 |   batch_size:       256
16 |   learning_starts:  5000
17 |   update_every:     1
18 |   n_gradient_steps: 1
19 | 
20 |   # Learning rates
21 |   policy_lr: 1e-3
22 |   q_lr:      1e-3
23 | 
24 |   # SAC hyperparameters
25 |   gamma: 0.99
26 |   alpha: 0.2
27 |   auto_alpha: True
28 |   target_entropy: -1
29 |   alpha_lr: 3e-4
30 | 
31 |   # Exploration wrapper
32 |   policy_class: mighty.mighty_exploration.StochasticPolicy
33 |   policy_kwargs:
34 |     entropy_coefficient: 0.2
35 |     discrete: False
36 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | 
  2 | # NOTE: Used on linux, limited support outside of Linux
  3 | #
  4 | # A simple makefile to help with small tasks related to development of Mighty
  5 | # These have been configured to only really run short tasks. Longer form tasks
  6 | # are usually completed in github actions.
  7 | 
  8 | .PHONY: help install-dev install check format pre-commit clean build clean-doc clean-build test doc publish
  9 | 
 10 | help:
 11 | 	@echo "Makefile Mighty"
 12 | 	@echo "* install-dev      to install all dev requirements and install pre-commit"
 13 | 	@echo "* check            to check the source code for issues"
 14 | 	@echo "* format           to format the code with ruff"
 15 | 	@echo "* typing           to type check the code with mypy"
 16 | 	@echo "* pre-commit       to run the pre-commit check"
 17 | 	@echo "* clean            to clean the dist and doc build files"
 18 | 	@echo "* build            to build a dist"
 19 | 	@echo "* test             to run the tests"
 20 | 	@echo "* docs             to serve and view the docs"
 21 | 	@echo "* docs-build-only  to generate and view the html files"
 22 | 	@echo "* docs-deploy      to push the latest doc version to gh-pages"
 23 | 	@echo "* publish          to help publish the current branch to pypi"
 24 | 
 25 | PYTHON ?= python
 26 | CYTHON ?= cython
 27 | PYTEST ?= uv run pytest
 28 | CTAGS ?= ctags
 29 | PIP ?= uv pip
 30 | MAKE ?= make
 31 | PRECOMMIT ?= uv run pre-commit
 32 | RUFF ?= uv run ruff
 33 | MYPY ?= uv run mypy
 34 | ISORT ?= uv run isort
 35 | 
 36 | DIR := ${CURDIR}
 37 | DIST := ${CURDIR}/dist
 38 | DOCDIR := ${CURDIR}/docs
 39 | INDEX_HTML := file://${DOCDIR}/html/build/index.html
 40 | 
 41 | install-dev:
 42 | 	$(PIP) install -e ".[dev,carl,docs,pufferlib,dacbench]"
 43 | 
 44 | install:
 45 | 	$(PIP) install -e ".[examples]"
 46 | 
 47 | 
 48 | # pydocstyle does not have easy ignore rules, instead, we include as they are covered
 49 | check: 
 50 | 	ruff format --check mighty test
 51 | 	ruff check mighty test
 52 | 
 53 | pre-commit:
 54 | 	$(PRECOMMIT) run --all-files
 55 | 
 56 | format: 
 57 | 	$(ISORT) isort mighty test
 58 | 	$(RUFF) format --silent mighty test
 59 | 	$(RUFF) check --fix --silent mighty test --exit-zero
 60 | 	$(RUFF) check --fix mighty test --exit-zero
 61 | 
 62 | typing:
 63 | 	$(MYPY) mighty
 64 | 
 65 | test:
 66 | 	$(PYTEST) -v --cov=mighty test --durations=20 --cov-report html
 67 | 
 68 | clean-doc:
 69 | 	rm -rf site
 70 | 
 71 | clean-build:
 72 | 	rm -rf ${DIST}
 73 | 
 74 | # Clean up any builds in ./dist as well as doc
 75 | clean: clean-doc clean-build
 76 | 
 77 | # Build a distribution in ./dist
 78 | build:
 79 | 	uv build
 80 | 
 81 | docs:
 82 | 	mkdocs serve
 83 | 
 84 | docs-build-only:
 85 | 	mkdocs build --clean --strict
 86 | 
 87 | docs-deploy:
 88 | 	mkdocs gh-deploy --force
 89 | 
 90 | # Publish to testpypi
 91 | # Will echo the commands to actually publish to be run to publish to actual PyPi
 92 | # This is done to prevent accidental publishing but provide the same conveniences
 93 | publish: clean-build build
 94 | 	uv publish --index testpypi
 95 | 	@echo
 96 | 	@echo "Test by installing from testpypi:"
 97 | 	@echo "pip install --index-url https://test.pypi.org/simple/ mighty-rl"
 98 | 	@echo
 99 | 	@echo "Once you have decided it works, publish to actual pypi with"
100 | 	@echo "uv publish"


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |     <a href="./docs/img/logo.png">
  3 |         <img src="./docs/img/logo.png" alt="Mighty Logo" width="80%"/>
  4 |     </a>
  5 | </p>
  6 | 
  7 | <div align="center">
  8 |     
  9 | [![PyPI Version](https://img.shields.io/pypi/v/mighty-rl.svg)](https://pypi.org/project/Mighty-RL/)
 10 | ![Python](https://img.shields.io/badge/Python-3.10-3776AB)
 11 | ![License](https://img.shields.io/badge/License-BSD3-orange)
 12 | [![Test](https://github.com/automl/Mighty/actions/workflows/test.yaml/badge.svg)](https://github.com/automl/Mighty/actions/workflows/test.yaml)
 13 | [![Doc Status](https://github.com/automl/Mighty/actions/workflows/docs_test.yaml/badge.svg)](https://github.com/automl/Mighty/actions/workflows/docs_test.yaml)
 14 |     
 15 | </div>
 16 | 
 17 | <div align="center">
 18 |     <h3>
 19 |       <a href="#installation">Installation</a> |
 20 |       <a href="https://automl.github.io/Mighty/">Documentation</a> |
 21 |       <a href="#run-a-mighty-agent">Run a Mighty Agent</a> |
 22 |       <a href="#cite-us">Cite Us</a>
 23 |     </h3>
 24 | </div>
 25 | 
 26 | ---
 27 | 
 28 | # Mighty
 29 | 
 30 | **Warning: Mighty is still in development without an official release! Use at your own peril and check back frequently for updates!**
 31 | 
 32 | Welcome to Mighty, hopefully your future one-stop shop for everything cRL.
 33 | Currently Mighty is still in its early stages with support for normal gym envs, DACBench and CARL.
 34 | The interface is controlled through hydra and we provide DQN, PPO and SAC algorithms.
 35 | We log training and regular evaluations to file and optionally also to tensorboard or wandb.
 36 | If you have any questions or feedback, please tell us, ideally via the GitHub issues!
 37 | 
 38 | Mighty features:
 39 | - Modular structure for easy (Meta-)RL tinkering
 40 | - PPO, SAC and DQN as base algorithms
 41 | - Environment integrations via Gymnasium, Pufferlib, CARL & DACBench
 42 | - Implementations of some important baselines: MAML, PLR, Cosine LR Schedule and more!
 43 | 
 44 | ## Installation
 45 | We recommend to using uv to install and run Mighty in a virtual environment.
 46 | The code has been tested with python 3.10.
 47 | 
 48 | First create a clean python environment:
 49 | 
 50 | ```bash
 51 | uv venv --python=3.10
 52 | source .venv/bin/activate
 53 | ```
 54 | 
 55 | Then  install Mighty:
 56 | 
 57 | ```bash
 58 | make install
 59 | ```
 60 | 
 61 | Optionally you can install the dev requirements directly:
 62 | ```bash
 63 | make install-dev
 64 | ```
 65 | 
 66 | Alternatively, you can install Mighty from PyPI:
 67 | ```bash
 68 | pip install mighty-rl
 69 | ```
 70 | 
 71 | ## Run a Mighty Agent
 72 | In order to run a Mighty Agent, use the run_mighty.py script and provide any training options as keywords.
 73 | If you want to know more about the configuration options, call:
 74 | ```bash
 75 | python mighty/run_mighty.py --help
 76 | ```
 77 | 
 78 | An example for running the PPO agent on the Pendulum gym environment looks like this:
 79 | ```bash
 80 | python mighty/run_mighty.py 'algorithm=ppo' 'environment=gymnasium/pendulum'
 81 | ```
 82 | 
 83 | ## Learning a Configuration Policy via DAC
 84 | 
 85 | In order to use Mighty with DACBench, you need to install DACBench first.
 86 | We recommend following the instructions in the [DACBench repo](https://github.com/automl/DACBench).
 87 | 
 88 | Afterwards, select the benchmark you want to run, for example the SigmoidBenchmark, and providing it as the "env" keyword: 
 89 | ```bash
 90 | python mighty/run_mighty.py 'algorithm=dqn' 'env=SigmoidBenchmark' 'env_wrappers=[dacbench.wrappers.MultiDiscreteActionWrapper]'
 91 | ```
 92 | 
 93 | ## Train your Agent on a CARL Environment
 94 | Mighty is designed with contextual RL in mind and therefore fully compatible with CARL.
 95 | Before you start training, however, please follow the installation instructions in the [CARL repo](https://github.com/automl/CARL).
 96 | 
 97 | Then use the same command as before, but provide the CARL environment, in this example CARLCartPoleEnv,
 98 | and information about the context distribution as keywords:
 99 | ```bash
100 | python mighty/run_mighty.py 'algorithm=dqn' 'env=CARLCartPoleEnv' '+env_kwargs.num_contexts=10' '+env_kwargs.context_feature_args=[gravity]'
101 | ```
102 | 
103 | ## Optimize Hyperparameters
104 | You can optimize the hyperparameters of your algorithm with the [Hypersweeper](https://github.com/automl/hypersweeper) package, e.g. using [SMAC3](https://github.com/automl/SMAC3). Mighty is directly compatible with Hypersweeper and thus smart and distributed HPO!
105 | 
106 | ## Further Examples
107 | We provide further examples, such as how to plot the logged evaluation data, in the [examples](examples) folder.
108 | 
109 | ## Cite Us
110 | 
111 | If you use Mighty in your work, please cite us:
112 | 
113 | ```bibtex
114 | @misc{mohaneimer24,
115 |   author    = {A. Mohan and T. Eimer and C. Benjamins and F. Hutter and M. Lindauer and A. Biedenkapp},
116 |   title     = {Mighty},
117 |   year      = {2024},
118 |   url = {https://github.com/automl/mighty}
119 | }
120 | ```
121 | 


--------------------------------------------------------------------------------
/docs/api_generator.py:
--------------------------------------------------------------------------------
 1 | """Generate the code reference pages and navigation.
 2 | 
 3 | # https://mkdocstrings.github.io/recipes/
 4 | """
 5 | from __future__ import annotations
 6 | 
 7 | import logging
 8 | from pathlib import Path
 9 | 
10 | import mkdocs_gen_files
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | source_path = "mighty"
15 | 
16 | # Modules whose members should not include inherited attributes or methods
17 | # NOTE: Given the current setup, we can only operate at a module level.
18 | # Ideally we specify options (at least at a module level) and we render
19 | # them into strings using a yaml parser. For now this is fine though
20 | NO_INHERITS = ("torch.nn",)
21 | TAB = "    "
22 | 
23 | for path in sorted(Path(source_path).rglob("*.py")):
24 |     module_path = path.relative_to(source_path).with_suffix("")
25 |     doc_path = path.relative_to(source_path).with_suffix(".md")
26 |     full_doc_path = Path("api/mighty", doc_path)
27 | 
28 |     parts = tuple(module_path.parts)
29 | 
30 |     if parts[-1] in ("__main__", "__version__", "__init__"):
31 |         continue
32 | 
33 |     if any(part.startswith("_") for part in parts):
34 |         continue
35 | 
36 |     if any(part.startswith("mighty_utils") for part in parts):
37 |         continue
38 | 
39 |     with mkdocs_gen_files.open(full_doc_path, "w") as fd:
40 |         if parts[0] != source_path:
41 |             parts = (source_path,) + parts
42 |         ident = ".".join(parts)
43 |         fd.write(f"::: {ident}")
44 | 
45 |         if ident.endswith(NO_INHERITS):
46 |             fd.write(f"\n{TAB}options:")
47 |             fd.write(f"\n{TAB}{TAB}inherited_members: false")
48 | 
49 |     mkdocs_gen_files.set_edit_path(full_doc_path, path)


--------------------------------------------------------------------------------
/docs/basic_usage.md:
--------------------------------------------------------------------------------
 1 | There are a few different ways you can use Mighty:
 2 | 
 3 | ### Running Meta-Methods
 4 | This is the easiest part. We have several algorithms and meta-methods implemented in Mighty and you should be able to run them directly on any environment of your choosing. The most difficult part will likely be the configuration of each method since they might require specific keywords or are only compatible with a given base algorithm. So you will likely want to read up on whatever method you choose. Then you also need to know if your method is of the runner or meta module type. Each have their own configuration keyword. An example for using a specific runner is:
 5 | 
 6 | ```bash
 7 | python mighty/run_mighty runner=es popsize=5 iterations=100 es=evosax.CMA_ES search_targets=["learning_rate", "_batch_size"] rl_train_agent=true
 8 | ```
 9 | This will use the evosax CMA-ES implementation with population size 5 to optimize the learning rate and batch size in 100 iterations. Meta modules, on the other hand, use a different keyword:
10 | ```bash
11 | python mighty/run_mighty.py +algorithm_kwargs.meta_methods=[mighty.mighty_meta.PrioritizedLevelReplay]
12 | ```
13 | This meta methods list collects all meta modules in the order they should be used. So while you can't use multiple runners, you can use layers of meta modules. 
14 | 
15 | ### Implementing New Components
16 | Of course Mighty currently only supports a limited amount of methods. This is where you come in! It should be fairly easy for you to add your own. We recommend following these steps:
17 | 
18 | 1. What are you adding? A runner, meta module, exploration policy, buffer, update variation or model? Make sure you choose the best level to implement your idea in.
19 | 2. Implement your method using the abstract class and existing methods as templates.
20 | 3. Plug your class into your Mighty config file. This works by replacing the default value with the import path of your custom class.
21 | 4. Run the algorithm.
22 | 
23 | Since you are passing the place from which to import your new class, you do not need to work within the Mighty codebase directly, but keep your changes separate. This way you can add several new methods to Mighty without copying the code. 
24 | 
25 | ### Combining Different Ideas
26 | You can combine different approaches with Mighty by varying the runner, exploration, buffer, update class and network architecture and combining them with an arbitrary number of meta modules.
27 | At this point, configuration might become very difficult. We recommend that you take a close look at how to use different hydra configuration files to separately configure each of your methods so that you can keep track of everything.
28 | Depending on what exactly you want to do, it can make sense to keep separate configuration files for each variation you make. This can be confusing, especially if you haven't worked with hydra before, so we recommed you take the time to focus on configurations when attempting combinations of several methods.


--------------------------------------------------------------------------------
/docs/hooks/cleanup_log_output.py:
--------------------------------------------------------------------------------
 1 | """The module is a hook which disables warnings and log messages which pollute the
 2 | doc build output.
 3 | 
 4 | One possible downside is if one of these modules ends up giving an actual
 5 | error, such as OpenML failing to retrieve a dataset. I tried to make sure ERROR
 6 | log message are still allowed through.
 7 | """
 8 | import logging
 9 | import warnings
10 | from typing import Any
11 | 
12 | import mkdocs
13 | import mkdocs.plugins
14 | import mkdocs.structure.pages
15 | 
16 | log = logging.getLogger("mkdocs")
17 | 
18 | 
19 | @mkdocs.plugins.event_priority(-50)
20 | def on_startup(**kwargs: Any):
21 |     # We can probably safely disregard these
22 |     warnings.filterwarnings("ignore", category=DeprecationWarning)
23 | 
24 | 
25 | def on_pre_page(
26 |     page: mkdocs.structure.pages.Page,
27 |     config: Any,
28 |     files: Any,
29 | ) -> mkdocs.structure.pages.Page | None:
30 |     # NOTE: mkdocs says they're always normalized to be '/' seperated
31 |     # which means this should work on windows as well.
32 | 
33 |     logging.getLogger("mighty").setLevel(logging.ERROR)
34 |     return page


--------------------------------------------------------------------------------
/docs/hooks/debug_which_page_is_being_rendered.py:
--------------------------------------------------------------------------------
 1 | """This module is a hook that when any code is being rendered, it will
 2 | print the path to the file being rendered.
 3 | 
 4 | This makes it easier to identify which file is being rendered when an error happens.
 5 | """
 6 | from __future__ import annotations
 7 | 
 8 | import logging
 9 | import os
10 | from typing import TYPE_CHECKING, Any
11 | 
12 | import mkdocs
13 | import mkdocs.plugins
14 | 
15 | if TYPE_CHECKING:
16 |     import mkdocs.structure.pages
17 | 
18 | log = logging.getLogger("mkdocs")
19 | 
20 | RENDER_EXAMPLES_ENV_VAR = "MIGHTY_DOC_RENDER_EXAMPLES"
21 | EXEC_DOCS_ENV_VAR = "MIGHTY_EXEC_DOCS"
22 | 
23 | truthy_values = {"yes", "on", "true", "1", "all"}
24 | 
25 | 
26 | def on_pre_page(
27 |     page: mkdocs.structure.pages.Page,
28 |     config: Any,
29 |     files: Any,
30 | ) -> mkdocs.structure.pages.Page | None:
31 |     render_examples = os.environ.get(RENDER_EXAMPLES_ENV_VAR, "true")
32 |     render_code = os.environ.get(EXEC_DOCS_ENV_VAR, "true")
33 |     if render_examples.lower() in truthy_values or render_code.lower() in truthy_values:
34 |         log.info(f"{page.file.src_path}")


--------------------------------------------------------------------------------
/docs/hooks/disable_markdown_exec.py:
--------------------------------------------------------------------------------
 1 | """This disable markdown_exec based on an environment variable.
 2 | This speeds up the build of the docs for faster iteration.
 3 | 
 4 | This is done by overwriting the module responsible for compiling and executing the code
 5 | by overriding the `exec(...)` global variable that is used to run the code.
 6 | We hijack it and print a helpful message about how to run the code cell instead.
 7 | 
 8 | https://github.com/pawamoy/markdown-exec/blob/adff40b2928dbb2d22f27684e085f02d39a07291/src/markdown_exec/formatters/python.py#L42-L70
 9 | """
10 | from __future__ import annotations
11 | 
12 | import logging
13 | import os
14 | from typing import Any
15 | 
16 | import mkdocs
17 | import mkdocs.plugins
18 | import mkdocs.structure.pages
19 | 
20 | RUN_CODE_BLOCKS_ENV_VAR = "MIGHTY_EXEC_DOCS"
21 | 
22 | logger = logging.getLogger("mkdocs")
23 | 
24 | 
25 | def _print_msg(compiled_code: Any, code_block_id: int, exec_globals: dict) -> None:
26 |     _print = exec_globals["print"]
27 |     _print(
28 |         f"Env variable {RUN_CODE_BLOCKS_ENV_VAR}=0 - No code to display."
29 |         "\nUse `just docs-code` (or `just docs-full` for examples) to run"
30 |         " the code block and display output."
31 |     )
32 | 
33 | truthy_values = {"yes", "on", "true", "1"}
34 | 
35 | @mkdocs.plugins.event_priority(100)
36 | def on_startup(**kwargs: Any):
37 |     run_code_blocks = os.environ.get(RUN_CODE_BLOCKS_ENV_VAR, "true")
38 |     if run_code_blocks.lower() not in truthy_values:
39 |         logger.warning(
40 |             f"Disabling markdown-exec due to {RUN_CODE_BLOCKS_ENV_VAR}={run_code_blocks}"
41 |             "\n.Use `just docs-full` to run and render examples.",
42 |         )
43 |         from markdown_exec.formatters import python
44 | 
45 |         setattr(python, "exec_python", _print_msg)


--------------------------------------------------------------------------------
/docs/img/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/docs/img/favicon.ico


--------------------------------------------------------------------------------
/docs/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/docs/img/logo.png


--------------------------------------------------------------------------------
/docs/img/logo_no_font.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/docs/img/logo_no_font.png


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Mighty is a Reinforcement Learning (RL) library that aims to make training general agents easy.
 3 | We natively support context in RL, i.e. train and test distributions that can be easily configured, as well
 4 | as Meta- and AutoRL methods on all levels.
 5 | That means if you're interested in general RL, you can start with well-known simulation environments and scale up
 6 | to actually applications using Mighty!
 7 | 
 8 | ### What Can I Do With Mighty?
 9 | Mighty offers a lot of flexibility for training general agents with online RL:
10 | 
11 | - train on standard and contextual RL environments
12 | - apply outer-loop methods like Bayesian Optimization or Evolutionary Strategies for Meta-Learning, Hyperparameter Optimization and more
13 | - use in-the-loop ideas like curriculum learning to enhance training
14 | - plug in modules for exploration, buffers or architectures without touching the full pipeline
15 | - combine different methods for Meta- and AutoRL to form full RL pipelines
16 | 
17 | We currently do not support other learning paradigms, but might extend to e.g. include offline data as an option. 
18 | 
19 | ### Where Is Mighty Going?
20 | 
21 | Currently Mighty is in early development and includes only standard RL algorithms compatible with cRL benchmarks and
22 | evaluation mechanisms. In the future, we hope to extend mighty with Meta-Learning methods as well as AutoRL, so stay tuned.
23 | 
24 | ### Contact & Citation
25 | Mighty is developed at [LUHAI Hannover]() by members of [AutoRL.org](). Your first contact is lead maintainer [Aditya Mohan](). If you found issues or want to contribute new features, it's best to visit our [GitHub page](https://github.com/automl/Mighty) page and start a discussion.
26 | 
27 | If you use Mighty for your research, please cite us:
28 | 
29 | ```bibtex
30 | @misc{mohaneimer24,
31 |   author    = {A. Mohan and T. Eimer and C. Benjamins and F. Hutter and M. Lindauer and A. Biedenkapp},
32 |   title     = {Mighty},
33 |   year      = {2024},
34 |   url = {https://github.com/automl/mighty}
35 | }
36 | ```


--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
 1 | We recommend to using uv to install and run Mighty in a virtual environment.
 2 | The code has been tested with python 3.10.
 3 | 
 4 | First create a clean python environment:
 5 | 
 6 | ```bash
 7 | uv venv --python=3.11
 8 | source .venv/bin/activate
 9 | ```
10 | 
11 | Then install Mighty via pip:
12 | 
13 | ```bash
14 | uv pip install mighty-rl
15 | ```
16 | 
17 | For a custom setup or if you want to hack Mighty, please see the [GitHub repo](https://github.com/automl/Mighty) instead.


--------------------------------------------------------------------------------
/docs/methods/architectures.md:
--------------------------------------------------------------------------------
  1 | # Mighty Architectures
  2 | Mighty is made for deep RL, meaning we rely on neural networks for function approximation. You'll find them under the 'mighty_models' keyword in the code. This page should give you a rough overview of their intended use and how to handle them for your experiments.
  3 | 
  4 | ## Network Structures in Mighty
  5 | Mighty networks are based on Torch. 
  6 | We implement some basic network architecture building block which can then be combined.
  7 | You will usually choose a feature extractor architecture and a head architecture which can be the same or different. 
  8 | Furthermore, you can combine two different architectures in the feature extractor. 
  9 | You can choose between:
 10 | 
 11 | - MLP: standard fully connected networks (flexible structure)
 12 | - CNN: 1D or 2D convolutional networks (flexible structure)
 13 | - ResNet: a 2D convolutional layer with two residual blocks
 14 | - TorchHub model (experimental): loading models from TorchHub
 15 | 
 16 | This should cover many standard combinations like a CNN feature extractor with an MLP head.
 17 | 
 18 | ## Implemented Models
 19 | The implemented 'mighty_models' define the prediction patterns for different algorithm classes. 
 20 | The DQN model, for example, is initialized to predict Q-values while the PPO model forwards through the policy head when called. 
 21 | Both can be based upon the same feature extraction and head structures, of course. 
 22 | If we look at the DQN model, we can see it primarily combines different elements to achieve this instead of implementing all of them: 
 23 | 
 24 | ```python
 25 | class DQN(nn.Module):
 26 |     """DQN network."""
 27 | 
 28 |     def __init__(self, num_actions, obs_size, dueling=False, **kwargs):
 29 |         """Initialize the network."""
 30 |         super().__init__()
 31 |         head_kwargs = {"hidden_sizes": [32, 32]}
 32 |         feature_extractor_kwargs = {"obs_shape": obs_size}
 33 |         if "head_kwargs" in kwargs:
 34 |             head_kwargs.update(kwargs["head_kwargs"])
 35 |         if "feature_extractor_kwargs" in kwargs:
 36 |             feature_extractor_kwargs.update(kwargs["feature_extractor_kwargs"])
 37 | 
 38 |         # Make feature extractor
 39 |         self.feature_extractor, self.output_size = make_feature_extractor(
 40 |             **feature_extractor_kwargs
 41 |         )
 42 |         self.dueling = dueling
 43 |         self.num_actions = int(num_actions)
 44 |         self.obs_size = obs_size
 45 |         self.hidden_sizes = head_kwargs["hidden_sizes"]
 46 | 
 47 |         # Make policy head
 48 |         self.head, self.value, self.advantage = make_q_head(
 49 |             self.output_size,
 50 |             self.num_actions,
 51 |             **head_kwargs,
 52 |         )
 53 | 
 54 |     def forward(self, x):
 55 |         """Forward pass."""
 56 |         x = self.feature_extractor(x)
 57 |         x = self.head(x)
 58 |         advantage = self.advantage(x)
 59 |         if self.dueling:
 60 |             value = self.value(x)
 61 |             x = value + advantage - advantage.mean(dim=1, keepdim=True)
 62 |         else:
 63 |             x = advantage
 64 |         return x
 65 | 
 66 |     def reset_head(self, hidden_sizes=None):
 67 |         """Reset the head of the network."""
 68 |         if hidden_sizes is None:
 69 |             hidden_sizes = self.hidden_sizes
 70 |         self.head, self.value, self.advantage = make_q_head(
 71 |             self.output_size,
 72 |             self.num_actions,
 73 |             hidden_sizes,
 74 |         )
 75 |         self.hidden_sizes = hidden_sizes
 76 | 
 77 |     def shrink_weights(self, shrinkage, noise_weight):
 78 |         """Shrink weights of the network."""
 79 |         params_old = deepcopy(list(self.head.parameters()))
 80 |         value_params_old = deepcopy(list(self.value.parameters()))
 81 |         adv_params_old = deepcopy(list(self.advantage.parameters()))
 82 |         self.reset_head(hidden_sizes=self.hidden_sizes)
 83 |         for p_old, p_rand in zip(*[params_old, self.head.parameters()], strict=False):
 84 |             p_rand.data = deepcopy(shrinkage * p_old.data + noise_weight * p_rand.data)
 85 |         for p_old, p_rand in zip(
 86 |             *[adv_params_old, self.advantage.parameters()], strict=False
 87 |         ):
 88 |             p_rand.data = deepcopy(shrinkage * p_old.data + noise_weight * p_rand.data)
 89 |         if self.dueling:
 90 |             for p_old, p_rand in zip(
 91 |                 *[value_params_old, self.value.parameters()], strict=False
 92 |             ):
 93 |                 p_rand.data = deepcopy(
 94 |                     shrinkage * p_old.data + noise_weight * p_rand.data
 95 |                 )
 96 | 
 97 |     def __getstate__(self):
 98 |         return (
 99 |             self.feature_extractor,
100 |             self.head,
101 |             self.advantage,
102 |             self.value,
103 |             self.dueling,
104 |             self.num_actions,
105 |         )
106 | 
107 |     def __setstate__(self, state):
108 |         self.feature_extractor = state[0]
109 |         self.head = state[1]
110 |         self.advantage = state[2]
111 |         self.value = state[3]
112 |         self.dueling = state[4]
113 |         self.num_actions = state[5]
114 | ```
115 | This allows us to have the actual architectures and network structures in central network classes and keeping the model classes quite short. 
116 | As you can see, the DQN class also has additional utility functions like parameter shrinking that can be used in different updates or meta components. 
117 | These are fully optional and can be added as you need them for other components.
118 | Depending on how you structure your model class, you should also revisit the corresponding update to ensure compatibility.
119 | 
120 | ## Changing Network Structure
121 | The MLP and CNN networks have a semi-configurable structure.
122 | Via the algorithm_kwargs, you can specify activations as well as number of layers and units for MLPs and number and kind of convolutions, channels, strides and paddings for CNN.
123 | Hidden sizes, number of channels, stride and padding can be configured per layer for more variation.
124 | Activations, on the other hand, are currently set for the full network.
125 | 
126 | ## When Should I Implement A New Network Class?
127 | Current network classes cover standard cases with some flexibility for MLPs and CNNs. 
128 | The TorchHub option is still being tested and also limited since it's not focused on RL models.
129 | Therefore several relevant options like Transformers still need their own class. 
130 | If you want to use a different architecture than listed here, you should simply make a new class for it and enable its creation via 'make_feature_extractor'.


--------------------------------------------------------------------------------
/docs/methods/inner_loops.md:
--------------------------------------------------------------------------------
 1 | # Mighty Inner Loops
 2 | A key motivation for Mighty is to make it easy to create systems that interact with the RL loop. If these systems work during an algorithm's runtime, they are inner loop components. In Mighty, we call them Meta Components. This page documents their structure and why they're so useful.
 3 | 
 4 | ## What Are Meta Components?
 5 | Meta components are elements interacting with the main loop at various points. 
 6 | Within this interaction, they have access to virtually all current internal information and can adapt it.
 7 | This means everything from hyperparameter scheduling to learning a separate dynamics models and more is possible within this structure. 
 8 | Meta components can be stacked on top of one another to combine different approaches in a single run.
 9 | This enables complex inner loop setups without entangling methods in code.
10 | 
11 | ## The Metrics Dictionary
12 | The most important part when adding components is the 'metrics' dictionary. This is Mighty's central information hub. 
13 | Here you can find transitions, losses, predictions, batches and parameters - everything you need to build methods that actively work with the RL loop.
14 | If you want examples of how it is used, you can check out our RND implementation:
15 | ```python
16 |     def get_reward(self, metrics):
17 |         """Adapt LR on step.
18 | 
19 |         :param metrics: Dict of current metrics
20 |         :return:
21 |         """
22 |         if self.rnd_net is None:
23 |             self.initialize_networks(metrics["transition"]["next_state"].shape[1:])
24 | 
25 |         rnd_error = self.rnd_net.get_error(metrics["transition"]["next_state"])
26 |         metrics["transition"]["intrinsic_reward"] = (
27 |             self.internal_reward_weight * rnd_error
28 |         )
29 |         metrics["transition"]["reward"] = (
30 |             metrics["transition"]["reward"] + self.internal_reward_weight * rnd_error
31 |         )
32 |         return metrics
33 | ```
34 | Here we read the next state from the metrics dictionary, predict state novelty from it and update the transition reward.
35 | We also add a new intrinsic reward key to enable logging.
36 | You can assume that most if not all relevant information is contained in the metrics dictionary at any given time. 
37 | It is also transmitted to many different Mighty components like the exploration policy, the buffer, the update function or to any meta-components.
38 | 
39 | ## Interactions With The Main Loop
40 | Meta-components are classes with methods that can be called at different points in the learning loop. There are several different call positions and they are specified by the component itself:
41 | ```python
42 |     def __init__(self) -> None:
43 |         """Meta module init.
44 | 
45 |         :return:
46 |         """
47 |         self.pre_step_methods = []
48 |         self.post_step_methods = []
49 |         self.pre_update_methods = []
50 |         self.post_update_methods = []
51 |         self.pre_episode_methods = []
52 |         self.post_episode_methods = []
53 | ```
54 | Each of these calls will receive the metrics dictionary, resulting in a very flexible type. 
55 | Right now Mighty contains a few meta-components doing very different things, e.g.:
56 | 
57 | - task scheduling/curriculum learning
58 | - hyperparameter scheduling
59 | - intrinsic rewards
60 | 
61 | Meta-components are also stackable, i.e. you can run multiple ones per training run. 
62 | In principle, you can do almost anything in a meta-component, including training additional networks or calling the policy directly.
63 | Before you default to using this class, however, we recommend double checking if your idea isn't better suited to a more specific class.
64 | 
65 | ## Combining Components
66 | When combining different modules, they are stacked on top of one another. 
67 | This means they are executed in order for each method. 
68 | For meta components interacting with each other or the same parts of the base loop, this order can be important!
69 | If you, for example, use a curriculum based on training reward and intrinsic reward, you should likely configure the curriculum to be called first to avoid basing the difficulty on the reward bonus.


--------------------------------------------------------------------------------
/docs/methods/outer_loops.md:
--------------------------------------------------------------------------------
 1 | # Mighty Outer Loops
 2 | Methods that interact with repeated runs of RL algorithms are our Mighty runners. These function a level above the standard RL training to modify the inner loop. On this page, you'll find information on their structure and what kind of usecases they cover.
 3 | 
 4 | ## Runners
 5 | Runners are a wrapper class around the agent and can interact with the full task spectrum, i.e. adapt agent and environment and run this combination for an arbitrary amount of steps.
 6 | The very basic online runner simply executes a task and evaluates the resulting policy:
 7 | ```python
 8 | class MightyOnlineRunner(MightyRunner):
 9 |     def run(self) -> Tuple[Dict, Dict]:
10 |         train_results = self.train(self.num_steps)
11 |         eval_results = self.evaluate()
12 |         return train_results, eval_results
13 | ``` 
14 | The ES runner, on the other hand, has a considerably longer 'run' function including multiple calls to versions of the agent:
15 | ```python
16 |     def run(self) -> Tuple[Dict, Dict]:
17 |         es_state = self.es.initialize(self.rng)
18 |         for _ in range(self.iterations):
19 |             rng_ask, _ = jax.random.split(self.rng, 2)
20 |             x, es_state = self.es.ask(rng_ask, es_state)
21 |             eval_rewards = []
22 |             for individual in x:
23 |                 if self.search_params:
24 |                     self.apply_parameters(individual[: self.total_n_params])
25 |                     individual = individual[self.total_n_params :]
26 |                 for i, target in enumerate(self.search_targets):
27 |                     if target == "parameters":
28 |                         continue
29 |                     new_value = np.asarray(individual[i]).item()
30 |                     if target in ["_batch_size", "n_units"]:
31 |                         new_value = max(0, int(new_value))
32 |                     setattr(self.agent, target, new_value)
33 |                 if self.train_agent:
34 |                     self.train(self.num_steps_per_iteration)
35 |                 eval_results = self.evaluate()
36 |                 eval_rewards.append(eval_results["mean_eval_reward"])
37 |             fitness = self.fit_shaper.apply(x, jnp.array(eval_rewards))
38 |             es_state = self.es.tell(x, fitness, es_state)
39 |         eval_results = self.evaluate()
40 |         return {"step": self.iterations}, eval_results
41 | ```
42 | Conceptually, you should think of runners creating new RL tasks, that is combinations of environment and agent, to achieve some goal. 
43 | This can be meta-learning, hyperparameter optimization and more.
44 | 
45 | ## Information Flow
46 | Runners don't interact with the inner loop directly, but primarily via the agent class interface.
47 | Running and evaluation the agent are the two most important function calls, but runners can also utilize the update and access buffers, environments, parameters and more. 
48 | Thus, the information can be performance as well as much of the algorithm state after execution.
49 | Notably, runners can also access meta components, enabling hybrid approaches inner loops that span multiple outer loops.


--------------------------------------------------------------------------------
/docs/package_structure.md:
--------------------------------------------------------------------------------
 1 | Mighty is desined to be highly modular, enabling access to the RL loop on different levels. This means it's not designed to be the absolute fastest way to run RL, but the most convenient one to apply different sorts of RL, MetaRL and AutoRL methods. As such, there are a few things you should know about the structure of Mighty.
 2 | 
 3 | ### For Multiple Inner Runs: Mighty Runners
 4 | Mighty uses runner classes to control the outer training loop. In the simplest case, a runner will just directly call the agent's train and evaluation functions without any changes:
 5 | 
 6 | ```python
 7 | def run(self) -> Tuple[Dict, Dict]:
 8 |         train_results = self.train(self.num_steps)
 9 |         eval_results = self.evaluate()
10 |         return train_results, eval_results
11 | ```
12 | This will result in a standard RL agent training run. Of course, we can at this point also run agents multiple times, make changes to their setup (hyperparameters, weights, environments) and integrate learning on this meta-level.
13 | A still fairly simple example is our ESRunner for outer loops with Evolutionary Strategies:
14 | 
15 | ```python
16 | def run(self) -> Tuple[Dict, Dict]:
17 |         es_state = self.es.initialize(self.rng)
18 |         for _ in range(self.iterations):
19 |             rng_ask, _ = jax.random.split(self.rng, 2)
20 |             x, es_state = self.es.ask(rng_ask, es_state)
21 |             eval_rewards = []
22 | 
23 |             for individual in x:
24 |                 if self.search_params:
25 |                     self.apply_parameters(individual[: self.total_n_params])
26 |                     individual = individual[self.total_n_params :]
27 | 
28 |                 for i, target in enumerate(self.search_targets):
29 |                     if target == "parameters":
30 |                         continue
31 |                     new_value = np.asarray(individual[i]).item()
32 |                     if target in ["_batch_size", "n_units"]:
33 |                         new_value = max(0, int(new_value))
34 |                     setattr(self.agent, target, new_value)
35 | 
36 |                 if self.train_agent:
37 |                     self.train(self.num_steps_per_iteration)
38 | 
39 |                 eval_results = self.evaluate()
40 |                 eval_rewards.append(eval_results["mean_eval_reward"])
41 | 
42 |             fitness = self.fit_shaper.apply(x, jnp.array(eval_rewards))
43 |             es_state = self.es.tell(x, fitness, es_state)
44 | 
45 |         eval_results = self.evaluate()
46 |         return {"step": self.iterations}, eval_results
47 | ```
48 | Here we can change all sorts of things about the agent, train in between or only evaluate and use the ES to get fresh inputs. Runner classes are defined with these multiple evaluations of RL tasks in mind, i.e. these classes will usually train multiple agents, reset their policies completely or otherwise start over at some point. 
49 | 
50 | ### For In-The-Loop Methods: Mighty Meta Modules
51 | 
52 | Not all Meta- or AutoRL methods operate in an outer loop, however. For the ones that configure training while it is still ongoing, we use the Mighty Meta Modules. 
53 | These are classes that maintain lists of function calls to make at different points in training:
54 | 
55 | ```python
56 |     def __init__(self) -> None:
57 |         """Meta module init.
58 | 
59 |         :return:
60 |         """
61 |         self.pre_step_methods = []
62 |         self.post_step_methods = []
63 |         self.pre_update_methods = []
64 |         self.post_update_methods = []
65 |         self.pre_episode_methods = []
66 |         self.post_episode_methods = []
67 | ```
68 | This gives meta modules a lot of flexibility of when to act upon training. Additionally, each of these function calls is given a "metrics" dictionary. This dictionary contains most, if not all, relevant information about training progress, e.g.:
69 | 
70 | - the last transitions
71 | - the last losses, errors and predictions
72 | - policy, Q- and value-networks
73 | - hyperparameters
74 | 
75 | This means meta modules can use everything from the current timestep to agent predictions. 
76 | 
77 | 
78 | ### Algorithm Components: Mighty Exploration, Buffers and Updates
79 | 
80 | The Mighty algorithms themselves also have modules which can be easily switched. These are exploration policies, buffers and update classes. 
81 | Exploration policies and buffers furthermore have access to the same metrics dictionary as meta modules, meaning you can get creative as to what they do with this information.
82 | The way they are used in the RL loop is fixed, however, such that these are a bit more streamlined than the completely free meta-modules.
83 | 
84 | 
85 | ### Inside the Agent: Mighty Models
86 | 
87 | Agent loops outside of exploration, buffers and updates are harder to alter in Mighty, since Mighty is primarily focused on meta-methods.
88 | You can control the network architecture of your agent fairly easily, however. 
89 | There are two principal avenues for this: 
90 | 
91 | 1. You can use one of the pre-defined Mighty Models and configure it to use a different network architecture in the config. We use torch internally, that means you can allocate torch.nn layers and activations in different parts of these networks to form a custom architecture.
92 | 2. If you also want to customize what exactly the network predicts or add things like frozen weights, you probably want to implement your own Mighty Model. These always contain a 'feature_extractor' as a base and can vary beyond that.


--------------------------------------------------------------------------------
/docs/usecases/Contextual_RL.md:
--------------------------------------------------------------------------------
 1 | ### What Is Contextual RL?
 2 | 
 3 | Most RL environments are either not concerned with generalization at all or test generalization performance without providing much insight into what agents are tested on,
 4 | e.g. by using procedurally generated levels that are hard to understand as a structured training or test distribution.
 5 | Contextual RL (or cRL)[[Hallak et al., CoRR 2015](https://arxiv.org/pdf/1502.02259.pdf), [Benjamins et al., CoRR 2022](https://arxiv.org/pdf/2202.04500.pdf)] aims to make the task distributions agents are trained on a specific as possible in order to gain better insights where agents perform well and what is currently missing in RL generalization.
 6 | 
 7 | ### Contextual RL With CARL
 8 | 
 9 | [CARL (context adaptive RL)](https://github.com/automl/CARL) (see [Benjamins et al., EcoRL 2021](<https://arxiv.org/pdf/2110.02102.pdf>) for more information) is a benchmark library specifically designed for contextual RL.
10 | It provides highly configurable contextual extensions to several well-known RL environments and is what we recommend to get started in cRL.
11 | Mighty is designed with contextual RL in mind and therefore fully compatible with CARL.
12 | 
13 | The training works similarly to a standard RL environment, but now you can specify the training and test distributions, for example 10 variations of gravity for CartPole from CARL's default distribution:
14 | 
15 | ```bash
16 | python mighty/run_mighty.py 'algorithm=dqn' 'env=CARLCartPoleEnv' '+env_kwargs.num_contexts=10' '+env_kwargs.context_feature_args=[gravity]'
17 | ```
18 | 
19 | Other CARL options are supported similarly as env_kwargs, though we recommend checking out the CARL examples to get a better idea of how to define these distributions.


--------------------------------------------------------------------------------
/docs/usecases/DAC.md:
--------------------------------------------------------------------------------
 1 | ### What Is Dynamic Algorithm Configuration?
 2 | Dynamic Algorithm Configuration (DAC) [[Biedenkapp et al., ECAI 2020](https://ml.informatik.uni-freiburg.de/wp-content/uploads/papers/20-ECAI-DAC.pdf), [Adriaensen et al., JAIR 2022](https://arxiv.org/pdf/2205.13881.pdf)]
 3 | is a hyperparameter optimization paradigm aiming to find the best possible hyperparameter configuration for a given *algorithm instance* at every *timestep* during runtime.
 4 | DAC can easily be modelled as a contextual MDP and is thus a real-world application of RL.
 5 | 
 6 | 
 7 | ### Dynamic Algorithm Configuration with Mighty
 8 | In order to interface with configurable algorithms, we recommend [DACBench](https://github.com/automl/DACBench).
 9 | It provides algorithms from different fields as well as artificial benchmarks, all with the OpenAI gym interface.
10 | 
11 | Select the benchmark you want to run, for example the FunctionApproximationBenchmark, and providing it as the "env" keyword:
12 | 
13 | ```bash
14 | python mighty/run_mighty.py 'algorithm=dqn' 'env=FunctionApproximationBenchmark'
15 | ```
16 | The naming here will make Mighty autodetect it as a DACBench environment.
17 | 
18 | The benchmarks in DACBench have many configuration options. You can use your hydra configs to include your changes, simply add them to the env_kwargs like this:
19 | 
20 | ```bash
21 | python run_mighty.py 'algorithm=dqn' 'env=FunctionApproximationBenchmark' '+env_kwargs.dimension=3'
22 | ```
23 | 
24 | To see the full options for DACBench environments, refer to the [DACBench documentation](https://automl.github.io/DACBench/main/index.html).


--------------------------------------------------------------------------------
/docs/usecases/Standard_RL.md:
--------------------------------------------------------------------------------
 1 | If you want to use Mighty on standard RL environments, you can choose between the [Gymnasium](https://gymnasium.farama.org/) interface and [Pufferlib](https://puffer.ai/) as a fast alternative. 
 2 | Generally we recommend you use Pufferlib where possible, but the choice is yours!
 3 | 
 4 | ### Mighty on Gymnasium Environments
 5 | 
 6 | Mighty can be used as a standard RL library for all environments that follow the Gymnasium interface.
 7 | In order to run a Mighty Agent, use the run_mighty.py script and provide any training options as keywords. If you want to know more about the configuration options, call:
 8 | 
 9 | ```bash
10 | python mighty/run_mighty.py --help
11 | ```
12 | 
13 | An example for running the PPO agent on the Pendulum gym environment for 1000 steps looks like this:
14 | 
15 | ```bash
16 | python mighty/run_mighty.py 'num_steps=1000' 'algorithm=ppo' 'env=Pendulum-v1'
17 | ```
18 | 
19 | We assume that if you don't specify anything beyond the name, you want to use Gymnasium. This will also work for environments that are registered with Gymnasium upon installation, e.g. [Gymnasium Robotics](https://robotics.farama.org/) and others.
20 | You can assume specifying the environment name like this to work just like "gym.make()".
21 | 
22 | ### Mighty on Pufferlib Environments
23 | Pufferlib offers an efficient way to parallelize environment evaluations for a wide selection of tasks. Many well-known Gymnasium environments or benchmarks like [ProcGen]() are included in Pufferlib and we recommend it as a default for these.
24 | Running Pufferlib environments is very similar to running Gymnasium environments, you only need to add the pufferlib domain:
25 | 
26 | ```bash
27 | python mighty/run_mighty.py 'num_steps=1000' 'algorithm=ppo' 'env=pufferlib.environments.procgen.bigfish'
28 | ```
29 | 
30 | We have some example configs where the env domain is pre-configured and you can override the name only. An example for minigrid would be:
31 | 
32 | ```yaml
33 | env_name: MiniGrid-DoorKey-8x8-v0    # Overide with names z.B MiniGrid-LavaGapS5-v0, MiniGrid-DoorKey-8x8-v0, MiniGrid-ObstructedMaze-1Dl-v0, MiniGrid-KeyCorridorS3R2-v0, MiniGrid-UnlockPickup-v0
34 | env: pufferlib.environments.minigrid.${env_name}
35 | env_kwargs: {}
36 | ```
37 | 
38 | Meaning you can use this configuration (let's call it pufferlib_minigrid) with just the env name, similar to above:
39 | ```bash
40 | python mighty/run_mighty.py 'num_steps=1000' 'algorithm=ppo' 'env=pufferlib_minigrid' 'env_name=MiniGrid-LavaGapS5-v0'
41 | ```


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/examples/__init__.py


--------------------------------------------------------------------------------
/examples/custom_exploration_scheduler.py:
--------------------------------------------------------------------------------
 1 | """Epsilon Greedy Scheduler."""
 2 | 
 3 | from __future__ import annotations
 4 | from mighty.mighty_meta.mighty_component import MightyMetaComponent
 5 | 
 6 | 
 7 | class EpsilonSchedule(MightyMetaComponent):
 8 |     """Cosine LR Schedule with optional warm restarts."""
 9 | 
10 |     def __init__(
11 |         self,
12 |         initial_epsilon=1.0,
13 |         num_decay_steps=40000,
14 |         target_epsilon=0.01
15 |     ) -> None:
16 |         """Epsilon schedule initialization.
17 | 
18 |         :param initial_epsilon: Initial maximal epsilon
19 |         :param num_decay_steps: Length of schedule in steps
20 |         :param target_epsilon: Minimal epsilon
21 |         :return:
22 |         """
23 |         super().__init__()
24 |         self.initial_epsilon = initial_epsilon
25 |         self.target_epsilon = target_epsilon
26 |         self.num_decay_steps = num_decay_steps
27 |         self.pre_step_methods = [self.adapt_epsilon]
28 | 
29 |     def adapt_epsilon(self, metrics):
30 |         """Adapt epsilon on step.
31 | 
32 |         :param metrics: Dict of current metrics
33 |         :return:
34 |         """
35 |         current_epsilon = self.initial_epsilon - (
36 |             (self.initial_epsilon - self.target_epsilon)
37 |             * metrics["step"]
38 |             / self.num_decay_steps
39 |         )
40 |         metrics["hp/pi_epsilon"] = current_epsilon


--------------------------------------------------------------------------------
/examples/custom_policy.py:
--------------------------------------------------------------------------------
 1 | """UCB exploration for DQN."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import numpy as np
 6 | 
 7 | from mighty.mighty_exploration.mighty_exploration_policy import MightyExplorationPolicy
 8 | 
 9 | 
10 | class QValueUCB(MightyExplorationPolicy):
11 |     """Exploration via UCB for DQN."""
12 | 
13 |     def __init__(
14 |         self,
15 |         algo,
16 |         model,
17 |         constant=2,
18 |     ):
19 |         """Initialize UCB.
20 | 
21 |         :param algo: algorithm name
22 |         :param func: policy function
23 |         :param constant: c constant for UCB
24 |         :return:
25 |         """
26 |         super().__init__(algo, model)
27 |         self.c = constant
28 |         self.action_selected_count = np.zeros(model.num_actions)
29 | 
30 |     def explore(self, s, return_logp, metrics):
31 |         """Explore.
32 | 
33 |         :param s: state
34 |         :param return_logp: return logprobs
35 |         :param metrics: metrics dictionary
36 |         :return: action or (action, logprobs)
37 |         """
38 |         # Get Q-values
39 |         _, qvals = self.sample_action(s)
40 |         # Calculate UCB bonus
41 |         ucb_bonus = self.c*np.sqrt(np.log(metrics["step"] + 1)/(self.action_selected_count + 1e-4))
42 |         # Add bonus and selection actions
43 |         ucb_actions = np.argmax(qvals.detach().numpy() + ucb_bonus, axis=1)
44 |         # Update action counter
45 |         if isinstance(ucb_actions, np.ndarray):
46 |             for action in ucb_actions:
47 |                 self.action_selected_count[action] += 1
48 |         else:
49 |             self.action_selected_count[ucb_actions] += 1
50 |             ucb_actions = np.array([ucb_actions])
51 |         return (ucb_actions, qvals) if return_logp else ucb_actions


--------------------------------------------------------------------------------
/examples/hypersweeper_smac_example_config.yaml:
--------------------------------------------------------------------------------
  1 | defaults:
  2 |   - _self_
  3 |   - override hydra/job_logging: colorlog
  4 |   - override hydra/hydra_logging: colorlog
  5 |   - override hydra/sweeper: HyperSMAC
  6 | 
  7 | runner: standard
  8 | debug: false
  9 | seed: 0
 10 | output_dir: examples/hypersweeper_example_output
 11 | wandb_project: null
 12 | tensorboard_file: null
 13 | experiment_name: optuna_tuning_example
 14 | num_steps: 50_000 
 15 | env: pufferlib.ocean.bandit
 16 | env_kwargs: {}
 17 | env_wrappers: []
 18 | num_envs: 64
 19 | 
 20 | # @package _global_
 21 | algorithm: PPO
 22 | 
 23 | algorithm_kwargs:
 24 |   # Hyperparameters
 25 |   n_policy_units: 128
 26 |   n_critic_units: 128
 27 |   soft_update_weight: 0.01
 28 | 
 29 |   rollout_buffer_class:
 30 |     _target_: mighty.mighty_replay.MightyRolloutBuffer  # Using rollout buffer
 31 |   rollout_buffer_kwargs:
 32 |     buffer_size: 4096  # Size of the rollout buffer.
 33 |     gamma: 0.99  # Discount factor for future rewards.
 34 |     gae_lambda: 0.95  # GAE lambda.
 35 |     obs_shape: ???  # Placeholder for observation shape
 36 |     act_dim: ???  # Placeholder for action dimension
 37 |     n_envs: ???
 38 |     
 39 | 
 40 |   # Training
 41 |   learning_rate:  3e-4
 42 |   batch_size: 1024  # Batch size for training.
 43 |   gamma: 0.99  # The amount by which to discount future rewards.
 44 |   n_gradient_steps: 3  # Number of epochs for updating policy.
 45 |   ppo_clip: 0.2  # Clipping parameter for PPO.
 46 |   value_loss_coef: 0.5  # Coefficient for value loss.
 47 |   entropy_coef: 0.01  # Coefficient for entropy loss.
 48 |   max_grad_norm: 0.5  # Maximum value for gradient clipping.
 49 |   
 50 | 
 51 |   hidden_sizes: [64, 64]
 52 |   activation: 'tanh'
 53 | 
 54 |   n_epochs: 10
 55 |   minibatch_size: 64
 56 |   kl_target: 0.01
 57 |   use_value_clip: True
 58 |   value_clip_eps: 0.2
 59 | 
 60 |   policy_class: mighty.mighty_exploration.StochasticPolicy  # Policy class for exploration
 61 |   policy_kwargs:
 62 |     entropy_coefficient: 0.0  # Coefficient for entropy-based exploration.
 63 | 
 64 | # Training
 65 | eval_every_n_steps: 1e4  # After how many steps to evaluate.
 66 | n_episodes_eval: 10
 67 | checkpoint: null  # Path to load model checkpoint
 68 | save_model_every_n_steps: 5e5
 69 | 
 70 | hydra:
 71 |   run:
 72 |     dir: ${output_dir}/${experiment_name}_${seed}
 73 |   sweep:
 74 |     dir: ${output_dir}/${experiment_name}_${seed}
 75 |   sweeper:
 76 |     n_trials: 20
 77 |     budget_variable: num_steps
 78 |     sweeper_kwargs:
 79 |       optimizer_kwargs:
 80 |         smac_facade: 
 81 |           _target_: smac.facade.multi_fidelity_facade.MultiFidelityFacade
 82 |           _partial_: true
 83 |         intensifier: 
 84 |           _target_: smac.facade.multi_fidelity_facade.MultiFidelityFacade.get_intensifier
 85 |           _partial_: true
 86 |           eta: 3
 87 |         scenario:
 88 |           n_trials: ${hydra.sweeper.n_trials}
 89 |           seed: ${seed}
 90 |           min_budget: 5000
 91 |           max_budget: 50000
 92 |           deterministic: true
 93 |           n_workers: 1
 94 |           output_directory: ${hydra.sweep.dir}
 95 |     search_space: 
 96 |       hyperparameters:
 97 |         algorithm_kwargs.learning_rate:
 98 |           type: uniform_float
 99 |           lower: 1e-5
100 |           upper: 1e-3
101 |           log: true
102 |         algorithm_kwargs.batch_size:
103 |           type: uniform_int
104 |           lower: 8
105 |           upper: 128


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_0/0/.hydra/config.yaml:
--------------------------------------------------------------------------------
 1 | runner: standard
 2 | debug: false
 3 | seed: 0
 4 | output_dir: examples/multiple_runs
 5 | wandb_project: null
 6 | tensorboard_file: null
 7 | experiment_name: mighty_experiment
 8 | algorithm_kwargs:
 9 |   n_policy_units: 128
10 |   n_critic_units: 128
11 |   soft_update_weight: 0.01
12 |   rollout_buffer_class:
13 |     _target_: mighty.mighty_replay.MightyRolloutBuffer
14 |   rollout_buffer_kwargs:
15 |     buffer_size: 4096
16 |     gamma: 0.99
17 |     gae_lambda: 0.95
18 |     obs_shape: ???
19 |     act_dim: ???
20 |     n_envs: ???
21 |   learning_rate: 0.0003
22 |   batch_size: 1024
23 |   gamma: 0.99
24 |   n_gradient_steps: 3
25 |   ppo_clip: 0.2
26 |   value_loss_coef: 0.5
27 |   entropy_coef: 0.01
28 |   max_grad_norm: 0.5
29 |   hidden_sizes:
30 |   - 64
31 |   - 64
32 |   activation: tanh
33 |   n_epochs: 10
34 |   minibatch_size: 64
35 |   kl_target: 0.01
36 |   use_value_clip: true
37 |   value_clip_eps: 0.2
38 |   policy_class: mighty.mighty_exploration.StochasticPolicy
39 |   policy_kwargs:
40 |     entropy_coefficient: 0.0
41 | eval_every_n_steps: 10000.0
42 | n_episodes_eval: 10
43 | checkpoint: null
44 | save_model_every_n_steps: 500000.0
45 | algorithm: PPO
46 | num_steps: 50000
47 | env: CartPole-v1
48 | env_kwargs: {}
49 | env_wrappers: []
50 | num_envs: 10
51 | search_space:
52 |   hyperparameters:
53 |     algorithm_kwargs.learning_rate:
54 |       type: uniform_float
55 |       lower: 1.0e-06
56 |       upper: 0.01
57 |       log: true
58 |       default_value: 0.005
59 |     algorithm_kwargs.epsilon:
60 |       type: uniform_float
61 |       lower: 0.01
62 |       upper: 0.25
63 |       default_value: 0.1
64 |     algorithm_kwargs.batch_size:
65 |       type: categorical
66 |       choices:
67 |       - 32
68 |       - 64
69 |       - 128
70 |       - 256
71 |       default_value: 32
72 |     algorithm_kwargs.soft_update_weight:
73 |       type: uniform_float
74 |       lower: 0.01
75 |       upper: 1.0
76 |       log: true
77 |       default_value: 1.0
78 |     algorithm_kwargs.td_update_class:
79 |       type: categorical
80 |       choices:
81 |       - mighty.mighty_update.QLearning
82 |       - mighty.mighty_update.DoubleQLearning
83 |       default_value: mighty.mighty_update.DoubleQLearning
84 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_0/0/.hydra/overrides.yaml:
--------------------------------------------------------------------------------
1 | - env=CartPole-v1
2 | - num_steps=50000
3 | - num_envs=10
4 | - seed=0
5 | - output_dir=examples/multiple_runs
6 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_0/eval_results.csv:
--------------------------------------------------------------------------------
 1 | ,step,seed,eval_episodes,mean_eval_step_reward,mean_eval_reward,instance
 2 | 0,10000,0,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.59375    1.275      1.59375    0.49038462 1.59375    0.98076923
 3 |  0.91071429 1.5        0.72857143 0.87931034]",25.5,None
 4 | 1,20000,0,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.14       1.62857143 0.46530612 0.87692308 2.07272727 1.03636364
 5 |  1.52       0.81428571 0.84444444 1.425     ]",22.8,None
 6 | 2,30000,0,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.53157895 2.23846154 0.67674419 2.425      0.74615385 1.38571429
 7 |  0.66136364 2.07857143 1.81875    0.41571429]",29.1,None
 8 | 3,40000,0,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.52380952 2.13333333 0.82051282 2.28571429 0.7804878  0.91428571
 9 |  1.23076923 1.23076923 0.41558442 1.23076923]",32.0,None
10 | 4,50000,0,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[0.63815789 0.51595745 2.30952381 0.65540541 1.515625   0.76984127
11 |  1.94       1.03191489 1.515625   2.30952381]",48.5,None
12 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_0/hyperparameters.csv:
--------------------------------------------------------------------------------
1 | ,step,hp/lr,hp/pi_epsilon,hp/batch_size,hp/learning_starts,meta_modules
2 | 0,0,0.0003,0.1,1024,1,[]
3 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_0/results.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/examples/multiple_runs/mighty_experiment_0/results.npz


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_1/1/.hydra/config.yaml:
--------------------------------------------------------------------------------
 1 | runner: standard
 2 | debug: false
 3 | seed: 1
 4 | output_dir: examples/multiple_runs
 5 | wandb_project: null
 6 | tensorboard_file: null
 7 | experiment_name: mighty_experiment
 8 | algorithm_kwargs:
 9 |   n_policy_units: 128
10 |   n_critic_units: 128
11 |   soft_update_weight: 0.01
12 |   rollout_buffer_class:
13 |     _target_: mighty.mighty_replay.MightyRolloutBuffer
14 |   rollout_buffer_kwargs:
15 |     buffer_size: 4096
16 |     gamma: 0.99
17 |     gae_lambda: 0.95
18 |     obs_shape: ???
19 |     act_dim: ???
20 |     n_envs: ???
21 |   learning_rate: 0.0003
22 |   batch_size: 1024
23 |   gamma: 0.99
24 |   n_gradient_steps: 3
25 |   ppo_clip: 0.2
26 |   value_loss_coef: 0.5
27 |   entropy_coef: 0.01
28 |   max_grad_norm: 0.5
29 |   hidden_sizes:
30 |   - 64
31 |   - 64
32 |   activation: tanh
33 |   n_epochs: 10
34 |   minibatch_size: 64
35 |   kl_target: 0.01
36 |   use_value_clip: true
37 |   value_clip_eps: 0.2
38 |   policy_class: mighty.mighty_exploration.StochasticPolicy
39 |   policy_kwargs:
40 |     entropy_coefficient: 0.0
41 | eval_every_n_steps: 10000.0
42 | n_episodes_eval: 10
43 | checkpoint: null
44 | save_model_every_n_steps: 500000.0
45 | algorithm: PPO
46 | num_steps: 50000
47 | env: CartPole-v1
48 | env_kwargs: {}
49 | env_wrappers: []
50 | num_envs: 10
51 | search_space:
52 |   hyperparameters:
53 |     algorithm_kwargs.learning_rate:
54 |       type: uniform_float
55 |       lower: 1.0e-06
56 |       upper: 0.01
57 |       log: true
58 |       default_value: 0.005
59 |     algorithm_kwargs.epsilon:
60 |       type: uniform_float
61 |       lower: 0.01
62 |       upper: 0.25
63 |       default_value: 0.1
64 |     algorithm_kwargs.batch_size:
65 |       type: categorical
66 |       choices:
67 |       - 32
68 |       - 64
69 |       - 128
70 |       - 256
71 |       default_value: 32
72 |     algorithm_kwargs.soft_update_weight:
73 |       type: uniform_float
74 |       lower: 0.01
75 |       upper: 1.0
76 |       log: true
77 |       default_value: 1.0
78 |     algorithm_kwargs.td_update_class:
79 |       type: categorical
80 |       choices:
81 |       - mighty.mighty_update.QLearning
82 |       - mighty.mighty_update.DoubleQLearning
83 |       default_value: mighty.mighty_update.DoubleQLearning
84 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_1/1/.hydra/overrides.yaml:
--------------------------------------------------------------------------------
1 | - env=CartPole-v1
2 | - num_steps=50000
3 | - num_envs=10
4 | - seed=1
5 | - output_dir=examples/multiple_runs
6 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_1/eval_results.csv:
--------------------------------------------------------------------------------
 1 | ,step,seed,eval_episodes,mean_eval_step_reward,mean_eval_reward,instance
 2 | 0,10000,1,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[0.81071429 0.84074074 0.43653846 1.26111111 1.51333333 1.03181818
 3 |  1.74615385 2.52222222 1.51333333 0.81071429]",22.7,None
 4 | 1,20000,1,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.07272727 2.36       1.38823529 1.12380952 0.944      1.07272727
 5 |  0.69411765 1.24210526 0.48163265 1.38823529]",23.6,None
 6 | 2,30000,1,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.97777778 0.60338983 1.87368421 1.97777778 1.36923077 0.58360656
 7 |  1.1483871  0.77391304 0.91282051 0.91282051]",35.6,None
 8 | 3,40000,1,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.45714286 1.85454545 1.02       1.13333333 1.2        1.7
 9 |  1.36       0.41632653 1.36       0.61818182]",20.4,None
10 | 4,50000,1,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[0.34065041 1.82173913 0.87291667 3.49166667 2.61875    1.26969697
11 |  0.91086957 3.22307692 2.46470588 0.47613636]",41.9,None
12 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_1/hyperparameters.csv:
--------------------------------------------------------------------------------
1 | ,step,hp/lr,hp/pi_epsilon,hp/batch_size,hp/learning_starts,meta_modules
2 | 0,0,0.0003,0.1,1024,1,[]
3 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_1/results.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/examples/multiple_runs/mighty_experiment_1/results.npz


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_2/2/.hydra/config.yaml:
--------------------------------------------------------------------------------
 1 | runner: standard
 2 | debug: false
 3 | seed: 2
 4 | output_dir: examples/multiple_runs
 5 | wandb_project: null
 6 | tensorboard_file: null
 7 | experiment_name: mighty_experiment
 8 | algorithm_kwargs:
 9 |   n_policy_units: 128
10 |   n_critic_units: 128
11 |   soft_update_weight: 0.01
12 |   rollout_buffer_class:
13 |     _target_: mighty.mighty_replay.MightyRolloutBuffer
14 |   rollout_buffer_kwargs:
15 |     buffer_size: 4096
16 |     gamma: 0.99
17 |     gae_lambda: 0.95
18 |     obs_shape: ???
19 |     act_dim: ???
20 |     n_envs: ???
21 |   learning_rate: 0.0003
22 |   batch_size: 1024
23 |   gamma: 0.99
24 |   n_gradient_steps: 3
25 |   ppo_clip: 0.2
26 |   value_loss_coef: 0.5
27 |   entropy_coef: 0.01
28 |   max_grad_norm: 0.5
29 |   hidden_sizes:
30 |   - 64
31 |   - 64
32 |   activation: tanh
33 |   n_epochs: 10
34 |   minibatch_size: 64
35 |   kl_target: 0.01
36 |   use_value_clip: true
37 |   value_clip_eps: 0.2
38 |   policy_class: mighty.mighty_exploration.StochasticPolicy
39 |   policy_kwargs:
40 |     entropy_coefficient: 0.0
41 | eval_every_n_steps: 10000.0
42 | n_episodes_eval: 10
43 | checkpoint: null
44 | save_model_every_n_steps: 500000.0
45 | algorithm: PPO
46 | num_steps: 50000
47 | env: CartPole-v1
48 | env_kwargs: {}
49 | env_wrappers: []
50 | num_envs: 10
51 | search_space:
52 |   hyperparameters:
53 |     algorithm_kwargs.learning_rate:
54 |       type: uniform_float
55 |       lower: 1.0e-06
56 |       upper: 0.01
57 |       log: true
58 |       default_value: 0.005
59 |     algorithm_kwargs.epsilon:
60 |       type: uniform_float
61 |       lower: 0.01
62 |       upper: 0.25
63 |       default_value: 0.1
64 |     algorithm_kwargs.batch_size:
65 |       type: categorical
66 |       choices:
67 |       - 32
68 |       - 64
69 |       - 128
70 |       - 256
71 |       default_value: 32
72 |     algorithm_kwargs.soft_update_weight:
73 |       type: uniform_float
74 |       lower: 0.01
75 |       upper: 1.0
76 |       log: true
77 |       default_value: 1.0
78 |     algorithm_kwargs.td_update_class:
79 |       type: categorical
80 |       choices:
81 |       - mighty.mighty_update.QLearning
82 |       - mighty.mighty_update.DoubleQLearning
83 |       default_value: mighty.mighty_update.DoubleQLearning
84 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_2/2/.hydra/overrides.yaml:
--------------------------------------------------------------------------------
1 | - env=CartPole-v1
2 | - num_steps=50000
3 | - num_envs=10
4 | - seed=2
5 | - output_dir=examples/multiple_runs
6 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_2/eval_results.csv:
--------------------------------------------------------------------------------
 1 | ,step,seed,eval_episodes,mean_eval_step_reward,mean_eval_reward,instance
 2 | 0,10000,2,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.256      0.95151515 2.85454545 0.92352941 1.36521739 1.01290323
 3 |  0.45507246 0.95151515 0.73023256 2.61666667]",31.4,None
 4 | 1,20000,2,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.41538462 1.53333333 0.47179487 1.02222222 2.04444444 1.53333333
 5 |  1.02222222 0.96842105 0.87619048 0.8       ]",18.4,None
 6 | 2,30000,2,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[2.26363636 1.46470588 0.996      1.46470588 0.59285714 2.26363636
 7 |  0.46111111 1.77857143 1.38333333 0.6225    ]",24.9,None
 8 | 3,40000,2,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.37333333 0.54210526 1.37333333 1.14444444 1.47142857 2.28888889
 9 |  0.64375    0.50243902 2.06       1.47142857]",20.6,None
10 | 4,50000,2,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[0.7        1.11363636 0.79032258 0.79032258 1.63333333 2.45
11 |  0.74242424 1.225      0.81666667 1.36111111]",24.5,None
12 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_2/hyperparameters.csv:
--------------------------------------------------------------------------------
1 | ,step,hp/lr,hp/pi_epsilon,hp/batch_size,hp/learning_starts,meta_modules
2 | 0,0,0.0003,0.1,1024,1,[]
3 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_2/results.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/examples/multiple_runs/mighty_experiment_2/results.npz


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_3/3/.hydra/config.yaml:
--------------------------------------------------------------------------------
 1 | runner: standard
 2 | debug: false
 3 | seed: 3
 4 | output_dir: examples/multiple_runs
 5 | wandb_project: null
 6 | tensorboard_file: null
 7 | experiment_name: mighty_experiment
 8 | algorithm_kwargs:
 9 |   n_policy_units: 128
10 |   n_critic_units: 128
11 |   soft_update_weight: 0.01
12 |   rollout_buffer_class:
13 |     _target_: mighty.mighty_replay.MightyRolloutBuffer
14 |   rollout_buffer_kwargs:
15 |     buffer_size: 4096
16 |     gamma: 0.99
17 |     gae_lambda: 0.95
18 |     obs_shape: ???
19 |     act_dim: ???
20 |     n_envs: ???
21 |   learning_rate: 0.0003
22 |   batch_size: 1024
23 |   gamma: 0.99
24 |   n_gradient_steps: 3
25 |   ppo_clip: 0.2
26 |   value_loss_coef: 0.5
27 |   entropy_coef: 0.01
28 |   max_grad_norm: 0.5
29 |   hidden_sizes:
30 |   - 64
31 |   - 64
32 |   activation: tanh
33 |   n_epochs: 10
34 |   minibatch_size: 64
35 |   kl_target: 0.01
36 |   use_value_clip: true
37 |   value_clip_eps: 0.2
38 |   policy_class: mighty.mighty_exploration.StochasticPolicy
39 |   policy_kwargs:
40 |     entropy_coefficient: 0.0
41 | eval_every_n_steps: 10000.0
42 | n_episodes_eval: 10
43 | checkpoint: null
44 | save_model_every_n_steps: 500000.0
45 | algorithm: PPO
46 | num_steps: 50000
47 | env: CartPole-v1
48 | env_kwargs: {}
49 | env_wrappers: []
50 | num_envs: 10
51 | search_space:
52 |   hyperparameters:
53 |     algorithm_kwargs.learning_rate:
54 |       type: uniform_float
55 |       lower: 1.0e-06
56 |       upper: 0.01
57 |       log: true
58 |       default_value: 0.005
59 |     algorithm_kwargs.epsilon:
60 |       type: uniform_float
61 |       lower: 0.01
62 |       upper: 0.25
63 |       default_value: 0.1
64 |     algorithm_kwargs.batch_size:
65 |       type: categorical
66 |       choices:
67 |       - 32
68 |       - 64
69 |       - 128
70 |       - 256
71 |       default_value: 32
72 |     algorithm_kwargs.soft_update_weight:
73 |       type: uniform_float
74 |       lower: 0.01
75 |       upper: 1.0
76 |       log: true
77 |       default_value: 1.0
78 |     algorithm_kwargs.td_update_class:
79 |       type: categorical
80 |       choices:
81 |       - mighty.mighty_update.QLearning
82 |       - mighty.mighty_update.DoubleQLearning
83 |       default_value: mighty.mighty_update.DoubleQLearning
84 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_3/3/.hydra/overrides.yaml:
--------------------------------------------------------------------------------
1 | - env=CartPole-v1
2 | - num_steps=50000
3 | - num_envs=10
4 | - seed=3
5 | - output_dir=examples/multiple_runs
6 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_3/eval_results.csv:
--------------------------------------------------------------------------------
 1 | ,step,seed,eval_episodes,mean_eval_step_reward,mean_eval_reward,instance
 2 | 0,10000,3,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[0.82666667 2.25454545 1.55       1.12727273 1.77142857 0.992
 3 |  1.37777778 0.44285714 0.52765957 2.75555556]",24.8,None
 4 | 1,20000,3,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[0.88421053 0.88421053 1.52727273 0.76363636 1.68       0.88421053
 5 |  1.2        0.88421053 0.98823529 0.93333333]",16.8,None
 6 | 2,30000,3,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[0.92105263 1.94444444 1.09375    1.25       0.5        1.25
 7 |  0.67307692 1.02941176 1.94444444 1.09375   ]",17.5,None
 8 | 3,40000,3,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.1625     2.06666667 1.28275862 1.77142857 0.64137931 0.88571429
 9 |  0.6        1.24       0.55522388 2.86153846]",37.2,None
10 | 4,50000,3,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[2.07058824 1.03529412 0.66415094 0.8        1.46666667 0.95135135
11 |  0.54153846 1.1        2.51428571 1.1       ]",35.2,None
12 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_3/hyperparameters.csv:
--------------------------------------------------------------------------------
1 | ,step,hp/lr,hp/pi_epsilon,hp/batch_size,hp/learning_starts,meta_modules
2 | 0,0,0.0003,0.1,1024,1,[]
3 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_3/results.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/examples/multiple_runs/mighty_experiment_3/results.npz


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_4/4/.hydra/config.yaml:
--------------------------------------------------------------------------------
 1 | runner: standard
 2 | debug: false
 3 | seed: 4
 4 | output_dir: examples/multiple_runs
 5 | wandb_project: null
 6 | tensorboard_file: null
 7 | experiment_name: mighty_experiment
 8 | algorithm_kwargs:
 9 |   n_policy_units: 128
10 |   n_critic_units: 128
11 |   soft_update_weight: 0.01
12 |   rollout_buffer_class:
13 |     _target_: mighty.mighty_replay.MightyRolloutBuffer
14 |   rollout_buffer_kwargs:
15 |     buffer_size: 4096
16 |     gamma: 0.99
17 |     gae_lambda: 0.95
18 |     obs_shape: ???
19 |     act_dim: ???
20 |     n_envs: ???
21 |   learning_rate: 0.0003
22 |   batch_size: 1024
23 |   gamma: 0.99
24 |   n_gradient_steps: 3
25 |   ppo_clip: 0.2
26 |   value_loss_coef: 0.5
27 |   entropy_coef: 0.01
28 |   max_grad_norm: 0.5
29 |   hidden_sizes:
30 |   - 64
31 |   - 64
32 |   activation: tanh
33 |   n_epochs: 10
34 |   minibatch_size: 64
35 |   kl_target: 0.01
36 |   use_value_clip: true
37 |   value_clip_eps: 0.2
38 |   policy_class: mighty.mighty_exploration.StochasticPolicy
39 |   policy_kwargs:
40 |     entropy_coefficient: 0.0
41 | eval_every_n_steps: 10000.0
42 | n_episodes_eval: 10
43 | checkpoint: null
44 | save_model_every_n_steps: 500000.0
45 | algorithm: PPO
46 | num_steps: 50000
47 | env: CartPole-v1
48 | env_kwargs: {}
49 | env_wrappers: []
50 | num_envs: 10
51 | search_space:
52 |   hyperparameters:
53 |     algorithm_kwargs.learning_rate:
54 |       type: uniform_float
55 |       lower: 1.0e-06
56 |       upper: 0.01
57 |       log: true
58 |       default_value: 0.005
59 |     algorithm_kwargs.epsilon:
60 |       type: uniform_float
61 |       lower: 0.01
62 |       upper: 0.25
63 |       default_value: 0.1
64 |     algorithm_kwargs.batch_size:
65 |       type: categorical
66 |       choices:
67 |       - 32
68 |       - 64
69 |       - 128
70 |       - 256
71 |       default_value: 32
72 |     algorithm_kwargs.soft_update_weight:
73 |       type: uniform_float
74 |       lower: 0.01
75 |       upper: 1.0
76 |       log: true
77 |       default_value: 1.0
78 |     algorithm_kwargs.td_update_class:
79 |       type: categorical
80 |       choices:
81 |       - mighty.mighty_update.QLearning
82 |       - mighty.mighty_update.DoubleQLearning
83 |       default_value: mighty.mighty_update.DoubleQLearning
84 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_4/4/.hydra/overrides.yaml:
--------------------------------------------------------------------------------
1 | - env=CartPole-v1
2 | - num_steps=50000
3 | - num_envs=10
4 | - seed=4
5 | - output_dir=examples/multiple_runs
6 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_4/eval_results.csv:
--------------------------------------------------------------------------------
 1 | ,step,seed,eval_episodes,mean_eval_step_reward,mean_eval_reward,instance
 2 | 0,10000,4,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.01       0.96190476 1.83636364 0.808      0.91818182 0.72142857
 3 |  0.87826087 1.44285714 1.18823529 0.96190476]",20.2,None
 4 | 1,20000,4,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.98823529 1.46956522 1.20714286 1.98823529 2.25333333 0.6627451
 5 |  2.6        0.76818182 2.81666667 0.28644068]",33.8,None
 6 | 2,30000,4,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[0.971875   0.7775     0.45735294 1.29583333 1.82941176 1.94375
 7 |  1.29583333 0.60980392 2.39230769 1.19615385]",31.1,None
 8 | 3,40000,4,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[2.05384615 1.02692308 1.57058824 0.98888889 1.90714286 0.98888889
 9 |  0.70263158 0.45254237 0.78529412 2.225     ]",26.7,None
10 | 4,50000,4,[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.],"[1.14137931 2.06875    0.59107143 1.43913043 0.50923077 0.76976744
11 |  1.43913043 3.31       0.70425532 1.74210526]",33.1,None
12 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_4/hyperparameters.csv:
--------------------------------------------------------------------------------
1 | ,step,hp/lr,hp/pi_epsilon,hp/batch_size,hp/learning_starts,meta_modules
2 | 0,0,0.0003,0.1,1024,1,[]
3 | 


--------------------------------------------------------------------------------
/examples/multiple_runs/mighty_experiment_4/results.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/examples/multiple_runs/mighty_experiment_4/results.npz


--------------------------------------------------------------------------------
/examples/optuna_example_config.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - override hydra/job_logging: colorlog
 4 |   - override hydra/hydra_logging: colorlog
 5 |   - override hydra/sweeper: optuna
 6 |   - override hydra/sweeper/sampler: tpe
 7 | 
 8 | runner: standard
 9 | debug: false
10 | seed: 0
11 | output_dir: examples/optuna_example_output
12 | wandb_project: null
13 | tensorboard_file: null
14 | experiment_name: optuna_tuning_example
15 | num_steps: 50_000 
16 | env: pufferlib.ocean.bandit
17 | env_kwargs: {}
18 | env_wrappers: []
19 | num_envs: 64
20 | 
21 | # @package _global_
22 | algorithm: PPO
23 | 
24 | algorithm_kwargs:
25 |   # Hyperparameters
26 |   n_policy_units: 128
27 |   n_critic_units: 128
28 |   soft_update_weight: 0.01
29 | 
30 |   rollout_buffer_class:
31 |     _target_: mighty.mighty_replay.MightyRolloutBuffer  # Using rollout buffer
32 |   rollout_buffer_kwargs:
33 |     buffer_size: 4096  # Size of the rollout buffer.
34 |     gamma: 0.99  # Discount factor for future rewards.
35 |     gae_lambda: 0.95  # GAE lambda.
36 |     obs_shape: ???  # Placeholder for observation shape
37 |     act_dim: ???  # Placeholder for action dimension
38 |     n_envs: ???
39 |     
40 | 
41 |   # Training
42 |   learning_rate:  3e-4
43 |   batch_size: 1024  # Batch size for training.
44 |   gamma: 0.99  # The amount by which to discount future rewards.
45 |   n_gradient_steps: 3  # Number of epochs for updating policy.
46 |   ppo_clip: 0.2  # Clipping parameter for PPO.
47 |   value_loss_coef: 0.5  # Coefficient for value loss.
48 |   entropy_coef: 0.01  # Coefficient for entropy loss.
49 |   max_grad_norm: 0.5  # Maximum value for gradient clipping.
50 |   
51 | 
52 |   hidden_sizes: [64, 64]
53 |   activation: 'tanh'
54 | 
55 |   n_epochs: 10
56 |   minibatch_size: 64
57 |   kl_target: 0.01
58 |   use_value_clip: True
59 |   value_clip_eps: 0.2
60 | 
61 |   policy_class: mighty.mighty_exploration.StochasticPolicy  # Policy class for exploration
62 |   policy_kwargs:
63 |     entropy_coefficient: 0.0  # Coefficient for entropy-based exploration.
64 | 
65 | # Training
66 | eval_every_n_steps: 1e4  # After how many steps to evaluate.
67 | n_episodes_eval: 10
68 | checkpoint: null  # Path to load model checkpoint
69 | save_model_every_n_steps: 5e5
70 | 
71 | hydra:
72 |   run:
73 |     dir: ${output_dir}/${experiment_name}_${seed}
74 |   sweep:
75 |     dir: ${output_dir}/${experiment_name}_${seed}
76 |   sweeper:
77 |     sampler:
78 |       seed: 123
79 |     direction: maximize
80 |     study_name: optuna_tuning_example
81 |     storage: null
82 |     n_trials: 20
83 |     n_jobs: 1
84 |     params:
85 |       algorithm_kwargs.learning_rate: range(0.0001, 0.05)
86 |       algorithm_kwargs.batch_size: range(8, 128)


--------------------------------------------------------------------------------
/mighty/__init__.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | name = "Mighty"
 4 | package_name = "mighty"
 5 | author = "Todo"
 6 | author_email = "Todo"
 7 | description = "No description given"
 8 | url = "https://www.automl.org"
 9 | project_urls = {
10 |     "Documentation": "https://automl.github.io/Mighty/main",
11 |     "Source Code": "https://github.com/automl/mighty",
12 | }
13 | copyright = f"Copyright {datetime.date.today().strftime('%Y')}, AutoML"
14 | version = "0.0.1"
15 | 


--------------------------------------------------------------------------------
/mighty/configs/algorithm/atari_dqn.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | algorithm: DQN
 3 | q_func: ???
 4 | 
 5 | algorithm_kwargs:
 6 |   # Hyperparameters
 7 |   epsilon: 0.2  # Controls epsilon-greedy action selection in policy.
 8 | 
 9 |   replay_buffer_class:
10 |     _target_: mighty.mighty_replay.PrioritizedReplay
11 |   replay_buffer_kwargs:
12 |     capacity: 1000000  # Maximum size of replay buffer.
13 | 
14 |   gamma: 0.9  # The amount by which to discount future rewards.
15 | 
16 |   # Training
17 |   learning_rate: 0.001
18 |   batch_size: 64  # Batch size for training.
19 | #  begin_updating_weights: 1  # Begin updating policy weights after this many observed transitions.
20 |   soft_update_weight: 0.01  # If we set :math:`\tau=1` we do a hard update. If we pick a smaller value, we do a smooth update.
21 |   q_kwargs:
22 |     dueling: False
23 |     feature_extractor_kwargs:
24 |       architecture: [cnn, mlp]
25 |       n_convolutions: 3
26 |       out_channels: [16, 32, 64]
27 |       sizes: [[2, 2], [2,2], [2,2]]
28 |       conv_dim: 2
29 |       flatten_cnn: True
30 |       n_layers: 1
31 |       hidden_sizes: [512]


--------------------------------------------------------------------------------
/mighty/configs/algorithm/ddqn.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | algorithm: DQN
 3 | q_func: ???
 4 | 
 5 | algorithm_kwargs:
 6 |   # Core architecture / model
 7 |   n_units: 64
 8 |   q_kwargs:
 9 |     dueling: True
10 |     feature_extractor_kwargs:
11 |       architecture: mlp
12 |       n_layers: 1
13 |       hidden_sizes: [64]
14 |     head_kwargs:
15 |       hidden_sizes: [64]
16 | 
17 |   # Exploration (decaying ε‐greedy)
18 |   policy_class:
19 |     _target_: mighty.mighty_exploration.DecayingEpsilonGreedy
20 |   policy_kwargs:
21 |     epsilon_start: 1.0
22 |     epsilon_final: 0.05
23 |     epsilon_decay_steps: 320000
24 | 
25 |   # Replay‐buffer settings
26 |   replay_buffer_class:
27 |     _target_: mighty.mighty_replay.PrioritizedReplay
28 |   replay_buffer_kwargs:
29 |     capacity: 250000
30 |     alpha: 0.6
31 |     beta: 0.4
32 |     epsilon: 1e-6
33 |     device: "cpu"
34 |     obs_shape: ???     # ← will be auto-filled at runtime
35 |     action_shape: ???  # ← will be auto-filled at runtime
36 | 
37 |   # Training hyperparameters
38 |   learning_rate: 3e-4
39 |   batch_size: 64
40 |   gamma: 0.97
41 |   learning_starts: 64000       # wait 1k transitions before training
42 | 
43 |   # Target‐network / updating (hard update every 1k ∇‐steps)
44 |   use_target: True
45 |   soft_update_weight: 0.1
46 |   target_update_freq: null
47 | 
48 |   # Double DQN update
49 |   td_update_class: mighty.mighty_update.DoubleQLearning
50 | 
51 |   td_update_kwargs:
52 |     gamma: 0.97
53 |     optimizer_class:
54 |       _target_: torch.optim.Adam
55 |     optimizer_kwargs:
56 |       lr: 5e-5
57 |       weight_decay: 1e-5
58 |       eps: 1e-6
59 |     max_grad_norm: 10.0
60 | 
61 |   # Checkpointing
62 |   save_replay: False
63 |   n_gradient_steps: 1


--------------------------------------------------------------------------------
/mighty/configs/algorithm/dqn.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | algorithm: DQN
 3 | q_func: ???
 4 | 
 5 | algorithm_kwargs:
 6 |   # Core architecture / model
 7 |   n_units: 256
 8 |   q_kwargs:
 9 |     dueling: False
10 |     feature_extractor_kwargs:
11 |       architecture: mlp
12 |       n_layers: 1
13 |       hidden_sizes: [256]
14 |     head_kwargs:
15 |       hidden_sizes: [256]
16 | 
17 |   # Exploration (decaying ε‐greedy)
18 |   policy_class:
19 |     _target_: mighty.mighty_exploration.DecayingEpsilonGreedy
20 |   policy_kwargs:
21 |     epsilon_start: 1.0
22 |     epsilon_final: 0.04
23 |     epsilon_decay_steps: 8000
24 | 
25 |   # Replay‐buffer settings
26 |   replay_buffer_class:
27 |     _target_: mighty.mighty_replay.MightyReplay
28 |   replay_buffer_kwargs:
29 |     capacity: 100000
30 | 
31 |   # Training hyperparameters
32 |   learning_rate: 2.3e-3
33 |   batch_size: 128
34 |   gamma: 0.99
35 |   learning_starts: 1000       # wait 1k transitions before training
36 | 
37 |   # Target‐network / updating (hard update every 1k ∇‐steps)
38 |   use_target: True
39 |   soft_update_weight: 0.005
40 |   target_update_freq: null
41 | 
42 |   # Double DQN update
43 |   td_update_class: mighty.mighty_update.QLearning
44 | 
45 |   td_update_kwargs:
46 |     gamma: 0.99
47 |     optimizer_class:
48 |       _target_: torch.optim.Adam
49 |     optimizer_kwargs:
50 |       lr: 2.3e-3
51 |       weight_decay: 1e-5
52 |       eps: 1e-6
53 |     max_grad_norm: 10.0
54 | 
55 |   # Checkpointing
56 |   save_replay: False
57 |   n_gradient_steps: 128


--------------------------------------------------------------------------------
/mighty/configs/algorithm/minigrid_dqn.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | algorithm: DQN
 3 | q_func: ???
 4 | 
 5 | algorithm_kwargs:
 6 |   # Hyperparameters
 7 |   epsilon: 0.2  # Controls epsilon-greedy action selection in policy.
 8 |   td_update_class: mighty.mighty_update.DoubleQLearning
 9 | 
10 |   replay_buffer_class:
11 |     _target_: mighty.mighty_replay.PrioritizedReplay
12 |   replay_buffer_kwargs:
13 |     capacity: 1000000  # Maximum size of replay buffer.
14 | 
15 |   gamma: 0.9  # The amount by which to discount future rewards.
16 | 
17 |   # Training
18 |   learning_rate: 0.001
19 |   batch_size: 64  # Batch size for training.
20 | #  begin_updating_weights: 1  # Begin updating policy weights after this many observed transitions.
21 |   soft_update_weight: 0.01  # If we set :math:`\tau=1` we do a hard update. If we pick a smaller value, we do a smooth update.
22 |   q_kwargs:
23 |     dueling: False
24 |     feature_extractor_kwargs:
25 |       architecture: [cnn, mlp]
26 |       n_convolutions: 3
27 |       out_channels: [16, 32, 64]
28 |       sizes: [[2, 2], [2,2], [2,2]]
29 |       conv_dim: 2
30 |       flatten_cnn: True
31 |       n_layers: 1
32 |       hidden_sizes: [512]


--------------------------------------------------------------------------------
/mighty/configs/algorithm/ppo.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | algorithm: PPO
 3 | 
 4 | algorithm_kwargs:
 5 |   # Hyperparameters
 6 |   n_policy_units: 64
 7 |   n_critic_units: 64
 8 |   soft_update_weight: 0.01
 9 | 
10 |   rollout_buffer_class:
11 |     _target_: mighty.mighty_replay.MightyRolloutBuffer  # Using rollout buffer
12 |   rollout_buffer_kwargs:
13 |     buffer_size: 256  # Size of the rollout buffer.
14 |     gamma: 0.98  # Discount factor for future rewards.
15 |     gae_lambda: 0.8  # GAE lambda.
16 |     obs_shape: ???  # Placeholder for observation shape
17 |     act_dim: ???  # Placeholder for action dimension
18 |     n_envs: ???
19 |     discrete_action: ???  # Placeholder for discrete action flag
20 |     
21 | 
22 |   # Training
23 |   learning_rate:  3e-4
24 |   batch_size: 32        # Batch size for training.
25 |   gamma: 0.99             # The amount by which to discount future rewards.
26 |   ppo_clip: 0.2          # Clipping parameter for PPO.
27 |   value_loss_coef: 0.5    # Coefficient for value loss.
28 |   entropy_coef: 0.0      # Coefficient for entropy loss.
29 |   max_grad_norm: 0.5      # Maximum value for gradient clipping.
30 |   
31 | 
32 |   hidden_sizes: [64]
33 |   activation: 'tanh'
34 | 
35 |   n_epochs: 20
36 |   minibatch_size: 256
37 |   kl_target: 0.01
38 |   use_value_clip: True
39 |   value_clip_eps: 0.2
40 | 
41 |   policy_class: mighty.mighty_exploration.StochasticPolicy  # Policy class for exploration
42 |   policy_kwargs:
43 |     entropy_coefficient: 0.0  # Coefficient for entropy-based exploration.


--------------------------------------------------------------------------------
/mighty/configs/algorithm/ppo_mountaincar.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | algorithm: PPO
 3 | 
 4 | algorithm_kwargs:
 5 |   # Hyperparameters
 6 |   n_policy_units: 64
 7 |   n_critic_units: 64
 8 |   soft_update_weight: 0.01
 9 | 
10 |   rollout_buffer_class:
11 |     _target_: mighty.mighty_replay.MightyRolloutBuffer  # Using rollout buffer
12 |   rollout_buffer_kwargs:
13 |     buffer_size: 256  # Size of the rollout buffer.
14 |     gamma: 0.99  # Discount factor for future rewards.
15 |     gae_lambda: 0.98  # GAE lambda.
16 |     obs_shape: ???  # Placeholder for observation shape
17 |     act_dim: ???  # Placeholder for action dimension
18 |     n_envs: ???
19 |     discrete_action: ???  # Placeholder for discrete action flag
20 |     
21 | 
22 |   # Training
23 |   learning_rate:  1e-3
24 |   batch_size: 1024        # Batch size for training.
25 |   gamma: 0.99             # The amount by which to discount future rewards.
26 |   ppo_clip: 0.2          # Clipping parameter for PPO.
27 |   value_loss_coef: 0.5    # Coefficient for value loss.
28 |   entropy_coef: 0.0      # Coefficient for entropy loss.
29 |   max_grad_norm: 0.5      # Maximum value for gradient clipping.
30 |   
31 | 
32 |   hidden_sizes: [64]
33 |   activation: 'tanh'
34 | 
35 |   n_epochs: 4
36 |   minibatch_size: 256
37 |   kl_target: 0.01
38 |   use_value_clip: True
39 |   value_clip_eps: 0.2
40 | 
41 |   policy_class: mighty.mighty_exploration.StochasticPolicy  # Policy class for exploration
42 |   policy_kwargs:
43 |     entropy_coefficient: 0.0  # Coefficient for entropy-based exploration.
44 | 
45 |   normalize_obs: True


--------------------------------------------------------------------------------
/mighty/configs/algorithm/procgen_dqn.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | algorithm: DQN
 3 | q_func: ???
 4 | 
 5 | algorithm_kwargs:
 6 |   # Hyperparameters
 7 |   epsilon: 0.2  # Controls epsilon-greedy action selection in policy.
 8 |   td_update_class: mighty.mighty_update.DoubleQLearning
 9 | 
10 |   replay_buffer_class:
11 |     _target_: mighty.mighty_replay.PrioritizedReplay
12 |   replay_buffer_kwargs:
13 |     capacity: 1000000  # Maximum size of replay buffer.
14 | 
15 |   gamma: 0.9  # The amount by which to discount future rewards.
16 | 
17 |   # Training
18 |   learning_rate: 0.001
19 |   batch_size: 64  # Batch size for training.
20 | #  begin_updating_weights: 1  # Begin updating policy weights after this many observed transitions.
21 |   soft_update_weight: 0.01  # If we set :math:`\tau=1` we do a hard update. If we pick a smaller value, we do a smooth update.
22 |   q_kwargs:
23 |     dueling: False
24 |     feature_extractor_kwargs:
25 |       architecture: resnet
26 |     head_kwargs:
27 |       hidden_sizes: [512]


--------------------------------------------------------------------------------
/mighty/configs/algorithm/sac.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | algorithm: SAC
 3 | 
 4 | algorithm_kwargs:
 5 |   # network sizes (PPO-style)
 6 |   n_policy_units:     256       # will become hidden_sizes=[8,8]
 7 |   n_critic_units:     256       # same for both Q-nets
 8 |   soft_update_weight: 0.01    # maps to tau
 9 | 
10 |   # Replay buffer
11 |   replay_buffer_class:
12 |     _target_: mighty.mighty_replay.MightyReplay
13 |   replay_buffer_kwargs:
14 |     capacity: 1e6
15 | 
16 |   # Scheduling & batch-updates
17 |   batch_size:       256
18 |   learning_starts:  5000
19 |   update_every:     2
20 |   n_gradient_steps: 1
21 | 
22 |   # Learning rates
23 |   policy_lr: 3e-4
24 |   q_lr:      3e-4
25 | 
26 |   # SAC hyperparameters
27 |   gamma: 0.99
28 |   alpha: 0.2
29 |   auto_alpha: True
30 |   target_entropy: null
31 |   alpha_lr: 3e-4
32 | 
33 |   # Exploration wrapper
34 |   policy_class: mighty.mighty_exploration.StochasticPolicy
35 |   policy_kwargs:
36 |     entropy_coefficient: 0.2
37 |     discrete: False
38 | 


--------------------------------------------------------------------------------
/mighty/configs/algorithm/sac_mujoco.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | algorithm: SAC
 3 | 
 4 | algorithm_kwargs:
 5 |   # network sizes (PPO-style)
 6 |   n_policy_units:     256       # will become hidden_sizes=[8,8]
 7 |   n_critic_units:     256       # same for both Q-nets
 8 |   soft_update_weight: 0.005    # maps to tau
 9 | 
10 |   # Replay buffer
11 |   replay_buffer_class:
12 |     _target_: mighty.mighty_replay.MightyReplay
13 |   replay_buffer_kwargs:
14 |     capacity: 1e6
15 | 
16 |   # Scheduling & batch-updates
17 |   batch_size:       256
18 |   learning_starts:  10000
19 |   update_every:     1
20 |   n_gradient_steps: 1
21 | 
22 |   # Learning rates
23 |   policy_lr: 3e-4
24 |   q_lr:      3e-4
25 | 
26 |   # SAC hyperparameters
27 |   gamma: 0.99
28 |   alpha: 0.2
29 |   auto_alpha: True
30 |   target_entropy: null
31 |   alpha_lr: 3e-4
32 | 
33 |   # Exploration wrapper
34 |   policy_class: mighty.mighty_exploration.StochasticPolicy
35 |   policy_kwargs:
36 |     entropy_coefficient: 0.0
37 |     discrete: False
38 | 
39 |   normalize_obs: True


--------------------------------------------------------------------------------
/mighty/configs/base.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - algorithm: ppo
 4 |   - environment: pufferlib_ocean/bandit
 5 |   - search_space: dqn_gym_classic
 6 |   - override hydra/job_logging: colorlog
 7 |   - override hydra/hydra_logging: colorlog
 8 |   - override hydra/help: mighty_help
 9 | 
10 | runner: standard
11 | debug: false
12 | seed: 0
13 | output_dir: runs
14 | wandb_project: null
15 | tensorboard_file: null
16 | experiment_name: mighty_experiment
17 | 
18 | algorithm_kwargs: {}
19 | 
20 | # Training
21 | eval_every_n_steps: 5e3  # After how many steps to evaluate.
22 | n_episodes_eval: 10
23 | checkpoint: null  # Path to load model checkpoint
24 | save_model_every_n_steps: 5e5
25 | 
26 | hydra:
27 |   run:
28 |     dir: ${output_dir}/${experiment_name}_${seed}
29 |   sweep:
30 |     dir: ${output_dir}/${experiment_name}_${seed}
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/mighty/configs/cluster/local.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | # defaults:
 3 | #   - override /hydra/launcher: joblib
 4 | 
 5 | # hydra:
 6 | #   launcher:
 7 | #     n_jobs: 16
 8 | 
 9 | cluster:
10 |   _target_: distributed.deploy.local.LocalCluster
11 |   n_workers: ${hydra.sweeper.scenario.n_workers}
12 |   processes: false
13 |   threads_per_worker: 1


--------------------------------------------------------------------------------
/mighty/configs/cluster/luis.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /hydra/launcher: submitit_slurm
 4 | 
 5 | cluster:
 6 |   queue: ai,tnt  # partition
 7 | 
 8 | hydra:
 9 |   launcher:
10 |     partition: ai
11 |     cpus_per_task: 1
12 |     name: expl2
13 |     timeout_min: 20
14 |     mem_gb: 4
15 |     setup:
16 |       - module load Miniconda3
17 |       - conda activate /bigwork/nhwpbenc/conda/envs/mighty


--------------------------------------------------------------------------------
/mighty/configs/cluster/noctua.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | defaults:
 3 |   - override /hydra/launcher: submitit_slurm
 4 | 
 5 | hydra:
 6 |   launcher:
 7 |     partition: normal
 8 |     cpus_per_task: 1
 9 |     name: expl2
10 |     timeout_min: 20
11 |     mem_gb: 4
12 |     setup:
13 |       - micromamba activate /scratch/hpc-prf-intexml/cbenjamins/envs/mighty
14 | 
15 | cluster:
16 |   _target_: dask_jobqueue.SLURMCluster
17 |   queue: normal  # set in cluster config
18 |   #  account: myaccount
19 |   cores: 16
20 |   memory: 32 GB
21 |   walltime: 01:00:00
22 |   processes: 1
23 |   log_directory: tmp/mighty_smac
24 |   n_workers: 16
25 |   death_timeout: 30
26 | 


--------------------------------------------------------------------------------
/mighty/configs/cluster/tnt.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - override hydra/launcher: submitit_slurm
 3 | 
 4 | cluster:
 5 |   queue: cpu_short  # partition
 6 | 
 7 | hydra:
 8 |   launcher:
 9 |     partition: cpu_short  # change this to your partition name
10 |     #gres: gpu:1     # use this option when running on GPUs
11 |     mem_gb: 12       # memory requirements
12 |     cpus_per_task: 20  # number of cpus per run
13 |     timeout_min: 720   # timeout in minutes
14 |     setup:
15 |       - export XLA_PYTHON_CLIENT_PREALLOCATE=false
16 | 


--------------------------------------------------------------------------------
/mighty/configs/cmaes_hpo.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - algorithm: dqn
 4 |   - environment: pufferlib_ocean/bandit
 5 |   - search_space: dqn_gym_classic
 6 |   - override hydra/job_logging: colorlog
 7 |   - override hydra/hydra_logging: colorlog
 8 |   - override hydra/help: mighty_help
 9 | 
10 | runner: es
11 | popsize: 5
12 | iterations: 100
13 | es: evosax.CMA_ES
14 | search_targets: ["learning_rate", "_batch_size"]
15 | rl_train_agent: true
16 | num_steps_per_iteration: 1000
17 | 
18 | debug: false
19 | seed: 0
20 | output_dir: runs
21 | wandb_project: null
22 | tensorboard_file: null
23 | experiment_name: mighty_experiment
24 | 
25 | algorithm_kwargs: {}
26 | 
27 | # Training
28 | eval_every_n_steps: 1e4  # After how many steps to evaluate.
29 | n_episodes_eval: 10
30 | checkpoint: null  # Path to load model checkpoint
31 | save_model_every_n_steps: 5e5


--------------------------------------------------------------------------------
/mighty/configs/environment/carl_walkers/ant_goals.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | 
 3 | num_steps: 25e6
 4 | env: CARLBraxAnt
 5 | env_wrappers: [mighty.mighty_utils.wrappers.FlattenVecObs]
 6 | # For CARL, batch size should be one and num_envs should control parallel envs
 7 | num_envs: 256
 8 | 
 9 | env_kwargs:
10 |   context_sample_seed: 0
11 |   evaluation_context_sample_seed: 1
12 |   num_contexts: 10
13 |   num_evaluation_contexts: 10
14 |   context_feature_args: {"target_distance": [normal, 9.8, 1.0, -100.0, 100.0], "target_direction": [categorical, [1, 2, 3, 4]]}
15 |   batch_size: 1
16 | 


--------------------------------------------------------------------------------
/mighty/configs/environment/dacbench/function_approximation.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | 
3 | num_steps: 1e5
4 | env: FunctionApproximationBenchmark
5 | env_kwargs: {}
6 | env_wrappers: [mighty.mighty_utils.wrappers.DictToVecActions]
7 | num_envs: 16


--------------------------------------------------------------------------------
/mighty/configs/environment/dacbench/function_approximation_benchmark.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | 
3 | num_steps: 1e5
4 | env: FunctionApproximationBenchmark
5 | env_kwargs: {benchmark: true, dimension: 1}
6 | env_wrappers: []
7 | num_envs: 16


--------------------------------------------------------------------------------
/mighty/configs/environment/gymnasium/atari_pong.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | 
3 | num_steps: 1e6
4 | env: ALE/Pong-v5
5 | env_kwargs: {}
6 | env_wrappers: []
7 | num_envs: 64


--------------------------------------------------------------------------------
/mighty/configs/environment/gymnasium/cartpole.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | 
3 | num_steps: 1_000_000 
4 | env: CartPole-v1
5 | env_kwargs: {}
6 | env_wrappers: []
7 | num_envs: 64


--------------------------------------------------------------------------------
/mighty/configs/environment/gymnasium/mountaincar.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | 
3 | num_steps: 100_000 
4 | env: MountainCar-v0
5 | env_kwargs: {}
6 | env_wrappers: []
7 | num_envs: 32


--------------------------------------------------------------------------------
/mighty/configs/environment/gymnasium/mountaincarcontinuous.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | 
3 | num_steps: 1_000_000 
4 | env: MountainCarContinuous-v0
5 | env_kwargs: {}
6 | env_wrappers: []
7 | num_envs: 64


--------------------------------------------------------------------------------
/mighty/configs/environment/gymnasium/pendulum.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | 
3 | num_steps: 1_000_000 
4 | env: Pendulum-v1
5 | env_kwargs: {}
6 | env_wrappers: []
7 | num_envs: 64


--------------------------------------------------------------------------------
/mighty/configs/environment/procgen_bigfish.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | 
3 | num_steps: 25e6
4 | env: procgen:bigfish
5 | env_kwargs: {}
6 | env_wrappers: []
7 | num_envs: 1


--------------------------------------------------------------------------------
/mighty/configs/environment/pufferlib_minigrid/minigrid_env.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | 
3 | num_steps: 2e5
4 | env_name: MiniGrid-DoorKey-8x8-v0    # Overide with names z.B MiniGrid-LavaGapS5-v0, MiniGrid-DoorKey-8x8-v0, MiniGrid-ObstructedMaze-1Dl-v0, MiniGrid-KeyCorridorS3R2-v0, MiniGrid-UnlockPickup-v0
5 | env: pufferlib.environments.minigrid.${env_name}
6 | env_kwargs: {}
7 | env_wrappers: []
8 | num_envs: 64


--------------------------------------------------------------------------------
/mighty/configs/environment/pufferlib_ocean/bandit.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | 
3 | num_steps: 50_000 
4 | env: pufferlib.ocean.bandit
5 | env_kwargs: {}
6 | env_wrappers: []
7 | num_envs: 64


--------------------------------------------------------------------------------
/mighty/configs/environment/pufferlib_ocean/memory.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | 
3 | num_steps: 50_000 
4 | env: pufferlib.ocean.memory
5 | env_kwargs: {}
6 | env_wrappers: []
7 | num_envs: 1


--------------------------------------------------------------------------------
/mighty/configs/environment/pufferlib_ocean/password.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | 
3 | num_steps: 50_000 
4 | env: pufferlib.ocean.password
5 | env_kwargs: {}
6 | env_wrappers: []
7 | num_envs: 1


--------------------------------------------------------------------------------
/mighty/configs/environment/pufferlib_ocean/squared.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | 
3 | num_steps: 50_000 
4 | env: pufferlib.ocean.squared
5 | env_kwargs: {}
6 | env_wrappers: [mighty.utils.wrappers.FlattenVecObs]
7 | num_envs: 1


--------------------------------------------------------------------------------
/mighty/configs/environment/pufferlib_ocean/stochastic.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | 
3 | num_steps: 50_000 
4 | env: pufferlib.ocean.stochastic
5 | env_kwargs: {}
6 | env_wrappers: []
7 | num_envs: 1


--------------------------------------------------------------------------------
/mighty/configs/environment/pufferlib_procgen/bigfish.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | 
3 | num_steps: 25e6
4 | env: pufferlib.environments.procgen.bigfish
5 | env_kwargs: {}
6 | env_wrappers: []
7 | num_envs: 256


--------------------------------------------------------------------------------
/mighty/configs/exploration/epsilon_decay.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | algorithm_kwargs:
3 |   policy_class: mighty.mighty_exploration.DecayingEpsilonGreedy
4 |   policy_kwargs:
5 |     # start at ε=1.0, linearly decay down to ε=0.05 over 10 000 actions
6 |     epsilon_start: 1.0
7 |     epsilon_final: 0.01
8 |     epsilon_decay_steps: 5000


--------------------------------------------------------------------------------
/mighty/configs/exploration/ez_greedy.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | algorithm_kwargs:
3 |   policy_class: mighty.mighty_exploration.EZGreedy


--------------------------------------------------------------------------------
/mighty/configs/exploration/noveld.yaml:
--------------------------------------------------------------------------------
1 | # @package _global_
2 | algorithm_kwargs:
3 |   meta_methods: 
4 |     - mighty.mighty_meta.NovelD
5 |   meta_kwargs: 
6 |     - rnd_output_dim: 16


--------------------------------------------------------------------------------
/mighty/configs/exploration/rnd.yaml:
--------------------------------------------------------------------------------
 1 | # @package _global_
 2 | algorithm_kwargs:
 3 |   meta_methods: 
 4 |     - mighty.mighty_meta.RND
 5 |   meta_kwargs: 
 6 |     - rnd_output_dim: 16
 7 |       rnd_network_config: 
 8 |         - type: Linear
 9 |           kwargs: 
10 |             out_features: 64
11 |         - type: ReLU
12 |           kwargs: {}
13 |         - type: Linear
14 |           kwargs: 
15 |             out_features: 16


--------------------------------------------------------------------------------
/mighty/configs/hydra/help/mighty_help.yaml:
--------------------------------------------------------------------------------
 1 | # App name, override to match the name your app is known by
 2 | app_name: Mighty-DACs
 3 | 
 4 | # Help header, customize to describe your app to your users
 5 | header: |-
 6 |   == ${hydra.help.app_name} ==
 7 |   The Mighty cRL library you've been looking for!
 8 | 
 9 | footer: |-
10 |   Powered by Hydra (https://hydra.cc)
11 |   Use --hydra-help to view Hydra specific help
12 | 
13 | template: |-
14 |   ${hydra.help.header}
15 |   
16 |   == Configuration groups ==
17 |   Compose your configuration from those algorithms (algorithm=dqn)
18 | 
19 |   $APP_CONFIG_GROUPS
20 |   
21 |   == Common Hyperparameters ==
22 |   * debug:              flag to toggle debug output (default: false)
23 |   * seed:               Which seed to use (default: 0)
24 |   * output_dir:         Where to store result data (default: /tmp)
25 |                         hydra specific information will be in "output_dir/year-month-day/timestamp/.hydra"
26 |   
27 |   * wandb_project:      For wandb integration (default: null)
28 |   * tensorboard_file:   For tensorboard integration (default: null)
29 |   * experiment_name:    The folder in which the specific experiment data is to be stored.
30 |                         I.e. the path will be "output_dir/experiment_name"
31 |   
32 |   * algorithm_kwargs:   A dictionary to specify hyperparameter settings to the algorithms.
33 |                         Will be overwritten/populated with the choice of algorithm.
34 |   * num_steps:          Maximum number of steps in the environment before episode ends. (default: 1000000)
35 |   * env:                The environment string name to use, e.g., MountainCarContinuous (default: CartPole-v1)
36 |                         For gym environments please see https://www.gymlibrary.ml/ (simple control environments are by
37 |                         default supported)
38 |                         For DACBench environments please see https://github.com/automl/DACBench
39 |                         For CARL environments please see https://github.com/automl/CARL
40 |   * env_kwargs:         Dict to modify environment parameters. Note: Currently only supported for CARL envs
41 |   * env_warppers:       List of wrapper classes to apply to the environment. (default: [])
42 |   
43 |   * eval_every_n_steps: Training steps interval after which the agent is evaluated on a separate eval_env, i.e., a 
44 |                         second copy of the training env (default: 1000)
45 |   * n_episodes_eval:    Training episodes interval after which the agent is evlauted on a separate eval_env, i.e., a 
46 |                         second copy of the training environment (default: null)
47 |   * checkpoint:         Path to load a checkpointed model from. This allows to contnue training. If unset a new model is
48 |                         trained from scratch (default: null)
49 | 
50 |   == Config ==
51 |   Any key=value argument can be overridden (use dots for.nested=overrides), for example:
52 |   python mighty/run_mighty.py 'algorithm=ppo' 'env=MountainCarContinuous' 'num_steps=1000' 'algorithm_kwargs.learning_rate=0.1'
53 |   or
54 |   python mighty/run_mighty.py 'algorithm=dqn' 'env=SigmoidBenchmark' 'num_steps=100000'
55 |   
56 |   This is the configuration that was generated for this run:
57 |   -------
58 |   $CONFIG
59 |   -------
60 | 
61 |   ${hydra.help.footer}
62 | 


--------------------------------------------------------------------------------
/mighty/configs/nes.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - algorithm: dqn
 4 |   - environment: pufferlib_ocean/bandit
 5 |   - search_space: dqn_gym_classic
 6 |   - override hydra/job_logging: colorlog
 7 |   - override hydra/hydra_logging: colorlog
 8 |   - override hydra/help: mighty_help
 9 | 
10 | runner: es
11 | popsize: 5
12 | iterations: 100
13 | es: evosax.xNES
14 | search_targets: ["parameters"]
15 | rl_train_agent: false
16 | 
17 | debug: false
18 | seed: 0
19 | output_dir: runs
20 | wandb_project: null
21 | tensorboard_file: null
22 | experiment_name: mighty_experiment
23 | 
24 | algorithm_kwargs: {}
25 | 
26 | # Training
27 | eval_every_n_steps: 1e4  # After how many steps to evaluate.
28 | n_episodes_eval: 10
29 | checkpoint: null  # Path to load model checkpoint
30 | save_model_every_n_steps: 5e5


--------------------------------------------------------------------------------
/mighty/configs/ppo_smac.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - algorithm: ppo_mujoco
 4 |   - environment: gymnasium/pendulum
 5 |   - search_space: ppo_rs
 6 |   - override hydra/job_logging: colorlog
 7 |   - override hydra/hydra_logging: colorlog
 8 |   - override hydra/help: mighty_help
 9 |   - override hydra/sweeper: HyperSMAC        # use Hypersweeper’s RandomSearch
10 | 
11 | runner: standard
12 | debug: false
13 | seed: 0
14 | output_dir: sweep_smac
15 | wandb_project: null
16 | tensorboard_file: null
17 | experiment_name: ppo_smac
18 | 
19 | budget: 200000  # Budget for the hyperparameter search
20 | 
21 | algorithm_kwargs: {}
22 | 
23 | # Training
24 | eval_every_n_steps: 1e4  # After how many steps to evaluate.
25 | n_episodes_eval: 10
26 | checkpoint: null  # Path to load model checkpoint
27 | save_model_every_n_steps: 5e5
28 | 
29 | hydra:
30 |   sweeper:
31 |     n_trials: 10
32 |     budget_variable: budget
33 |     sweeper_kwargs:
34 |       seeds: [0]
35 |       optimizer_kwargs:
36 |         smac_facade: 
37 |           _target_: smac.facade.blackbox_facade.BlackBoxFacade
38 |           _partial_: true
39 |           logging_level: 20  # 10 DEBUG, 20 INFO
40 |         scenario:
41 |           seed: 42
42 |           n_trials: ${hydra.sweeper.n_trials}
43 |           deterministic: true
44 |           n_workers: 4
45 |           output_directory: ${hydra.sweep.dir}
46 |     search_space: ${search_space}
47 |   run:
48 |     dir: ./tmp/branin_smac/
49 |   sweep:
50 |     dir: ./tmp/branin_smac/


--------------------------------------------------------------------------------
/mighty/configs/sac_smac.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - algorithm: sac_mujoco
 4 |   - environment: gymnasium/pendulum
 5 |   - search_space: sac_rs
 6 |   - override hydra/job_logging: colorlog
 7 |   - override hydra/hydra_logging: colorlog
 8 |   - override hydra/help: mighty_help
 9 |   - override hydra/sweeper: HyperSMAC        # use Hypersweeper’s RandomSearch
10 | 
11 | runner: standard
12 | debug: false
13 | seed: 0
14 | output_dir: sweep_smac
15 | wandb_project: null
16 | tensorboard_file: null
17 | experiment_name: ppo_smac
18 | 
19 | budget: 200000  # Budget for the hyperparameter search
20 | 
21 | algorithm_kwargs: {}
22 | 
23 | # Training
24 | eval_every_n_steps: 1e4  # After how many steps to evaluate.
25 | n_episodes_eval: 10
26 | checkpoint: null  # Path to load model checkpoint
27 | save_model_every_n_steps: 5e5
28 | 
29 | hydra:
30 |   sweeper:
31 |     n_trials: 10
32 |     budget_variable: budget
33 |     sweeper_kwargs:
34 |       seeds: [0]
35 |       optimizer_kwargs:
36 |         smac_facade: 
37 |           _target_: smac.facade.blackbox_facade.BlackBoxFacade
38 |           _partial_: true
39 |           logging_level: 20  # 10 DEBUG, 20 INFO
40 |         scenario:
41 |           seed: 42
42 |           n_trials: ${hydra.sweeper.n_trials}
43 |           deterministic: true
44 |           n_workers: 4
45 |           output_directory: ${hydra.sweep.dir}
46 |     search_space: ${search_space}
47 |   run:
48 |     dir: ./tmp/branin_smac/
49 |   sweep:
50 |     dir: ./tmp/branin_smac/


--------------------------------------------------------------------------------
/mighty/configs/search_space/dqn_gym_classic.yaml:
--------------------------------------------------------------------------------
 1 | hyperparameters:
 2 |   algorithm_kwargs.learning_rate:
 3 |     type: uniform_float
 4 |     lower: 1e-6
 5 |     upper: 1e-2
 6 |     log: true
 7 |     default_value: 5e-3
 8 |   algorithm_kwargs.epsilon:
 9 |     type: uniform_float
10 |     lower: 0.01
11 |     upper: 0.25
12 |     default_value: 0.1
13 |   algorithm_kwargs.batch_size:
14 |     type: categorical
15 |     choices: [32, 64, 128, 256]
16 |     default_value: 32
17 |   algorithm_kwargs.soft_update_weight:
18 |     type: uniform_float
19 |     lower: 0.01
20 |     upper: 1.0
21 |     log: true
22 |     default_value: 1.
23 |   algorithm_kwargs.td_update_class:
24 |     type: categorical
25 |     choices: [mighty.mighty_update.QLearning, mighty.mighty_update.DoubleQLearning] #, coax.td_learning.ClippedDoubleQLearning, coax.td_learning.SoftClippedDoubleQLearning]
26 |     default_value: mighty.mighty_update.DoubleQLearning
27 | 
28 | 


--------------------------------------------------------------------------------
/mighty/configs/search_space/dqn_rs.yaml:
--------------------------------------------------------------------------------
 1 | hyperparameters:
 2 |   algorithm_kwargs.learning_rate:
 3 |     type: uniform_float
 4 |     upper: 0.1
 5 |     lower: 1.0e-06
 6 |     default: 0.0003
 7 |     log: true
 8 |   algorithm_kwargs.gamma:
 9 |     type: uniform_float
10 |     lower: 0.9
11 |     upper: 0.9999
12 |     log: false
13 |   algorithm_kwargs.batch_size:
14 |     type: categorical
15 |     choices: [32, 64, 128, 256]


--------------------------------------------------------------------------------
/mighty/configs/search_space/dqn_template.yaml:
--------------------------------------------------------------------------------
 1 | # @package hydra.sweeper.search_space
 2 | hyperparameters:
 3 |   algorithm_kwargs.n_units:
 4 |     type: ordinal
 5 |     sequence: [4,8,16,32,64,128,256,512]
 6 |   algorithm_kwargs.soft_update_weight:
 7 |     type: uniform_float
 8 |     lower: 0
 9 |     upper: 1
10 |     default_value: 1
11 | 
12 | 


--------------------------------------------------------------------------------
/mighty/configs/search_space/mighty_template.yaml:
--------------------------------------------------------------------------------
 1 | # @package hydra.sweeper.search_space
 2 | 
 3 | # Possible HP types:
 4 | # constant, unparametrized, uniform_float, normal_float, beta_float
 5 | # uniform_int, normal_int, beta_int, categorical, ordinal
 6 | hyperparameters:
 7 |   algorithm_kwargs.learning_rate:
 8 |     type: uniform_float
 9 |     lower: 1e-6
10 |     upper: 1e-2
11 |     log: true
12 |     default_value: 1e-3
13 |   algorithm_kwargs.epsilon:
14 |     type: uniform_float
15 |     lower: 0
16 |     upper: 1
17 |     log: false
18 |     default_value: 0.1
19 |   algorithm_kwargs.batch_size:
20 |     type: ordinal
21 |     sequence: [4,8,16,32,64,128,256,512,1024]
22 |     default: 64


--------------------------------------------------------------------------------
/mighty/configs/search_space/ppo_rs.yaml:
--------------------------------------------------------------------------------
 1 | # configs/search_space/ppo_rs.yaml
 2 | hyperparameters:
 3 |   # match the keys under algorithm_kwargs in your PPO config
 4 |   algorithm_kwargs.learning_rate:
 5 |     type: uniform_float
 6 |     lower: 1e-5
 7 |     upper: 1e-3
 8 |     log: true
 9 |   algorithm_kwargs.batch_size:
10 |     type: categorical
11 |     choices: [8192, 16384, 32768]
12 |   algorithm_kwargs.n_gradient_steps:
13 |     type: uniform_int
14 |     lower: 1
15 |     upper: 20
16 |     log: false
17 |   algorithm_kwargs.gamma:
18 |     type: uniform_float
19 |     lower: 0.9
20 |     upper: 0.9999
21 |     log: false
22 |   algorithm_kwargs.ppo_clip:
23 |     type: uniform_float
24 |     lower: 0.1
25 |     upper: 0.3
26 |     log: false
27 |   algorithm_kwargs.value_loss_coef:
28 |     type: uniform_float
29 |     lower: 0.1
30 |     upper: 1.0
31 |     log: false
32 |   algorithm_kwargs.entropy_coef:
33 |     type: uniform_float
34 |     lower: 0.0
35 |     upper: 0.1
36 |     log: false
37 |   algorithm_kwargs.max_grad_norm:
38 |     type: uniform_float
39 |     lower: 0.1
40 |     upper: 1.0
41 |     log: false
42 | 


--------------------------------------------------------------------------------
/mighty/configs/search_space/sac_rs.yaml:
--------------------------------------------------------------------------------
1 | hyperparameters:
2 |   algorithm_kwargs.learning_rate:
3 |     type: uniform_float
4 |     lower: 0.000001
5 |     upper: 0.01
6 |     log: true
7 |   algorithm_kwargs.batch_size:
8 |     type: categorical
9 |     choices: [32, 64, 128, 256]


--------------------------------------------------------------------------------
/mighty/configs/sweep_ppo_pbt.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - algorithm: ppo
 4 |   - environment: gymnasium/pendulum
 5 |   - search_space: ppo_rs
 6 |   - override hydra/job_logging: colorlog
 7 |   - override hydra/hydra_logging: colorlog
 8 |   - override hydra/help: mighty_help
 9 |   - override hydra/sweeper: HyperPBT        # use Hypersweeper’s RandomSearch
10 | 
11 | runner: standard
12 | debug: false
13 | seed: 0
14 | output_dir: sweep_pbt
15 | wandb_project: null
16 | tensorboard_file: null
17 | experiment_name: mighty_experiment
18 | 
19 | algorithm_kwargs: {}
20 | 
21 | # Training
22 | eval_every_n_steps: 1e4  # After how many steps to evaluate.
23 | n_episodes_eval: 10
24 | checkpoint: null  # Path to load model checkpoint
25 | save_model_every_n_steps: 5e5
26 | 
27 | hydra:
28 |   sweeper:
29 |     budget: 100000
30 |     budget_variable: 100000
31 |     loading_variable: load
32 |     saving_variable: save
33 |     sweeper_kwargs:
34 |       optimizer_kwargs:
35 |         population_size: 10
36 |         config_interval: 1e4
37 |       checkpoint_tf: true
38 |       load_tf: true
39 |     search_space: ${search_space}
40 |   run:
41 |     dir: ${output_dir}/${experiment_name}_${seed}
42 |   sweep:
43 |     dir: ${output_dir}/${experiment_name}_${seed}


--------------------------------------------------------------------------------
/mighty/configs/sweep_rs.yaml:
--------------------------------------------------------------------------------
 1 | defaults:
 2 |   - _self_
 3 |   - algorithm: ppo
 4 |   - environment: gymnasium/pendulum
 5 |   - search_space: ppo_rs
 6 |   - override hydra/job_logging: colorlog
 7 |   - override hydra/hydra_logging: colorlog
 8 |   - override hydra/help: mighty_help
 9 |   - override hydra/sweeper: HyperRS        # use Hypersweeper’s RandomSearch
10 | 
11 | runner: standard
12 | debug: false
13 | seed: 0
14 | output_dir: sweep_rs
15 | wandb_project: null
16 | tensorboard_file: null
17 | experiment_name: dqn_sweep
18 | 
19 | algorithm_kwargs: {}
20 | 
21 | # Training
22 | eval_every_n_steps: 1e4  # After how many steps to evaluate.
23 | n_episodes_eval: 10
24 | checkpoint: null  # Path to load model checkpoint
25 | save_model_every_n_steps: 5e5
26 | 
27 | hydra:
28 |   sweeper:
29 |     n_trials: 10
30 |     sweeper_kwargs:
31 |       max_parallelization: 0.8
32 |       max_budget: 100000
33 |     search_space: ${search_space}
34 |   run:
35 |     dir: ${output_dir}/${experiment_name}_${seed}
36 |   sweep:
37 |     dir: ${output_dir}/${experiment_name}_${seed}


--------------------------------------------------------------------------------
/mighty/configs/target_function.yaml:
--------------------------------------------------------------------------------
1 | # configs/target_function.yaml
2 | _target_: run_mighty          # or fully‑qualified: mighty.run_mighty.run_mighty


--------------------------------------------------------------------------------
/mighty/mighty_agents/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/mighty/mighty_agents/.gitkeep


--------------------------------------------------------------------------------
/mighty/mighty_agents/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base_agent import MightyAgent
 2 | from .dqn import MightyDQNAgent
 3 | from .ppo import MightyPPOAgent
 4 | from .sac import MightySACAgent
 5 | 
 6 | # FIXME: does it make sense to also split them in on- and off-policy agents? I mean for ifs in the base class?
 7 | # Then we wouldn't have to test for PPO, just for on-policy
 8 | VALID_AGENT_TYPES = ["DQN", "PPO", "SAC", "DDQN"]
 9 | AGENT_CLASSES = {
10 |     "DQN": MightyDQNAgent,
11 |     "PPO": MightyPPOAgent,
12 |     "SAC": MightySACAgent,
13 |     "DDQN": MightyDQNAgent,
14 | }
15 | 
16 | from .factory import get_agent_class  # noqa: E402
17 | 
18 | __all__ = [
19 |     "MightyAgent",
20 |     "get_agent_class",
21 |     "MightyDQNAgent",
22 |     "MightyPPOAgent",
23 |     "MightySACAgent",
24 | ]
25 | 


--------------------------------------------------------------------------------
/mighty/mighty_agents/factory.py:
--------------------------------------------------------------------------------
 1 | """Factory for creating agents based on config."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from typing import TYPE_CHECKING
 6 | 
 7 | from mighty.mighty_agents import AGENT_CLASSES, VALID_AGENT_TYPES
 8 | 
 9 | if TYPE_CHECKING:
10 |     from mighty.mighty_agents.base_agent import MightyAgent
11 | 
12 | 
13 | def get_agent_class(agent_type: str) -> MightyAgent:
14 |     """Transforms config keyword for agents to class."""
15 |     agent_class = None
16 |     if agent_type in VALID_AGENT_TYPES:
17 |         agent_class = AGENT_CLASSES[agent_type]
18 |     else:
19 |         raise ValueError(f"Unknown agent_type {agent_type}.")
20 | 
21 |     return agent_class  # type: ignore
22 | 


--------------------------------------------------------------------------------
/mighty/mighty_exploration/__init__.py:
--------------------------------------------------------------------------------
 1 | from mighty.mighty_exploration.decaying_epsilon_greedy import DecayingEpsilonGreedy
 2 | from mighty.mighty_exploration.epsilon_greedy import EpsilonGreedy
 3 | from mighty.mighty_exploration.ez_greedy import EZGreedy
 4 | from mighty.mighty_exploration.mighty_exploration_policy import MightyExplorationPolicy
 5 | from mighty.mighty_exploration.stochastic_policy import StochasticPolicy
 6 | 
 7 | __all__ = [
 8 |     "MightyExplorationPolicy",
 9 |     "EpsilonGreedy",
10 |     "EZGreedy",
11 |     "StochasticPolicy",
12 |     "DecayingEpsilonGreedy",
13 | ]
14 | 


--------------------------------------------------------------------------------
/mighty/mighty_exploration/decaying_epsilon_greedy.py:
--------------------------------------------------------------------------------
 1 | """Decaying Epsilon‐Greedy Exploration."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import numpy as np
 6 | 
 7 | from mighty.mighty_exploration.epsilon_greedy import EpsilonGreedy
 8 | 
 9 | 
10 | class DecayingEpsilonGreedy(EpsilonGreedy):
11 |     """Epsilon-Greedy Exploration with linear decay schedule."""
12 | 
13 |     def __init__(
14 |         self,
15 |         algo,
16 |         model,
17 |         epsilon: float | None = None,
18 |         epsilon_start: float = 1.0,
19 |         epsilon_final: float = 0.01,
20 |         epsilon_decay_steps: int = 10000,
21 |     ):
22 |         """
23 |         :param algo:       algorithm name
24 |         :param model:      policy model (e.g. Q-network)
25 |         :param epsilon_start: Initial ε (at time step 0)
26 |         :param epsilon_final: Final ε (after decay_steps)
27 |         :param epsilon_decay_steps: Number of steps over which to linearly
28 |                                      decay ε from epsilon_start → epsilon_final.
29 |         """
30 |         super().__init__(algo=algo, model=model, epsilon=epsilon_start)
31 |         self.epsilon_start = epsilon_start
32 |         self.epsilon_final = epsilon_final
33 |         self.epsilon_decay_steps = epsilon_decay_steps
34 |         self.total_steps = 0
35 | 
36 |     def _compute_epsilon(self) -> float:
37 |         """Linearly interpolate between epsilon_start and epsilon_final."""
38 |         if self.total_steps >= self.epsilon_decay_steps:
39 |             return self.epsilon_final
40 |         fraction = self.total_steps / self.epsilon_decay_steps
41 |         return float(
42 |             self.epsilon_start + fraction * (self.epsilon_final - self.epsilon_start)
43 |         )
44 | 
45 |     def get_random_actions(self, n_actions, action_length):
46 |         """
47 |         Override to recompute ε at each call, then delegate to EpsilonGreedy's logic.
48 |         """
49 |         # 1) Update ε based on total_steps
50 |         current_epsilon = self._compute_epsilon()
51 |         self.epsilon = current_epsilon
52 | 
53 |         # 2) Call parent method to build exploration flags & random actions
54 |         exploration_flags, random_actions = super().get_random_actions(
55 |             n_actions, action_length
56 |         )
57 | 
58 |         # 3) Advance the step counter (so subsequent calls see a smaller ε)
59 |         self.total_steps += n_actions
60 | 
61 |         return exploration_flags, random_actions
62 | 
63 |     def explore_func(self, s):
64 |         """Same as EpsilonGreedy, except uses decayed ε each time."""
65 |         greedy_actions, qvals = self.sample_action(s)
66 |         exploration_flags, random_actions = self.get_random_actions(
67 |             len(greedy_actions), len(qvals[0])
68 |         )
69 |         actions = np.where(exploration_flags, random_actions, greedy_actions)
70 |         return actions.astype(int), qvals
71 | 


--------------------------------------------------------------------------------
/mighty/mighty_exploration/epsilon_greedy.py:
--------------------------------------------------------------------------------
 1 | """Epsilon Greedy Exploration."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import numpy as np
 6 | 
 7 | from mighty.mighty_exploration.mighty_exploration_policy import MightyExplorationPolicy
 8 | 
 9 | 
10 | class EpsilonGreedy(MightyExplorationPolicy):
11 |     """Epsilon Greedy Exploration."""
12 | 
13 |     def __init__(
14 |         self,
15 |         algo,
16 |         model,
17 |         epsilon=0.1,
18 |     ):
19 |         """Initialize Epsilon Greedy.
20 | 
21 |         :param algo: algorithm name
22 |         :param func: policy function
23 |         :param epsilon: exploration epsilon
24 |         :param env: environment
25 |         :return:
26 |         """
27 |         super().__init__(algo, model)
28 |         self.epsilon = epsilon
29 | 
30 |     def get_random_actions(self, n_actions, action_length):
31 |         if isinstance(self.epsilon, float):
32 |             exploration_flags = [
33 |                 self.rng.random() < self.epsilon for _ in range(n_actions)
34 |             ]
35 |         else:
36 |             index = 0
37 |             exploration_flags = []
38 |             while len(exploration_flags) < n_actions:
39 |                 exploration_flags.append(self.rng.random() < self.epsilon[index])
40 |                 index += 1
41 |                 if index >= len(self.epsilon):
42 |                     index = 0
43 |         exploration_flags = np.array(exploration_flags)
44 |         random_actions = self.rng.integers(action_length, size=n_actions)
45 |         return exploration_flags, random_actions
46 | 
47 |     def explore_func(self, s):
48 |         greedy_actions, qvals = self.sample_action(s)
49 |         exploration_flags, random_actions = self.get_random_actions(
50 |             len(greedy_actions), len(qvals[0])
51 |         )
52 |         actions = np.where(exploration_flags, random_actions, greedy_actions)
53 |         return actions.astype(int), qvals
54 | 


--------------------------------------------------------------------------------
/mighty/mighty_exploration/ez_greedy.py:
--------------------------------------------------------------------------------
 1 | """Epsilon Greedy Exploration."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from typing import TYPE_CHECKING, Tuple
 6 | 
 7 | import numpy as np
 8 | 
 9 | from mighty.mighty_exploration import EpsilonGreedy
10 | 
11 | if TYPE_CHECKING:
12 |     import torch
13 | 
14 | 
15 | class EZGreedy(EpsilonGreedy):
16 |     """Epsilon Greedy Exploration."""
17 | 
18 |     def __init__(
19 |         self,
20 |         algo: str,
21 |         model: torch.nn.Module,
22 |         epsilon: float = 0.1,
23 |         zipf_param: int = 2,
24 |     ):
25 |         """Initialize EZ Greedy.
26 | 
27 |         :param algo: algorithm name
28 |         :param model: model
29 |         :param epsilon: exploration epsilon
30 |         :param zipf_param: parametrizes the Zipf distribution for skipping
31 |         :return:
32 |         """
33 |         super().__init__(algo, model)
34 |         self.epsilon = epsilon
35 |         self.zipf_param = zipf_param
36 |         self.skip = max(1, np.random.default_rng().zipf(self.zipf_param))
37 |         self.skipped = None
38 |         self.frozen_actions = None
39 | 
40 |     def explore_func(self, s: torch.Tensor) -> Tuple:
41 |         # Epsilon Greedy Step
42 |         greedy_actions, qvals = self.sample_action(s)
43 | 
44 |         # Initialize Skips
45 |         if self.skipped is None:
46 |             self.skipped = np.zeros(len(greedy_actions))  # type: ignore
47 |             self.frozen_actions = np.zeros(greedy_actions.shape)  # type: ignore
48 | 
49 |         # Do epsilon greedy exploration
50 |         exploration_flags, random_actions = self.get_random_actions(
51 |             len(greedy_actions), len(qvals[0])
52 |         )
53 |         actions = np.where(exploration_flags, random_actions, greedy_actions)
54 | 
55 |         # Decay Skips
56 |         self.skipped = np.maximum(0, self.skipped - 1)  # type: ignore
57 | 
58 |         # Sample skip lengths for new exploration steps
59 |         new_skips = np.where(
60 |             exploration_flags,
61 |             [self.rng.zipf(self.zipf_param) for _ in range(len(exploration_flags))],
62 |             [0] * len(exploration_flags),
63 |         )
64 |         for i in range(len(self.skipped)):  # type: ignore
65 |             if self.skipped[i] == 0:  # type: ignore
66 |                 self.frozen_actions[i] = actions[i]  # type: ignore
67 | 
68 |             if exploration_flags[i] and self.skipped[i] == 0:  # type: ignore
69 |                 self.skipped[i] = new_skips[i]  # type: ignore
70 | 
71 |         # Apply skip
72 |         skips = [self.skipped[i] > 0 for i in range(len(self.skipped))]  # type: ignore
73 |         actions = np.where(skips, self.frozen_actions, actions)  # type: ignore
74 |         return actions.astype(int), qvals
75 | 


--------------------------------------------------------------------------------
/mighty/mighty_exploration/mighty_exploration_policy.py:
--------------------------------------------------------------------------------
  1 | """Mighty Exploration Policy."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from torch.distributions import Categorical, Normal
  8 | 
  9 | 
 10 | class MightyExplorationPolicy:
 11 |     """Generic Exploration Policy Interface.
 12 | 
 13 |     Now supports:
 14 |       - Discrete: `model(state)` → logits → Categorical
 15 |       - Continuous (squashed-Gaussian): `model(state)` → (action, z, mean, log_std)
 16 |       - Continuous (legacy): `model(state)` → (mean, std)
 17 |     """
 18 | 
 19 |     def __init__(
 20 |         self,
 21 |         algo,
 22 |         model,
 23 |         discrete=False,
 24 |     ) -> None:
 25 |         """
 26 |         :param algo:    Algorithm name (e.g. "ppo", "sac", etc.)
 27 |         :param model:   The policy network (any nn.Module)
 28 |         :param discrete: True if action-space is discrete
 29 |         """
 30 |         self.rng = np.random.default_rng()
 31 |         self.algo = algo
 32 |         self.model = model
 33 |         self.discrete = discrete
 34 | 
 35 |         # Undistorted action sampling
 36 |         if self.algo == "q":
 37 | 
 38 |             def sample_func(state_np):
 39 |                 """
 40 |                 Q-learning branch:
 41 |                   • state_np: np.ndarray of shape [batch, obs_dim]
 42 |                   • model(state) returns Q-values: tensor [batch, n_actions]
 43 |                 We choose action = argmax(Q), and also return the full Q‐vector.
 44 |                 """
 45 |                 state = torch.as_tensor(state_np, dtype=torch.float32)
 46 |                 qs = self.model(state)  # [batch, n_actions]
 47 |                 # Choose greedy action
 48 |                 action = torch.argmax(qs, dim=1)  # [batch]
 49 |                 return action.detach().cpu().numpy(), qs  # action_np, Q‐vector
 50 | 
 51 |             self.sample_action = sample_func
 52 | 
 53 |         else:
 54 | 
 55 |             def sample_func(state_np):
 56 |                 """
 57 |                 state_np: np.ndarray of shape [batch, obs_dim]
 58 |                 Returns: (action_tensor, log_prob_tensor)
 59 |                 """
 60 |                 state = torch.as_tensor(state_np, dtype=torch.float32)
 61 | 
 62 |                 # ─── Discrete action branch ─────────────────────────────────────────
 63 |                 if self.discrete:
 64 |                     logits = self.model(state)  # [batch, n_actions]
 65 |                     dist = Categorical(logits=logits)
 66 |                     action = dist.sample()  # [batch]
 67 |                     log_prob = dist.log_prob(action)  # [batch]
 68 |                     return action.detach().cpu().numpy(), log_prob
 69 | 
 70 |                 # ─── Continuous squashed‐Gaussian (4‐tuple) ──────────────────────────
 71 |                 out = self.model(state)
 72 |                 if isinstance(out, tuple) and len(out) == 4:
 73 |                     # Unpack exactly (action, z, mean, log_std)
 74 |                     action, z, mean, log_std = out  # each [batch, action_dim]
 75 |                     std = torch.exp(log_std)  # [batch, action_dim]
 76 |                     dist = Normal(mean, std)
 77 | 
 78 |                     # 2a) log_pz = ∑ᵢ log N(zᵢ; μᵢ, σᵢ)
 79 |                     log_pz = dist.log_prob(z).sum(dim=-1)  # [batch]
 80 | 
 81 |                     # 2b) tanh‐correction = ∑ᵢ log(1 − tanh(zᵢ)² + ε)
 82 |                     eps = 1e-6
 83 |                     log_correction = torch.log(1.0 - torch.tanh(z).pow(2) + eps).sum(
 84 |                         dim=-1
 85 |                     )  # [batch]
 86 | 
 87 |                     # 2c) final log_prob of a = tanh(z)
 88 |                     log_prob = log_pz - log_correction  # [batch]
 89 |                     return action.detach().cpu().numpy(), log_prob
 90 | 
 91 |                 # ─── Legacy continuous branch (model returns (mean, std)) ────────────
 92 |                 if isinstance(out, tuple) and len(out) == 2:
 93 |                     mean, std = out  # both [batch, action_dim]
 94 |                     dist = Normal(mean, std)
 95 |                     z = dist.rsample()  # [batch, action_dim]
 96 |                     action = torch.tanh(z)  # [batch, action_dim]
 97 | 
 98 |                     # 3a) log_pz = ∑ᵢ log N(zᵢ; μᵢ, σᵢ)
 99 |                     log_pz = dist.log_prob(z).sum(dim=-1)  # [batch]
100 | 
101 |                     # 3b) tanh‐correction
102 |                     eps = 1e-6
103 |                     log_correction = torch.log(1.0 - action.pow(2) + eps).sum(
104 |                         dim=-1
105 |                     )  # [batch]
106 | 
107 |                     log_prob = log_pz - log_correction  # [batch]
108 |                     return action.detach().cpu().numpy(), log_prob
109 | 
110 |                 # ─── Fallback: if model(state) returns a Distribution ────────────────
111 |                 if isinstance(out, torch.distributions.Distribution):
112 |                     dist = out  # user returned a Distribution
113 |                     action = dist.sample()  # [batch]
114 |                     log_prob = dist.log_prob(action)  # [batch]
115 |                     return action.detach().cpu().numpy(), log_prob
116 | 
117 |                 # ─── Otherwise, we don’t know how to sample ─────────────────────────
118 |                 raise RuntimeError(
119 |                     "MightyExplorationPolicy: cannot interpret model(state) output of type "
120 |                     f"{type(out)}"
121 |                 )
122 | 
123 |         self.sample_action = sample_func
124 | 
125 |     def __call__(self, s, return_logp=False, metrics=None, evaluate=False):
126 |         """Get action.
127 | 
128 |         :param s: state
129 |         :param return_logp: return logprobs
130 |         :param metrics: current metric dict
131 |         :param eval: eval mode
132 |         :return: action or (action, logprobs)
133 |         """
134 |         if metrics is None:
135 |             metrics = {}
136 |         if evaluate:
137 |             action, logprobs = self.sample_action(s)
138 |             output = (action, logprobs) if return_logp else action
139 |         else:
140 |             output = self.explore(s, return_logp, metrics)
141 | 
142 |         return output
143 | 
144 |     def explore(self, s, return_logp, metrics=None):
145 |         """Explore.
146 | 
147 |         :param s: state
148 |         :param return_logp: return logprobs
149 |         :param _: not used
150 |         :return: action or (action, logprobs)
151 |         """
152 |         action, logprobs = self.explore_func(s)
153 |         return (action, logprobs) if return_logp else action
154 | 
155 |     def explore_func(self, s):
156 |         """Explore function."""
157 |         raise NotImplementedError
158 | 


--------------------------------------------------------------------------------
/mighty/mighty_exploration/stochastic_policy.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from typing import Tuple
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from torch.distributions import Categorical, Normal
  8 | 
  9 | from mighty.mighty_exploration.mighty_exploration_policy import MightyExplorationPolicy
 10 | from mighty.mighty_models import SACModel
 11 | 
 12 | 
 13 | class StochasticPolicy(MightyExplorationPolicy):
 14 |     """Entropy-Based Exploration for discrete and continuous action spaces."""
 15 | 
 16 |     def __init__(
 17 |         self, algo, model, entropy_coefficient: float = 0.2, discrete: bool = True
 18 |     ):
 19 |         """
 20 |         :param algo: the RL algorithm instance
 21 |         :param model: the policy model
 22 |         :param entropy_coefficient: weight on entropy term
 23 |         :param discrete: whether the action space is discrete
 24 |         """
 25 |         super().__init__(algo, model, discrete)
 26 |         self.entropy_coefficient = entropy_coefficient
 27 |         self.discrete = discrete
 28 | 
 29 |         # --- override sample_action only for continuous SAC ---
 30 |         if not discrete and isinstance(model, SACModel):
 31 |             # for evaluation use deterministic=True; training will go through .explore()
 32 |             def _sac_sample(state_np):
 33 |                 state = torch.as_tensor(state_np, dtype=torch.float32)
 34 |                 # forward returns (action, z, mean, log_std)
 35 |                 action, z, mean, log_std = model(state, deterministic=True)
 36 |                 logp = model.policy_log_prob(z, mean, log_std)
 37 | 
 38 |                 return action.detach().cpu().numpy(), logp
 39 | 
 40 |             self.sample_action = _sac_sample
 41 | 
 42 |     def explore(self, s, return_logp, metrics=None) -> Tuple[np.ndarray, torch.Tensor]:
 43 |         """
 44 |         Given observations `s`, sample an exploratory action and compute a weighted log-prob.
 45 | 
 46 |         Returns:
 47 |           action: numpy array of actions
 48 |           weighted_log_prob: Tensor of shape [batch, 1]
 49 |         """
 50 |         state = torch.as_tensor(s, dtype=torch.float32)
 51 |         if self.discrete:
 52 |             logits = self.model(state)
 53 |             dist = Categorical(logits=logits)
 54 |             action = dist.sample()
 55 |             log_prob = dist.log_prob(action).unsqueeze(-1)
 56 |             return action.detach().cpu().numpy(), log_prob * self.entropy_coefficient
 57 |         else:
 58 |             # If model has attribute continuous_action=True, we know:
 59 |             #   model(state) → (action, z, mean, log_std)
 60 |             if hasattr(self.model, "continuous_action") and getattr(
 61 |                 self.model, "continuous_action"
 62 |             ):
 63 |                 # 1) Forward pass: get (action, z, mean, log_std)
 64 |                 action, z, mean, log_std = self.model(
 65 |                     state
 66 |                 )  # each: [batch, action_dim]
 67 |                 std = torch.exp(log_std)  # [batch, action_dim]
 68 |                 dist = Normal(mean, std)
 69 | 
 70 |                 # 2) Compute log_prob of "z" under N(mean, std)
 71 |                 log_pz = dist.log_prob(z).sum(dim=-1, keepdim=True)  # [batch, 1]
 72 | 
 73 |                 # 3) Tanh Jacobian‐correction: sum_i log(1 − tanh(z_i)^2 + ε)
 74 |                 eps = 1e-6
 75 |                 log_correction = torch.log(1.0 - torch.tanh(z).pow(2) + eps).sum(
 76 |                     dim=-1, keepdim=True
 77 |                 )  # [batch, 1]
 78 | 
 79 |                 # 4) Final log_prob of a = tanh(z)
 80 |                 log_prob = log_pz - log_correction  # [batch, 1]
 81 | 
 82 |                 # 5) (Optional) multiply by entropy_coeff to get “weighted log_prob”
 83 |                 weighted_log_prob = log_prob * self.entropy_coefficient
 84 | 
 85 |                 return action.detach().cpu().numpy(), weighted_log_prob
 86 | 
 87 |             # If it’s actually a SACModel, fallback (should only happen in training if model∈SACModel)
 88 |             elif isinstance(self.model, SACModel):
 89 |                 action, z, mean, log_std = self.model(state, deterministic=False)
 90 |                 std = torch.exp(log_std)
 91 |                 dist = Normal(mean, std)
 92 | 
 93 |                 log_pz = dist.log_prob(z).sum(dim=-1, keepdim=True)
 94 |                 weighted_log_prob = log_pz * self.entropy_coefficient
 95 |                 return action.detach().cpu().numpy(), weighted_log_prob
 96 | 
 97 |             # If it’s “mean, std”‐style continuous (rare in our code), handle that case
 98 |             else:
 99 |                 mean, std = self.model(state)
100 |                 dist = Normal(mean, std)
101 |                 z = dist.rsample()  # [batch, action_dim]
102 |                 action = torch.tanh(z)  # [batch, action_dim]
103 | 
104 |                 log_pz = dist.log_prob(z).sum(dim=-1, keepdim=True)
105 |                 eps = 1e-6
106 |                 log_correction = torch.log(1.0 - action.pow(2) + eps).sum(
107 |                     dim=-1, keepdim=True
108 |                 )
109 |                 log_prob = log_pz - log_correction  # [batch, 1]
110 |                 entropy = dist.entropy().sum(dim=-1, keepdim=True)  # [batch, 1]
111 |                 weighted_log_prob = log_prob * entropy
112 | 
113 |                 return action.detach().cpu().numpy(), weighted_log_prob
114 | 
115 |     def forward(self, s):
116 |         """
117 |         Alias for explore, so policy(s) returns (action, weighted_log_prob).
118 |         """
119 |         return self.explore(s)
120 | 


--------------------------------------------------------------------------------
/mighty/mighty_meta/__init__.py:
--------------------------------------------------------------------------------
1 | from mighty.mighty_meta.cosine_lr_schedule import CosineLRSchedule
2 | from mighty.mighty_meta.plr import PrioritizedLevelReplay
3 | from mighty.mighty_meta.rnd import RND, NovelD
4 | from mighty.mighty_meta.space import SPaCE
5 | 
6 | __all__ = ["PrioritizedLevelReplay", "SPaCE", "CosineLRSchedule", "RND", "NovelD"]
7 | 


--------------------------------------------------------------------------------
/mighty/mighty_meta/cosine_lr_schedule.py:
--------------------------------------------------------------------------------
 1 | """Cosine LR Schedule with optional warm restarts."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import numpy as np
 6 | 
 7 | from mighty.mighty_meta.mighty_component import MightyMetaComponent
 8 | 
 9 | 
10 | class CosineLRSchedule(MightyMetaComponent):
11 |     """Cosine LR Schedule with optional warm restarts."""
12 | 
13 |     def __init__(
14 |         self,
15 |         initial_lr,
16 |         num_decay_steps,
17 |         min_lr=0,
18 |         restart_every=10000,
19 |         restart_multiplier=1.2,
20 |     ) -> None:
21 |         """Cosine schedule initialization.
22 | 
23 |         :param initial_lr: Initial maximal LR
24 |         :param num_decay_steps: Length of schedule in steps
25 |         :param min_lr: Minimal LR
26 |         :param restart_every: Restart frequency
27 |         :param restart multiplier: Multiplies current learning rate on restart.
28 |         :return:
29 |         """
30 |         super().__init__()
31 |         self.restart_every = restart_every
32 |         self.n_restarts = 0
33 |         self.t_mult = restart_multiplier
34 |         self.eta_max = initial_lr
35 |         self.t_max = num_decay_steps
36 |         self.eta_min = min_lr
37 |         self.pre_step_methods = [self.adapt_lr]
38 | 
39 |     def adapt_lr(self, metrics):
40 |         """Adapt LR on step.
41 | 
42 |         :param metrics: Dict of current metrics
43 |         :return:
44 |         """
45 |         reset = False
46 |         if self.restart_every > 0:
47 |             if self.n_restarts < np.floor(metrics["step"] / self.restart_every):
48 |                 self.n_restarts += 1
49 |                 self.eta_max = (
50 |                     self.eta_min
51 |                     + 0.5
52 |                     * (self.eta_max - self.eta_min)
53 |                     * (1 + np.cos((metrics["step"] / self.t_max) * np.pi))
54 |                     * self.t_mult
55 |                 )
56 |                 metrics["hp/lr"] = self.eta_max
57 |                 reset = True
58 | 
59 |         if metrics["step"] < self.t_max and not reset:
60 |             metrics["hp/lr"] = self.eta_min + 0.5 * (self.eta_max - self.eta_min) * (
61 |                 1 + np.cos((metrics["step"] / self.t_max) * np.pi)
62 |             )
63 | 


--------------------------------------------------------------------------------
/mighty/mighty_meta/mighty_component.py:
--------------------------------------------------------------------------------
 1 | """Template for meta-learning components."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | 
 6 | class MightyMetaComponent:
 7 |     """Component for registering meta-control methods."""
 8 | 
 9 |     def __init__(self) -> None:
10 |         """Meta module init.
11 | 
12 |         :return:
13 |         """
14 |         self.pre_step_methods = []
15 |         self.post_step_methods = []
16 |         self.pre_update_methods = []
17 |         self.post_update_methods = []
18 |         self.pre_episode_methods = []
19 |         self.post_episode_methods = []
20 | 
21 |     def pre_step(self, metrics):
22 |         """Execute methods before a step.
23 | 
24 |         :param metrics: Current metrics dict
25 |         :return:
26 |         """
27 |         for m in self.pre_step_methods:
28 |             m(metrics)
29 | 
30 |     def post_step(self, metrics):
31 |         """Execute methods after a step.
32 | 
33 |         :param metrics: Current metrics dict
34 |         :return:
35 |         """
36 |         for m in self.post_step_methods:
37 |             m(metrics)
38 | 
39 |     def pre_update(self, metrics):
40 |         """Execute methods before the update.
41 | 
42 |         :param metrics: Current metrics dict
43 |         :return:
44 |         """
45 |         for m in self.pre_update_methods:
46 |             m(metrics)
47 | 
48 |     def post_update(self, metrics):
49 |         """Execute methods after the update.
50 | 
51 |         :param metrics: Current metrics dict
52 |         :return:
53 |         """
54 |         for m in self.post_update_methods:
55 |             m(metrics)
56 | 
57 |     def pre_episode(self, metrics):
58 |         """Execute methods before an episode.
59 | 
60 |         :param metrics: Current metrics dict
61 |         :return:
62 |         """
63 |         for m in self.pre_episode_methods:
64 |             m(metrics)
65 | 
66 |     def post_episode(self, metrics):
67 |         """Execute methods at the end of an episode.
68 | 
69 |         :param metrics: Current metrics dict
70 |         :return:
71 |         """
72 |         for m in self.post_episode_methods:
73 |             m(metrics)
74 | 


--------------------------------------------------------------------------------
/mighty/mighty_meta/space.py:
--------------------------------------------------------------------------------
 1 | """Curriculum Learning via Self-Paced Context Evaluation."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import numpy as np
 6 | import torch
 7 | 
 8 | from mighty.mighty_meta.mighty_component import MightyMetaComponent
 9 | 
10 | 
11 | class SPaCE(MightyMetaComponent):
12 |     """Curriculum Learning via Self-Paced Context Evaluation."""
13 | 
14 |     def __init__(self, criterion="relative_improvement", threshold=0.1, k=1) -> None:
15 |         """SPaCE initialization.
16 | 
17 |         :param criterion: Ranking criterion
18 |         :param threshold: Minimum average change needed to keep train set size
19 |         :param k: Size of instance set increase
20 |         :return:
21 |         """
22 |         super().__init__()
23 |         self.criterion = criterion
24 |         self.threshold = threshold
25 |         self.instance_set = []
26 |         self.increase_by_k_instances = k
27 |         self.current_instance_set_size = k
28 |         self.last_evals = None
29 |         self.all_instances = None
30 |         self.pre_episode_methods = [self.get_instances]
31 | 
32 |     def get_instances(self, metrics):
33 |         """Get Training set on episode start.
34 | 
35 |         :param metrics: Current metrics dict
36 |         :return:
37 |         """
38 |         env = metrics["env"]
39 |         vf = metrics["vf"]
40 |         rollout_values = None
41 |         if "rollout_values" in metrics:
42 |             rollout_values = metrics["rollout_values"]
43 | 
44 |         if self.all_instances is None:
45 |             self.all_instances = np.array(env.instance_id_list.copy())
46 | 
47 |         if self.last_evals is None and rollout_values is None:
48 |             self.instance_set = np.random.default_rng().choice(
49 |                 self.all_instances, size=self.current_instance_set_size
50 |             )
51 |         elif self.last_evals is None:
52 |             self.instance_set = np.random.default_rng().choice(
53 |                 self.all_instances, size=self.current_instance_set_size
54 |             )
55 |             self.last_evals = np.nanmean(rollout_values)
56 |         else:
57 |             if (
58 |                 abs(np.mean(rollout_values) - self.last_evals)
59 |                 / (self.last_evals + 1e-6)
60 |                 <= self.threshold
61 |             ):
62 |                 self.current_instance_set_size = min(
63 |                     self.current_instance_set_size + self.increase_by_k_instances,
64 |                     len(self.all_instances),
65 |                 )
66 |             self.last_evals = np.nanmean(rollout_values)
67 |             evals = self.get_evals(env, vf)
68 |             if self.criterion == "improvement":
69 |                 improvement = evals - self.last_evals
70 |             elif self.criterion == "relative_improvement":
71 |                 improvement = (evals - self.last_evals) / self.last_evals
72 |             else:
73 |                 raise NotImplementedError("This SpaCE criterion is not implemented.")
74 |             self.instance_set = self.all_instances[np.argsort(improvement)[::-1]][
75 |                 : self.current_instance_set_size
76 |             ]
77 |         env.set_instance_set(self.instance_set)
78 | 
79 |     def get_evals(self, env, vf):
80 |         """Get values for s_0 of all instances.
81 | 
82 |         :param env: environment
83 |         :param vf: value or q function
84 |         :return:
85 |         """
86 |         values = []
87 |         for i in self.all_instances:
88 |             state, _ = env.reset()
89 |             env.set_inst_id(i)
90 |             v = vf(torch.tensor(state)).squeeze().detach().numpy()
91 |             # If we're dealing with a q function, we transform to value here
92 |             if isinstance(v[0], np.ndarray):
93 |                 v = v.sum(axis=1)
94 |             values.append(v[0])
95 |         return values
96 | 


--------------------------------------------------------------------------------
/mighty/mighty_models/__init__.py:
--------------------------------------------------------------------------------
1 | from mighty.mighty_models.dqn import DQN
2 | from mighty.mighty_models.ppo import PPOModel
3 | from mighty.mighty_models.sac import SACModel
4 | 
5 | __all__ = ["DQN", "SACModel", "PPOModel"]
6 | 


--------------------------------------------------------------------------------
/mighty/mighty_models/ppo.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from typing import Tuple
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | from mighty.mighty_models.networks import make_feature_extractor
  8 | 
  9 | 
 10 | class PPOModel(nn.Module):
 11 |     """PPO Model with policy and value networks."""
 12 | 
 13 |     def __init__(
 14 |         self,
 15 |         obs_shape: int,
 16 |         action_size: int,
 17 |         hidden_sizes: list[int] = [64, 64],
 18 |         activation: str = "tanh",
 19 |         continuous_action: bool = False,
 20 |         log_std_min: float = -20.0,
 21 |         log_std_max: float = 2.0,
 22 |     ):
 23 |         """Initialize the PPO model."""
 24 |         super().__init__()
 25 | 
 26 |         self.obs_size = int(obs_shape)
 27 |         self.action_size = int(action_size)
 28 |         self.hidden_sizes = hidden_sizes
 29 |         self.activation = activation
 30 |         self.continuous_action = continuous_action
 31 |         self.log_std_min = log_std_min
 32 |         self.log_std_max = log_std_max
 33 | 
 34 |         # Make feature extractor
 35 |         self.feature_extractor_policy, feat_dim = make_feature_extractor(
 36 |             architecture="mlp",
 37 |             obs_shape=obs_shape,
 38 |             n_layers=len(hidden_sizes),
 39 |             hidden_sizes=hidden_sizes,
 40 |             activation=activation,
 41 |         )
 42 | 
 43 |         self.feature_extractor_value, _ = make_feature_extractor(
 44 |             architecture="mlp",
 45 |             obs_shape=obs_shape,
 46 |             n_layers=len(hidden_sizes),
 47 |             hidden_sizes=hidden_sizes,
 48 |             activation=activation,
 49 |         )
 50 | 
 51 |         if self.continuous_action:
 52 |             # Output size must be 2 * action_size (mean + log_std)
 53 |             final_out_dim = action_size * 2
 54 |         else:
 55 |             # For discrete actions, output logits of size = action_size
 56 |             final_out_dim = action_size
 57 | 
 58 |         # (Architecture based on
 59 |         # https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/common/policies.py)
 60 | 
 61 |         # Policy network
 62 |         self.policy_head = nn.Sequential(
 63 |             self.feature_extractor_policy,  # [batch, feat_dim]
 64 |             nn.Linear(feat_dim, hidden_sizes[0]),  # [batch, hidden_sizes[0]]
 65 |             nn.LayerNorm(hidden_sizes[0]),  # (optional normalization)
 66 |             getattr(nn, activation.capitalize())(),  # e.g. tanh or ReLU
 67 |             nn.Linear(hidden_sizes[0], final_out_dim),  # [batch, final_out_dim]
 68 |         )
 69 | 
 70 |         # Value network
 71 |         self.value_head = nn.Sequential(
 72 |             self.feature_extractor_value,  # [batch, feat_dim]
 73 |             nn.Linear(feat_dim, hidden_sizes[0]),  # [batch, hidden_sizes[0]]
 74 |             nn.LayerNorm(hidden_sizes[0]),
 75 |             getattr(nn, activation.capitalize())(),
 76 |             nn.Linear(hidden_sizes[0], 1),  # [batch, 1]
 77 |         )
 78 | 
 79 |         # Orthogonal initialization
 80 |         def _init_weights(m: nn.Module):
 81 |             if isinstance(m, nn.Linear):
 82 |                 out_dim = m.out_features
 83 |                 if self.continuous_action and out_dim == final_out_dim:
 84 |                     # This is the final policy‐output layer (mean & log_std):
 85 |                     gain = 0.01
 86 |                 elif (not self.continuous_action) and out_dim == action_size:
 87 |                     # Final policy‐output layer (discrete‐logits):
 88 |                     gain = 0.01
 89 |                 elif out_dim == 1:
 90 |                     # Final value‐output layer:
 91 |                     gain = 1.0
 92 |                 else:
 93 |                     # Any intermediate hidden layer:
 94 |                     gain = math.sqrt(2)
 95 |                 nn.init.orthogonal_(m.weight, gain)
 96 |                 nn.init.constant_(m.bias, 0.0)
 97 | 
 98 |         self.apply(_init_weights)
 99 | 
100 |     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
101 |         """Forward pass through the policy network."""
102 | 
103 |         if self.continuous_action:
104 |             raw = self.policy_head(x)  # [batch, 2 * action_size]
105 |             mean, log_std = raw.chunk(2, dim=-1)  # each [batch, action_size]
106 |             log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
107 |             std = torch.exp(log_std)  # [batch, action_size]
108 | 
109 |             # Sample a raw Gaussian z; during inference/training this is 'reparameterized'
110 |             # (If you need a deterministic‐eval mode, you can add a flag argument here.)
111 |             eps = torch.randn_like(mean)
112 |             z = mean + std * eps  # [batch, action_size]
113 |             action = torch.tanh(z)  # squash to [−1, +1]
114 | 
115 |             return action, z, mean, log_std
116 | 
117 |         else:
118 |             logits = self.policy_head(x)  # [batch, action_size]
119 |             return logits
120 | 
121 |     def forward_value(self, x: torch.Tensor) -> torch.Tensor:
122 |         """Forward pass through the value network."""
123 |         return self.value_head(x)
124 | 


--------------------------------------------------------------------------------
/mighty/mighty_models/sac.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple
  2 | 
  3 | import torch
  4 | from torch import nn
  5 | 
  6 | from mighty.mighty_models.networks import make_feature_extractor
  7 | 
  8 | 
  9 | class SACModel(nn.Module):
 10 |     """SAC Model with squashed Gaussian policy and twin Q-networks."""
 11 | 
 12 |     def __init__(
 13 |         self,
 14 |         obs_size: int,
 15 |         action_size: int,
 16 |         hidden_sizes: list[int] = [256, 256],
 17 |         activation: str = "relu",
 18 |         log_std_min: float = -20,
 19 |         log_std_max: float = 2,
 20 |     ):
 21 |         super().__init__()
 22 |         self.obs_size = obs_size
 23 |         self.action_size = action_size
 24 |         self.log_std_min = log_std_min
 25 |         self.log_std_max = log_std_max
 26 |         self.hidden_sizes = hidden_sizes
 27 |         self.activation = activation
 28 | 
 29 |         # Shared feature extractor for policy and Q-networks
 30 |         extractor, out_dim = make_feature_extractor(
 31 |             architecture="mlp",
 32 |             obs_shape=obs_size,
 33 |             n_layers=len(hidden_sizes),
 34 |             hidden_sizes=hidden_sizes,
 35 |             activation=activation,
 36 |         )
 37 | 
 38 |         # Policy network outputs mean and log_std
 39 |         self.policy_net = nn.Sequential(
 40 |             extractor,
 41 |             nn.Linear(out_dim, action_size * 2),
 42 |         )
 43 | 
 44 |         # Twin Q-networks
 45 |         # — live Q-nets —
 46 |         self.q_net1 = self._make_q_net()
 47 |         self.q_net2 = self._make_q_net()
 48 | 
 49 |         self.target_q_net1 = self._make_q_net()
 50 |         self.target_q_net1.load_state_dict(self.q_net1.state_dict())
 51 |         self.target_q_net2 = self._make_q_net()
 52 |         self.target_q_net2.load_state_dict(self.q_net2.state_dict())
 53 |         for p in self.target_q_net1.parameters():
 54 |             p.requires_grad = False
 55 |         for p in self.target_q_net2.parameters():
 56 |             p.requires_grad = False
 57 | 
 58 |     def _make_q_net(self) -> nn.Sequential:
 59 |         q_in = self.obs_size + self.action_size
 60 |         q_extractor, _ = make_feature_extractor(
 61 |             architecture="mlp",
 62 |             obs_shape=q_in,
 63 |             n_layers=len(self.hidden_sizes),
 64 |             hidden_sizes=self.hidden_sizes,
 65 |             activation=self.activation,
 66 |         )
 67 |         return nn.Sequential(q_extractor, nn.Linear(self.hidden_sizes[-1], 1))
 68 | 
 69 |     def forward(
 70 |         self, state: torch.Tensor, deterministic: bool = False
 71 |     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
 72 |         """
 73 |         Forward pass for policy sampling.
 74 | 
 75 |         Returns:
 76 |           action: torch.Tensor in [-1,1]
 77 |           z: raw Gaussian sample before tanh
 78 |           mean: Gaussian mean
 79 |           log_std: Gaussian log std
 80 |         """
 81 |         x = self.policy_net(state)
 82 |         mean, log_std = x.chunk(2, dim=-1)
 83 |         log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
 84 |         std = torch.exp(log_std)
 85 | 
 86 |         if deterministic:
 87 |             z = mean
 88 |         else:
 89 |             z = mean + std * torch.randn_like(mean)
 90 |         action = torch.tanh(z)
 91 |         return action, z, mean, log_std
 92 | 
 93 |     def policy_log_prob(
 94 |         self, z: torch.Tensor, mean: torch.Tensor, log_std: torch.Tensor
 95 |     ) -> torch.Tensor:
 96 |         """
 97 |         Compute log-prob of action a = tanh(z), correcting for tanh transform.
 98 |         """
 99 |         std = torch.exp(log_std)
100 |         dist = torch.distributions.Normal(mean, std)
101 |         log_pz = dist.log_prob(z).sum(dim=-1, keepdim=True)
102 |         eps = 1e-6  # small constant to avoid numerical issues
103 |         log_correction = (torch.log(1 - torch.tanh(z).pow(2) + eps)).sum(
104 |             dim=-1, keepdim=True
105 |         )
106 |         log_pa = log_pz - log_correction
107 |         return log_pa
108 | 
109 |     def forward_q1(self, state_action: torch.Tensor) -> torch.Tensor:
110 |         return self.q_net1(state_action)
111 | 
112 |     def forward_q2(self, state_action: torch.Tensor) -> torch.Tensor:
113 |         return self.q_net2(state_action)
114 | 


--------------------------------------------------------------------------------
/mighty/mighty_replay/__init__.py:
--------------------------------------------------------------------------------
 1 | from mighty.mighty_replay.buffer import MightyBuffer
 2 | from mighty.mighty_replay.mighty_prioritized_replay import PrioritizedReplay
 3 | from mighty.mighty_replay.mighty_replay_buffer import MightyReplay, TransitionBatch
 4 | from mighty.mighty_replay.mighty_rollout_buffer import MightyRolloutBuffer, RolloutBatch
 5 | 
 6 | __all__ = [
 7 |     "MightyReplay",
 8 |     "PrioritizedReplay",
 9 |     "TransitionBatch",
10 |     "MightyRolloutBuffer",
11 |     "MightyBuffer",
12 |     "RolloutBatch",
13 | ]
14 | 


--------------------------------------------------------------------------------
/mighty/mighty_replay/buffer.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | 
 4 | class MightyBuffer(ABC):
 5 |     @abstractmethod
 6 |     def add(self, *args, **kwargs):
 7 |         pass
 8 | 
 9 |     @abstractmethod
10 |     def sample(self, batch_size):
11 |         pass
12 | 
13 |     @abstractmethod
14 |     def reset(self):
15 |         pass
16 | 
17 |     @abstractmethod
18 |     def __len__(self):
19 |         pass
20 | 
21 |     @abstractmethod
22 |     def __bool__(self):
23 |         pass
24 | 


--------------------------------------------------------------------------------
/mighty/mighty_replay/mighty_replay_buffer.py:
--------------------------------------------------------------------------------
  1 | """Mighty replay buffer."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | from collections.abc import Iterable
  6 | 
  7 | import dill as pickle
  8 | import numpy as np
  9 | import torch
 10 | 
 11 | from mighty.mighty_replay.buffer import MightyBuffer
 12 | 
 13 | 
 14 | def flatten_infos(xs):
 15 |     """Transform info dict to flat list.
 16 | 
 17 |     :param xs: info dict
 18 |     :return: flattened infos
 19 |     """
 20 |     if isinstance(xs, dict):
 21 |         xs = list(xs.values())
 22 |     for x in xs:
 23 |         if isinstance(x, Iterable) and not isinstance(x, str | bytes):
 24 |             yield from flatten_infos(x)
 25 |         else:
 26 |             yield x
 27 | 
 28 | 
 29 | class TransitionBatch:
 30 |     """Transition batch."""
 31 | 
 32 |     def __init__(
 33 |         self,
 34 |         observations,
 35 |         actions,
 36 |         rewards,
 37 |         next_observations,
 38 |         dones,
 39 |         device: torch.device | str = "cpu",
 40 |     ) -> None:
 41 |         """Initialize TransitionBatch."""
 42 |         if isinstance(rewards, float | int):
 43 |             observations = np.array([observations], dtype=np.float32)
 44 |             actions = np.array([actions], dtype=np.float32)
 45 |             rewards = np.array([rewards], dtype=np.float32)
 46 |             next_observations = np.array([next_observations], dtype=np.float32)
 47 |             dones = np.array([dones], dtype=np.float32)
 48 |         if isinstance(rewards, np.ndarray):
 49 |             self.observations = torch.from_numpy(observations.astype(np.float32)).to(
 50 |                 device
 51 |             )
 52 |             self.actions = torch.from_numpy(actions.astype(np.float32)).to(device)
 53 |             self.rewards = torch.from_numpy(rewards.astype(np.float32)).to(device)
 54 |             self.next_obs = torch.from_numpy(next_observations.astype(np.float32)).to(
 55 |                 device
 56 |             )
 57 |             self.dones = torch.from_numpy(dones.astype(np.int64)).to(device)
 58 |         else:
 59 |             self.observations = observations.to(device)
 60 |             self.actions = actions.to(device)
 61 |             self.rewards = rewards.to(device)
 62 |             self.next_obs = next_observations.to(device)
 63 |             self.dones = dones.to(device)
 64 | 
 65 |     @property
 66 |     def size(self):
 67 |         """Current buffer size."""
 68 |         return len(self.observations)
 69 | 
 70 |     def __len__(self):
 71 |         return self.size
 72 | 
 73 |     def __iter__(self):
 74 |         yield from zip(
 75 |             self.observations,
 76 |             self.actions,
 77 |             self.rewards,
 78 |             self.next_obs,
 79 |             self.dones,
 80 |             strict=False,
 81 |         )
 82 | 
 83 | 
 84 | class MightyReplay(MightyBuffer):
 85 |     """Simple replay buffer."""
 86 | 
 87 |     def __init__(
 88 |         self,
 89 |         capacity,
 90 |         keep_infos=False,
 91 |         flatten_infos=False,
 92 |         device: torch.device | str = "cpu",
 93 |     ):
 94 |         """Initialize Buffer.
 95 | 
 96 |         :param capacity: Buffer size
 97 |         :param random_seed: Seed for sampling
 98 |         :param keep_infos: Keep the extra info dict. Required for some algorithms.
 99 |         :param flatten_infos: Make flat list from infos.
100 |             Might be necessary, depending on info content.
101 |         :return:
102 |         """
103 |         self.capacity = capacity
104 |         self.keep_infos = keep_infos
105 |         self.flatten_infos = flatten_infos
106 |         self.device = torch.device(device)
107 |         self.rng = np.random.default_rng()
108 |         self.reset()
109 | 
110 |     @property
111 |     def full(self):
112 |         """Check if the buffer is full."""
113 |         return self.index + 1 >= self.capacity
114 | 
115 |     def add(self, transition_batch, _):
116 |         """Add transition(s).
117 | 
118 |         :param transition_batch: Transition(s) to add
119 |         :param metrics: Current metrics dict
120 |         :return:
121 |         """
122 |         if not self.keep_infos:
123 |             transition_batch.extra_info = []
124 |         elif self.flatten_infos:
125 |             transition_batch.extra_info = [
126 |                 list(flatten_infos(transition_batch.extra_info))
127 |             ]
128 | 
129 |         self.index += transition_batch.size
130 |         if len(self.obs) == 0:
131 |             self.obs = transition_batch.observations
132 |             self.next_obs = transition_batch.next_obs
133 |             self.actions = transition_batch.actions
134 |             self.rewards = transition_batch.rewards
135 |             self.dones = transition_batch.dones
136 |         else:
137 |             self.obs = torch.cat((self.obs, transition_batch.observations))
138 |             self.next_obs = torch.cat((self.next_obs, transition_batch.next_obs))
139 |             self.actions = torch.cat((self.actions, transition_batch.actions))
140 |             self.rewards = torch.cat((self.rewards, transition_batch.rewards))
141 |             self.dones = torch.cat((self.dones, transition_batch.dones))
142 |         if len(self) > self.capacity:
143 |             self.obs = self.obs[len(self) - self.capacity :]
144 |             self.next_obs = self.next_obs[len(self) - self.capacity :]
145 |             self.actions = self.actions[len(self) - self.capacity :]
146 |             self.rewards = self.rewards[len(self) - self.capacity :]
147 |             self.dones = self.dones[len(self) - self.capacity :]
148 |             self.index = self.capacity
149 | 
150 |     def sample(self, batch_size=32):
151 |         """Sample transitions."""
152 |         batch_indices = self.rng.choice(np.arange(len(self)), size=batch_size)
153 |         return TransitionBatch(
154 |             self.obs[batch_indices],
155 |             self.actions[batch_indices],
156 |             self.rewards[batch_indices],
157 |             self.next_obs[batch_indices],
158 |             self.dones[batch_indices],
159 |             device=self.device,
160 |         )
161 | 
162 |     def reset(self):
163 |         """Reset the buffer."""
164 |         self.obs = []
165 |         self.next_obs = []
166 |         self.actions = []
167 |         self.rewards = []
168 |         self.dones = []
169 |         self.index = 0
170 | 
171 |     def __len__(self):
172 |         return len(self.obs)
173 | 
174 |     def __bool__(self):
175 |         return bool(len(self))
176 | 
177 |     def save(self, filename="buffer.pkl"):
178 |         """Save the buffer to a file."""
179 |         with open(filename, "wb") as f:
180 |             pickle.dump(self, f)
181 | 


--------------------------------------------------------------------------------
/mighty/mighty_runners/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | from .mighty_maml_runner import MightyMAMLRunner, MightyTRPOMAMLRunner
 4 | from .mighty_online_runner import MightyOnlineRunner
 5 | from .mighty_runner import MightyRunner
 6 | 
 7 | VALID_RUNNER_TYPES = ["standard", "default", "online"]
 8 | RUNNER_CLASSES: Dict[str, type[MightyRunner]] = {
 9 |     "standard": MightyOnlineRunner,
10 |     "default": MightyOnlineRunner,
11 |     "online": MightyOnlineRunner,
12 | }
13 | 
14 | try:
15 |     import evosax  # noqa: F401
16 | 
17 |     found = True
18 | except ImportError:
19 |     print("evosax not found, to use ES runners please install mighty[es].")
20 |     found = False
21 | 
22 | if found:
23 |     from .mighty_es_runner import MightyESRunner
24 | 
25 |     VALID_RUNNER_TYPES.append("es")
26 |     RUNNER_CLASSES["es"] = MightyESRunner
27 | 
28 | 
29 | from .factory import get_runner_class  # noqa: E402
30 | 
31 | __all__ = [
32 |     "MightyRunner",
33 |     "MightyOnlineRunner",
34 |     "MightyMAMLRunner",
35 |     "MightyTRPOMAMLRunner",
36 |     "MightyESRunner",
37 |     "get_runner_class",
38 | ]
39 | 


--------------------------------------------------------------------------------
/mighty/mighty_runners/factory.py:
--------------------------------------------------------------------------------
 1 | """Factory for creating runners based on config."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | from typing import TYPE_CHECKING
 6 | 
 7 | from mighty.mighty_runners import RUNNER_CLASSES, VALID_RUNNER_TYPES
 8 | 
 9 | if TYPE_CHECKING:
10 |     from mighty.mighty_runners.mighty_runner import MightyRunner
11 | 
12 | 
13 | def get_runner_class(agent_type: str) -> type[MightyRunner]:
14 |     """Transforms config keyword for agents to class."""
15 |     agent_class = None
16 |     if agent_type in VALID_RUNNER_TYPES:
17 |         agent_class = RUNNER_CLASSES[agent_type]
18 |     else:
19 |         raise ValueError(f"Unknown agent_type {agent_type}.")
20 | 
21 |     return agent_class
22 | 


--------------------------------------------------------------------------------
/mighty/mighty_runners/mighty_es_runner.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import importlib.util as iutil
 4 | from typing import TYPE_CHECKING, Dict, Tuple
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | 
 9 | from mighty.mighty_agents.base_agent import retrieve_class
10 | from mighty.mighty_runners.mighty_runner import MightyRunner
11 | 
12 | spec = iutil.find_spec("evosax")
13 | found = spec is not None
14 | if found:
15 |     import jax
16 |     from evosax import FitnessShaper, xNES  # type: ignore
17 |     from jax import numpy as jnp
18 | else:
19 |     import warnings
20 | 
21 |     warnings.warn("evosax not found, to use NES runners please install mighty[es].")
22 | 
23 | if TYPE_CHECKING:
24 |     from omegaconf import DictConfig
25 | 
26 | 
27 | class MightyESRunner(MightyRunner):
28 |     def __init__(self, cfg: DictConfig) -> None:
29 |         super().__init__(cfg)
30 |         self.search_targets = cfg.search_targets
31 |         num_dims = len(self.search_targets)
32 |         self.search_params = False
33 |         if "parameters" in self.search_targets:
34 |             self.search_params = True
35 |             self.total_n_params = sum([len(p.flatten()) for p in self.agent.parameters])
36 |             num_dims -= 1
37 |             num_dims += self.total_n_params
38 | 
39 |         es_cls = retrieve_class(cfg.es, default_cls=xNES)
40 |         es_kwargs = {}
41 |         if "es_kwargs" in cfg.keys():
42 |             es_kwargs = cfg.es_kwargs
43 | 
44 |         self.es = es_cls(popsize=cfg.popsize, num_dims=num_dims, **es_kwargs)
45 |         self.rng = jax.random.PRNGKey(0)
46 |         self.fit_shaper = FitnessShaper(centered_rank=True, w_decay=0.0, maximize=True)
47 |         self.iterations = cfg.iterations
48 |         self.train_agent = cfg.rl_train_agent
49 |         if self.train_agent:
50 |             self.num_steps_per_iteration = cfg.num_steps_per_iteration
51 | 
52 |     def apply_parameters(self, individual) -> None:  # type: ignore
53 |         # 1. Make tensor from x
54 |         individual = np.asarray(individual)
55 |         individual = torch.tensor(individual, dtype=torch.float32)
56 | 
57 |         # 2. Shape it to match the model's parameters
58 |         param_shapes = [p.shape for p in self.agent.parameters]
59 |         reshaped_individual = []
60 |         for shape in param_shapes:
61 |             new_individual = individual[: shape.numel()]
62 |             new_individual = new_individual.reshape(shape)
63 |             reshaped_individual.append(new_individual)
64 |             individual = individual[shape.numel() :]
65 |         # 3. Set the model's parameters to the shaped tensor
66 |         for p, x_ in zip(self.agent.parameters, reshaped_individual):
67 |             p.data = x_
68 | 
69 |     def run(self) -> Tuple[Dict, Dict]:
70 |         es_state = self.es.initialize(self.rng)
71 |         for _ in range(self.iterations):
72 |             rng_ask, _ = jax.random.split(self.rng, 2)
73 |             x, es_state = self.es.ask(rng_ask, es_state)
74 |             eval_rewards = []
75 |             for individual in x:
76 |                 if self.search_params:
77 |                     self.apply_parameters(individual[: self.total_n_params])
78 |                     individual = individual[self.total_n_params :]
79 |                 for i, target in enumerate(self.search_targets):
80 |                     if target == "parameters":
81 |                         continue
82 |                     new_value = np.asarray(individual[i]).item()
83 |                     if target in ["_batch_size", "n_units"]:
84 |                         new_value = max(0, int(new_value))
85 |                     setattr(self.agent, target, new_value)
86 |                 if self.train_agent:
87 |                     self.train(self.num_steps_per_iteration)
88 |                 eval_results = self.evaluate()
89 |                 eval_rewards.append(eval_results["mean_eval_reward"])
90 |             fitness = self.fit_shaper.apply(x, jnp.array(eval_rewards))
91 |             es_state = self.es.tell(x, fitness, es_state)
92 |         eval_results = self.evaluate()
93 |         return {"step": self.iterations}, eval_results
94 | 


--------------------------------------------------------------------------------
/mighty/mighty_runners/mighty_online_runner.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Dict, Tuple
 4 | 
 5 | from mighty.mighty_runners.mighty_runner import MightyRunner
 6 | 
 7 | 
 8 | class MightyOnlineRunner(MightyRunner):
 9 |     def run(self) -> Tuple[Dict, Dict]:
10 |         train_results = self.train(self.num_steps)
11 |         eval_results = self.evaluate()
12 |         return train_results, eval_results
13 | 


--------------------------------------------------------------------------------
/mighty/mighty_runners/mighty_runner.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import logging
 4 | import warnings
 5 | from abc import ABC
 6 | from pathlib import Path
 7 | from typing import TYPE_CHECKING, Any, Dict, Tuple
 8 | 
 9 | from hydra.utils import get_class
10 | 
11 | from mighty.mighty_agents.factory import get_agent_class
12 | from mighty.mighty_utils.envs import make_mighty_env
13 | 
14 | warnings.filterwarnings("ignore")
15 | 
16 | if TYPE_CHECKING:
17 |     from omegaconf import DictConfig
18 | 
19 | 
20 | class MightyRunner(ABC):
21 |     def __init__(self, cfg: DictConfig) -> None:
22 |         """Parse config and run Mighty agent."""
23 |         output_dir = Path(cfg.output_dir) / f"{cfg.experiment_name}_{cfg.seed}"
24 |         if not output_dir.exists():
25 |             output_dir.mkdir(parents=True)
26 | 
27 |         # Check whether env is from DACBench, CARL or gym
28 |         # Make train and eval env
29 |         env, base_eval_env, eval_default = make_mighty_env(cfg)
30 | 
31 |         wrapper_classes = []
32 |         for w in cfg.env_wrappers:
33 |             wkwargs = cfg.wrapper_kwargs if "wrapper_kwargs" in cfg else {}
34 |             cls = get_class(w)
35 |             env = cls(env, **wkwargs)
36 |             wrapper_classes.append((cls, wkwargs))
37 | 
38 |         def wrap_eval():  # type: ignore
39 |             wrapped_env = base_eval_env()
40 |             for cls, wkwargs in wrapper_classes:
41 |                 wrapped_env = cls(wrapped_env, **wkwargs)
42 |             return wrapped_env
43 | 
44 |         eval_env = wrap_eval()
45 | 
46 |         # Setup agent
47 |         agent_class = get_agent_class(cfg.algorithm)
48 |         args_agent = dict(cfg.algorithm_kwargs)
49 |         self.agent = agent_class(  # type: ignore
50 |             env=env,
51 |             eval_env=eval_env,
52 |             output_dir=output_dir,
53 |             seed=cfg.seed,
54 |             **args_agent,
55 |         )
56 | 
57 |         self.eval_every_n_steps = cfg.eval_every_n_steps
58 |         self.num_steps = cfg.num_steps
59 | 
60 |         # Load checkpoint if one is given
61 |         if cfg.checkpoint is not None:
62 |             self.agent.load(cfg.checkpoint)
63 |             logging.info("#" * 80)
64 |             logging.info(f"Loading checkpoint at {cfg.checkpoint}")
65 | 
66 |         # Train
67 |         logging.info("#" * 80)
68 |         logging.info(f'Using agent type "{self.agent}" to learn')
69 |         logging.info("#" * 80)
70 | 
71 |     def train(self, num_steps: int, env=None) -> Any:  # type: ignore
72 |         return self.agent.run(
73 |             n_steps=num_steps, env=env, eval_every_n_steps=self.eval_every_n_steps
74 |         )
75 | 
76 |     def evaluate(self, eval_env=None) -> Any:  # type: ignore
77 |         return self.agent.evaluate(eval_env)
78 | 
79 |     def run(self) -> Tuple[Dict, Dict]:
80 |         raise NotImplementedError
81 | 


--------------------------------------------------------------------------------
/mighty/mighty_update/__init__.py:
--------------------------------------------------------------------------------
 1 | from mighty.mighty_update.ppo_update import PPOUpdate
 2 | from mighty.mighty_update.q_learning import (
 3 |     ClippedDoubleQLearning,
 4 |     DoubleQLearning,
 5 |     QLearning,
 6 |     SPRQLearning,
 7 | )
 8 | from mighty.mighty_update.sac_update import SACUpdate
 9 | 
10 | __all__ = [
11 |     "QLearning",
12 |     "DoubleQLearning",
13 |     "ClippedDoubleQLearning",
14 |     "SPRQLearning",
15 |     "SACUpdate",
16 |     "PPOUpdate",
17 | ]
18 | 


--------------------------------------------------------------------------------
/mighty/mighty_utils/__init__ .py:
--------------------------------------------------------------------------------
 1 | from types import MIGHTYENV, TypeKwargs, retrieve_class
 2 | 
 3 | from envs import make_mighty_env
 4 | from update_utils import polyak_update
 5 | 
 6 | __all__ = [
 7 |     "MIGHTYENV",
 8 |     "make_mighty_env",
 9 |     "TypeKwargs",
10 |     "retrieve_class",
11 |     "polyak_update",
12 | ]
13 | 


--------------------------------------------------------------------------------
/mighty/mighty_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .envs import make_mighty_env
 2 | from .migthy_types import MIGHTYENV, TypeKwargs, retrieve_class
 3 | from .update_utils import polyak_update
 4 | 
 5 | __all__ = [
 6 |     "MIGHTYENV",
 7 |     "make_mighty_env",
 8 |     "TypeKwargs",
 9 |     "retrieve_class",
10 |     "polyak_update",
11 | ]
12 | 


--------------------------------------------------------------------------------
/mighty/mighty_utils/migthy_types.py:
--------------------------------------------------------------------------------
 1 | """Type helpers for the mighty package."""
 2 | 
 3 | from __future__ import annotations
 4 | 
 5 | import importlib
 6 | from typing import Any, NewType
 7 | 
 8 | import hydra
 9 | from omegaconf import DictConfig
10 | 
11 | TypeKwargs = NewType("TypeKwargs", dict[str, Any] | DictConfig)
12 | 
13 | MIGHTYENV = None
14 | 
15 | 
16 | dacbench = importlib.util.find_spec("dacbench")
17 | dacbench_found = dacbench is not None
18 | if dacbench_found:
19 |     import dacbench
20 | 
21 |     MIGHTYENV = dacbench.AbstractEnv
22 |     DACENV = dacbench.AbstractEnv
23 | else:
24 |     DACENV = int
25 | 
26 | carl = importlib.util.find_spec("carl")
27 | carl_found = carl is not None
28 | if carl_found:
29 |     from carl.envs.carl_env import CARLEnv
30 | 
31 |     if MIGHTYENV is None:
32 |         MIGHTYENV = CARLEnv
33 |     CARLENV = CARLEnv
34 | else:
35 |     CARLENV = int
36 | 
37 | if not carl_found and not dacbench_found:
38 |     import gymnasium as gym
39 | 
40 |     MIGHTYENV = gym.Env
41 | 
42 | 
43 | def retrieve_class(cls: str | DictConfig | type, default_cls: type) -> type:
44 |     """Get mighty class."""
45 |     if cls is None:
46 |         cls = default_cls
47 |     elif isinstance(cls, DictConfig):
48 |         cls = hydra.utils.get_class(cls._target_)
49 |     elif isinstance(cls, str):
50 |         cls = hydra.utils.get_class(cls)
51 |     return cls
52 | 


--------------------------------------------------------------------------------
/mighty/mighty_utils/test_helpers.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | 
 3 | import gymnasium as gym
 4 | import numpy as np
 5 | import torch
 6 | 
 7 | 
 8 | class DummyEnv(gym.Env):
 9 |     def __init__(self):
10 |         self.observation_space = gym.spaces.Box(low=-1, high=1, shape=(3,))
11 |         self.action_space = gym.spaces.Discrete(4)
12 |         self.inst_id = None
13 |         self.instance_set = [42]
14 | 
15 |     @property
16 |     def instance_id_list(self):
17 |         return [self.inst_id]
18 | 
19 |     def set_inst_id(self, inst_id):
20 |         self.inst_id = inst_id
21 | 
22 |     def set_instance_set(self, instance_set):
23 |         self.instance_set = instance_set
24 | 
25 |     def reset(self, options={}, seed=None):
26 |         if self.inst_id is None:
27 |             self.inst_id = np.random.default_rng().integers(0, 100)
28 |         return self.observation_space.sample(), {}
29 | 
30 |     def step(self, action):
31 |         tr = np.random.default_rng().choice([0, 1], p=[0.9, 0.1])
32 |         return self.observation_space.sample(), 0, False, tr, {}
33 | 
34 | 
35 | class DummyModel:
36 |     def __init__(self, action=1):
37 |         self.action = action
38 | 
39 |     def __call__(self, s):
40 |         fake_qs = np.zeros((len(s), 5))
41 |         fake_qs[:, self.action] = 1
42 |         return torch.tensor(fake_qs)
43 | 
44 | 
45 | def clean(path):
46 |     shutil.rmtree(path, ignore_errors=False, onerror=None)
47 | 


--------------------------------------------------------------------------------
/mighty/mighty_utils/update_utils.py:
--------------------------------------------------------------------------------
1 | def polyak_update(source_params, target_params, tau: float):
2 |     """Polyak averaging for target network updates."""
3 |     for source, target in zip(source_params, target_params):
4 |         target.data.copy_(tau * source.data + (1 - tau) * target.data)
5 | 


--------------------------------------------------------------------------------
/mighty/run_mighty.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import hydra
 4 | import numpy as np
 5 | from omegaconf import DictConfig
 6 | 
 7 | from mighty.mighty_runners.factory import get_runner_class
 8 | 
 9 | 
10 | @hydra.main("./configs", "base", version_base=None)
11 | def run_mighty(cfg: DictConfig) -> None:
12 |     # Make runner
13 |     runner_cls = get_runner_class(cfg.runner)
14 |     runner = runner_cls(cfg)
15 | 
16 |     # Execute run
17 |     start = time.time()
18 |     train_result, eval_result = runner.run()
19 |     end = time.time()
20 | 
21 |     # Print stats
22 |     print("Training finished!")
23 |     print(
24 |         f"Reached a reward of {np.round(eval_result['mean_eval_reward'], decimals=2)} in {train_result['step']} steps and {np.round(end - start, decimals=2)}s."
25 |     )
26 |     return eval_result["mean_eval_reward"]
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     run_mighty()
31 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools"] 
 3 | build-backend = "setuptools.build_meta" 
 4 | 
 5 | [project]
 6 | name = "Mighty-RL"
 7 | version = "0.0.1"
 8 | description = "A modular, meta-learning-ready RL library."
 9 | authors = [{ name = "AutoRL@LUHAI", email = "a.mohan@ai.uni-hannover.de" }]
10 | readme = "README.md"
11 | requires-python = ">=3.10,<3.12"
12 | license = { file = "LICENSE" }
13 | keywords = [
14 |   "Reinforcement Learning",
15 |   "MetaRL",
16 |   "Generalization in RL"
17 | ]
18 | classifiers = [
19 |   "Intended Audience :: Developers",
20 |   "Programming Language :: Python :: 3 :: Only",
21 |   "Development Status :: 3 - Alpha",
22 |   "Topic :: Utilities",
23 |   "Topic :: Scientific/Engineering",
24 |   "Topic :: Scientific/Engineering :: Artificial Intelligence",
25 |   "License :: OSI Approved :: BSD License"
26 | ]
27 | 
28 | dependencies = [
29 |     "numpy~=1.21",
30 |     "gymnasium",
31 |     "matplotlib~=3.4",
32 |     "seaborn~=0.11",
33 |     "tensorboard",
34 |     "hydra-core~=1.2",
35 |     "hydra-colorlog~=1.2",
36 |     "hydra-submitit-launcher~=1.2",
37 |     "pandas",
38 |     "scipy",
39 |     "rich~=12.4",
40 |     "wandb~=0.12",
41 |     "torch",
42 |     "dill",
43 |     "imageio",
44 |     "evosax==0.1.6",
45 |     "rliable",
46 |     "seaborn", 
47 |     "uniplot"
48 | ]
49 | 
50 | [project.optional-dependencies]
51 | dev = ["ruff", "mypy", "build", "pytest", "pytest-cov"]
52 | carl = ["carl_bench==1.1.0", "brax==0.9.3", "protobuf>=3.17.3", "jax==0.4.18", "jaxlib~=0.4.18"]
53 | dacbench = ["dacbench>=0.3.0", "torchvision", "ioh"]
54 | pufferlib = ["pufferlib==2.0.6"]
55 | docs = ["mkdocs", "mkdocs-material", "mkdocs-autorefs",
56 |         "mkdocs-gen-files", "mkdocs-literate-nav",
57 |         "mkdocs-glightbox", "mkdocs-glossary-plugin",
58 |         "mkdocstrings[python]", "markdown-exec[ansi]", "mike"]
59 | examples = []
60 | 
61 | [tool.setuptools.packages.find]
62 | include = ["mighty*", "examples"]
63 | 
64 | [tool.ruff]
65 | extend-exclude = []
66 | 
67 | [tool.ruff.lint]
68 | ignore = [
69 |   # Conflicts with the formatter
70 |   "COM812", "ISC001"
71 | ]
72 | 
73 | [tool.mypy]
74 | python_version = "3.10"
75 | disallow_untyped_defs = true
76 | show_error_codes = true
77 | no_implicit_optional = true
78 | warn_return_any = true
79 | warn_unused_ignores = true
80 | exclude = ["scripts", "docs", "test"]
81 | 
82 | [[tool.uv.index]]
83 | name = "testpypi"
84 | url = "https://test.pypi.org/simple/"
85 | publish-url = "https://test.pypi.org/legacy/"
86 | explicit = true


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automl/Mighty/a46077c6814d02ec8d9b100db892a480bb4e05e7/test/__init__.py


--------------------------------------------------------------------------------
/test/agents/test_agent_factory.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pytest
 4 | 
 5 | from mighty.mighty_agents import AGENT_CLASSES, VALID_AGENT_TYPES
 6 | from mighty.mighty_agents.factory import get_agent_class
 7 | 
 8 | 
 9 | class TestFactory:
10 |     def test_create_agent(self):
11 |         for agent_type in VALID_AGENT_TYPES:
12 |             agent_class = get_agent_class(agent_type)
13 |             assert agent_class == AGENT_CLASSES[agent_type]
14 | 
15 |     def test_create_agent_with_invalid_type(self):
16 |         with pytest.raises(ValueError):
17 |             get_agent_class("INVALID")
18 | 


--------------------------------------------------------------------------------
/test/agents/test_base_agent.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | import gymnasium as gym
 6 | import pytest
 7 | 
 8 | from mighty.mighty_agents.dqn import MightyAgent, MightyDQNAgent
 9 | from mighty.mighty_utils.test_helpers import DummyEnv, clean
10 | 
11 | 
12 | class TestMightyAgent:
13 |     def test_init(self):
14 |         env = gym.vector.SyncVectorEnv([DummyEnv for _ in range(1)])
15 |         output_dir = Path("test_base_agent")
16 |         output_dir.mkdir(parents=True, exist_ok=True)
17 |         with pytest.raises(NotImplementedError):
18 |             MightyAgent(
19 |                 output_dir,
20 |                 env,
21 |                 meta_kwargs=None,
22 |                 wandb_kwargs=None,
23 |                 meta_methods=None,
24 |             )
25 |         clean(output_dir)
26 | 
27 |     def test_make_checkpoint_dir(self):
28 |         env = gym.vector.SyncVectorEnv([DummyEnv for _ in range(1)])
29 |         output_dir = Path("test_base_agent")
30 |         output_dir.mkdir(parents=True, exist_ok=True)
31 |         agent = MightyDQNAgent(output_dir, env)
32 | 
33 |         agent.make_checkpoint_dir(1)
34 |         assert Path(agent.checkpoint_dir).exists()
35 |         clean(output_dir)
36 | 
37 |     def test_apply_config(self):
38 |         env = gym.vector.SyncVectorEnv([DummyEnv for _ in range(1)])
39 |         output_dir = Path("test_base_agent")
40 |         output_dir.mkdir(parents=True, exist_ok=True)
41 |         agent = MightyDQNAgent(output_dir, env)
42 |         config = {
43 |             "learning_rate": -1,
44 |         }
45 |         agent.apply_config(config)
46 |         assert agent.learning_rate == -1
47 |         clean(output_dir)
48 | 


--------------------------------------------------------------------------------
/test/exploration/test_epsilon_greedy.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | import torch
 6 | 
 7 | from mighty.mighty_exploration import EpsilonGreedy
 8 | from mighty.mighty_utils.test_helpers import DummyModel
 9 | 
10 | 
11 | class TestEpsilonGreedy:
12 |     def get_policy(self, epsilon=0.1):
13 |         return EpsilonGreedy(algo="q", model=DummyModel(), epsilon=epsilon)
14 | 
15 |     @pytest.mark.parametrize(
16 |         "state",
17 |         [
18 |             torch.tensor([[0, 1], [0, 1]]),
19 |             torch.tensor([[0, 235, 67], [0, 1, 2]]),
20 |             torch.tensor(
21 |                 [[0, 235, 67], [0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2]]
22 |             ),
23 |         ],
24 |     )
25 |     def test_exploration_func(self, state):
26 |         policy = self.get_policy(epsilon=0.0)
27 |         actions, qvals = policy.explore_func(state)
28 |         greedy_actions, greedy_qvals = policy.sample_action(state)
29 |         assert len(actions) == len(state), "Action should be predicted per state."
30 |         assert all(a == g for g in greedy_actions for a in actions), (
31 |             f"Actions should match greedy: {actions}///{greedy_actions}"
32 |         )
33 |         assert torch.equal(qvals, greedy_qvals), (
34 |             f"Q-values should match greedy: {qvals}///{greedy_qvals}"
35 |         )
36 | 
37 |         policy = self.get_policy(epsilon=0.5)
38 |         actions = np.array(
39 |             [policy.explore_func(state)[0] for _ in range(100)]
40 |         ).flatten()
41 |         assert sum([a == 1 for a in actions]) / (100 * len(state)) > 0.5, (
42 |             "Actions should match greedy at least in half of cases."
43 |         )
44 |         assert sum([a == 1 for a in actions]) / (100 * len(state)) < 0.8, (
45 |             "Actions should match greedy in less than 4/5 of cases."
46 |         )
47 | 
48 |         policy = self.get_policy(epsilon=np.linspace(0, 1, len(state)))
49 |         actions = np.array([policy.explore_func(state)[0] for _ in range(100)])
50 |         assert all(actions[:, 0] == 1), "Low index actions should match greedy."
51 |         assert sum(actions[:, -1] == 1) / 100 < 0.33, (
52 |             "High index actions should not match greedy more than 1/3 of the time."
53 |         )
54 | 
55 |     @pytest.mark.parametrize(
56 |         "state",
57 |         [
58 |             torch.tensor([[0, 1], [0, 1]]),
59 |             torch.tensor([[0, 235, 67], [0, 1, 2]]),
60 |             torch.tensor(
61 |                 [[0, 235, 67], [0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2], [0, 1, 2]]
62 |             ),
63 |         ],
64 |     )
65 |     def test_multiple_epsilons(self, state):
66 |         """Test multiple epsilon values."""
67 |         policy = self.get_policy(epsilon=[0.1, 0.5])
68 |         assert np.all(policy.epsilon == [0.1, 0.5]), "Epsilon should be [0.1, 0.5]."
69 |         action, _ = policy.explore_func(state)
70 |         assert len(action) == len(state.numpy()), (
71 |             f"Action should be predicted per state: len({action}) != len({state.numpy()})."
72 |         )
73 | 


--------------------------------------------------------------------------------
/test/exploration/test_exploration.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pytest
 4 | import torch
 5 | 
 6 | from mighty.mighty_exploration import MightyExplorationPolicy
 7 | from mighty.mighty_utils.test_helpers import DummyModel
 8 | 
 9 | 
10 | class TestPolicy:
11 |     def get_policy(self, action=1):
12 |         return MightyExplorationPolicy(algo="q", model=DummyModel(action=action))
13 | 
14 |     def test_exploration_func(self):
15 |         with pytest.raises(NotImplementedError):
16 |             self.get_policy().explore_func([0])
17 | 
18 |     @pytest.mark.parametrize(
19 |         "state",
20 |         [
21 |             torch.tensor([0]),
22 |             torch.tensor([0, 1]),
23 |             torch.tensor([[0, 235, 67], [0, 1, 2]]),
24 |         ],
25 |     )
26 |     def test_call(self, state):
27 |         policy = self.get_policy()
28 |         with pytest.raises(NotImplementedError):
29 |             policy(state)
30 | 
31 |         greedy_actions, qvals = policy(state, evaluate=True, return_logp=True)
32 |         assert all(greedy_actions == 1), (
33 |             f"Greedy actions should be 1: {greedy_actions}///{qvals}"
34 |         )
35 |         assert qvals.shape[-1] == 5, "Q-value shape should not be changed."
36 |         assert len(qvals) == len(state), "Q-value length should not be changed."
37 | 
38 |         policy = self.get_policy(action=3)
39 |         greedy_actions, qvals = policy(state, evaluate=True, return_logp=True)
40 |         assert all(greedy_actions == 3), (
41 |             f"Greedy actions should be 3: {greedy_actions}///{qvals}"
42 |         )
43 |         assert qvals.shape[-1] == 5, "Q-value shape should not be changed."
44 |         assert len(qvals) == len(state), "Q-value length should not be changed."
45 | 


--------------------------------------------------------------------------------
/test/exploration/test_ez_greedy.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import gymnasium as gym
  4 | import numpy as np
  5 | 
  6 | from mighty.mighty_agents.dqn import MightyDQNAgent
  7 | from mighty.mighty_exploration.ez_greedy import EZGreedy
  8 | from mighty.mighty_utils.test_helpers import DummyEnv, clean
  9 | 
 10 | 
 11 | class TestEZGreedy:
 12 |     def test_init(self) -> None:
 13 |         env = gym.vector.SyncVectorEnv([DummyEnv for _ in range(1)])
 14 |         output_dir = Path("test_ez_greedy")
 15 |         output_dir.mkdir(parents=True, exist_ok=True)
 16 |         dqn = MightyDQNAgent(
 17 |             output_dir,
 18 |             env,
 19 |             use_target=False,
 20 |             policy_class="mighty.mighty_exploration.EZGreedy",
 21 |         )
 22 |         assert isinstance(dqn.policy, EZGreedy), (
 23 |             "Policy should be an instance of EZGreedy when creating with string."
 24 |         )
 25 |         assert dqn.policy.epsilon == 0.1, "Default epsilon should be 0.1."
 26 |         assert dqn.policy.zipf_param == 2, "Default zipf_param should be 2."
 27 |         assert dqn.policy.skipped is None, "Skip should be initialized at None."
 28 |         assert dqn.policy.frozen_actions is None, (
 29 |             "Frozen actions should be initialized at None."
 30 |         )
 31 | 
 32 |         dqn = MightyDQNAgent(
 33 |             output_dir,
 34 |             env,
 35 |             use_target=False,
 36 |             policy_class=EZGreedy,
 37 |             policy_kwargs={"epsilon": [0.5, 0.3], "zipf_param": 3},
 38 |         )
 39 |         assert isinstance(dqn.policy, EZGreedy), (
 40 |             "Policy should be an instance of EZGreedy when creating with class."
 41 |         )
 42 |         assert np.all(dqn.policy.epsilon == [0.5, 0.3]), "Epsilon should be [0.5, 0.3]."
 43 |         assert dqn.policy.zipf_param == 3, "zipf_param should be 3."
 44 |         clean(output_dir)
 45 | 
 46 |     def test_skip_single(self) -> None:
 47 |         env = gym.vector.SyncVectorEnv([DummyEnv for _ in range(1)])
 48 |         output_dir = Path("test_ez_greedy")
 49 |         output_dir.mkdir(parents=True, exist_ok=True)
 50 |         dqn = MightyDQNAgent(
 51 |             output_dir,
 52 |             env,
 53 |             use_target=False,
 54 |             policy_class="mighty.mighty_exploration.EZGreedy",
 55 |             policy_kwargs={"epsilon": 0.0, "zipf_param": 3},
 56 |         )
 57 | 
 58 |         state, _ = env.reset()
 59 |         action = dqn.policy([state])
 60 |         assert np.all(action < env.single_action_space.n), (
 61 |             "Action should be within the action space."
 62 |         )
 63 |         assert len(action) == len(state), "Action should be predicted per state."
 64 | 
 65 |         dqn.policy.skipped = np.array([1])
 66 |         next_action = dqn.policy([state])
 67 |         assert np.all(action == next_action), (
 68 |             "Action should be the same as the previous action when skip is active."
 69 |         )
 70 |         assert dqn.policy.skipped[0] == 0, "Skip should be decayed by one."
 71 |         clean(output_dir)
 72 | 
 73 |     def test_skip_batch(self) -> None:
 74 |         env = gym.vector.SyncVectorEnv([DummyEnv for _ in range(2)])
 75 |         output_dir = Path("test_ez_greedy")
 76 |         output_dir.mkdir(parents=True, exist_ok=True)
 77 |         dqn = MightyDQNAgent(
 78 |             output_dir,
 79 |             env,
 80 |             use_target=False,
 81 |             policy_class=EZGreedy,
 82 |             policy_kwargs={"epsilon": [0.5, 1.0], "zipf_param": 3},
 83 |         )
 84 | 
 85 |         state, _ = env.reset()
 86 |         action = dqn.policy(state)
 87 |         assert all([a < env.single_action_space.n for a in action]), (
 88 |             "Actions should be within the action space."
 89 |         )
 90 |         assert len(action) == len(state), "Action should be predicted per state."
 91 | 
 92 |         dqn.policy.skipped = np.array([3, 0])
 93 |         next_action = dqn.policy(state + 2)
 94 |         assert action[0] == next_action[0], (
 95 |             f"First action should be the same as the previous action when skip is active: {action[0]} != {next_action[0]}"
 96 |         )
 97 |         assert dqn.policy.skipped[0] == 2, "Skip should be decayed by one."
 98 |         assert dqn.policy.skipped[1] >= 0, "Skip should not be decayed below one."
 99 |         clean(output_dir)
100 | 


--------------------------------------------------------------------------------
/test/meta_components/test_cosine_schedule.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | import gymnasium as gym
 6 | 
 7 | from mighty.mighty_agents.dqn import MightyDQNAgent
 8 | from mighty.mighty_utils.test_helpers import DummyEnv, clean
 9 | 
10 | 
11 | class TestCosineLR:
12 |     def test_decay(self) -> None:
13 |         env = gym.vector.SyncVectorEnv([DummyEnv for _ in range(1)])
14 |         output_dir = Path("test_cosine")
15 |         output_dir.mkdir(parents=True, exist_ok=True)
16 |         dqn = MightyDQNAgent(
17 |             output_dir,
18 |             env,
19 |             meta_methods=["mighty.mighty_meta.CosineLRSchedule"],
20 |             meta_kwargs=[
21 |                 {"initial_lr": 0.2, "num_decay_steps": 100, "restart_every": 0}
22 |             ],
23 |         )
24 |         lr = 1.5
25 |         dqn.learning_rate = lr
26 |         for i in range(4):
27 |             metrics = dqn.run(n_steps=10 * (i + 1))
28 |             assert metrics["hp/lr"] == dqn.learning_rate, (
29 |                 f"Learning rate should be set to schedule value {metrics['hp/lr']} instead of {dqn.learning_rate}."
30 |             )
31 |             assert dqn.learning_rate < lr, (
32 |                 f"Learning rate should decrease: {dqn.learning_rate} is not less than {lr}."
33 |             )
34 |             lr = dqn.learning_rate.copy()
35 |         clean(output_dir)
36 | 
37 |     def test_restart(self) -> None:
38 |         env = gym.vector.SyncVectorEnv([DummyEnv for _ in range(1)])
39 |         output_dir = Path("test_cosine")
40 |         output_dir.mkdir(parents=True, exist_ok=True)
41 |         dqn = MightyDQNAgent(
42 |             output_dir,
43 |             env,
44 |             meta_methods=["mighty.mighty_meta.CosineLRSchedule"],
45 |             meta_kwargs=[
46 |                 {"initial_lr": 0.2, "num_decay_steps": 100, "restart_every": 5}
47 |             ],
48 |         )
49 |         dqn.run(6, 0)
50 |         assert dqn.meta_modules["CosineLRSchedule"].n_restarts == 1, (
51 |             "Restart counter should increase."
52 |         )
53 |         assert dqn.learning_rate >= dqn.meta_modules["CosineLRSchedule"].eta_max, (
54 |             f"Restart should increase learning rate: {dqn.learning_rate} is not {dqn.meta_modules['CosineLRSchedule'].eta_max}."
55 |         )
56 |         clean(output_dir)
57 | 


--------------------------------------------------------------------------------
/test/meta_components/test_space.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pathlib import Path
 4 | 
 5 | from mighty.mighty_agents.dqn import MightyDQNAgent
 6 | from mighty.mighty_meta import SPaCE
 7 | from mighty.mighty_utils.test_helpers import DummyEnv, DummyModel, clean
 8 | from mighty.mighty_utils.wrappers import ContextualVecEnv
 9 | 
10 | 
11 | class TestSPaCE:
12 |     def test_init(self) -> None:
13 |         space = SPaCE(criterion="improvement", threshold=0.5, k=2)
14 |         assert space.criterion == "improvement"
15 |         assert space.threshold == 0.5
16 |         assert space.increase_by_k_instances == 2
17 |         assert space.current_instance_set_size == 2
18 |         assert space.last_evals is None
19 | 
20 |     def test_get_instances(self) -> None:
21 |         space = SPaCE()
22 |         metrics = {
23 |             "env": DummyEnv(),
24 |             "vf": DummyModel(),
25 |             "rollout_values": [[0.0, 0.6, 0.7]],
26 |         }
27 |         space.get_instances(metrics)
28 |         assert len(space.all_instances) == 1, (
29 |             f"Expected 1, got {len(space.all_instances)}"
30 |         )
31 |         assert len(space.instance_set) == 1, (
32 |             f"Expected 1, got {len(space.instance_set)}"
33 |         )
34 |         assert space.last_evals is not None, "Evals should not be None."
35 | 
36 |     def test_get_evals(self) -> None:
37 |         vf = DummyModel()
38 |         env = DummyEnv()
39 |         space = SPaCE()
40 |         space.all_instances = env.instance_id_list
41 |         values = space.get_evals(env, vf)
42 |         assert len(values) == 1, f"Expected 1 value, got {len(values)}"
43 | 
44 |     def test_in_loop(self) -> None:
45 |         env = ContextualVecEnv([DummyEnv for _ in range(2)])
46 |         output_dir = Path("test_space")
47 |         output_dir.mkdir(parents=True, exist_ok=True)
48 |         dqn = MightyDQNAgent(
49 |             output_dir,
50 |             env,
51 |             use_target=False,
52 |             meta_methods=["mighty.mighty_meta.SPaCE"],
53 |         )
54 |         assert dqn.meta_modules["SPaCE"] is not None, "SPaCE should be initialized."
55 |         dqn.run(100, 0)
56 |         assert dqn.meta_modules["SPaCE"].all_instances is not None, (
57 |             "All instances should be initialized."
58 |         )
59 |         assert env.inst_ids[0] in dqn.meta_modules["SPaCE"].all_instances, (
60 |             "Instance should be in all instances."
61 |         )
62 |         clean(output_dir)
63 | 


--------------------------------------------------------------------------------
/test/runners/test_es_runner.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import shutil
  4 | from copy import deepcopy
  5 | 
  6 | from omegaconf import OmegaConf
  7 | 
  8 | from mighty.mighty_agents import MightyAgent
  9 | from mighty.mighty_runners import MightyESRunner, MightyRunner
 10 | from mighty.mighty_utils.wrappers import PufferlibToGymAdapter
 11 | 
 12 | 
 13 | class TestMightyNESRunner:
 14 |     runner_config = OmegaConf.create(
 15 |         {
 16 |             "runner": "es",
 17 |             "es": "evosax.xNES",
 18 |             "search_targets": ["parameters", "_batch_size", "learning_rate"],
 19 |             "rl_train_agent": True,
 20 |             "num_steps_per_iteration": 10,
 21 |             "iterations": 2,
 22 |             "popsize": 3,
 23 |             "debug": False,
 24 |             "seed": 0,
 25 |             "output_dir": "test_nes_runner",
 26 |             "wandb_project": None,
 27 |             "tensorboard_file": None,
 28 |             "experiment_name": "mighty_experiment",
 29 |             "eval_every_n_steps": 1e4,
 30 |             "n_episodes_eval": 10,
 31 |             "checkpoint": None,
 32 |             "save_model_every_n_steps": 5e5,
 33 |             "num_steps": 100,
 34 |             "env": "pufferlib.ocean.bandit",
 35 |             "env_kwargs": {},
 36 |             "env_wrappers": [],
 37 |             "num_envs": 1,
 38 |             "algorithm": "DQN",
 39 |             "algorithm_kwargs": {
 40 |                 "n_units": 8,
 41 |                 "epsilon": 0.2,
 42 |                 "replay_buffer_class": "mighty.mighty_replay.PrioritizedReplay",
 43 |                 "replay_buffer_kwargs": {"capacity": 1000000, "alpha": 0.6},
 44 |                 "learning_rate": 0.001,
 45 |                 "batch_size": 64,
 46 |                 "gamma": 0.9,
 47 |                 "soft_update_weight": 1.0,
 48 |                 "td_update_class": "mighty.mighty_update.QLearning",
 49 |                 "q_kwargs": {
 50 |                     "dueling": False,
 51 |                     "feature_extractor_kwargs": {
 52 |                         "architecture": "mlp",
 53 |                         "n_layers": 1,
 54 |                         "hidden_sizes": [32],
 55 |                     },
 56 |                     "head_kwargs": {"hidden_sizes": [32]},
 57 |                 },
 58 |             },
 59 |         }
 60 |     )
 61 | 
 62 |     def test_init(self):
 63 |         runner = MightyESRunner(self.runner_config)
 64 |         assert isinstance(runner, MightyRunner), (
 65 |             "MightyNESRunner should be an instance of MightyRunner"
 66 |         )
 67 |         assert isinstance(runner.agent, MightyAgent), (
 68 |             "MightyNESRunner should have a MightyAgent"
 69 |         )
 70 |         assert isinstance(runner.agent.eval_env, PufferlibToGymAdapter), (
 71 |             "Eval env should be a PufferlibToGymAdapter"
 72 |         )
 73 |         assert runner.agent.env is not None, "Env should be set"
 74 |         assert runner.iterations is not None, "Iterations should be set"
 75 |         assert runner.es is not None, "ES should be set"
 76 |         assert runner.fit_shaper is not None, "Fit shaper should be set"
 77 |         assert runner.rng is not None, "RNG should be set"
 78 | 
 79 |     def test_run(self):
 80 |         runner = MightyESRunner(self.runner_config)
 81 |         old_params = deepcopy(runner.agent.parameters)
 82 |         old_lr = runner.agent.learning_rate
 83 |         old_batch_size = runner.agent._batch_size
 84 |         train_results, eval_results = runner.run()
 85 |         new_params = runner.agent.parameters
 86 |         assert isinstance(train_results, dict), "Train results should be a dictionary"
 87 |         assert isinstance(eval_results, dict), "Eval results should be a dictionary"
 88 |         assert "mean_eval_reward" in eval_results, (
 89 |             "Mean eval reward should be in eval results"
 90 |         )
 91 |         param_equals = [o == p for o, p in zip(old_params, new_params)]
 92 |         for params in param_equals:
 93 |             assert not all(params.flatten()), (
 94 |                 "Parameters should have changed in training"
 95 |             )
 96 |         assert not old_lr == runner.agent.learning_rate, (
 97 |             "Learning rate should have changed in training"
 98 |         )
 99 |         assert not old_batch_size == runner.agent._batch_size, (
100 |             "Batch size should have changed in training"
101 |         )
102 |         shutil.rmtree("test_nes_runner")
103 | 


--------------------------------------------------------------------------------
/test/runners/test_runner.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import shutil
  4 | 
  5 | import pytest
  6 | from omegaconf import OmegaConf
  7 | 
  8 | from mighty.mighty_agents import MightyAgent
  9 | from mighty.mighty_runners import MightyOnlineRunner, MightyRunner
 10 | from mighty.mighty_utils.wrappers import PufferlibToGymAdapter
 11 | 
 12 | 
 13 | class TestMightyRunner:
 14 |     runner_config = OmegaConf.create(
 15 |         {
 16 |             "runner": "standard",
 17 |             "debug": False,
 18 |             "seed": 0,
 19 |             "output_dir": "test_runner",
 20 |             "wandb_project": None,
 21 |             "tensorboard_file": None,
 22 |             "experiment_name": "mighty_experiment",
 23 |             "eval_every_n_steps": 1e4,
 24 |             "n_episodes_eval": 10,
 25 |             "checkpoint": None,
 26 |             "save_model_every_n_steps": 5e5,
 27 |             "num_steps": 100,
 28 |             "env": "pufferlib.ocean.bandit",
 29 |             "env_kwargs": {},
 30 |             "env_wrappers": [],
 31 |             "num_envs": 1,
 32 |             "algorithm": "DQN",
 33 |             "algorithm_kwargs": {
 34 |                 "n_units": 8,
 35 |                 "epsilon": 0.2,
 36 |                 "replay_buffer_class": "mighty.mighty_replay.PrioritizedReplay",
 37 |                 "replay_buffer_kwargs": {"capacity": 1000000, "alpha": 0.6},
 38 |                 "learning_rate": 0.001,
 39 |                 "batch_size": 64,
 40 |                 "gamma": 0.9,
 41 |                 "soft_update_weight": 1.0,
 42 |                 "td_update_class": "mighty.mighty_update.QLearning",
 43 |                 "q_kwargs": {
 44 |                     "dueling": False,
 45 |                     "feature_extractor_kwargs": {
 46 |                         "architecture": "mlp",
 47 |                         "n_layers": 1,
 48 |                         "hidden_sizes": [32],
 49 |                     },
 50 |                     "head_kwargs": {"hidden_sizes": [32]},
 51 |                 },
 52 |             },
 53 |         }
 54 |     )
 55 | 
 56 |     def test_init(self):
 57 |         runner = MightyOnlineRunner(self.runner_config)
 58 |         assert isinstance(runner, MightyRunner), (
 59 |             "MightyOnlineRunner should be an instance of MightyRunner"
 60 |         )
 61 |         assert isinstance(runner.agent, MightyAgent), (
 62 |             "MightyOnlineRunner should have a MightyAgent"
 63 |         )
 64 |         assert isinstance(runner.agent.eval_env, PufferlibToGymAdapter), (
 65 |             "Eval env should be a PufferlibToGymAdapter"
 66 |         )
 67 |         assert runner.agent.env is not None, "Env should not be None"
 68 |         assert runner.eval_every_n_steps == self.runner_config.eval_every_n_steps, (
 69 |             "Eval every n steps should be set"
 70 |         )
 71 |         assert runner.num_steps == self.runner_config.num_steps, (
 72 |             "Num steps should be set"
 73 |         )
 74 | 
 75 |     def test_train(self):
 76 |         runner = MightyOnlineRunner(self.runner_config)
 77 |         results = runner.train(100)
 78 |         assert isinstance(results, dict), "Results should be a dictionary"
 79 |         alternate_env = True
 80 |         with pytest.raises(AttributeError):
 81 |             runner.train(100, alternate_env)
 82 | 
 83 |     def test_evaluate(self):
 84 |         runner = MightyOnlineRunner(self.runner_config)
 85 |         results = runner.evaluate()
 86 |         assert isinstance(results, dict), "Results should be a dictionary"
 87 |         assert "mean_eval_reward" in results, "Results should have mean_eval_reward"
 88 |         alternate_env = True
 89 |         with pytest.raises(AttributeError):
 90 |             runner.evaluate(alternate_env)
 91 | 
 92 |     def test_run(self):
 93 |         runner = MightyOnlineRunner(self.runner_config)
 94 |         train_results, eval_results = runner.run()
 95 |         assert isinstance(train_results, dict), "Train results should be a dictionary"
 96 |         assert isinstance(eval_results, dict), "Eval results should be a dictionary"
 97 |         assert "mean_eval_reward" in eval_results, (
 98 |             "Eval results should have mean_eval_reward"
 99 |         )
100 |         shutil.rmtree("test_runner")
101 | 


--------------------------------------------------------------------------------
/test/runners/test_runner_factory.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pytest
 4 | 
 5 | from mighty.mighty_runners.factory import get_runner_class
 6 | from mighty.mighty_runners.mighty_online_runner import MightyOnlineRunner
 7 | 
 8 | VALID_RUNNER_TYPES = ["standard", "default", "online"]
 9 | RUNNER_CLASSES = {
10 |     "standard": MightyOnlineRunner,
11 |     "default": MightyOnlineRunner,
12 |     "online": MightyOnlineRunner,
13 | }
14 | 
15 | 
16 | class TestFactory:
17 |     def test_create_agent(self):
18 |         for runner_type in VALID_RUNNER_TYPES:
19 |             runner_class = get_runner_class(runner_type)
20 |             assert runner_class == RUNNER_CLASSES[runner_type], (
21 |                 f"Runner class should be {RUNNER_CLASSES[runner_type]}"
22 |             )
23 | 
24 |     def test_create_agent_with_invalid_type(self):
25 |         with pytest.raises(ValueError):
26 |             get_runner_class("INVALID")
27 | 


--------------------------------------------------------------------------------
/test/test_cli.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | import pytest
 5 | 
 6 | IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true"
 7 | 
 8 | 
 9 | @pytest.mark.skipif(
10 |     IN_GITHUB_ACTIONS,
11 |     reason="The python called here for some reason will lack some dependencies in GH actions. Test locally instead.",
12 | )
13 | class TestMightCLI:
14 |     def test_run_from_file(self):
15 |         exit_status = os.system(
16 |             "uv run python mighty/run_mighty.py num_steps=100 output_dir=test_cli"
17 |         )
18 |         assert exit_status == 0
19 |         shutil.rmtree("test_cli")
20 | 


--------------------------------------------------------------------------------