├── .github
    └── workflows
    │   └── ci.yaml
├── .gitignore
├── LICENSE
├── README.md
├── bricksrl
    ├── Pybricks
    │   └── PybricksHubClass.py
    ├── __init__.py
    └── environments
    │   ├── __init__.py
    │   ├── base
    │       └── base_env.py
    │   ├── dummy
    │       ├── mixed_obs_dummy.py
    │       └── vec_obs_dummy.py
    │   ├── roboarm_mixed_v0
    │       ├── RoboArmMixedEnv.py
    │       └── client.py
    │   ├── roboarm_v0
    │       ├── RoboArmEnv.py
    │       ├── RoboArmSim.py
    │       └── client.py
    │   ├── runaway_v0
    │       ├── RunAwayEnv.py
    │       └── client.py
    │   ├── spinning_v0
    │       ├── SpinningEnv.py
    │       └── client.py
    │   └── walker_v0
    │       ├── WalkerEnv.py
    │       ├── WalkerEnvSim.py
    │       └── client.py
├── conf
    ├── README.md
    ├── agent
    │   ├── bc.yaml
    │   ├── cql.yaml
    │   ├── droq.yaml
    │   ├── iql.yaml
    │   ├── random.yaml
    │   ├── sac.yaml
    │   └── td3.yaml
    ├── config.yaml
    └── env
    │   ├── roboarm-v0.yaml
    │   ├── roboarm_mixed-v0.yaml
    │   ├── roboarm_sim-v0.yaml
    │   ├── runaway-v0.yaml
    │   ├── spinning-v0.yaml
    │   ├── walker-v0.yaml
    │   └── walker_sim-v0.yaml
├── examples
    ├── README.md
    ├── custom_env.py
    ├── example_notebook.ipynb
    └── torchrl_sac
    │   ├── config.yaml
    │   ├── train.py
    │   └── utils.py
├── experiments
    ├── 2wheeler
    │   ├── eval.py
    │   ├── pretrain.py
    │   └── train.py
    ├── helper
    │   ├── __init__.py
    │   ├── agents
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── behavior_cloning.py
    │   │   ├── cql.py
    │   │   ├── iql.py
    │   │   ├── random.py
    │   │   ├── sac.py
    │   │   └── td3.py
    │   ├── networks
    │   │   ├── __init__.py
    │   │   └── networks.py
    │   └── utils.py
    ├── roboarm
    │   ├── eval.py
    │   ├── pretrain.py
    │   └── train.py
    └── walker
    │   ├── eval.py
    │   ├── pretrain.py
    │   └── train.py
├── pyproject.toml
├── setup.py
└── tests
    ├── __init__.py
    ├── test_agents.py
    └── test_env_sim.py


/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 | # Runs on both push and pull_request events to the main branch.
 5 |   push:
 6 |     branches:
 7 |       - '**'  # or master, depending on your default branch
 8 |   pull_request:
 9 |     branches:
10 |       - main  # or master
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 | 
17 |     strategy:
18 |       matrix:
19 |         python-version: [3.8.18, 3.9]
20 | 
21 |     steps:
22 |     - name: Checkout code
23 |       uses: actions/checkout@v3
24 | 
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: ${{ matrix.python-version }}
29 | 
30 |     - name: Install dependencies
31 |       run: |
32 |         python -m pip install --upgrade pip
33 |         pip install -e .[dev]
34 | 
35 |     - name: List files # checking if the files are in the right place
36 |       run: |
37 |         ls
38 | 
39 |     - name: Run tests with pytest
40 |       run: |
41 |         pytest
42 | 
43 |     - name: Check code formatting with ufmt
44 |       run: ufmt check .


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | .DS_Store
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | pip-wheel-metadata/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | .python-version
 87 | 
 88 | # pipenv
 89 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 90 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 91 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 92 | #   install all needed dependencies.
 93 | #Pipfile.lock
 94 | 
 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 96 | __pypackages__/
 97 | 
 98 | # Celery stuff
 99 | celerybeat-schedule
100 | celerybeat.pid
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 
132 | # wandb
133 | wandb/
134 | # hydra
135 | outputs/
136 | # .pth files
137 | *.pth
138 | 
139 | # vscode
140 | .vscode/
141 | 
142 | # dev tools
143 | pytest.ini
144 | .pre-commit-config.yaml
145 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 www.compscience.org
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # BricksRL
  2 | 
  3 | ![CI](https://github.com/BricksRL/bricksrl/actions/workflows/ci.yaml/badge.svg?branch=main)
  4 | ![Python](https://img.shields.io/badge/python-3.8%20%7C%203.9-blue)
  5 | [![arXiv](https://img.shields.io/badge/arXiv-2406.17490-b31b1b.svg)](https://arxiv.org/abs/2406.17490)
  6 | [![Website](https://img.shields.io/badge/Website-Visit%20Now-blue)](https://bricksrl.github.io/ProjectPage/)
  7 | [![Discord](https://img.shields.io/badge/Join_our_Discord-7289da?logo=discord&logoColor=ffffff&labelColor=7289da)](https://discord.gg/qdTsFaVfZm)
  8 | 
  9 | 
 10 | BricksRL allows the training of custom LEGO robots using deep reinforcement learning. By integrating [Pybricks](https://pybricks.com/) and [TorchRL](https://pytorch.org/rl/stable/index.html), it facilitates efficient real-world training via Bluetooth communication between LEGO hubs and a local computing device. Check out our [paper](https://arxiv.org/abs/2406.17490)!
 11 | 
 12 | For additional information and building instructions for the robots, view the project page [BricksRL](https://bricksrl.github.io/ProjectPage/).
 13 | 
 14 | 
 15 | 
 16 | 
 17 | ## Prerequisites
 18 | <details>
 19 |   <summary>Click me</summary>
 20 | 
 21 | ### Enable web Bluetooth on chrome
 22 | 
 23 |  1. Go to "chrome://flags/" 
 24 |  2. enable "Experimental Web Platform features"
 25 |  3. restart chrome
 26 |  4. Use beta.pybricks.com to edit and upload the client scripts for each environment
 27 | 
 28 | ### Environment Setup
 29 | 
 30 | 1. **Create a Conda environment:**
 31 |    ```bash
 32 |    conda create --name bricksrl python=3.8
 33 |     ```
 34 | 2. **Activate the environment:**
 35 |    ```bash
 36 |    conda activate bricksrl
 37 |     ```
 38 | 3. **Install PyTorch:**
 39 |    ```bash
 40 |    pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
 41 | 
 42 |     ```
 43 | 4. **Install bricksrl and additional packages:**
 44 |    For regular users, install the package and all required dependencies by running:
 45 |    ```bash
 46 |    pip install -e .
 47 |     ```
 48 | 
 49 |    This will install the bricksrl package along with the dependencies listed in setup.py.
 50 | 
 51 | 5. **(Optional) Install development tools:**
 52 | 
 53 |    If you are a developer and need to install development tools (e.g., pytest, ufmt, pre-commit), use the following command to install them as extras:
 54 | 
 55 |    ```bash
 56 |    pip install -e .[dev]
 57 |    ```
 58 |    This will install the development dependencies defined in the setup.py file along with the package.
 59 | 
 60 | 
 61 | </details>
 62 | 
 63 | ## Usage
 64 | ### Client 
 65 | Update your client script on the [Pybricks Hub](https://beta.pybricks.com/) whenever you want to run a new environment with your robot.
 66 | 
 67 | 
 68 | ### Config
 69 | Before running experiments, please review and modify the configuration settings according to your needs. Each environment and agent setup has its own specific configuration file under the configs/ directory. For more information checkout the [config README](conf/README.md).
 70 | 
 71 | 
 72 | ### Robots
 73 | 
 74 | Robots utilized for our experiments. Building instructions can be found [here](https://bricksrl.github.io/ProjectPage/).
 75 | 
 76 | | ![2wheeler](https://drive.google.com/uc?export=view&id=1IxqQ1VZchPZMNXyZnTULuNy53-LMYT6W) | ![Walker](https://drive.google.com/uc?export=view&id=1ImR0f1UNjC4sUHXWWg_D06eukrh-doW9) | ![RoboArm](https://drive.google.com/uc?export=view&id=1IYCJrl5rZBvOb6xKwbSUZqYrVwKjCpJH) | 
 77 | |:--:|:--:|:--:|  
 78 | | **2Wheeler** |  **Walker** | **RoboArm** |
 79 | 
 80 | 
 81 | ## Run Experiments
 82 | ### Train an Agent
 83 | 
 84 |    ```bash
 85 |    python experiments/walker/train.py
 86 |    ```
 87 | 
 88 | ### Evaluate an Agent
 89 |    ```bash
 90 |    python experiments/walker/eval.py
 91 |    ```
 92 | 
 93 | ## Results
 94 | <details>
 95 |   <summary>Click me</summary>
 96 | 
 97 | Evaluation videos of the trained agents can be found [here](https://bricksrl.github.io/ProjectPage/).
 98 | 
 99 | ### 2Wheeler Results:
100 | 
101 | <img src="https://drive.google.com/uc?export=view&id=1U2s_zKFJyHdb1EnetHSpmB7DHs9Tz-vG" width="500" height="300" alt="2Wheeler Results">
102 | 
103 | ### Walker Results:
104 | 
105 | <img src="https://drive.google.com/uc?export=view&id=19ygYa7gBj8WBzn4ZwXc6007hyBDqTwJ6" width="500" height="300" alt="Walker Results">
106 | 
107 | ### RoboArm Results:
108 | 
109 | <img src="https://drive.google.com/uc?export=view&id=1Qow1VICXMv25gMSP4Kt-fW37Kxl6rOWi" width="500" height="300" alt="RoboArm Results">
110 | 
111 | <img src="https://drive.google.com/uc?export=view&id=1d58vv2JNNP1U1x_oh8uz93wH8WKj3F4k" width="500" height="300" alt="RoboArm Mixed Results">
112 | 
113 | </details>
114 | 
115 | 
116 | ### Offline RL 
117 | <details>
118 |   <summary>Click me</summary>
119 | 
120 | With the use of precollected [datasets](https://huggingface.co/datasets/compsciencelab/BricksRL-Datasets) we can pretrain agents with offline RL to perform a task without the need of real world interaction. Such pretrained policies can be evaluated directly or used for later training to fine tuning the pretrained policy on the real robot. 
121 | 
122 | #### Datasets
123 | The datasets can be downloaded from huggingface and contain expert and random transitions for the 2Wheeler (RunAway-v0 and Spinning-v0), Walker (Walker-v0) and RoboArm (RoboArm-v0) robots.
124 | 
125 |    ```bash
126 |       git lfs install
127 |       git clone git@hf.co:datasets/compsciencelab/BricksRL-Datasets
128 |    ```
129 | 
130 | The datasets consist of TensorDicts containing expert and random transitions, which can be directly loaded into the replay buffer. When initiating (pre-)training, simply provide the path to the desired TensorDict when prompted to load the replay buffer.
131 | 
132 | 
133 | #### Pretrain an Agent
134 | 
135 | The execution of an experiment for offline training is similar to the online training except that you run the **pretrain.py** script: 
136 | 
137 |    ```bash
138 |    python experiments/walker/pretrain.py
139 |    ```
140 | 
141 | Trained policies can then be evaluated as before with:
142 | 
143 |    ```bash
144 |    python experiments/walker/eval.py
145 |    ```
146 | 
147 | Or run training for fine-tuning the policy on the real robot:
148 | 
149 |    ```bash
150 |    python experiments/walker/train.py
151 |    ```
152 | 
153 | 
154 | </details>
155 | 
156 | ## Examples
157 | 
158 | ### TorchRL and Custom Environment Examples
159 | 
160 | Examples to use BricksRL environments with typical training scripts from [TorchRL's sota-implementations](https://github.com/pytorch/rl/tree/main/sota-implementations) can be found [here](examples/).
161 | 
162 | 
163 | We also provide a template to create your own [custom BricksRL enviornment](examples/custom_env.py) which subsequently can be used directly in the TorchRL examples.
164 | 
165 | For more information see the examples [readme](examples/README.md).
166 | 
167 | 
168 | ### High-Level Examples
169 | In the [example notebook](examples/example_notebook.ipynb) we provide high-level training examples to train a **SAC agent** in the **RoboArmSim-v0** environment and a **TD3 agent** in the **WalkerSim-v0** enviornment. 
170 | The examples are based on the experiments for our paper. Stand alone examples similar to the [TorchRL sota-implementations](https://github.com/pytorch/rl/tree/main/sota-implementations) can be found [here](examples/torchrl_sac).
171 | 
172 | ## Citation
173 | If you use BricksRL in your work, please refer to this BibTeX entry to cite it:
174 | 
175 | ```
176 | @article{dittert2024bricksrl,
177 |   title={BricksRL: A Platform for Democratizing Robotics and Reinforcement Learning Research and Education with LEGO},
178 |   author={Sebastian Dittert and Vincent Moens and Gianni De Fabritiis},
179 |   journal={arXiv preprint arXiv:2406.17490},
180 |   year={2024}
181 | }
182 | ```


--------------------------------------------------------------------------------
/bricksrl/Pybricks/PybricksHubClass.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import struct
  3 | 
  4 | from bleak import BleakClient, BleakScanner
  5 | 
  6 | 
  7 | class PybricksHub:
  8 |     """Class for connecting to a Pybricks Hub."""
  9 | 
 10 |     PYBRICKS_COMMAND_EVENT_CHAR_UUID = "c5f50002-8280-46da-89f4-6d8051e4aeef"
 11 |     HUB_NAME = "Pybricks Hub"
 12 | 
 13 |     def __init__(self, out_format_str: str, state_dim: int):
 14 |         self.device = None
 15 |         self.client = None
 16 |         self.rx_queue = asyncio.Queue(maxsize=8)  # LifoQueue
 17 |         self.loop = asyncio.get_event_loop()
 18 | 
 19 |         self.exception_out_data = struct.pack(out_format_str, *([0.0] * state_dim))
 20 |         self.disconnected = False
 21 |         self.payload_buffer = None
 22 | 
 23 |     def connect(self) -> None:
 24 |         """Connect to the hub."""
 25 |         print("Connecting to the hub...")
 26 |         self.loop.run_until_complete(self._connect())
 27 | 
 28 |     async def _connect(self) -> None:
 29 |         """Connect to the hub."""
 30 |         try:
 31 |             # Find the device and initialize client.
 32 |             self.device = await BleakScanner.find_device_by_name(self.HUB_NAME)
 33 |             self.client = BleakClient(
 34 |                 self.device, disconnected_callback=self._handle_disconnect
 35 |             )
 36 | 
 37 |             # Connect and get services
 38 |             print("Switch on the hub", flush=True)
 39 |             await self.client.connect()
 40 |             await self.client.start_notify(
 41 |                 self.PYBRICKS_COMMAND_EVENT_CHAR_UUID, self._handle_rx
 42 |             )
 43 | 
 44 |             # Tell user to start program on the hub.
 45 |             print("Start the program on the hub now with the button.", flush=True)
 46 |             await asyncio.sleep(5)
 47 | 
 48 |         except Exception as e:
 49 |             # Handle exceptions.
 50 |             print(e)
 51 |             await self.disconnect()
 52 | 
 53 |     def send(self, data: bytes) -> None:
 54 |         """Send data to the hub as bytes."""
 55 |         self.loop.run_until_complete(self._send(data))
 56 | 
 57 |     async def _send(self, data: bytes) -> None:
 58 |         try:
 59 |             # Send some data to the hub.
 60 |             await self.client.write_gatt_char(
 61 |                 self.PYBRICKS_COMMAND_EVENT_CHAR_UUID,
 62 |                 b"\x06" + data,  # Prepend "write stdin" command
 63 |                 response=False,
 64 |             )
 65 |         except Exception as e:
 66 |             # Handle exceptions.
 67 |             print(e)
 68 |             await self.disconnect()
 69 | 
 70 |     def disconnect(self) -> None:
 71 |         """
 72 |         Disconnect from the hub.
 73 |         This method disconnects the hub from the client.
 74 |         """
 75 |         if self.client and not self.disconnected:
 76 |             asyncio.create_task(self._disconnect())
 77 | 
 78 |     async def _disconnect(self) -> None:
 79 |         try:
 80 |             # Disconnect when we are done.
 81 |             if self.client:
 82 |                 await self.client.disconnect()
 83 |         except Exception as e:
 84 |             # Handle exceptions.
 85 |             print(e)
 86 |         finally:
 87 |             self.disconnected = True
 88 | 
 89 |         self.client = None
 90 |         self.device = None
 91 |         self.rx_char = None
 92 | 
 93 |     def _handle_disconnect(self, _) -> None:
 94 |         print("Hub was disconnected.")
 95 |         self.disconnect()
 96 | 
 97 |     async def _handle_rx(self, _, data: bytes) -> None:
 98 |         # add received data to the queue
 99 |         if data[0] == 0x01:  # "write stdout" event (0x01)
100 |             payload = data[1:]
101 |             # print("Received:", payload)
102 |             if (
103 |                 len(payload) != len(self.exception_out_data)
104 |                 and self.payload_buffer is None
105 |             ):
106 |                 self.payload_buffer = payload
107 |             elif (
108 |                 len(payload) != len(self.exception_out_data)
109 |                 and self.payload_buffer is not None
110 |             ):
111 |                 self.payload_buffer += payload
112 |                 if self.payload_buffer.__len__() == len(self.exception_out_data):
113 |                     await self.rx_queue.put(self.payload_buffer)
114 |                     self.payload_buffer = None
115 |             else:
116 |                 await self.rx_queue.put(payload)
117 | 
118 |     async def _read_data(self) -> bytes:
119 |         try:
120 |             # get data from the queue
121 |             return await self.rx_queue.get()
122 |         except asyncio.QueueEmpty:
123 |             print("Queue is empty, returning zeros")
124 |             return self.exception_out_data
125 | 
126 |     def read(self) -> bytes:
127 |         """Read data from the hub and return it as a bytearray."""
128 |         return self.loop.run_until_complete(self._read_data())
129 | 
130 |     def close(self) -> None:
131 |         if not self.loop.is_closed():
132 |             self.loop.run_until_complete(self._disconnect())
133 | 


--------------------------------------------------------------------------------
/bricksrl/__init__.py:
--------------------------------------------------------------------------------
1 | from bricksrl.environments.base.base_env import BaseEnv
2 | from bricksrl.Pybricks.PybricksHubClass import PybricksHub
3 | 


--------------------------------------------------------------------------------
/bricksrl/environments/__init__.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from torchrl.envs import (
  3 |     CatFrames,
  4 |     Compose,
  5 |     ObservationNorm,
  6 |     ToTensorImage,
  7 |     TransformedEnv,
  8 | )
  9 | 
 10 | from bricksrl.environments.roboarm_mixed_v0.RoboArmMixedEnv import RoboArmMixedEnv_v0
 11 | from bricksrl.environments.roboarm_v0.RoboArmEnv import RoboArmEnv_v0
 12 | from bricksrl.environments.roboarm_v0.RoboArmSim import RoboArmSimEnv_v0
 13 | from bricksrl.environments.runaway_v0.RunAwayEnv import RunAwayEnv_v0
 14 | from bricksrl.environments.spinning_v0.SpinningEnv import SpinningEnv_v0
 15 | from bricksrl.environments.walker_v0.WalkerEnv import WalkerEnv_v0
 16 | from bricksrl.environments.walker_v0.WalkerEnvSim import WalkerEnvSim_v0
 17 | 
 18 | VIDEO_LOGGING_ENVS = ["roboarm_mixed-v0", "walker_mixed-v0"]
 19 | ALL_2WHEELER_ENVS = ["spinning-v0", "runaway-v0"]
 20 | ALL_WALKER_ENVS = [
 21 |     "walker-v0",
 22 |     "walker_sim-v0",
 23 | ]
 24 | ALL_ROBOARM_ENVS = [
 25 |     "roboarm-v0",
 26 |     "roboarm_mixed-v0",
 27 |     "roboarm_sim-v0",
 28 | ]
 29 | ALL_ENVS = ALL_2WHEELER_ENVS + ALL_WALKER_ENVS + ALL_ROBOARM_ENVS
 30 | 
 31 | 
 32 | # TODO: maybe outsorce this to experiments/helper and not in bricksrl
 33 | def make_env(config, pretrain=False):
 34 |     """
 35 |     Creates a new environment based on the provided configuration.
 36 | 
 37 |     Args:
 38 |         config: A configuration object containing the environment name and maximum episode steps.
 39 |         pretrain: A boolean indicating whether the environment is for pretraining.
 40 | 
 41 |     Returns:
 42 |         A tuple containing the new environment, its action spec, and its state spec.
 43 |     """
 44 |     env = make(name=config.env.name, env_conf=config.env, pretrain=pretrain)
 45 |     observation_keys = [key for key in env.observation_spec.keys()]
 46 | 
 47 |     transforms = []
 48 |     if config.env.frame_stack > 1:
 49 |         transforms.append(
 50 |             CatFrames(
 51 |                 N=config.env.frame_stack,
 52 |                 in_keys=observation_keys,
 53 |                 out_key=observation_keys,
 54 |             )
 55 |         )
 56 |     if config.env.action_filter < 1:
 57 |         raise NotImplementedError("ActionFilterWrapper not implemented yet")
 58 |         # TODO: add this to torchrl
 59 |         # env = ActionFilterWrapper(
 60 |         #     env, current_action_influence=config.env.action_filter
 61 |         # )
 62 |     normalize_keys = [key for key in observation_keys if key != "pixels"]
 63 |     obs_ranges = np.array(list(env.observation_ranges.values()))
 64 |     obs_mean = obs_ranges.mean(axis=-1)  # mean of min and max
 65 |     obs_std = obs_ranges.std(axis=-1)  # std of min and max
 66 |     transforms.append(
 67 |         ObservationNorm(
 68 |             in_keys=normalize_keys, loc=obs_mean, scale=obs_std, standard_normal=True
 69 |         )
 70 |     )
 71 |     if "pixels" in observation_keys:
 72 |         transforms.append(ToTensorImage(in_keys=["pixels"], from_int=True))
 73 | 
 74 |     env = TransformedEnv(env, Compose(*transforms))
 75 | 
 76 |     action_spec = env.action_spec
 77 |     state_spec = env.observation_spec
 78 | 
 79 |     return env, action_spec, state_spec
 80 | 
 81 | 
 82 | def make(name="RunAway", env_conf=None, pretrain=False):
 83 |     if name == "runaway-v0":
 84 |         return RunAwayEnv_v0(
 85 |             max_episode_steps=env_conf.max_episode_steps,
 86 |             min_distance=env_conf.min_distance,
 87 |             verbose=env_conf.verbose,
 88 |             pretrain=pretrain,
 89 |         )
 90 |     elif name == "spinning-v0":
 91 |         return SpinningEnv_v0(
 92 |             max_episode_steps=env_conf.max_episode_steps,
 93 |             sleep_time=env_conf.sleep_time,
 94 |             verbose=env_conf.verbose,
 95 |             pretrain=pretrain,
 96 |         )
 97 |     elif name == "walker-v0":
 98 |         return WalkerEnv_v0(
 99 |             max_episode_steps=env_conf.max_episode_steps,
100 |             verbose=env_conf.verbose,
101 |             sleep_time=env_conf.sleep_time,
102 |             pretrain=pretrain,
103 |         )
104 |     elif name == "walker_sim-v0":
105 |         return WalkerEnvSim_v0(
106 |             max_episode_steps=env_conf.max_episode_steps,
107 |             noise=env_conf.noise,
108 |             low_action_angle=env_conf.low_action_angle,
109 |             high_action_angle=env_conf.high_action_angle,
110 |             verbose=env_conf.verbose,
111 |         )
112 |     elif name == "roboarm-v0":
113 |         return RoboArmEnv_v0(
114 |             max_episode_steps=env_conf.max_episode_steps,
115 |             verbose=env_conf.verbose,
116 |             sleep_time=env_conf.sleep_time,
117 |             reward_signal=env_conf.reward_signal,
118 |             pretrain=pretrain,
119 |         )
120 |     elif name == "roboarm_sim-v0":
121 |         return RoboArmSimEnv_v0(
122 |             max_episode_steps=env_conf.max_episode_steps,
123 |             verbose=env_conf.verbose,
124 |             noise=env_conf.noise,
125 |             reward_signal=env_conf.reward_signal,
126 |         )
127 |     elif name == "roboarm_mixed-v0":
128 |         return RoboArmMixedEnv_v0(
129 |             max_episode_steps=env_conf.max_episode_steps,
130 |             sleep_time=env_conf.sleep_time,
131 |             verbose=env_conf.verbose,
132 |             reward_signal=env_conf.reward_signal,
133 |             camera_id=env_conf.camera_id,
134 |             goal_radius=env_conf.goal_radius,
135 |             pretrain=pretrain,
136 |         )
137 |     else:
138 |         print("Environment not found")
139 | 


--------------------------------------------------------------------------------
/bricksrl/environments/base/base_env.py:
--------------------------------------------------------------------------------
  1 | import struct
  2 | import sys
  3 | 
  4 | import numpy as np
  5 | 
  6 | import torch
  7 | from bricksrl.Pybricks.PybricksHubClass import PybricksHub
  8 | from tensordict import TensorDict, TensorDictBase
  9 | from torchrl.envs import EnvBase
 10 | 
 11 | 
 12 | class BaseEnv(EnvBase):
 13 |     """
 14 |     The base class for reinforcement learning environments used with the Lego robots.
 15 | 
 16 |     Args:
 17 |         action_dim (int): The dimensionality of the action space.
 18 |         state_dim (int): The dimensionality of the state space.
 19 |         use_hub (bool): Whether to use the Pybricks hub for communication, if False, only
 20 |             the observation spec and action specs are created and can be used.
 21 |             Can be helpful for testing and debugging as you dont connect to the hub.
 22 |         verbose (bool): Whether to print verbose output.
 23 |     """
 24 | 
 25 |     def __init__(
 26 |         self,
 27 |         action_dim: int,
 28 |         state_dim: int,
 29 |         use_hub: bool = True,
 30 |         verbose: bool = False,
 31 |     ):
 32 |         self.verbose = verbose
 33 |         self.action_dim = action_dim
 34 |         self.state_dim = state_dim
 35 | 
 36 |         self.action_format_str = "!" + "f" * self.action_dim
 37 |         self.state_format_str = "!" + "f" * self.state_dim
 38 | 
 39 |         self.expected_bytesize = struct.calcsize(self.state_format_str)
 40 | 
 41 |         # buffer state in case of missing data
 42 |         self.buffered_state = np.zeros(self.state_dim, dtype=np.float32)
 43 | 
 44 |         if use_hub:
 45 |             self.hub = PybricksHub(
 46 |                 state_dim=state_dim, out_format_str=self.state_format_str
 47 |             )
 48 |             self.hub.connect()
 49 |             print("Connected to hub.")
 50 |         else:
 51 |             self.hub = None
 52 |         super().__init__(batch_size=torch.Size([1]))
 53 | 
 54 |     def send_to_hub(self, action: np.array) -> None:
 55 |         """
 56 |         Sends the given action to the hub as bytes.
 57 | 
 58 |         Args:
 59 |             action (np.array): The action to send to the hub as a numpy array.
 60 | 
 61 |         Raises:
 62 |             AssertionError: If the shape of the action does not match the action dimension.
 63 |         """
 64 |         assert (
 65 |             action.shape[0] == self.action_dim
 66 |         ), "Action shape does not match action dimension."
 67 |         byte_action = struct.pack(self.action_format_str, *action)
 68 |         if self.verbose:
 69 |             print("Sending data size: ", len(byte_action))
 70 |             print("Sending data: ", byte_action)
 71 |         self.hub.send(byte_action)
 72 | 
 73 |     def read_from_hub(self) -> np.array:
 74 |         """
 75 |         Reads the current state of the environment from the hub and returns it as a numpy array.
 76 | 
 77 |         Returns:
 78 |             np.array: The current state of the environment as a numpy array.
 79 |         """
 80 |         byte_state = self.hub.read()
 81 |         if self.verbose:
 82 |             print("Reading data size: ", sys.getsizeof(byte_state))
 83 |             print("Reading data: ", byte_state)
 84 |             print("len: ", len(byte_state))
 85 | 
 86 |         if len(byte_state) != self.expected_bytesize:
 87 |             print(
 88 |                 "State has size {} but should have size {}.".format(
 89 |                     len(byte_state), struct.calcsize(self.state_format_str)
 90 |                 )
 91 |             )
 92 |             print("Returning previous state.")
 93 |             state = self.buffered_state
 94 |             print("State: ", state)
 95 |         else:
 96 |             state = np.array([struct.unpack(self.state_format_str, byte_state)])
 97 |             self.buffered_state = state
 98 |         assert (
 99 |             state.shape[1] == self.state_dim
100 |         ), f"State has shape {state.shape[0]} and does not match state dimension: {self.state_dim}."
101 |         return state
102 | 
103 |     def sample_random_action(self, tensordict: TensorDictBase) -> TensorDictBase:
104 |         """
105 |         Sample a random action from the action space.
106 | 
107 |         Returns:
108 |             TensorDictBase: A dictionary containing the sampled action.
109 |         """
110 |         if tensordict is not None:
111 |             tensordict.set("action", self.action_spec.rand())
112 |             return tensordict
113 |         else:
114 |             return TensorDict({"action": self.action_spec.rand()}, [])
115 | 
116 |     def close(self) -> None:
117 |         if self.hub is not None:
118 |             self.hub.close()
119 | 
120 |     def _step(
121 |         self,
122 |     ):
123 |         raise NotImplementedError
124 | 
125 |     def _reset(
126 |         self,
127 |     ):
128 |         raise NotImplementedError
129 | 
130 |     def _set_seed(self, seed: int):
131 |         np.random.seed(seed)
132 |         torch.manual_seed(seed)
133 | 
134 | 
135 | class BaseSimEnv(EnvBase):
136 |     """
137 |     The base class for reinforcement learning environments used to simulate Lego robots.
138 | 
139 |     Args:
140 |         action_dim (int): The dimensionality of the action space.
141 |         state_dim (int): The dimensionality of the state space.
142 |         verbose (bool): Whether to print verbose output.
143 |         use_hub (bool): This argument is kept for compatibility but is not used in the simulation environment.
144 |     """
145 | 
146 |     def __init__(
147 |         self,
148 |         action_dim: int,
149 |         state_dim: int,
150 |         verbose: bool = False,
151 |         use_hub: bool = False,
152 |     ):
153 |         self.verbose = verbose
154 |         self.action_dim = action_dim
155 |         self.state_dim = state_dim
156 | 
157 |         super().__init__(batch_size=torch.Size([1]))
158 | 
159 |     def sample_random_action(self, tensordict: TensorDictBase) -> TensorDictBase:
160 |         """
161 |         Sample a random action from the action space.
162 | 
163 |         Returns:
164 |             TensorDictBase: A dictionary containing the sampled action.
165 |         """
166 |         if tensordict is not None:
167 |             tensordict.set("action", self.action_spec.rand())
168 |             return tensordict
169 |         else:
170 |             return TensorDict({"action": self.action_spec.rand()}, [])
171 | 
172 |     def _step(
173 |         self,
174 |     ):
175 |         raise NotImplementedError
176 | 
177 |     def _reset(
178 |         self,
179 |     ):
180 |         raise NotImplementedError
181 | 
182 |     def _set_seed(self, seed: int):
183 |         """
184 |         Sets the seed for the environment's random number generator.
185 | 
186 |         Args:
187 |             seed (int): The seed to set.
188 |         """
189 |         np.random.seed(seed)
190 |         torch.manual_seed(seed)
191 | 


--------------------------------------------------------------------------------
/bricksrl/environments/dummy/mixed_obs_dummy.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple
  2 | 
  3 | import numpy as np
  4 | 
  5 | import torch
  6 | 
  7 | from tensordict import TensorDict, TensorDictBase
  8 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec
  9 | from torchrl.envs import EnvBase
 10 | 
 11 | 
 12 | class MixedObsDummyEnv(EnvBase):
 13 |     """
 14 |     MixedObsDummyEnv is a dummy environment for testing purposes.
 15 |     It does not connec to Pybricks
 16 | 
 17 |     """
 18 | 
 19 |     action_dim = 4
 20 |     state_dim = 7
 21 |     observation_key = "observation"
 22 |     pixel_observation_key = "pixels"
 23 | 
 24 |     def __init__(self, max_episode_steps=10, img_shape=(64, 64, 3)):
 25 |         self.max_episode_steps = max_episode_steps
 26 |         self._batch_size = torch.Size([1])
 27 |         self.action_spec = BoundedTensorSpec(
 28 |             low=-torch.ones((1, self.action_dim)),
 29 |             high=torch.ones((1, self.action_dim)),
 30 |             shape=(1, self.action_dim),
 31 |         )
 32 | 
 33 |         observation_spec = BoundedTensorSpec(
 34 |             low=-torch.ones((1, self.state_dim)),
 35 |             high=torch.ones((1, self.state_dim)),
 36 |         )
 37 | 
 38 |         pixel_observation_spec = BoundedTensorSpec(
 39 |             low=torch.zeros((1,) + img_shape, dtype=torch.uint8),
 40 |             high=torch.ones((1,) + img_shape, dtype=torch.uint8) * 255,
 41 |         )
 42 | 
 43 |         self.observation_spec = CompositeSpec(shape=(1,))
 44 |         self.observation_spec.set(self.observation_key, observation_spec)
 45 |         self.observation_spec.set(self.pixel_observation_key, pixel_observation_spec)
 46 |         super().__init__(batch_size=self._batch_size)
 47 | 
 48 |     def _set_seed(self, seed: int):
 49 |         return super()._set_seed(seed)
 50 | 
 51 |     def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
 52 |         """
 53 |         Reset the environment and return the initial state.
 54 | 
 55 |         Returns:
 56 |             TensorDictBase: The initial state of the environment.
 57 |         """
 58 |         # TODO solve this fake action sending before to receive first state
 59 |         self.episode_step_iter = 0
 60 |         observation = self.observation_spec[self.observation_key].rand()
 61 |         pixel_observation = self.observation_spec[self.pixel_observation_key].rand()
 62 |         return TensorDict(
 63 |             {
 64 |                 self.observation_key: observation.float(),
 65 |                 self.pixel_observation_key: pixel_observation,
 66 |             },
 67 |             batch_size=[1],
 68 |         )
 69 | 
 70 |     def reward(
 71 |         self,
 72 |         action: np.ndarray,
 73 |         next_state: np.ndarray,
 74 |     ) -> Tuple[float, bool]:
 75 |         """ """
 76 |         return 0.0, False
 77 | 
 78 |     def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
 79 |         """ """
 80 |         action = tensordict.get("action").cpu().numpy()
 81 |         observation = self.observation_spec[self.observation_key].rand()
 82 |         pixel_observation = self.observation_spec[self.pixel_observation_key].rand()
 83 | 
 84 |         reward, done = self.reward(
 85 |             action=action,
 86 |             next_state=observation,
 87 |         )
 88 |         next_tensordict = TensorDict(
 89 |             {
 90 |                 self.observation_key: observation.float(),
 91 |                 self.pixel_observation_key: pixel_observation,
 92 |                 "reward": torch.tensor([reward]).float(),
 93 |                 "done": torch.tensor([done]).bool(),
 94 |             },
 95 |             batch_size=[1],
 96 |         )
 97 | 
 98 |         # increment episode step counter
 99 |         self.episode_step_iter += 1
100 |         if self.episode_step_iter >= self.max_episode_steps:
101 |             next_tensordict.set("done", torch.tensor([True]))
102 |         return next_tensordict
103 | 


--------------------------------------------------------------------------------
/bricksrl/environments/dummy/vec_obs_dummy.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple
  2 | 
  3 | import numpy as np
  4 | 
  5 | import torch
  6 | 
  7 | from tensordict import TensorDict, TensorDictBase
  8 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec
  9 | from torchrl.envs import EnvBase
 10 | 
 11 | 
 12 | class VecObsDummyEnv(EnvBase):
 13 |     """
 14 |     VecObsDummyEnv is a dummy environment for testing purposes.
 15 |     It does not connec to Pybricks
 16 | 
 17 |     """
 18 | 
 19 |     action_dim = 4
 20 |     state_dim = 7
 21 |     observation_key = "observation"
 22 | 
 23 |     def __init__(self, max_episode_steps=10):
 24 |         self.max_episode_steps = max_episode_steps
 25 |         self._batch_size = torch.Size([1])
 26 |         self.action_spec = BoundedTensorSpec(
 27 |             low=-torch.ones((1, self.action_dim)),
 28 |             high=torch.ones((1, self.action_dim)),
 29 |             shape=(1, self.action_dim),
 30 |         )
 31 | 
 32 |         observation_spec = BoundedTensorSpec(
 33 |             low=-torch.ones((1, self.state_dim)),
 34 |             high=torch.ones((1, self.state_dim)),
 35 |         )
 36 | 
 37 |         self.observation_spec = CompositeSpec(shape=(1,))
 38 |         self.observation_spec.set(self.observation_key, observation_spec)
 39 |         super().__init__(batch_size=self._batch_size)
 40 | 
 41 |     def _set_seed(self, seed: int):
 42 |         return super()._set_seed(seed)
 43 | 
 44 |     def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
 45 |         """
 46 |         Reset the environment and return the initial state.
 47 | 
 48 |         Returns:
 49 |             TensorDictBase: The initial state of the environment.
 50 |         """
 51 |         # TODO solve this fake action sending before to receive first state
 52 |         self.episode_step_iter = 0
 53 |         observation = self.observation_spec[self.observation_key].rand()
 54 |         return TensorDict(
 55 |             {
 56 |                 self.observation_key: observation.float(),
 57 |             },
 58 |             batch_size=[1],
 59 |         )
 60 | 
 61 |     def reward(
 62 |         self,
 63 |         action: np.ndarray,
 64 |         next_state: np.ndarray,
 65 |     ) -> Tuple[float, bool]:
 66 |         """ """
 67 |         return 0.0, False
 68 | 
 69 |     def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
 70 |         """ """
 71 |         action = tensordict.get("action").cpu().numpy()
 72 |         next_observation = self.observation_spec[self.observation_key].rand()
 73 | 
 74 |         reward, done = self.reward(
 75 |             action=action,
 76 |             next_state=next_observation,
 77 |         )
 78 |         next_tensordict = TensorDict(
 79 |             {
 80 |                 self.observation_key: next_observation.float(),
 81 |                 "reward": torch.tensor([reward]).float(),
 82 |                 "done": torch.tensor([done]).bool(),
 83 |             },
 84 |             batch_size=[1],
 85 |         )
 86 | 
 87 |         # increment episode step counter
 88 |         self.episode_step_iter += 1
 89 |         if self.episode_step_iter >= self.max_episode_steps:
 90 |             next_tensordict.set("done", torch.tensor([True]))
 91 |         return next_tensordict
 92 | 
 93 | 
 94 | class VecGoalObsDummyEnv(EnvBase):
 95 |     """
 96 |     VecGoalObsDummyEnv is a dummy environment for testing purposes.
 97 |     It does not connec to Pybricks
 98 | 
 99 |     """
100 | 
101 |     action_dim = 4
102 |     state_dim = 7
103 |     observation_key = "observation"
104 |     goal_observation_key = "goal_observation"
105 | 
106 |     def __init__(self, max_episode_steps=10):
107 |         self.max_episode_steps = max_episode_steps
108 |         self._batch_size = torch.Size([1])
109 |         self.action_spec = BoundedTensorSpec(
110 |             low=-torch.ones((1, self.action_dim)),
111 |             high=torch.ones((1, self.action_dim)),
112 |             shape=(1, self.action_dim),
113 |         )
114 | 
115 |         observation_spec = BoundedTensorSpec(
116 |             low=-torch.ones((1, self.state_dim)),
117 |             high=torch.ones((1, self.state_dim)),
118 |         )
119 | 
120 |         self.observation_spec = CompositeSpec(shape=(1,))
121 |         self.observation_spec.set(self.observation_key, observation_spec)
122 |         self.observation_spec.set(self.goal_observation_key, observation_spec)
123 |         super().__init__(batch_size=self._batch_size)
124 | 
125 |     def _set_seed(self, seed: int):
126 |         return super()._set_seed(seed)
127 | 
128 |     def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
129 |         """
130 |         Reset the environment and return the initial state.
131 | 
132 |         Returns:
133 |             TensorDictBase: The initial state of the environment.
134 |         """
135 |         # TODO solve this fake action sending before to receive first state
136 |         self.episode_step_iter = 0
137 |         observation = self.observation_spec[self.observation_key].rand()
138 |         goal_observation = self.observation_spec[self.goal_observation_key].rand()
139 |         return TensorDict(
140 |             {
141 |                 self.observation_key: observation.float(),
142 |                 self.goal_observation_key: goal_observation.float(),
143 |             },
144 |             batch_size=[1],
145 |         )
146 | 
147 |     def reward(
148 |         self,
149 |         action: np.ndarray,
150 |         next_state: np.ndarray,
151 |     ) -> Tuple[float, bool]:
152 |         """ """
153 |         return 0.0, False
154 | 
155 |     def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
156 |         """ """
157 |         action = tensordict.get("action").cpu().numpy()
158 |         next_observation = self.observation_spec[self.observation_key].rand()
159 |         goal = tensordict.get(self.goal_observation_key)
160 | 
161 |         reward, done = self.reward(
162 |             action=action,
163 |             next_state=next_observation,
164 |         )
165 |         next_tensordict = TensorDict(
166 |             {
167 |                 self.observation_key: next_observation.float(),
168 |                 self.goal_observation_key: goal.float(),
169 |                 "reward": torch.tensor([reward]).float(),
170 |                 "done": torch.tensor([done]).bool(),
171 |             },
172 |             batch_size=[1],
173 |         )
174 | 
175 |         # increment episode step counter
176 |         self.episode_step_iter += 1
177 |         if self.episode_step_iter >= self.max_episode_steps:
178 |             next_tensordict.set("done", torch.tensor([True]))
179 |         return next_tensordict
180 | 


--------------------------------------------------------------------------------
/bricksrl/environments/roboarm_mixed_v0/client.py:
--------------------------------------------------------------------------------
  1 | import ustruct
  2 | from micropython import kbd_intr
  3 | from pybricks.hubs import InventorHub
  4 | from pybricks.parameters import Port
  5 | from pybricks.pupdevices import Motor
  6 | from pybricks.tools import wait
  7 | from uselect import poll
  8 | from usys import stdin, stdout
  9 | 
 10 | kbd_intr(-1)
 11 | hub = InventorHub()
 12 | 
 13 | # Initialize and set the motors
 14 | high_motor_range = (-150, 10)
 15 | high_motor = Motor(Port.A)
 16 | high_motor.run_target(speed=400, target_angle=-70)
 17 | 
 18 | low_motor_range = (10, 75)
 19 | low_motor = Motor(Port.D)
 20 | low_motor.control.limits(500, 1000, 900)
 21 | low_motor.run_target(speed=200, target_angle=40)
 22 | 
 23 | rotation_motor_range = (-140, 40)
 24 | rotation_motor = Motor(Port.B, gears=[20, 60])
 25 | motors = {"HM": high_motor, "LM": low_motor, "RM": rotation_motor}
 26 | 
 27 | 
 28 | def get_current_motor_angles():
 29 |     angles = {}
 30 |     for k, v in motors.items():
 31 |         angle = normalize_angle(get_angle(v))
 32 |         angles.update({k: angle})
 33 |     return angles
 34 | 
 35 | 
 36 | def run_angle(motor, angle, speed=300):
 37 |     motor.run_angle(speed=speed, rotation_angle=angle, wait=False)
 38 | 
 39 | 
 40 | def get_angle(motor):
 41 |     return motor.angle()
 42 | 
 43 | 
 44 | def normalize_angle(angle, low_angle=-180, high_angle=179, original_one_round=360):
 45 |     # Normalize angle to be within -179 to 179 degrees
 46 |     while angle <= low_angle:
 47 |         angle += original_one_round
 48 |     while angle > high_angle:
 49 |         angle -= original_one_round
 50 |     return angle
 51 | 
 52 | 
 53 | def transform_range(value, old_min, old_max, new_min, new_max):
 54 |     """
 55 |     Transform a value from one range to another.
 56 | 
 57 |     Parameters:
 58 |     value (float): The value to transform.
 59 |     old_min (float): The minimum value of the old range.
 60 |     old_max (float): The maximum value of the old range.
 61 |     new_min (float): The minimum value of the new range.
 62 |     new_max (float): The maximum value of the new range.
 63 | 
 64 |     Returns:
 65 |     float: The transformed value.
 66 |     """
 67 |     # Compute the scale factor between the old and new ranges
 68 |     scale = (new_max - new_min) / (old_max - old_min)
 69 |     # Apply the transformation
 70 |     return new_min + (value - old_min) * scale
 71 | 
 72 | 
 73 | keyboard = poll()
 74 | keyboard.register(stdin)
 75 | motor_speed = 250
 76 | 
 77 | while True:
 78 | 
 79 |     while not keyboard.poll(0):
 80 |         wait(1)
 81 | 
 82 |     # Read action values for the motors
 83 |     data = stdin.buffer.read(12)
 84 |     rotation_action, low_action, high_action = ustruct.unpack("!fff", data)
 85 | 
 86 |     # Transform action range for motors
 87 |     high_action = transform_range(high_action, -1, 1, -60, 60)
 88 |     low_action = transform_range(low_action, -1, 1, -30, 30)
 89 |     rotation_action = transform_range(rotation_action, -1, 1, -90, 90)
 90 | 
 91 |     angles = get_current_motor_angles()
 92 | 
 93 |     # Adjust high action to ensure it stays within range after being applied
 94 |     if angles["HM"] + high_action > max(high_motor_range):
 95 |         high_action = max(high_motor_range) - angles["HM"]
 96 |     elif angles["HM"] + high_action < min(high_motor_range):
 97 |         high_action = min(high_motor_range) - angles["HM"]
 98 |     high_motor.run_angle(speed=motor_speed, rotation_angle=high_action, wait=False)
 99 | 
100 |     # Adjust low action to ensure it stays within range after being applied
101 |     if angles["LM"] + low_action > max(low_motor_range):
102 |         low_action = max(low_motor_range) - angles["LM"]
103 |     elif angles["LM"] + low_action < min(low_motor_range):
104 |         low_action = min(low_motor_range) - angles["LM"]
105 |     low_motor.run_angle(speed=motor_speed, rotation_angle=low_action, wait=False)
106 | 
107 |     # Adjust rotation action to ensure it stays within range after being applied
108 |     if angles["RM"] + rotation_action > max(rotation_motor_range):
109 |         rotation_action = max(rotation_motor_range) - angles["RM"]
110 |     elif angles["RM"] + rotation_action < min(rotation_motor_range):
111 |         rotation_action = min(rotation_motor_range) - angles["RM"]
112 |     rotation_motor.control.limits(250, 200, 500)
113 |     rotation_motor.run_angle(
114 |         speed=motor_speed, rotation_angle=rotation_action, wait=False
115 |     )
116 | 
117 |     # Small delay to let motors arrive target angle
118 |     wait(250)
119 | 
120 |     # Sometimes low angle jumps out of range and cant move back this corrects those cases
121 |     if low_angle < 10:
122 |         low_motor.run_target(speed=200, target_angle=10)
123 | 
124 |     # Read sensors to get current state of the robot
125 |     high_angle = high_motor.angle()
126 |     low_angle = low_motor.angle()
127 |     rotation_angle = rotation_motor.angle()
128 | 
129 |     # Send current state back to environment
130 |     out_msg = ustruct.pack(
131 |         "!fff",
132 |         high_angle,
133 |         low_angle,
134 |         rotation_angle,
135 |     )
136 |     stdout.buffer.write(out_msg)
137 | 


--------------------------------------------------------------------------------
/bricksrl/environments/roboarm_v0/RoboArmEnv.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from typing import Tuple
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | 
  7 | from bricksrl.environments.base.base_env import BaseEnv
  8 | from numpy import linalg
  9 | from tensordict import TensorDict, TensorDictBase
 10 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec
 11 | 
 12 | 
 13 | class RoboArmEnv_v0(BaseEnv):
 14 |     """ """
 15 | 
 16 |     action_dim = 4  # (Grab_motor_action, high_motor_action, low_motor_action, rotation_motor_action)
 17 | 
 18 |     state_dim = 4  # (GM, HM, LM, RM)
 19 | 
 20 |     observation_ranges = {
 21 |         "GM": (-148, -44),
 22 |         "HM": (-150, 10),
 23 |         "LM": (10, 70),
 24 |         "RM": (-180, 179),
 25 |     }
 26 | 
 27 |     observation_key = "observation"
 28 |     goal_observation_key = "goal_observation"
 29 | 
 30 |     def __init__(
 31 |         self,
 32 |         max_episode_steps: int = 50,
 33 |         sleep_time: float = 0.0,
 34 |         verbose: bool = False,
 35 |         pretrain: bool = False,
 36 |         reward_signal: str = "dense",
 37 |     ):
 38 |         self.sleep_time = sleep_time
 39 | 
 40 |         assert reward_signal in [
 41 |             "dense",
 42 |             "sparse",
 43 |         ], "Reward signal must be dense or sparse."
 44 |         self.reward_signal = reward_signal
 45 |         self.max_episode_steps = max_episode_steps
 46 |         self._batch_size = torch.Size([1])
 47 | 
 48 |         # Define action spec
 49 |         self.action_spec = BoundedTensorSpec(
 50 |             low=-1,
 51 |             high=1,
 52 |             shape=(1, self.action_dim),
 53 |         )
 54 | 
 55 |         self.goal_thresholds = np.array(
 56 |             [50]
 57 |         )  # everythin below 20 is very good. 50 is good!
 58 |         # Observation 4 motors (GM, HM, LM, RM) + goal positions (GGM, GHM, GLM, GRM)
 59 |         # Define observation spec
 60 |         bounds = torch.tensor(
 61 |             [
 62 |                 self.observation_ranges["GM"],
 63 |                 self.observation_ranges["HM"],
 64 |                 self.observation_ranges["LM"],
 65 |                 self.observation_ranges["RM"],
 66 |             ]
 67 |         )
 68 | 
 69 |         low_bounds = bounds[:, 0].unsqueeze(0)
 70 |         high_bounds = bounds[:, 1].unsqueeze(0)
 71 | 
 72 |         observation_spec = BoundedTensorSpec(
 73 |             low=low_bounds,
 74 |             high=high_bounds,
 75 |         )
 76 | 
 77 |         self.observation_spec = CompositeSpec(shape=(1,))
 78 |         self.observation_spec.set(self.observation_key, observation_spec)
 79 |         self.observation_spec.set(self.goal_observation_key, observation_spec)
 80 |         super().__init__(
 81 |             action_dim=self.action_dim,
 82 |             state_dim=self.state_dim,
 83 |             verbose=verbose,
 84 |             use_hub=1 - pretrain,
 85 |         )
 86 | 
 87 |     def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
 88 |         """
 89 |         Reset the environment and return the initial state.
 90 | 
 91 |         Returns:
 92 |             TensorDictBase: The initial state of the environment.
 93 |         """
 94 |         # TODO solve this fake action sending before to receive first state
 95 |         self.episode_step_iter = 0
 96 |         if tensordict is not None:
 97 |             action = tensordict.get("action").cpu().numpy().squeeze()
 98 |         else:
 99 |             action = np.zeros(self.action_dim)
100 |         self.send_to_hub(action)
101 |         time.sleep(self.sleep_time)
102 |         observation = self.read_from_hub()
103 |         # sample random goal state
104 |         self.goal_observation = (
105 |             self.observation_spec[self.goal_observation_key].rand().numpy()
106 |         )
107 | 
108 |         return TensorDict(
109 |             {
110 |                 self.observation_key: torch.tensor(observation, dtype=torch.float32),
111 |                 self.goal_observation_key: torch.tensor(
112 |                     self.goal_observation, dtype=torch.float32
113 |                 ),
114 |                 "error": torch.tensor([0]).float(),
115 |             },
116 |             batch_size=[1],
117 |         )
118 | 
119 |     @staticmethod
120 |     def shortest_angular_distance_vectorized(
121 |         theta_goal: np.array, theta_current: np.array
122 |     ) -> float:
123 |         """
124 |         Calculate the shortest angular distance between two arrays of angles.
125 | 
126 |         Parameters:
127 |         - theta_goal: Array of goal angles in degrees.
128 |         - theta_current: Array of current angles in degrees.
129 | 
130 |         Returns:
131 |         - Array of the shortest angular distances in degrees.
132 |         """
133 | 
134 |         # Convert angles from degrees to radians
135 |         theta_goal_rad = np.radians(theta_goal)
136 |         theta_current_rad = np.radians(theta_current)
137 | 
138 |         # Calculate difference in radians using np.arctan2 for vectorized operation
139 |         delta_theta_rad = np.arctan2(
140 |             np.sin(theta_goal_rad - theta_current_rad),
141 |             np.cos(theta_goal_rad - theta_current_rad),
142 |         )
143 | 
144 |         # Convert result back to degrees
145 |         delta_theta_deg = np.degrees(delta_theta_rad)
146 | 
147 |         return delta_theta_deg
148 | 
149 |     def reward(
150 |         self,
151 |         achieved_state: np.array,
152 |     ) -> Tuple[float, bool]:
153 |         """Reward function of roboarm.
154 | 
155 |         Args:
156 |             achieved_state (np.ndarray): The achieved state.
157 |             goal_state (np.ndarray): The goal state.
158 | 
159 |         Returns:
160 |             Tuple[float, bool]: The reward received and a boolean indicating whether the episode is done.
161 |         """
162 | 
163 |         done = False
164 |         if self.reward_signal == "dense":
165 |             angle_deltas = self.shortest_angular_distance_vectorized(
166 |                 self.goal_observation, achieved_state
167 |             )
168 |             error = np.sum(np.abs(angle_deltas))
169 |             reward = -error / 100
170 |             if error < np.mean(self.goal_thresholds):
171 |                 done = True
172 |         elif self.reward_signal == "sparse":
173 |             angle_deltas = self.shortest_angular_distance_vectorized(
174 |                 self.goal_observation, achieved_state
175 |             )
176 |             error = np.sum(np.abs(angle_deltas))
177 |             if np.all(error <= self.goal_thresholds):
178 |                 reward = 1
179 |                 done = True
180 | 
181 |             else:
182 |                 reward = 0
183 |         else:
184 |             raise ValueError("Reward signal must be dense or sparse.")
185 | 
186 |         return reward, done, error
187 | 
188 |     def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
189 |         """ """
190 |         # Send action to hub to receive next state
191 |         self.send_to_hub(tensordict.get("action").cpu().numpy().squeeze())
192 |         time.sleep(
193 |             self.sleep_time
194 |         )  # we need to wait some time for sensors to read and to
195 | 
196 |         # receive the next state
197 |         next_observation = self.read_from_hub()
198 | 
199 |         # calc reward and done
200 |         reward, done, error = self.reward(
201 |             achieved_state=next_observation,
202 |         )
203 | 
204 |         next_tensordict = TensorDict(
205 |             {
206 |                 self.observation_key: torch.tensor(
207 |                     next_observation, dtype=torch.float32
208 |                 ),
209 |                 self.goal_observation_key: torch.tensor(
210 |                     self.goal_observation, dtype=torch.float32
211 |                 ),
212 |                 "reward": torch.tensor([reward]).float(),
213 |                 "done": torch.tensor([done]).bool(),
214 |                 "error": torch.tensor([error]).float(),
215 |             },
216 |             batch_size=[1],
217 |         )
218 | 
219 |         # increment episode step counter
220 |         self.episode_step_iter += 1
221 |         if self.episode_step_iter >= self.max_episode_steps:
222 |             next_tensordict.set("done", torch.tensor([True]))
223 |         return next_tensordict
224 | 


--------------------------------------------------------------------------------
/bricksrl/environments/roboarm_v0/client.py:
--------------------------------------------------------------------------------
  1 | import ustruct
  2 | from micropython import kbd_intr
  3 | from pybricks.hubs import InventorHub
  4 | from pybricks.parameters import Port
  5 | from pybricks.pupdevices import Motor
  6 | from pybricks.tools import wait
  7 | from uselect import poll
  8 | from usys import stdin, stdout
  9 | 
 10 | kbd_intr(-1)
 11 | 
 12 | hub = InventorHub()
 13 | 
 14 | # Initialize and set the motors
 15 | grab_motor_range = (-148, -45)
 16 | grab_motor = Motor(Port.E)
 17 | grab_motor.run_target(speed=400, target_angle=-95)  # start roughly in the middle
 18 | 
 19 | high_motor_range = (-150, 10)
 20 | high_motor = Motor(Port.A)
 21 | high_motor.run_target(speed=400, target_angle=-70)
 22 | 
 23 | low_motor_range = (10, 70)
 24 | low_motor = Motor(Port.D)
 25 | low_motor.control.limits(500, 1000, 900)
 26 | low_motor.run_target(speed=400, target_angle=40)
 27 | 
 28 | rotation_motor = Motor(Port.B, gears=[20, 60])
 29 | 
 30 | motors = {"GM": grab_motor, "HM": high_motor, "LM": low_motor, "RM": rotation_motor}
 31 | 
 32 | 
 33 | def get_current_motor_angles():
 34 |     angles = {}
 35 |     for k, v in motors.items():
 36 |         angle = normalize_angle(get_angle(v))
 37 |         angles.update({k: angle})
 38 |     return angles
 39 | 
 40 | 
 41 | def normalize_angle(angle, low_angle=-180, high_angle=179, original_one_round=360):
 42 |     # Normalize angle to be within -179 to 179 degrees
 43 |     while angle <= low_angle:
 44 |         angle += original_one_round
 45 |     while angle > high_angle:
 46 |         angle -= original_one_round
 47 |     return angle
 48 | 
 49 | 
 50 | def run_angle(motor, angle, speed=300):
 51 |     motor.run_angle(speed=speed, rotation_angle=angle, wait=False)
 52 | 
 53 | 
 54 | def get_angle(motor):
 55 |     return motor.angle()
 56 | 
 57 | 
 58 | def transform_range(value, old_min, old_max, new_min, new_max):
 59 |     """
 60 |     Transform a value from one range to another.
 61 | 
 62 |     Parameters:
 63 |     value (float): The value to transform.
 64 |     old_min (float): The minimum value of the old range.
 65 |     old_max (float): The maximum value of the old range.
 66 |     new_min (float): The minimum value of the new range.
 67 |     new_max (float): The maximum value of the new range.
 68 | 
 69 |     Returns:
 70 |     float: The transformed value.
 71 |     """
 72 |     # Compute the scale factor between the old and new ranges
 73 |     scale = (new_max - new_min) / (old_max - old_min)
 74 |     # Apply the transformation
 75 |     return new_min + (value - old_min) * scale
 76 | 
 77 | 
 78 | keyboard = poll()
 79 | keyboard.register(stdin)
 80 | motor_speed = 250
 81 | 
 82 | while True:
 83 | 
 84 |     while not keyboard.poll(0):
 85 |         wait(1)
 86 | 
 87 |     # Read action values for the motors
 88 |     data = stdin.buffer.read(16)  # Reading 4 bytes (4 floats)
 89 |     rotation_action, low_action, high_action, grab_action = ustruct.unpack(
 90 |         "!ffff", data
 91 |     )
 92 | 
 93 |     # Transform action range for motors
 94 |     grab_action = transform_range(grab_action, -1, 1, -25, 25)
 95 |     high_action = transform_range(high_action, -1, 1, -60, 60)
 96 |     low_action = transform_range(low_action, -1, 1, -30, 30)
 97 |     rotation_action = transform_range(rotation_action, -1, 1, -100, 100)
 98 | 
 99 |     angles = get_current_motor_angles()
100 | 
101 |     # Adjust grab action to ensure it stays within range after being applied
102 |     if angles["GM"] + grab_action > max(grab_motor_range):
103 |         grab_action = max(grab_motor_range) - angles["GM"]
104 |     elif angles["GM"] + grab_action < min(grab_motor_range):
105 |         grab_action = min(grab_motor_range) - angles["GM"]
106 |     grab_motor.run_angle(speed=motor_speed, rotation_angle=grab_action, wait=False)
107 | 
108 |     # Adjust high action to ensure it stays within range after being applied
109 |     if angles["HM"] + high_action > max(high_motor_range):
110 |         high_action = max(high_motor_range) - angles["HM"]
111 |     elif angles["HM"] + high_action < min(high_motor_range):
112 |         high_action = min(high_motor_range) - angles["HM"]
113 |     high_motor.run_angle(speed=motor_speed, rotation_angle=high_action, wait=False)
114 | 
115 |     # Adjust low action to ensure it stays within range after being applied
116 |     if angles["LM"] + low_action > max(low_motor_range):
117 |         low_action = max(low_motor_range) - angles["LM"]
118 |     elif angles["LM"] + low_action < min(low_motor_range):
119 |         low_action = min(low_motor_range) - angles["LM"]
120 |     low_motor.run_angle(speed=motor_speed, rotation_angle=low_action, wait=False)
121 |     rotation_motor.run_angle(
122 |         speed=motor_speed, rotation_angle=rotation_action, wait=False
123 |     )
124 | 
125 |     # Small delay to let motors arrive target angle
126 |     wait(250)
127 | 
128 |     # Sometimes low angle jumps out of range and cant move back this corrects those cases
129 |     if low_angle < 10:
130 |         low_motor.run_target(speed=200, target_angle=10)
131 | 
132 |     # Read sensors to get current state of the robot
133 |     rotation_angle = rotation_motor.angle()
134 |     high_angle = high_motor.angle()
135 |     grab_angle = grab_motor.angle()
136 |     low_angle = low_motor.angle()
137 | 
138 |     # Send current state back to environment
139 |     out_msg = ustruct.pack(
140 |         "!ffff", grab_angle, high_angle, low_angle, normalize_angle(rotation_angle)
141 |     )
142 |     stdout.buffer.write(out_msg)
143 | 


--------------------------------------------------------------------------------
/bricksrl/environments/runaway_v0/RunAwayEnv.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from typing import Tuple
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | from bricksrl.environments.base.base_env import BaseEnv
  7 | from tensordict import TensorDict, TensorDictBase
  8 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec
  9 | 
 10 | 
 11 | class RunAwayEnv_v0(BaseEnv):
 12 |     """
 13 |     A reinforcement learning environment for training agents to get away from a wall.
 14 | 
 15 |     The goal of the agent is to increase the distance measured by an ultrasonic sensor and get away from the wall as fast as possible.
 16 |     The environment provides a state consisting of 4 sensor readings (left, right, pitch, roll) and the distance to the wall.
 17 |     The agent can take a continuous action in the range [-1, 1] to control the movement of the robot.
 18 |     The environment returns a reward based on the change in distance to the wall and terminates the episode if the robot gets too close to the wall or the maximum number of steps is reached.
 19 | 
 20 |     Args:
 21 |         max_episode_steps (int): The maximum number of steps per episode. Defaults to 10.
 22 |         min_distance (float): The minimum distance to the wall. Defaults to 40.
 23 |         sleep_time (float): The time to wait between sending actions and receiving the next state. Defaults to 0.2.
 24 |         verbose (bool): Whether to print verbose information during the environment's execution. Defaults to False.
 25 | 
 26 |     """
 27 | 
 28 |     action_dim = 1  # control the wheel motors together
 29 |     # 5 sensors (left motor angle, right motor angle, pitch, roll, distance)
 30 |     state_dim = 5
 31 | 
 32 |     observation_ranges = {
 33 |         "left_motor_angles": [0, 360],
 34 |         "right_motor_angles": [0, 360],
 35 |         "roll_angle": [-90, 90],
 36 |         "pitch_angle": [-90, 90],
 37 |         "distance": [0, 2000],
 38 |     }
 39 | 
 40 |     observation_key = "observation"
 41 | 
 42 |     def __init__(
 43 |         self,
 44 |         max_episode_steps: int = 10,
 45 |         min_distance: float = 40,
 46 |         sleep_time: float = 0.2,
 47 |         verbose: bool = False,
 48 |         pretrain: bool = False,
 49 |     ):
 50 |         self.sleep_time = sleep_time
 51 |         self.min_distance = min_distance
 52 |         self.max_episode_steps = max_episode_steps
 53 |         self._batch_size = torch.Size([1])
 54 | 
 55 |         # Define action spec
 56 |         self.action_spec = BoundedTensorSpec(
 57 |             low=-1,
 58 |             high=1,
 59 |             shape=(1, self.action_dim),
 60 |         )
 61 | 
 62 |         # Define observation spec
 63 |         bounds = torch.tensor(
 64 |             [
 65 |                 self.observation_ranges["left_motor_angles"],
 66 |                 self.observation_ranges["right_motor_angles"],
 67 |                 self.observation_ranges["roll_angle"],
 68 |                 self.observation_ranges["pitch_angle"],
 69 |                 self.observation_ranges["distance"],
 70 |             ]
 71 |         )
 72 | 
 73 |         low_bounds = bounds[:, 0].unsqueeze(0)
 74 |         high_bounds = bounds[:, 1].unsqueeze(0)
 75 | 
 76 |         observation_spec = BoundedTensorSpec(
 77 |             low=low_bounds,
 78 |             high=high_bounds,
 79 |         )
 80 |         self.observation_spec = CompositeSpec(
 81 |             {self.observation_key: observation_spec}, shape=(1,)
 82 |         )
 83 |         self.verbose = verbose
 84 |         super().__init__(
 85 |             action_dim=self.action_dim,
 86 |             state_dim=self.state_dim,
 87 |             verbose=verbose,
 88 |             use_hub=1 - pretrain,
 89 |         )
 90 | 
 91 |     def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
 92 |         """
 93 |         Reset the environment and return the initial state.
 94 | 
 95 |         Returns:
 96 |             TensorDictBase: The initial state of the environment.
 97 |         """
 98 |         # TODO solve this fake action sending before to receive first state
 99 |         self.episode_step_iter = 0
100 |         if tensordict is not None:
101 |             action = tensordict.get("action").cpu().numpy().squeeze(0)
102 |         else:
103 |             action = np.zeros(self.action_dim)
104 |         self.send_to_hub(action)
105 |         time.sleep(self.sleep_time)
106 |         observation = self.read_from_hub()
107 |         self.distance = observation[:, -1]
108 |         return TensorDict(
109 |             {
110 |                 self.observation_key: torch.tensor(observation, dtype=torch.float32),
111 |                 "distance": torch.tensor([self.distance]).float(),
112 |             },
113 |             batch_size=[1],
114 |         )
115 | 
116 |     def reward(self, next_observation: np.array) -> Tuple[float, bool]:
117 |         """Reward function of RunAwayEnv.
118 | 
119 |         Goal: Increase distance measured by ultrasonic sensor aka.
120 |         get away from the wall as fast as possible.
121 | 
122 |         """
123 |         done = False
124 | 
125 |         current_distance = next_observation[:, -1]
126 |         if current_distance <= self.min_distance:  # too close to the wall break episode
127 |             done = True
128 |             reward = 0.0
129 |         elif current_distance < self.distance:
130 |             reward = -1.0
131 |         elif current_distance > self.distance:
132 |             reward = 1.0
133 |         else:
134 |             reward = 0.0
135 |         if self.distance >= 2000:
136 |             done = True
137 |         self.distance = current_distance
138 |         return reward, done
139 | 
140 |     def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
141 |         """ """
142 |         # Send action to hub to receive next state
143 |         self.send_to_hub(tensordict.get("action").cpu().numpy().squeeze(0))
144 |         time.sleep(self.sleep_time)  # wait some time for sensors to read and to
145 | 
146 |         # receive the next state
147 |         next_observation = self.read_from_hub()
148 | 
149 |         # calc reward and done
150 |         reward, done = self.reward(
151 |             next_observation=next_observation,
152 |         )
153 | 
154 |         next_tensordict = TensorDict(
155 |             {
156 |                 self.observation_key: torch.tensor(
157 |                     next_observation, dtype=torch.float32
158 |                 ),
159 |                 "reward": torch.tensor([reward]).float(),
160 |                 "done": torch.tensor([done]).bool(),
161 |                 "distance": torch.tensor([self.distance]).float(),
162 |             },
163 |             batch_size=[1],
164 |         )
165 | 
166 |         # increment episode step counter
167 |         self.episode_step_iter += 1
168 |         if self.episode_step_iter >= self.max_episode_steps:
169 |             next_tensordict.set("done", torch.tensor([True]))
170 |         return next_tensordict
171 | 


--------------------------------------------------------------------------------
/bricksrl/environments/runaway_v0/client.py:
--------------------------------------------------------------------------------
 1 | import ustruct
 2 | from micropython import kbd_intr
 3 | from pybricks.hubs import InventorHub
 4 | from pybricks.parameters import Direction, Port
 5 | from pybricks.pupdevices import Motor, UltrasonicSensor
 6 | from pybricks.robotics import DriveBase
 7 | from pybricks.tools import wait
 8 | from uselect import poll
 9 | from usys import stdin, stdout
10 | 
11 | kbd_intr(-1)
12 | 
13 | 
14 | def normalize_angle(angle):
15 |     # Normalize angle to be within 0 and 360
16 |     while angle <= 0:
17 |         angle += 360
18 |     while angle > 360:
19 |         angle -= 360
20 |     return angle
21 | 
22 | 
23 | def transform_range(value, old_min, old_max, new_min, new_max):
24 |     """
25 |     Transform a value from one range to another.
26 | 
27 |     Parameters:
28 |     value (float): The value to transform.
29 |     old_min (float): The minimum value of the old range.
30 |     old_max (float): The maximum value of the old range.
31 |     new_min (float): The minimum value of the new range.
32 |     new_max (float): The maximum value of the new range.
33 | 
34 |     Returns:
35 |     float: The transformed value.
36 |     """
37 |     # Compute the scale factor between the old and new ranges
38 |     scale = (new_max - new_min) / (old_max - old_min)
39 |     # Apply the transformation
40 |     return new_min + (value - old_min) * scale
41 | 
42 | 
43 | kbd_intr(-1)
44 | hub = InventorHub()
45 | 
46 | # Initialize the drive base.
47 | left_motor = Motor(Port.E, Direction.COUNTERCLOCKWISE)
48 | right_motor = Motor(Port.A)
49 | drive_base = DriveBase(left_motor, right_motor, wheel_diameter=56, axle_track=130)
50 | # Initialize the distance sensor.
51 | sensor = UltrasonicSensor(Port.C)
52 | 
53 | keyboard = poll()
54 | keyboard.register(stdin)
55 | 
56 | while True:
57 | 
58 |     # Optional: Check available input.
59 |     while not keyboard.poll(0):
60 |         wait(1)
61 | 
62 |     # Read action values for the motors
63 |     action_value = ustruct.unpack("!f", stdin.buffer.read(4))[0]
64 |     action = transform_range(action_value, -1, 1, -100, 100)
65 | 
66 |     drive_base.straight(action, wait=True)
67 | 
68 |     # Read sensors to get current state of the robot
69 |     (left, right) = (left_motor.angle(), right_motor.angle())
70 |     (pitch, roll) = hub.imu.tilt()
71 |     dist = sensor.distance()
72 | 
73 |     # Send current state back to environment
74 |     out_msg = ustruct.pack(
75 |         "!fffff", normalize_angle(left), normalize_angle(right), pitch, roll, dist
76 |     )
77 |     stdout.buffer.write(out_msg)
78 | 


--------------------------------------------------------------------------------
/bricksrl/environments/spinning_v0/SpinningEnv.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from typing import Tuple
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | from bricksrl.environments.base.base_env import BaseEnv
  7 | from tensordict import TensorDict, TensorDictBase
  8 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec
  9 | 
 10 | 
 11 | class SpinningEnv_v0(BaseEnv):
 12 |     """
 13 |     SpinningEnv_v0 is a custom gym environment for a spinning robot.
 14 |     The robot has to learn to spin in a circle around its own axis given a random goal direction (left or right, 0 or 1).
 15 | 
 16 |     Args:
 17 |         max_episode_steps (int): The maximum number of steps per episode. Defaults to 50.
 18 |         sleep_time (float): The time to wait between sending actions and receiving the next state. Defaults to 0.2.
 19 |         verbose (bool): Whether to print verbose information during the environment's execution. Defaults to False.
 20 | 
 21 |     """
 22 | 
 23 |     action_dim = 2  # to control the wheel motors independently
 24 |     state_dim = 5  # 5 sensors (left, right, pitch, roll, rotation_velocity) + 1 direction (left or right)
 25 | 
 26 |     observation_ranges = {
 27 |         "left_motor_angle": [0, 360],
 28 |         "right_motor_angle": [0, 360],
 29 |         "pitch_angle": [-90, 90],
 30 |         "roll_angle": [-90, 90],
 31 |         "rotation_velocity": [-100, 100],
 32 |         "direction": [0, 1],
 33 |     }
 34 | 
 35 |     observation_key = "observation"
 36 | 
 37 |     def __init__(
 38 |         self,
 39 |         max_episode_steps: int = 50,
 40 |         sleep_time: float = 0.2,
 41 |         verbose: bool = False,
 42 |         pretrain: bool = False,
 43 |     ):
 44 |         self.sleep_time = sleep_time
 45 |         self._batch_size = torch.Size([1])
 46 |         self.max_episode_steps = max_episode_steps
 47 | 
 48 |         # Define action spec
 49 |         self.action_spec = BoundedTensorSpec(
 50 |             low=-1,
 51 |             high=1,
 52 |             shape=(1, self.action_dim),
 53 |         )
 54 | 
 55 |         # Define observation spec
 56 |         bounds = torch.tensor(
 57 |             [
 58 |                 self.observation_ranges["left_motor_angle"],
 59 |                 self.observation_ranges["right_motor_angle"],
 60 |                 self.observation_ranges["pitch_angle"],
 61 |                 self.observation_ranges["roll_angle"],
 62 |                 self.observation_ranges["rotation_velocity"],
 63 |                 self.observation_ranges["direction"],
 64 |             ]
 65 |         )
 66 |         low_bounds = bounds[:, 0].unsqueeze(0)
 67 |         high_bounds = bounds[:, 1].unsqueeze(0)
 68 | 
 69 |         observation_spec = BoundedTensorSpec(
 70 |             low=low_bounds,
 71 |             high=high_bounds,
 72 |         )
 73 |         self.observation_spec = CompositeSpec(
 74 |             {self.observation_key: observation_spec}, shape=(1,)
 75 |         )
 76 | 
 77 |         super().__init__(
 78 |             action_dim=self.action_dim,
 79 |             state_dim=self.state_dim,
 80 |             verbose=verbose,
 81 |             use_hub=1 - pretrain,
 82 |         )
 83 | 
 84 |     def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
 85 |         """
 86 |         Reset the environment and return the initial state.
 87 | 
 88 |         Returns:
 89 |             TensorDictBase: The initial state of the environment.
 90 |         """
 91 |         # TODO solve this fake action sending before to receive first state
 92 |         self.episode_step_iter = 0
 93 |         if tensordict is not None:
 94 |             action = tensordict.get("action").cpu().numpy().squeeze()
 95 |         else:
 96 |             action = np.zeros(self.action_dim)
 97 |         self.send_to_hub(action)
 98 |         time.sleep(self.sleep_time)
 99 | 
100 |         state = self.read_from_hub()
101 |         self.direction = np.random.randint(0, 2)  # (0,1) left or right
102 |         full_original_state = np.concatenate(
103 |             (state, np.array([[self.direction]])), axis=1, dtype=np.float32
104 |         )
105 | 
106 |         return TensorDict(
107 |             {
108 |                 self.observation_key: torch.tensor(full_original_state),
109 |             },
110 |             batch_size=[1],
111 |         )
112 | 
113 |     def reward(self, next_observation: np.array) -> Tuple[float, bool]:
114 |         """Reward function of Spinning environment.
115 |         If the self.direction is 0, the robot is spinning left, otherwise right.
116 |         We want to maximise in those cases the angular velocity (last element of the state vector).
117 |         If the robot is spinning in the wrong direction, we want to minimize the angular velocity.
118 |         """
119 |         done = False
120 |         velocity = next_observation[:, -2]
121 | 
122 |         if self.direction == 0:
123 |             reward = velocity
124 |         else:
125 |             reward = -velocity
126 | 
127 |         return reward, done
128 | 
129 |     def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
130 |         """ """
131 |         # Send action to hub to receive next state
132 |         self.send_to_hub(tensordict.get("action").cpu().numpy().squeeze())
133 |         time.sleep(self.sleep_time)  # wait some time for sensors to read and to
134 |         # receive the next state
135 |         next_observation = self.read_from_hub()
136 |         full_original_next_observation = np.concatenate(
137 |             (next_observation, np.array([[self.direction]])), axis=1, dtype=np.float32
138 |         )
139 |         # calc reward and done
140 |         reward, done = self.reward(full_original_next_observation)
141 | 
142 |         next_tensordict = TensorDict(
143 |             {
144 |                 self.observation_key: torch.tensor(full_original_next_observation),
145 |                 "reward": torch.tensor([reward]).float(),
146 |                 "done": torch.tensor([done]).bool(),
147 |             },
148 |             batch_size=[1],
149 |         )
150 |         # increment episode step counter
151 |         self.episode_step_iter += 1
152 |         if self.episode_step_iter >= self.max_episode_steps:
153 |             next_tensordict.set("done", torch.tensor([True]).bool())
154 | 
155 |         return next_tensordict
156 | 


--------------------------------------------------------------------------------
/bricksrl/environments/spinning_v0/client.py:
--------------------------------------------------------------------------------
 1 | import ustruct
 2 | from micropython import kbd_intr
 3 | from pybricks.hubs import InventorHub
 4 | from pybricks.parameters import Axis, Direction, Port
 5 | from pybricks.pupdevices import Motor
 6 | from pybricks.tools import wait
 7 | from uselect import poll
 8 | from usys import stdin, stdout
 9 | 
10 | kbd_intr(-1)
11 | hub = InventorHub()
12 | 
13 | # Initialize and set the motors
14 | left_motor = Motor(Port.E, Direction.COUNTERCLOCKWISE)
15 | right_motor = Motor(Port.A)
16 | 
17 | keyboard = poll()
18 | keyboard.register(stdin)
19 | 
20 | 
21 | def normalize_angle(angle):
22 |     # Normalize angle to be within 0 and 360
23 |     while angle <= 0:
24 |         angle += 360
25 |     while angle > 360:
26 |         angle -= 360
27 |     return angle
28 | 
29 | 
30 | def transform_range(value, old_min, old_max, new_min, new_max):
31 |     """
32 |     Transform a value from one range to another.
33 | 
34 |     Parameters:
35 |     value (float): The value to transform.
36 |     old_min (float): The minimum value of the old range.
37 |     old_max (float): The maximum value of the old range.
38 |     new_min (float): The minimum value of the new range.
39 |     new_max (float): The maximum value of the new range.
40 | 
41 |     Returns:
42 |     float: The transformed value.
43 |     """
44 |     # Compute the scale factor between the old and new ranges
45 |     scale = (new_max - new_min) / (old_max - old_min)
46 |     # Apply the transformation
47 |     return new_min + (value - old_min) * scale
48 | 
49 | 
50 | while True:
51 | 
52 |     while not keyboard.poll(0):
53 |         wait(1)
54 | 
55 |     # Read action values for both motors
56 |     data = stdin.buffer.read(8)  # Reading 8 bytes (two floats)
57 |     left_action_value, right_action_value = ustruct.unpack("!ff", data)
58 | 
59 |     # Apply action to each motor
60 |     left_motor.run_angle(
61 |         speed=400,
62 |         rotation_angle=transform_range(left_action_value, -1, 1, -100, 100),
63 |         wait=False,
64 |     )
65 |     right_motor.run_angle(
66 |         speed=400,
67 |         rotation_angle=transform_range(right_action_value, -1, 1, -100, 100),
68 |         wait=False,
69 |     )
70 | 
71 |     wait(100)  # Small delay
72 | 
73 |     # Read sensors to get current state of the robot
74 |     (left, right) = (left_motor.angle(), right_motor.angle())
75 |     (pitch, roll) = hub.imu.tilt()
76 |     z_angl_vel = hub.imu.angular_velocity(Axis.Z)
77 | 
78 |     # Send current state back to environment
79 |     out_msg = ustruct.pack(
80 |         "!fffff", normalize_angle(left), normalize_angle(right), pitch, roll, z_angl_vel
81 |     )
82 |     stdout.buffer.write(out_msg)
83 | 


--------------------------------------------------------------------------------
/bricksrl/environments/walker_v0/WalkerEnv.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from typing import Tuple
  3 | 
  4 | import numpy as np
  5 | 
  6 | import torch
  7 | 
  8 | from bricksrl.environments.base.base_env import BaseEnv
  9 | from tensordict import TensorDict, TensorDictBase
 10 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec
 11 | 
 12 | 
 13 | class WalkerEnv_v0(BaseEnv):
 14 |     """
 15 |     A reinforcement learning environment for the robodog to learn to walk.
 16 | 
 17 |     Specific to the walker_v0 environment is, that the reward function is hard coded to learn a gait routine.
 18 |     In contrast to the walker_v1 environment, the reward function is not based on the acceleration of the robot.
 19 | 
 20 |     Args:
 21 |         max_episode_steps (int): The maximum number of steps per episode. Defaults to 10.
 22 |         sleep_time (float): The time to wait between sending actions and receiving the next state. Defaults to 0.0.
 23 |         verbose (bool): Whether to print additional information. Defaults to False.
 24 | 
 25 |     """
 26 | 
 27 |     action_dim = 4  # (lf_value, lb_value, rf_value, rb_value)
 28 |     # angles are in range [-179, 179]
 29 |     state_dim = 7  # (lf_angle, rf_angle, lb_angle, rb_angle, pitch, roll, acc_x)
 30 | 
 31 |     observation_ranges = {
 32 |         "lf_angle": [-179, 179],
 33 |         "rf_angle": [-179, 179],
 34 |         "lb_angle": [-179, 179],
 35 |         "rb_angle": [-179, 179],
 36 |         "pitch": [-50, 50],
 37 |         "roll": [-50, 50],
 38 |         "acc_x": [-3000, 3000],
 39 |     }
 40 | 
 41 |     observation_key = "observation"
 42 | 
 43 |     def __init__(
 44 |         self,
 45 |         max_episode_steps: int = 50,
 46 |         sleep_time: float = 0.0,
 47 |         verbose: bool = False,
 48 |         pretrain: bool = False,
 49 |     ):
 50 |         self.sleep_time = sleep_time
 51 |         self._batch_size = torch.Size([1])
 52 |         self.max_episode_steps = max_episode_steps
 53 | 
 54 |         # Define action spec
 55 |         self.action_spec = BoundedTensorSpec(
 56 |             low=-1,
 57 |             high=1,
 58 |             shape=(1, self.action_dim),
 59 |         )
 60 | 
 61 |         # Define observation spec
 62 |         bounds = torch.tensor(
 63 |             [
 64 |                 self.observation_ranges["lf_angle"],
 65 |                 self.observation_ranges["rf_angle"],
 66 |                 self.observation_ranges["lb_angle"],
 67 |                 self.observation_ranges["rb_angle"],
 68 |                 self.observation_ranges["pitch"],
 69 |                 self.observation_ranges["roll"],
 70 |                 self.observation_ranges["acc_x"],
 71 |             ]
 72 |         )
 73 |         # Reshape bounds to (1, 7)
 74 |         low_bounds = bounds[:, 0].unsqueeze(0)
 75 |         high_bounds = bounds[:, 1].unsqueeze(0)
 76 | 
 77 |         observation_spec = BoundedTensorSpec(
 78 |             low=low_bounds,
 79 |             high=high_bounds,
 80 |             shape=(1, self.state_dim),
 81 |         )
 82 | 
 83 |         self.observation_spec = CompositeSpec(
 84 |             {self.observation_key: observation_spec}, shape=(1,)
 85 |         )
 86 |         super().__init__(
 87 |             action_dim=self.action_dim,
 88 |             state_dim=self.state_dim,
 89 |             verbose=verbose,
 90 |             use_hub=1 - pretrain,
 91 |         )
 92 | 
 93 |     def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
 94 |         """
 95 |         Reset the environment and return the initial state.
 96 | 
 97 |         Returns:
 98 |             TensorDictBase: The initial state of the environment.
 99 |         """
100 |         # TODO solve this fake action sending before to receive first state
101 |         self.episode_step_iter = 0
102 |         if tensordict is not None:
103 |             action = tensordict.get("action").cpu().numpy().squeeze()
104 |         else:
105 |             action = np.zeros(self.action_dim)
106 |         self.send_to_hub(action)
107 |         time.sleep(self.sleep_time)
108 |         observation = self.read_from_hub()
109 | 
110 |         return TensorDict(
111 |             {
112 |                 self.observation_key: torch.tensor(observation, dtype=torch.float32),
113 |             },
114 |             batch_size=[1],
115 |         )
116 | 
117 |     def reward(
118 |         self,
119 |         action: np.ndarray,
120 |         next_state: np.ndarray,
121 |     ) -> Tuple[float, bool]:
122 |         """Reward function of walker.
123 | 
124 |         Args:
125 |             action (np.ndarray): The action taken.
126 |             next_state (np.ndarray): The next state.
127 | 
128 |         Returns:
129 |             Tuple[float, bool]: The reward received and a boolean indicating whether the episode is done.
130 |         """
131 | 
132 |         done = False
133 |         # pitch and roll need to stay in range [-75, 75] outside done = True
134 |         pitch, roll = next_state[:, -3], next_state[:, -2]
135 |         if np.abs(pitch) > 100 or np.abs(roll) > 100:
136 |             done = True
137 |             reward = 0
138 |             return reward, done
139 | 
140 |         (
141 |             lf_angle,
142 |             rf_angle,
143 |             lb_angle,
144 |             rb_angle,
145 |             pitch,
146 |             roll,
147 |             acc_x,
148 |         ) = next_state.squeeze()
149 | 
150 |         # we want actions to be negative and high
151 |         # action is in range [-1, 1] over 4 dims -> sum is in range [-4, 4] -> divide by 4 to get in range [-1, 1]
152 |         action_reward = -np.sum(action) / 4 / 10
153 |         # Take this off we dont want them to be similar otherwise we cant adapt for noise in the system
154 |         # actions should ideally be similar something like [-0.75, -0.75, -0.75, -0.75]
155 |         # action_std_reward = -np.std(action)
156 | 
157 |         # we want lf_angle and rb_angle to be synchronized and rf_angle and lb_angle to be synchronized
158 |         # divide by 180 to get in range [-1, 0]
159 |         lf_rb_diff_reward = -angular_difference(lf_angle, rb_angle) / 180
160 |         rf_lb_diff_reward = -angular_difference(rf_angle, lb_angle) / 180
161 | 
162 |         # we want lf_rb and rf_lb to be 180° apart
163 |         # divide by 180 to get in range [-1, 0]
164 |         lf_rf_180_reward = -(180 - angular_difference(lf_angle, rf_angle)) / 180
165 |         lb_rb_180_reward = -(180 - angular_difference(lb_angle, rb_angle)) / 180
166 | 
167 |         if self.verbose:
168 |             print("action_reward", action_reward)
169 |             # print("action_std_reward", action_std_reward)
170 |             print("lf_rb_diff_reward", lf_rb_diff_reward)
171 |             print("rf_lb_diff_reward", rf_lb_diff_reward)
172 |             print("lf_rf_180_reward", lf_rf_180_reward)
173 | 
174 |         reward = (
175 |             action_reward
176 |             # + action_std_reward
177 |             + lf_rb_diff_reward
178 |             + rf_lb_diff_reward
179 |             + lf_rf_180_reward
180 |             + lb_rb_180_reward
181 |         )
182 | 
183 |         return reward.item(), done
184 | 
185 |     def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
186 |         """ """
187 |         # Send action to hub to receive next state
188 |         action = tensordict.get("action").cpu().numpy().squeeze()
189 |         self.send_to_hub(action)
190 |         time.sleep(self.sleep_time)  # wait some time for sensors to read and to
191 |         # receive the next state
192 |         next_observation = self.read_from_hub()
193 | 
194 |         # calc reward and done
195 |         reward, done = self.reward(
196 |             action=action,
197 |             next_state=next_observation,
198 |         )
199 |         next_tensordict = TensorDict(
200 |             {
201 |                 self.observation_key: torch.tensor(
202 |                     next_observation, dtype=torch.float32
203 |                 ),
204 |                 "reward": torch.tensor([reward]).float(),
205 |                 "done": torch.tensor([done]).bool(),
206 |             },
207 |             batch_size=[1],
208 |         )
209 | 
210 |         # increment episode step counter
211 |         self.episode_step_iter += 1
212 |         if self.episode_step_iter >= self.max_episode_steps:
213 |             next_tensordict.set("done", torch.tensor([True]))
214 |         return next_tensordict
215 | 
216 | 
217 | def angular_difference(angle1, angle2):
218 |     # Calculate the difference in angles, wrapped between -180 and 180
219 |     difference = (angle2 - angle1 + 180) % 360 - 180
220 |     return abs(difference)  # Return the absolute value of the difference
221 | 


--------------------------------------------------------------------------------
/bricksrl/environments/walker_v0/WalkerEnvSim.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple
  2 | 
  3 | import numpy as np
  4 | 
  5 | import torch
  6 | 
  7 | from bricksrl.environments.base.base_env import BaseSimEnv
  8 | from tensordict import TensorDict, TensorDictBase
  9 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec
 10 | 
 11 | 
 12 | class WalkerEnvSim_v0(BaseSimEnv):
 13 |     """ """
 14 | 
 15 |     action_dim = 4  # (lf_value, lb_value, rf_value, rb_value)
 16 |     # angles are in range [-179, 179]
 17 |     state_dim = 7  # (lf_angle, rf_angle, lb_angle, rb_angle, pitch, roll, acc_x)
 18 | 
 19 |     observation_ranges = {
 20 |         "lf_angle": [-179, 179],
 21 |         "rf_angle": [-179, 179],
 22 |         "lb_angle": [-179, 179],
 23 |         "rb_angle": [-179, 179],
 24 |         "pitch": [-75, 75],
 25 |         "roll": [-75, 75],
 26 |         "acc_x": [-3000, 3000],
 27 |     }
 28 | 
 29 |     observation_key = "observation"
 30 | 
 31 |     def __init__(
 32 |         self,
 33 |         max_episode_steps: int = 50,
 34 |         noise: float = 0.1,
 35 |         low_action_angle: int = -100,
 36 |         high_action_angle: int = 0,
 37 |         verbose: bool = False,
 38 |     ):
 39 |         self._batch_size = torch.Size([1])
 40 |         self.max_episode_steps = max_episode_steps
 41 |         self.noise = noise
 42 |         self.low_action_angle = low_action_angle
 43 |         self.high_action_angle = high_action_angle
 44 |         self.current_leg_angles = None
 45 | 
 46 |         # Define action spec
 47 |         self.action_spec = BoundedTensorSpec(
 48 |             low=-1,
 49 |             high=1,
 50 |             shape=(1, self.action_dim),
 51 |         )
 52 | 
 53 |         # Define observation spec
 54 |         bounds = torch.tensor(
 55 |             [
 56 |                 self.observation_ranges["lf_angle"],
 57 |                 self.observation_ranges["rf_angle"],
 58 |                 self.observation_ranges["lb_angle"],
 59 |                 self.observation_ranges["rb_angle"],
 60 |                 self.observation_ranges["pitch"],
 61 |                 self.observation_ranges["roll"],
 62 |                 self.observation_ranges["acc_x"],
 63 |             ]
 64 |         )
 65 |         # Reshape bounds to (1, 7)
 66 |         low_bounds = bounds[:, 0].unsqueeze(0)
 67 |         high_bounds = bounds[:, 1].unsqueeze(0)
 68 |         observation_spec = BoundedTensorSpec(
 69 |             low=low_bounds,
 70 |             high=high_bounds,
 71 |         )
 72 | 
 73 |         self.observation_spec = CompositeSpec(
 74 |             {self.observation_key: observation_spec}, shape=(1,)
 75 |         )
 76 |         super().__init__(
 77 |             action_dim=self.action_dim,
 78 |             state_dim=self.state_dim,
 79 |             verbose=verbose,
 80 |             use_hub=False,
 81 |         )
 82 | 
 83 |     def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
 84 |         """
 85 |         Reset the environment and return the initial state.
 86 | 
 87 |         Returns:
 88 |             TensorDictBase: The initial state of the environment.
 89 |         """
 90 |         # TODO solve this fake action sending before to receive first state
 91 |         self.episode_step_iter = 0
 92 | 
 93 |         observation = self.observation_spec[self.observation_key].rand()
 94 |         self.current_leg_angles = observation[0, :4]
 95 |         return TensorDict(
 96 |             {
 97 |                 self.observation_key: observation,
 98 |             },
 99 |             batch_size=[1],
100 |         )
101 | 
102 |     def reward(
103 |         self,
104 |         action: np.ndarray,
105 |         next_state: np.ndarray,
106 |     ) -> Tuple[float, bool]:
107 |         """Reward function of walker.
108 | 
109 |         Args:
110 |             action (np.ndarray): The action taken.
111 |             next_state (np.ndarray): The next state.
112 | 
113 |         Returns:
114 |             Tuple[float, bool]: The reward received and a boolean indicating whether the episode is done.
115 |         """
116 | 
117 |         done = False
118 |         # pitch and roll need to stay in range [-75, 75] outside done = True
119 |         pitch, roll = next_state[:, -3], next_state[:, -2]
120 |         if np.abs(pitch) > 100 or np.abs(roll) > 100:
121 |             done = True
122 |             reward = 0
123 |             return reward, done
124 | 
125 |         (
126 |             lf_angle,
127 |             rf_angle,
128 |             lb_angle,
129 |             rb_angle,
130 |             pitch,
131 |             roll,
132 |             acc_x,
133 |         ) = next_state.squeeze()
134 | 
135 |         # we want actions to be negative and high
136 |         # action is in range [-1, 1] over 4 dims -> sum is in range [-4, 4] -> divide by 4 to get in range [-1, 1]
137 |         action_reward = -np.sum(action) / 4 / 10
138 |         # Take this off we dont want them to be similar otherwise we cant adapt for noise in the system
139 |         # actions should ideally be similar something like [-0.75, -0.75, -0.75, -0.75]
140 |         # action_std_reward = -np.std(action)
141 | 
142 |         # we want lf_angle and rb_angle to be synchronized and rf_angle and lb_angle to be synchronized
143 |         # divide by 180 to get in range [-1, 0]
144 |         lf_rb_diff_reward = -angular_difference(lf_angle, rb_angle) / 180
145 |         rf_lb_diff_reward = -angular_difference(rf_angle, lb_angle) / 180
146 | 
147 |         # we want lf_rb and rf_lb to be 180° apart
148 |         # divide by 180 to get in range [-1, 0]
149 |         lf_rf_180_reward = -(180 - angular_difference(lf_angle, rf_angle)) / 180
150 |         lb_rb_180_reward = -(180 - angular_difference(lb_angle, rb_angle)) / 180
151 | 
152 |         if self.verbose:
153 |             print("action_reward", action_reward)
154 |             # print("action_std_reward", action_std_reward)
155 |             print("lf_rb_diff_reward", lf_rb_diff_reward)
156 |             print("rf_lb_diff_reward", rf_lb_diff_reward)
157 |             print("lf_rf_180_reward", lf_rf_180_reward)
158 | 
159 |         reward = (
160 |             action_reward
161 |             # + action_std_reward
162 |             + lf_rb_diff_reward
163 |             + rf_lb_diff_reward
164 |             + lf_rf_180_reward
165 |             + lb_rb_180_reward
166 |         )
167 | 
168 |         return reward.item(), done
169 | 
170 |     @staticmethod
171 |     def transform_range(value, old_min, old_max, new_min, new_max):
172 |         """
173 |         Transform a value from one range to another.
174 | 
175 |         Parameters:
176 |         value (float): The value to transform.
177 |         old_min (float): The minimum value of the old range.
178 |         old_max (float): The maximum value of the old range.
179 |         new_min (float): The minimum value of the new range.
180 |         new_max (float): The maximum value of the new range.
181 | 
182 |         Returns:
183 |         float: The transformed value.
184 |         """
185 |         # Compute the scale factor between the old and new ranges
186 |         scale = (new_max - new_min) / (old_max - old_min)
187 |         # Apply the transformation
188 |         return new_min + (value - old_min) * scale
189 | 
190 |     @staticmethod
191 |     def normalize_angle(angle, low_angle=-180, high_angle=179, original_one_round=360):
192 |         # Normalize angle to be within -179 to 179 degrees
193 |         while angle <= low_angle:
194 |             angle += original_one_round
195 |         while angle > high_angle:
196 |             angle -= original_one_round
197 |         return angle
198 | 
199 |     def apply_action(self, action: np.ndarray) -> np.ndarray:
200 | 
201 |         noise = np.random.normal(0, self.noise, size=4)
202 |         action += noise
203 | 
204 |         lf_value, lb_value, rf_value, rb_value = action
205 |         # transform action range for motors
206 |         lf_action = self.transform_range(
207 |             lf_value, -1, 1, self.low_action_angle, self.high_action_angle
208 |         )
209 |         lb_action = self.transform_range(
210 |             lb_value, -1, 1, self.low_action_angle, self.high_action_angle
211 |         )
212 |         rf_actopm = self.transform_range(
213 |             rf_value, -1, 1, self.low_action_angle, self.high_action_angle
214 |         )
215 |         rb_action = self.transform_range(
216 |             rb_value, -1, 1, self.low_action_angle, self.high_action_angle
217 |         )
218 | 
219 |         (
220 |             lf_angle,
221 |             rf_angle,
222 |             lb_angle,
223 |             rb_angle,
224 |         ) = self.current_leg_angles.squeeze()
225 | 
226 |         new_lf_angle = self.normalize_angle(lf_angle + lf_action)
227 |         new_lb_angle = self.normalize_angle(lb_angle + lb_action)
228 |         new_rf_angle = self.normalize_angle(rf_angle + rf_actopm)
229 |         new_rb_angle = self.normalize_angle(rb_angle + rb_action)
230 | 
231 |         self.current_leg_angles = np.array(
232 |             [
233 |                 [
234 |                     new_lf_angle,
235 |                     new_rf_angle,
236 |                     new_lb_angle,
237 |                     new_rb_angle,
238 |                 ]
239 |             ],
240 |             dtype=np.float32,
241 |         )
242 |         return self.current_leg_angles
243 | 
244 |     def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
245 |         """ """
246 |         # Send action to hub to receive next state
247 |         action = tensordict.get("action").cpu().numpy().squeeze()
248 | 
249 |         # receive the next state
250 |         next_observation = self.apply_action(action)
251 | 
252 |         # add zeros for pitch, roll and acc_x
253 |         next_observation = np.concatenate(
254 |             (next_observation, np.zeros((1, 3))), axis=1, dtype=np.float32
255 |         )
256 | 
257 |         # calc reward and done
258 |         reward, done = self.reward(
259 |             action=action,
260 |             next_state=next_observation,
261 |         )
262 |         next_tensordict = TensorDict(
263 |             {
264 |                 self.observation_key: next_observation,
265 |                 "reward": torch.tensor([reward]).float(),
266 |                 "done": torch.tensor([done]).bool(),
267 |             },
268 |             batch_size=[1],
269 |         )
270 | 
271 |         # increment episode step counter
272 |         self.episode_step_iter += 1
273 |         if self.episode_step_iter >= self.max_episode_steps:
274 |             next_tensordict.set("done", torch.tensor([True]))
275 |         return next_tensordict
276 | 
277 | 
278 | def angular_difference(angle1, angle2):
279 |     # Calculate the difference in angles, wrapped between -180 and 180
280 |     difference = (angle2 - angle1 + 180) % 360 - 180
281 |     return abs(difference)  # Return the absolute value of the difference
282 | 


--------------------------------------------------------------------------------
/bricksrl/environments/walker_v0/client.py:
--------------------------------------------------------------------------------
  1 | # NOTE: Run this program with the latest
  2 | # firmware provided via https://beta.pybricks.com/
  3 | 
  4 | import umath
  5 | import ustruct
  6 | from micropython import kbd_intr
  7 | from pybricks.hubs import InventorHub
  8 | from pybricks.parameters import Axis, Direction, Port
  9 | from pybricks.pupdevices import Motor, UltrasonicSensor
 10 | from pybricks.tools import wait
 11 | from uselect import poll
 12 | 
 13 | # Standard MicroPython modules
 14 | from usys import stdin, stdout
 15 | 
 16 | kbd_intr(-1)
 17 | hub = InventorHub()
 18 | 
 19 | # Initialize and set the motors
 20 | lf_motor = Motor(Port.D, Direction.COUNTERCLOCKWISE)
 21 | lb_motor = Motor(Port.B, Direction.COUNTERCLOCKWISE)
 22 | rf_motor = Motor(Port.C)
 23 | rb_motor = Motor(Port.A)
 24 | 
 25 | # Init additional sensor
 26 | eyes = UltrasonicSensor(Port.E)
 27 | 
 28 | # Setup poll
 29 | keyboard = poll()
 30 | keyboard.register(stdin)
 31 | 
 32 | 
 33 | def normalize_angle(angle):
 34 |     # Normalize angle to be within -179 to 179 degrees
 35 |     while angle <= -180:
 36 |         angle += 360
 37 |     while angle > 179:
 38 |         angle -= 360
 39 |     return angle
 40 | 
 41 | 
 42 | def transform_range(value, old_min, old_max, new_min, new_max):
 43 |     """
 44 |     Transform a value from one range to another.
 45 | 
 46 |     Parameters:
 47 |     value (float): The value to transform.
 48 |     old_min (float): The minimum value of the old range.
 49 |     old_max (float): The maximum value of the old range.
 50 |     new_min (float): The minimum value of the new range.
 51 |     new_max (float): The maximum value of the new range.
 52 | 
 53 |     Returns:
 54 |     float: The transformed value.
 55 |     """
 56 |     # Compute the scale factor between the old and new ranges
 57 |     scale = (new_max - new_min) / (old_max - old_min)
 58 |     # Apply the transformation
 59 |     return new_min + (value - old_min) * scale
 60 | 
 61 | 
 62 | # Setting default values and ranges
 63 | low_angle = -100  # 270
 64 | high_angle = 0
 65 | speed = 600
 66 | 
 67 | while True:
 68 | 
 69 |     while not keyboard.poll(0):
 70 |         wait(1)
 71 | 
 72 |     # Read action values for the motors
 73 |     data = stdin.buffer.read(16)  # Reading 16 bytes (4 floats)
 74 |     lf_value, lb_value, rf_value, rb_value = ustruct.unpack("!ffff", data)
 75 | 
 76 |     # Apply actions. Motor angle range is [-180, 179] action outputs are [-1, 1] we transform the actions first.
 77 |     lb_motor.run_angle(
 78 |         speed=speed,
 79 |         rotation_angle=transform_range(lb_value, -1, 1, low_angle, high_angle),
 80 |         wait=False,
 81 |     )
 82 |     lf_motor.run_angle(
 83 |         speed=speed,
 84 |         rotation_angle=transform_range(lf_value, -1, 1, low_angle, high_angle),
 85 |         wait=False,
 86 |     )
 87 |     rb_motor.run_angle(
 88 |         speed=speed,
 89 |         rotation_angle=transform_range(rb_value, -1, 1, low_angle, high_angle),
 90 |         wait=False,
 91 |     )
 92 |     rf_motor.run_angle(
 93 |         speed=speed,
 94 |         rotation_angle=transform_range(rf_value, -1, 1, low_angle, high_angle),
 95 |         wait=False,
 96 |     )
 97 | 
 98 |     # Small delay to let motors arrive target angle
 99 |     wait(250)  # 250
100 | 
101 |     # Read sensors to get current state of the robot
102 |     a_x = hub.imu.acceleration(Axis.X)
103 |     (lf_angle, rf_angle) = (lf_motor.angle(), rf_motor.angle())
104 |     (lb_angle, rb_angle) = (lb_motor.angle(), rb_motor.angle())
105 |     (pitch, roll) = hub.imu.tilt()
106 |     dist = eyes.distance()
107 | 
108 |     if umath.fabs(pitch) > 90 or umath.fabs(roll) > 120 or dist <= 40:
109 |         hub.display.text(text="Help", on=500, off=50)
110 | 
111 |     # Send current state back to environment
112 |     out_msg = ustruct.pack(
113 |         "!fffffff",
114 |         normalize_angle(lf_angle),
115 |         normalize_angle(rf_angle),
116 |         normalize_angle(lb_angle),
117 |         normalize_angle(rb_angle),
118 |         pitch,
119 |         roll,
120 |         a_x,
121 |     )
122 |     stdout.buffer.write(out_msg)
123 | 


--------------------------------------------------------------------------------
/conf/README.md:
--------------------------------------------------------------------------------
 1 | # Configuration Details for BricksRL Experiments
 2 | 
 3 | ## Overview
 4 | This directory contains all the necessary configuration files to tailor your experiments using BricksRL. Configurations are managed using [Hydra](https://hydra.cc/), a powerful tool for configuring complex applications that allows for easy modification of parameters directly from the command line.
 5 | 
 6 | ## Configuration Files
 7 | - **config.yaml**: The base configuration for all experiments including what agent and environment to run. 
 8 | - **env/**: Contains environment-specific configurations.
 9 |   - **runaway-v0.yaml**: Settings for the *RunAway-v0* environment for the 2wheeler robot.
10 |   - **spinning-v0.yaml**: Settings for the *Spinning-v0* environment for the 2wheeler robot.
11 |   - **walker-v0.yaml**: Settings for the *Walker-v0* environment for the walker robot.
12 |   - **walker_sim-v0.yaml**: Settings for the *WalkerSim-v0* environment for the walker robot.
13 |   - **roboarm-v0.yaml**: Settings for the *RoboArm-v0* environment for the roboarm robot.
14 |   - **roboarm_sim-v0.yaml**: Settings for the *RoboArmSim-v0* environment for the roboarm robot.
15 |   - **roboarm_mixed-v0.yaml**: Settings for the *RoboArmMixed-v0* environment for the roboarm robot.
16 | - **agent/**: Contains agent-specific configurations.
17 |   - **sac.yaml**: Configuration for the SAC agent.
18 |   - **td3.yaml**: Configuration for the TD3 agent.
19 |   - **droq.yaml**: Configuration for the DroQ agent.
20 | 
21 | ## Using Hydra for Configuration Overrides
22 | Hydra allows you to override any configuration parameter directly from the terminal when you run your experiments. This makes it easy to test different configurations without altering your configuration files.
23 | 
24 | ### Example Usage
25 | To run an experiment with the walker environment using the SAC agent and specify the number of episodes directly from the command line, you can use the following command:
26 | 
27 | ```bash
28 | python experiments/walker/train.py episodes=200 agent=sac
29 | ```
30 | This command temporarily overrides the episodes and agent parameters for this specific run without needing to change the configuration files.
31 | 
32 | You can further override agent or environment specific parameter like:
33 | 
34 | ```bash
35 | python experiments/walker/train.py agent=sac agent.batch_size=32
36 | ```
37 | 
38 | or 
39 | 
40 | ```bash
41 | python experiments/walker/train.py env.max_episode_steps=200 env.frame_stack=4
42 | ```


--------------------------------------------------------------------------------
/conf/agent/bc.yaml:
--------------------------------------------------------------------------------
 1 | name: bc
 2 | lr: 3e-4
 3 | batch_size: 256
 4 | num_updates: 1
 5 | prefill_episodes: 0
 6 | 
 7 | 
 8 | policy_type: deterministic # stochastic or deterministic
 9 | num_cells: 256
10 | dropout: 0.01
11 | normalization: LayerNorm
12 | 


--------------------------------------------------------------------------------
/conf/agent/cql.yaml:
--------------------------------------------------------------------------------
 1 | name: cql
 2 | lr: 3e-4
 3 | batch_size: 256
 4 | num_updates: 1
 5 | prefill_episodes: 10
 6 | 
 7 | bc_steps: 1000
 8 | 
 9 | # CQL specific
10 | num_cells: 256
11 | gamma: 0.99
12 | soft_update_eps: 0.995
13 | loss_function: l2
14 | temperature: 1.0
15 | min_q_weight: 1.0
16 | max_q_backup: False
17 | deterministic_backup: False
18 | num_random: 10
19 | with_lagrange: True
20 | lagrange_thresh: 5.0 # tau
21 | 
22 | normalization: None
23 | dropout: 0.0
24 | 
25 | prb: 0
26 | buffer_size: 1000000
27 | pretrain: False
28 | reset_params: False


--------------------------------------------------------------------------------
/conf/agent/droq.yaml:
--------------------------------------------------------------------------------
 1 | name: sac
 2 | lr: 3e-4
 3 | batch_size: 256
 4 | num_updates: 20
 5 | prefill_episodes: 10
 6 | 
 7 | num_cells: 256
 8 | gamma: 0.99
 9 | soft_update_eps: 0.995
10 | alpha_init: 1
11 | fixed_alpha: False
12 | loss_function: l2
13 | 
14 | normalization: LayerNorm
15 | dropout: 0.01
16 | 
17 | prb: 0
18 | buffer_size: 1000000
19 | reset_params: False
20 | 


--------------------------------------------------------------------------------
/conf/agent/iql.yaml:
--------------------------------------------------------------------------------
 1 | name: iql
 2 | lr: 3e-4
 3 | batch_size: 256
 4 | num_updates: 1
 5 | prefill_episodes: 0
 6 | 
 7 | num_cells: 256
 8 | gamma: 0.99
 9 | soft_update_eps: 0.995
10 | loss_function: l2
11 | temperature: 1.0
12 | expectile: 0.5
13 | 
14 | normalization: None
15 | dropout: 0.0
16 | 
17 | prb: 0
18 | buffer_size: 1000000
19 | pretrain: False
20 | reset_params: False


--------------------------------------------------------------------------------
/conf/agent/random.yaml:
--------------------------------------------------------------------------------
1 | name: random
2 | # not used for random agent
3 | batch_size: 256
4 | num_updates: 2500
5 | prefill_episodes: 0


--------------------------------------------------------------------------------
/conf/agent/sac.yaml:
--------------------------------------------------------------------------------
 1 | name: sac
 2 | lr: 3e-4
 3 | batch_size: 256
 4 | num_updates: 1
 5 | prefill_episodes: 10
 6 | 
 7 | num_cells: 256
 8 | gamma: 0.99
 9 | soft_update_eps: 0.995
10 | alpha_init: 1
11 | fixed_alpha: False
12 | loss_function: l2
13 | 
14 | normalization: None
15 | dropout: 0.0
16 | 
17 | prb: 0
18 | buffer_size: 1000000
19 | reset_params: False


--------------------------------------------------------------------------------
/conf/agent/td3.yaml:
--------------------------------------------------------------------------------
 1 | name: td3
 2 | lr: 3e-4
 3 | batch_size: 256
 4 | num_updates: 1
 5 | prefill_episodes: 10
 6 | 
 7 | num_cells: 256
 8 | gamma: 0.99
 9 | soft_update_eps: 0.995
10 | loss_function: smooth_l1
11 | exploration_noise: 0.1 # 0.01
12 | 
13 | normalization: None
14 | dropout: 0.0
15 | 
16 | prb: 0
17 | buffer_size: 1000000
18 | reset_params: False
19 | use_bc: False
20 | alpha: 1.0


--------------------------------------------------------------------------------
/conf/config.yaml:
--------------------------------------------------------------------------------
 1 | # Base Config to run all examples
 2 | 
 3 | run_name: ""
 4 | verbose: 0
 5 | 
 6 | device: "cuda"
 7 | episodes: 250
 8 | 
 9 | defaults:
10 |   - _self_
11 |   # random, sac, td3, droq
12 |   - agent: sac 
13 |   - env: walker_sim-v0


--------------------------------------------------------------------------------
/conf/env/roboarm-v0.yaml:
--------------------------------------------------------------------------------
 1 | name: "roboarm-v0"
 2 | max_episode_steps: 100
 3 | # env specific params
 4 | verbose: 0
 5 | # env wrapper
 6 | frame_stack: 1
 7 | action_filter: 1
 8 | sleep_time: 0.0
 9 | reward_signal: dense
10 | 


--------------------------------------------------------------------------------
/conf/env/roboarm_mixed-v0.yaml:
--------------------------------------------------------------------------------
 1 | name: "roboarm_mixed-v0"
 2 | max_episode_steps: 30
 3 | # env specific params
 4 | verbose: 0
 5 | # env wrapper
 6 | frame_stack: 1
 7 | action_filter: 1
 8 | sleep_time: 0.0
 9 | reward_signal: dense
10 | camera_id: 2
11 | goal_radius: 25
12 | 


--------------------------------------------------------------------------------
/conf/env/roboarm_sim-v0.yaml:
--------------------------------------------------------------------------------
 1 | name: "roboarm_sim-v0"
 2 | max_episode_steps: 100
 3 | # env specific params
 4 | verbose: 0
 5 | # env wrapper
 6 | frame_stack: 1
 7 | action_filter: 1
 8 | noise: 0.05
 9 | reward_signal: dense
10 | 


--------------------------------------------------------------------------------
/conf/env/runaway-v0.yaml:
--------------------------------------------------------------------------------
 1 | name: "runaway-v0"
 2 | max_episode_steps: 20
 3 | # env specific params
 4 | min_distance: 40.
 5 | verbose: 0
 6 | # env wrapper
 7 | frame_stack: 1
 8 | action_filter: 1
 9 | 
10 | 


--------------------------------------------------------------------------------
/conf/env/spinning-v0.yaml:
--------------------------------------------------------------------------------
1 | name: "spinning-v0"
2 | max_episode_steps: 50
3 | verbose: 0
4 | # env wrapper
5 | frame_stack: 1
6 | action_filter: 1
7 | sleep_time: 0.0
8 | 
9 | 


--------------------------------------------------------------------------------
/conf/env/walker-v0.yaml:
--------------------------------------------------------------------------------
 1 | name: "walker-v0"
 2 | max_episode_steps: 100
 3 | # env specific params
 4 | verbose: 0
 5 | # env wrapper
 6 | frame_stack: 1
 7 | action_filter: 1
 8 | sleep_time: 0.0
 9 | 
10 | 


--------------------------------------------------------------------------------
/conf/env/walker_sim-v0.yaml:
--------------------------------------------------------------------------------
 1 | name: "walker_sim-v0"
 2 | max_episode_steps: 100
 3 | # env specific params
 4 | noise: 0.1
 5 | low_action_angle: -100
 6 | high_action_angle: 0
 7 | verbose: 0
 8 | # env wrapper
 9 | frame_stack: 1
10 | action_filter: 1
11 | 
12 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Examples 
 2 | 
 3 | ## TorchRL SOTA Example
 4 | 
 5 | In the [torchrl_sac](./torchrl_sac/) folder you will find a training script to train LEGO robots with Bricksrl similar to the state-of-the-art implementations in [TorchRL](https://github.com/pytorch/rl/tree/main/sota-implementations). This allows you to basically plug-and-play with any TorchRL sota-implementation or do custom adaptations.
 6 | 
 7 | [Example results](https://wandb.ai/sebastian-dittert/bricksrl_torchrl_sac_example?nw=nwusersebastiandittert)
 8 | 
 9 | ### TorchRL sota-example diff
10 | 
11 | Only change made to the TorchRL sota-implementations is the make_env function:
12 | 
13 | ```
14 | # Environment import from BricksRL
15 | from bricksrl.environments.walker_v0.WalkerEnvSim import WalkerEnvSim_v0
16 | 
17 | # ====================================================================
18 | # Make BricksRL Environment
19 | # -----------------
20 | 
21 | 
22 | def env_maker(cfg, device="cpu", from_pixels=False):
23 |     # We use the WalkerEnvSim_v0 environment from BricksRL as an example
24 |     # as it is easy to test as it does not require a robot at hand or to connect to the hub.
25 |     # Users can replace this with any other environment from BricksRL or custom environments.
26 |     env = WalkerEnvSim_v0(max_episode_steps=cfg.env.max_episode_steps)
27 |     observation_keys = [key for key in env.observation_spec.keys()]
28 | 
29 |     transforms = []
30 |     if cfg.env.frame_stack > 1:
31 |         transforms.append(
32 |             CatFrames(
33 |                 N=cfg.env.frame_stack,
34 |                 in_keys=observation_keys,
35 |                 out_key=observation_keys,
36 |             )
37 |         )
38 |     normalize_keys = [key for key in observation_keys if key != "pixels"]
39 |     obs_ranges = np.array(list(env.observation_ranges.values()))
40 |     obs_mean = obs_ranges.mean(axis=-1)
41 |     obs_std = obs_ranges.std(axis=-1)
42 |     transforms.append(
43 |         ObservationNorm(
44 |             in_keys=normalize_keys, loc=obs_mean, scale=obs_std, standard_normal=True
45 |         )
46 |     )
47 |     transforms.append(DeviceCastTransform(device))
48 |     return TransformedEnv(env, Compose(*transforms))
49 | 
50 | ```
51 | 
52 | 
53 | 
54 | 
55 | ## Custom Environment 
56 | A template to create your own custom environments can be found [here](./custom_env.py). With an environment created like this you can update the [TorchRL example](./torchrl_sac) to train your own TorchRL agent on your custom environment.
57 | 
58 | > **Attention!** For each custom environment, you need a custom client script that must be loaded on the HUB!
59 | 
60 | ## High Level Examples
61 | In the [example notebook](./example_notebook.ipynb) we provide high-level training examples to train a **SAC agent** in the **RoboArmSim-v0** environment and a **TD3 agent** in the **WalkerSim-v0** enviornment.
62 | The examples are based on the experiments for our paper. Stand alone examples similar to the [TorchRL sota-implementations](https://github.com/pytorch/rl/tree/main/sota-implementations) can be found [here](./torchrl_sac).


--------------------------------------------------------------------------------
/examples/custom_env.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple
  2 | 
  3 | import numpy as np
  4 | 
  5 | import torch
  6 | 
  7 | from bricksrl.environments.base.base_env import BaseEnv
  8 | from tensordict import TensorDict, TensorDictBase
  9 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec
 10 | 
 11 | 
 12 | class CustomEnv(BaseEnv):
 13 |     """
 14 |     Environment template for creating your own custom environment for BricksRL.
 15 | 
 16 |     Args:
 17 |         max_episode_steps (int): The maximum number of steps per episode. Defaults to 10.
 18 |         verbose (bool): Whether to print additional information. Defaults to False.
 19 | 
 20 |     """
 21 | 
 22 |     def __init__(
 23 |         self,
 24 |         max_episode_steps: int = 50,
 25 |         verbose: bool = False,
 26 |     ):
 27 |         self._batch_size = torch.Size([1])
 28 |         self.max_episode_steps = max_episode_steps
 29 | 
 30 |         # Define action spec
 31 |         self.action_spec = BoundedTensorSpec(
 32 |             low=-1,
 33 |             high=1,
 34 |             shape=(1, self.action_dim),
 35 |         )
 36 | 
 37 |         # Define observation spec
 38 | 
 39 |         observation_spec = BoundedTensorSpec(
 40 |             low=-1,
 41 |             high=1,
 42 |             shape=(1, self.state_dim),
 43 |         )
 44 | 
 45 |         self.observation_spec = CompositeSpec(
 46 |             {self.observation_key: observation_spec}, shape=(1,)
 47 |         )
 48 |         super().__init__(
 49 |             action_dim=self.action_dim,
 50 |             state_dim=self.state_dim,
 51 |             verbose=verbose,
 52 |         )
 53 | 
 54 |     def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
 55 |         """
 56 |         Reset the environment and return the initial state.
 57 | 
 58 |         Returns:
 59 |             TensorDictBase: The initial state of the environment.
 60 |         """
 61 |         # TODO solve this fake action sending before to receive first state
 62 |         self.episode_step_iter = 0
 63 |         if tensordict is not None:
 64 |             action = tensordict.get("action").cpu().numpy().squeeze()
 65 |         else:
 66 |             action = np.zeros(self.action_dim)
 67 |         self.send_to_hub(action)
 68 |         # Get current observation
 69 |         observation = self.read_from_hub()
 70 | 
 71 |         return TensorDict(
 72 |             {
 73 |                 self.observation_key: torch.tensor(observation, dtype=torch.float32),
 74 |             },
 75 |             batch_size=[1],
 76 |         )
 77 | 
 78 |     def reward(
 79 |         self,
 80 |         action: np.ndarray,
 81 |         next_state: np.ndarray,
 82 |     ) -> Tuple[float, bool]:
 83 |         """Your custom reward function"""
 84 |         return 1.0, False
 85 | 
 86 |     def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
 87 |         """Custom step function"""
 88 |         # Send action to hub to receive next state
 89 |         action = tensordict.get("action").cpu().numpy().squeeze()
 90 |         self.send_to_hub(action)
 91 |         # receive the next state
 92 |         next_observation = self.read_from_hub()
 93 | 
 94 |         # calc reward and done
 95 |         reward, done = self.reward(
 96 |             action=action,
 97 |             next_state=next_observation,
 98 |         )
 99 |         next_tensordict = TensorDict(
100 |             {
101 |                 self.observation_key: torch.tensor(
102 |                     next_observation, dtype=torch.float32
103 |                 ),
104 |                 "reward": torch.tensor([reward]).float(),
105 |                 "done": torch.tensor([done]).bool(),
106 |             },
107 |             batch_size=[1],
108 |         )
109 | 
110 |         # increment episode step counter
111 |         self.episode_step_iter += 1
112 |         if self.episode_step_iter >= self.max_episode_steps:
113 |             next_tensordict.set("done", torch.tensor([True]))
114 |         return next_tensordict
115 | 


--------------------------------------------------------------------------------
/examples/torchrl_sac/config.yaml:
--------------------------------------------------------------------------------
 1 | # environment and task
 2 | env:
 3 |   max_episode_steps: 100
 4 |   seed: 41
 5 |   frame_stack: 1
 6 | 
 7 | # collector
 8 | collector:
 9 |   total_frames: 10_000
10 |   init_random_frames: 1000
11 |   frames_per_batch: 1000
12 |   init_env_steps: 1000
13 |   device: cpu
14 |   env_per_collector: 1
15 |   reset_at_each_iter: False
16 | 
17 | # replay buffer
18 | replay_buffer:
19 |   size: 1000000
20 |   prb: 0 # use prioritized experience replay
21 |   scratch_dir: null
22 | 
23 | # optim
24 | optim:
25 |   utd_ratio: 1.0
26 |   gamma: 0.99
27 |   loss_function: l2
28 |   lr: 3.0e-4
29 |   weight_decay: 0.0
30 |   batch_size: 256
31 |   target_update_polyak: 0.995
32 |   alpha_init: 1.0
33 |   adam_eps: 1.0e-8
34 | 
35 | # network
36 | network:
37 |   hidden_sizes: [256, 256]
38 |   activation: relu
39 |   default_policy_scale: 1.0
40 |   scale_lb: 0.1
41 |   device:
42 | 
43 | # logging
44 | logger:
45 |   backend: wandb
46 |   project_name: bricksrl_torchrl_sac_example
47 |   group_name: null
48 |   exp_name: Walkersim-v0_SAC
49 |   mode: online
50 |   eval_iter: 1000
51 |   video: False


--------------------------------------------------------------------------------
/examples/torchrl_sac/train.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | #
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | """SAC Example.
  6 | 
  7 | This is a simple self-contained example of a SAC training script.
  8 | 
  9 | It supports state environments like MuJoCo.
 10 | 
 11 | The helper functions are coded in the utils.py associated with this script.
 12 | """
 13 | import time
 14 | 
 15 | import hydra
 16 | 
 17 | import numpy as np
 18 | import torch
 19 | import torch.cuda
 20 | import tqdm
 21 | from tensordict import TensorDict
 22 | from torchrl._utils import logger as torchrl_logger
 23 | from torchrl.envs.utils import ExplorationType, set_exploration_type
 24 | 
 25 | from torchrl.record.loggers import generate_exp_name, get_logger
 26 | from utils import (
 27 |     dump_video,
 28 |     log_metrics,
 29 |     make_collector,
 30 |     make_environment,
 31 |     make_loss_module,
 32 |     make_replay_buffer,
 33 |     make_sac_agent,
 34 |     make_sac_optimizer,
 35 | )
 36 | 
 37 | 
 38 | @hydra.main(version_base="1.1", config_path="", config_name="config")
 39 | def main(cfg: "DictConfig"):  # noqa: F821
 40 |     device = cfg.network.device
 41 |     if device in ("", None):
 42 |         if torch.cuda.is_available():
 43 |             device = torch.device("cuda:0")
 44 |         else:
 45 |             device = torch.device("cpu")
 46 |     device = torch.device(device)
 47 | 
 48 |     # Create logger
 49 |     exp_name = generate_exp_name("SAC", cfg.logger.exp_name)
 50 |     logger = None
 51 |     if cfg.logger.backend:
 52 |         logger = get_logger(
 53 |             logger_type=cfg.logger.backend,
 54 |             logger_name="sac_logging",
 55 |             experiment_name=exp_name,
 56 |             wandb_kwargs={
 57 |                 "mode": cfg.logger.mode,
 58 |                 "config": dict(cfg),
 59 |                 "project": cfg.logger.project_name,
 60 |                 "group": cfg.logger.group_name,
 61 |             },
 62 |         )
 63 | 
 64 |     torch.manual_seed(cfg.env.seed)
 65 |     np.random.seed(cfg.env.seed)
 66 | 
 67 |     # Create environments
 68 |     train_env, eval_env = make_environment(cfg, logger=logger)
 69 | 
 70 |     # Create agent
 71 |     model, exploration_policy = make_sac_agent(cfg, train_env, eval_env, device)
 72 | 
 73 |     # Create SAC loss
 74 |     loss_module, target_net_updater = make_loss_module(cfg, model)
 75 | 
 76 |     # Create off-policy collector
 77 |     collector = make_collector(cfg, train_env, exploration_policy)
 78 | 
 79 |     # Create replay buffer
 80 |     replay_buffer = make_replay_buffer(
 81 |         batch_size=cfg.optim.batch_size,
 82 |         prb=cfg.replay_buffer.prb,
 83 |         buffer_size=cfg.replay_buffer.size,
 84 |         scratch_dir=cfg.replay_buffer.scratch_dir,
 85 |         device="cpu",
 86 |     )
 87 | 
 88 |     # Create optimizers
 89 |     (
 90 |         optimizer_actor,
 91 |         optimizer_critic,
 92 |         optimizer_alpha,
 93 |     ) = make_sac_optimizer(cfg, loss_module)
 94 | 
 95 |     # Main loop
 96 |     start_time = time.time()
 97 |     collected_frames = 0
 98 |     pbar = tqdm.tqdm(total=cfg.collector.total_frames)
 99 | 
100 |     init_random_frames = cfg.collector.init_random_frames
101 |     num_updates = int(
102 |         cfg.collector.env_per_collector
103 |         * cfg.collector.frames_per_batch
104 |         * cfg.optim.utd_ratio
105 |     )
106 |     prb = cfg.replay_buffer.prb
107 |     eval_iter = cfg.logger.eval_iter
108 |     frames_per_batch = cfg.collector.frames_per_batch
109 |     eval_rollout_steps = cfg.env.max_episode_steps
110 | 
111 |     sampling_start = time.time()
112 |     for i, tensordict in enumerate(collector):
113 |         sampling_time = time.time() - sampling_start
114 | 
115 |         # Update weights of the inference policy
116 |         collector.update_policy_weights_()
117 | 
118 |         pbar.update(tensordict.numel())
119 | 
120 |         tensordict = tensordict.reshape(-1)
121 |         current_frames = tensordict.numel()
122 |         # Add to replay buffer
123 |         replay_buffer.extend(tensordict.cpu())
124 |         collected_frames += current_frames
125 | 
126 |         # Optimization steps
127 |         training_start = time.time()
128 |         if collected_frames >= init_random_frames:
129 |             losses = TensorDict({}, batch_size=[num_updates])
130 |             for i in range(num_updates):
131 |                 # Sample from replay buffer
132 |                 sampled_tensordict = replay_buffer.sample()
133 |                 if sampled_tensordict.device != device:
134 |                     sampled_tensordict = sampled_tensordict.to(
135 |                         device, non_blocking=True
136 |                     )
137 |                 else:
138 |                     sampled_tensordict = sampled_tensordict.clone()
139 | 
140 |                 # Compute loss
141 |                 loss_td = loss_module(sampled_tensordict)
142 | 
143 |                 actor_loss = loss_td["loss_actor"]
144 |                 q_loss = loss_td["loss_qvalue"]
145 |                 alpha_loss = loss_td["loss_alpha"]
146 | 
147 |                 # Update actor
148 |                 optimizer_actor.zero_grad()
149 |                 actor_loss.backward()
150 |                 optimizer_actor.step()
151 | 
152 |                 # Update critic
153 |                 optimizer_critic.zero_grad()
154 |                 q_loss.backward()
155 |                 optimizer_critic.step()
156 | 
157 |                 # Update alpha
158 |                 optimizer_alpha.zero_grad()
159 |                 alpha_loss.backward()
160 |                 optimizer_alpha.step()
161 | 
162 |                 losses[i] = loss_td.select(
163 |                     "loss_actor", "loss_qvalue", "loss_alpha"
164 |                 ).detach()
165 | 
166 |                 # Update qnet_target params
167 |                 target_net_updater.step()
168 | 
169 |                 # Update priority
170 |                 if prb:
171 |                     replay_buffer.update_priority(sampled_tensordict)
172 | 
173 |         training_time = time.time() - training_start
174 |         episode_end = (
175 |             tensordict["next", "done"]
176 |             if tensordict["next", "done"].any()
177 |             else tensordict["next", "truncated"]
178 |         )
179 |         episode_rewards = tensordict["next", "episode_reward"][episode_end]
180 | 
181 |         # Logging
182 |         metrics_to_log = {}
183 |         if len(episode_rewards) > 0:
184 |             episode_length = tensordict["next", "step_count"][episode_end]
185 |             metrics_to_log["train/reward"] = episode_rewards.mean().item()
186 |             metrics_to_log["train/episode_length"] = episode_length.sum().item() / len(
187 |                 episode_length
188 |             )
189 |         if collected_frames >= init_random_frames:
190 |             metrics_to_log["train/q_loss"] = losses.get("loss_qvalue").mean().item()
191 |             metrics_to_log["train/actor_loss"] = losses.get("loss_actor").mean().item()
192 |             metrics_to_log["train/alpha_loss"] = losses.get("loss_alpha").mean().item()
193 |             metrics_to_log["train/alpha"] = loss_td["alpha"].item()
194 |             metrics_to_log["train/entropy"] = loss_td["entropy"].item()
195 |             metrics_to_log["train/sampling_time"] = sampling_time
196 |             metrics_to_log["train/training_time"] = training_time
197 | 
198 |         # Evaluation
199 |         if abs(collected_frames % eval_iter) < frames_per_batch:
200 |             with set_exploration_type(ExplorationType.DETERMINISTIC), torch.no_grad():
201 |                 eval_start = time.time()
202 |                 eval_rollout = eval_env.rollout(
203 |                     eval_rollout_steps,
204 |                     model[0],
205 |                     auto_cast_to_device=True,
206 |                     break_when_any_done=True,
207 |                 )
208 |                 eval_env.apply(dump_video)
209 |                 eval_time = time.time() - eval_start
210 |                 eval_reward = eval_rollout["next", "reward"].sum(-2).mean().item()
211 |                 metrics_to_log["eval/reward"] = eval_reward
212 |                 metrics_to_log["eval/time"] = eval_time
213 |         if logger is not None:
214 |             log_metrics(logger, metrics_to_log, collected_frames)
215 |         sampling_start = time.time()
216 | 
217 |     collector.shutdown()
218 |     if not eval_env.is_closed:
219 |         eval_env.close()
220 |     if not train_env.is_closed:
221 |         train_env.close()
222 |     end_time = time.time()
223 |     execution_time = end_time - start_time
224 |     torchrl_logger.info(f"Training took {execution_time:.2f} seconds to finish")
225 | 
226 | 
227 | if __name__ == "__main__":
228 |     main()
229 | 


--------------------------------------------------------------------------------
/experiments/2wheeler/eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import time
 4 | 
 5 | import hydra
 6 | import numpy as np
 7 | import wandb
 8 | from omegaconf import DictConfig, OmegaConf
 9 | from torchrl.envs.utils import step_mdp
10 | from tqdm import tqdm
11 | 
12 | # Add the project root to PYTHONPATH for config
13 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
14 | if project_root not in sys.path:
15 |     sys.path.insert(0, project_root)
16 | 
17 | from bricksrl.environments import make_env
18 | from experiments.helper.agents import get_agent
19 | from experiments.helper.utils import login, logout, setup_check
20 | 
21 | 
22 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config")
23 | def run(cfg: DictConfig) -> None:
24 |     print(OmegaConf.to_yaml(cfg))
25 | 
26 |     # make environment.
27 |     setup_check(robot="2wheeler", config=cfg)
28 |     env, action_space, state_space = make_env(cfg)
29 | 
30 |     # make agent
31 |     agent, project_name = get_agent(action_space, state_space, cfg)
32 |     login(agent)
33 |     agent.eval()
34 | 
35 |     # initialize wandb
36 |     wandb.init(project=project_name + "_eval")
37 |     wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
38 | 
39 |     eval_episodes = cfg.episodes
40 |     quit = False
41 |     _ = input("Press Enter to start evaluation...")
42 |     try:
43 |         for e in tqdm(range(eval_episodes), desc="Evaluation"):
44 |             td = env.reset()
45 |             done = td.get("done", False)
46 |             truncated = td.get("truncated", False)
47 |             ep_return = 0
48 |             ep_steps = 0
49 |             total_step_times = []
50 |             actions = []
51 |             print("Start new evaluation...", flush=True)
52 |             while not done and not truncated:
53 |                 ep_steps += 1
54 |                 step_start_time = time.time()
55 |                 td = agent.get_eval_action(td)
56 |                 actions.append(td.get("action").cpu().numpy())
57 |                 td = env.step(td)
58 |                 agent.add_experience(td)
59 |                 total_agent_step_time = time.time() - step_start_time
60 |                 total_step_times.append(total_agent_step_time)
61 |                 done = td.get(("next", "done"), False)
62 |                 ep_return += td.get(("next", "reward"), 0)
63 |                 if done:
64 |                     if cfg.env.name == "runaway-v0":
65 |                         inpt = input(
66 |                             "Please reset the robot to the starting position and press Enter to continue or q to quit:"
67 |                         )
68 |                         if inpt == "q":
69 |                             quit = True
70 |                     break
71 |                 td = step_mdp(td)
72 | 
73 |             if quit:
74 |                 break
75 | 
76 |             # Metrics Logging
77 |             log_dict = {
78 |                 "epoch": e,
79 |                 "reward": ep_return,
80 |                 "steps": ep_steps,
81 |                 "total_step_time": np.mean(total_step_times),
82 |                 "buffer_size": agent.replay_buffer.__len__(),
83 |                 "done": done.float(),
84 |                 "mean_action": np.mean(actions),
85 |             }
86 |             if cfg.env.name == "runaway-v0":
87 |                 log_dict.update({"distance": td.get("distance")})
88 | 
89 |             wandb.log(log_dict)
90 | 
91 |     except KeyboardInterrupt:
92 |         print("Evaluation interrupted by user.")
93 |     logout(agent)
94 |     env.close()
95 | 
96 | 
97 | if __name__ == "__main__":
98 |     run()
99 | 


--------------------------------------------------------------------------------
/experiments/2wheeler/pretrain.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import hydra
 5 | import wandb
 6 | from omegaconf import DictConfig, OmegaConf
 7 | from tqdm import tqdm
 8 | 
 9 | # Add the project root to PYTHONPATH for config
10 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
11 | if project_root not in sys.path:
12 |     sys.path.insert(0, project_root)
13 | 
14 | from bricksrl.environments import make_env
15 | from experiments.helper.agents import get_agent
16 | from experiments.helper.utils import login, logout, setup_check, tensordict2dict
17 | 
18 | 
19 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config")
20 | def run(cfg: DictConfig) -> None:
21 |     print(OmegaConf.to_yaml(cfg))
22 | 
23 |     # make environment.
24 |     setup_check(robot="2wheeler", config=cfg)
25 |     env, action_space, state_space = make_env(cfg, pretrain=True)
26 | 
27 |     # make agent
28 |     agent, project_name = get_agent(action_space, state_space, cfg)
29 |     login(agent)
30 | 
31 |     # initialize wandb
32 |     wandb.init(project=project_name)
33 |     wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
34 |     wandb.watch(agent.actor, log_freq=1) if agent.actor else None
35 | 
36 |     batch_size = cfg.agent.batch_size
37 |     num_updates = cfg.agent.num_updates
38 |     train_episodes = cfg.episodes
39 |     print("Start training...")
40 |     try:
41 |         for e in tqdm(range(train_episodes), desc="Training"):
42 | 
43 |             loss_info = agent.train(batch_size=batch_size, num_updates=num_updates)
44 | 
45 |             # Metrics Logging
46 |             log_dict = {
47 |                 "epoch": e,
48 |                 "buffer_size": agent.replay_buffer.__len__(),
49 |             }
50 |             log_dict.update(tensordict2dict(loss_info))
51 |             wandb.log(log_dict)
52 | 
53 |     except KeyboardInterrupt:
54 |         print("Training interrupted by user.")
55 | 
56 |     logout(agent)
57 |     env.close()
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     run()
62 | 


--------------------------------------------------------------------------------
/experiments/2wheeler/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import time
  4 | 
  5 | import hydra
  6 | import numpy as np
  7 | import wandb
  8 | from omegaconf import DictConfig, OmegaConf
  9 | from torchrl.envs.utils import step_mdp
 10 | from tqdm import tqdm
 11 | 
 12 | # Add the project root to PYTHONPATH for config
 13 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
 14 | if project_root not in sys.path:
 15 |     sys.path.insert(0, project_root)
 16 | 
 17 | from bricksrl.environments import make_env
 18 | from experiments.helper.agents import get_agent
 19 | from experiments.helper.utils import (
 20 |     login,
 21 |     logout,
 22 |     prefill_buffer,
 23 |     setup_check,
 24 |     tensordict2dict,
 25 | )
 26 | 
 27 | 
 28 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config")
 29 | def run(cfg: DictConfig) -> None:
 30 |     print(OmegaConf.to_yaml(cfg))
 31 | 
 32 |     # make environment.
 33 |     setup_check(robot="2wheeler", config=cfg)
 34 |     env, action_space, state_space = make_env(cfg)
 35 | 
 36 |     # make agent
 37 |     agent, project_name = get_agent(action_space, state_space, cfg)
 38 |     login(agent)
 39 | 
 40 |     # initialize wandb
 41 |     wandb.init(project=project_name)
 42 |     wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
 43 |     wandb.watch(agent.actor, log_freq=1) if agent.actor else None
 44 | 
 45 |     # prefill buffer with random actions
 46 |     prefill_buffer(
 47 |         env=env,
 48 |         agent=agent,
 49 |         num_episodes=cfg.agent.prefill_episodes,
 50 |         stop_on_done=True if cfg.env.name == "runaway-v0" else False,
 51 |     )
 52 | 
 53 |     batch_size = cfg.agent.batch_size
 54 |     num_updates = cfg.agent.num_updates
 55 |     train_episodes = cfg.episodes
 56 |     print("Start training...")
 57 |     quit = False
 58 |     try:
 59 |         for e in tqdm(range(train_episodes), desc="Training"):
 60 |             td = env.reset()
 61 |             done = td.get("done", False)
 62 |             truncated = td.get("truncated", False)
 63 |             ep_return = 0
 64 |             ep_steps = 0
 65 |             total_step_times = []
 66 |             actions = []
 67 |             print("Start new data collection...", flush=True)
 68 |             while not done and not truncated:
 69 |                 ep_steps += 1
 70 |                 step_start_time = time.time()
 71 |                 td = agent.get_action(td)
 72 |                 actions.append(td.get("action").cpu().numpy())
 73 |                 td = env.step(td)
 74 |                 agent.add_experience(td)
 75 |                 total_agent_step_time = time.time() - step_start_time
 76 |                 total_step_times.append(total_agent_step_time)
 77 |                 done = td.get(("next", "done"), False)
 78 |                 ep_return += td.get(("next", "reward"), 0)
 79 |                 if done:
 80 |                     if cfg.env.name == "runaway-v0":
 81 |                         inpt = input(
 82 |                             "Please reset the robot to the starting position and press Enter to continue or q to quit:"
 83 |                         )
 84 |                         if inpt == "q":
 85 |                             quit = True
 86 |                     break
 87 |                 td = step_mdp(td)
 88 |             loss_info = agent.train(
 89 |                 batch_size=batch_size, num_updates=num_updates * ep_steps
 90 |             )
 91 |             if quit:
 92 |                 break
 93 | 
 94 |             # Metrics Logging
 95 |             log_dict = {
 96 |                 "epoch": e,
 97 |                 "reward": ep_return,
 98 |                 "steps": ep_steps,
 99 |                 "total_step_time": np.mean(total_step_times),
100 |                 "buffer_size": agent.replay_buffer.__len__(),
101 |                 "done": done.float(),
102 |                 "mean_action": np.mean(actions),
103 |             }
104 |             if cfg.env.name == "runaway-v0":
105 |                 log_dict.update({"distance": td.get("distance")})
106 | 
107 |             log_dict.update(tensordict2dict(loss_info))
108 |             wandb.log(log_dict)
109 | 
110 |     except KeyboardInterrupt:
111 |         print("Training interrupted by user.")
112 | 
113 |     logout(agent)
114 |     env.close()
115 | 
116 | 
117 | if __name__ == "__main__":
118 |     run()
119 | 


--------------------------------------------------------------------------------
/experiments/helper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BricksRL/bricksrl/bc250aeaa3b9ab9d718601fced38325f3621c8a3/experiments/helper/__init__.py


--------------------------------------------------------------------------------
/experiments/helper/agents/__init__.py:
--------------------------------------------------------------------------------
 1 | from experiments.helper.agents.behavior_cloning import BehavioralCloningAgent
 2 | from experiments.helper.agents.cql import CQLAgent
 3 | from experiments.helper.agents.iql import IQLAgent
 4 | from experiments.helper.agents.random import RandomAgent
 5 | from experiments.helper.agents.sac import SACAgent
 6 | from experiments.helper.agents.td3 import TD3Agent
 7 | 
 8 | all_agents = ["td3", "sac", "iql", "cql", "bc", "random"]
 9 | 
10 | 
11 | def get_agent(action_spec, state_spec, cfg):
12 |     if cfg.agent.name == "td3":
13 |         agent = TD3Agent(
14 |             action_spec=action_spec,
15 |             state_spec=state_spec,
16 |             agent_config=cfg.agent,
17 |             device=cfg.device,
18 |         )
19 |     elif cfg.agent.name == "sac":
20 |         agent = SACAgent(
21 |             action_spec=action_spec,
22 |             state_spec=state_spec,
23 |             agent_config=cfg.agent,
24 |             device=cfg.device,
25 |         )
26 |     elif cfg.agent.name == "bc":
27 |         agent = BehavioralCloningAgent(
28 |             action_spec=action_spec,
29 |             state_spec=state_spec,
30 |             agent_config=cfg.agent,
31 |             device=cfg.device,
32 |         )
33 |     elif cfg.agent.name == "random":
34 |         agent = RandomAgent(
35 |             action_spec=action_spec,
36 |             state_spec=state_spec,
37 |             agent_config=cfg.agent,
38 |             device=cfg.device,
39 |         )
40 |     elif cfg.agent.name == "iql":
41 |         agent = IQLAgent(
42 |             action_spec=action_spec,
43 |             state_spec=state_spec,
44 |             agent_config=cfg.agent,
45 |             device=cfg.device,
46 |         )
47 |     elif cfg.agent.name == "cql":
48 |         agent = CQLAgent(
49 |             action_spec=action_spec,
50 |             state_spec=state_spec,
51 |             agent_config=cfg.agent,
52 |             device=cfg.device,
53 |         )
54 |     else:
55 |         raise NotImplementedError(
56 |             f"Agent {cfg.agent.name} not implemented, please choose from {all_agents}"
57 |         )
58 | 
59 |     project_name = f"lego-{cfg.agent.name}-{cfg.env.name}"
60 |     print("--- Agent initialized ---", flush=True)
61 | 
62 |     return agent, project_name
63 | 


--------------------------------------------------------------------------------
/experiments/helper/agents/base.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from typing import Tuple
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | import torch.nn.init as init
 7 | from tensordict import TensorDictBase
 8 | from torchrl.data.tensor_specs import TensorSpec
 9 | from torchrl.envs.utils import set_exploration_mode
10 | 
11 | 
12 | class BaseAgent:
13 |     """Implements a base agent used to interact with the lego robots.
14 | 
15 |     Args:
16 |         state_spec (TensorSpec): The state specification of the environment.
17 |         action_spec (TensorSpec): The action specification of the environment.
18 |         agent_name (str): The name of the agent.
19 |         device (str): The device to use for computation.
20 | 
21 |     Attributes:
22 |         name (str): The name of the agent.
23 |         observation_spec (TensorSpec): The state specification of the environment.
24 |         action_spec (TensorSpec): The action specification of the environment.
25 |         device (str): The device to use for computation.
26 |         observation_keys (List[str]): The keys used to access the observation in the tensor dictionary.
27 |     """
28 | 
29 |     def __init__(
30 |         self,
31 |         state_spec: TensorSpec,
32 |         action_spec: TensorSpec,
33 |         agent_name: str,
34 |         device: str = "cpu",
35 |     ):
36 |         self.name = agent_name
37 |         self.observation_spec = state_spec
38 |         self.action_spec = action_spec
39 |         self.device = device
40 |         self.observation_keys = [key for key in self.observation_spec.keys()]
41 | 
42 |     def init_nets(self, model: nn.Module):
43 |         """Initializes the networks with random data.
44 | 
45 |         Args:
46 |             model (list): A list of PyTorch models to initialize.
47 |         """
48 |         with torch.no_grad(), set_exploration_mode("random"):
49 |             td = self.observation_spec.rand()
50 |             td = td.to(self.device)
51 |             for net in model:
52 |                 net(td)
53 |         del td
54 | 
55 |     def eval(self):
56 |         """Sets the agent to evaluation mode."""
57 |         raise NotImplementedError
58 | 
59 |     @staticmethod
60 |     def reset_parameter(param):
61 |         if param.data.ndimension() == 2:  # Weights
62 |             init.kaiming_uniform_(param.data, a=math.sqrt(5))
63 |         else:  # Biases and others
64 |             # Adjust based on your specific needs
65 |             init.uniform_(param.data, -1, 1)
66 | 
67 |     def get_action(self, tensordict: TensorDictBase) -> TensorDictBase:
68 |         """Returns a sampled action given a tensordict to collect data.
69 | 
70 |         Args:
71 |             tensordict (TensorDictBase): Tensordict containing the current state of the environment.
72 | 
73 |         Returns:
74 |             TensorDictBase: TensorDict containing the sampled action to take in the environment.
75 |         """
76 |         raise NotImplementedError
77 | 
78 |     def get_eval_action(self, tensordict: TensorDictBase) -> TensorDictBase:
79 |         """Returns an action given a tensordict to evaluate the agent.
80 | 
81 |         Args:
82 |             tensordict (TensorDictBase): Tensordict containing the current state of the environment.
83 | 
84 |         Returns:
85 |             TensorDictBase: TensorDict containing the eval action to take in the environment.
86 |         """
87 |         raise NotImplementedError
88 | 
89 |     def train(
90 |         self,
91 |     ):
92 |         """Trains the agent.
93 | 
94 |         Raises:
95 |             NotImplementedError: This method must be implemented by a subclass.
96 |         """
97 |         raise NotImplementedError
98 | 


--------------------------------------------------------------------------------
/experiments/helper/agents/behavior_cloning.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensordict as td
  3 | import torch
  4 | 
  5 | from experiments.helper.agents.base import BaseAgent
  6 | from experiments.helper.networks.networks import (
  7 |     get_deterministic_actor,
  8 |     get_stochastic_actor,
  9 | )
 10 | from tensordict import TensorDictBase
 11 | from torch import nn, optim
 12 | from torchrl.data import BoundedTensorSpec, TensorDictReplayBuffer
 13 | 
 14 | from torchrl.data.replay_buffers.storages import LazyMemmapStorage
 15 | from torchrl.envs import RenameTransform, ToTensorImage
 16 | from torchrl.envs.utils import ExplorationType, set_exploration_type
 17 | 
 18 | 
 19 | def initialize(net, std=0.02):
 20 |     for p, n in net.named_parameters():
 21 |         if "weight" in p:
 22 |             # nn.init.xavier_uniform_(n)
 23 |             nn.init.normal_(n, mean=0, std=std)
 24 |         elif "bias" in p:
 25 |             nn.init.zeros_(n)
 26 | 
 27 | 
 28 | class BehavioralCloningAgent(BaseAgent):
 29 |     def __init__(self, state_spec, action_spec, agent_config, device="cpu"):
 30 |         super(BehavioralCloningAgent, self).__init__(
 31 |             state_spec, action_spec, agent_config.name, device
 32 |         )
 33 | 
 34 |         if agent_config.policy_type == "deterministic":
 35 |             self.actor = get_deterministic_actor(state_spec, action_spec, agent_config)
 36 |         elif agent_config.policy_type == "stochastic":
 37 |             raise NotImplementedError(
 38 |                 "Stochastic actor training is not implemented yet"
 39 |             )
 40 |             # TODO: Implement stochastic actor training
 41 |             # self.actor = get_stochastic_actor(
 42 |             #     state_spec, action_spec, agent_config
 43 |             # )
 44 |         else:
 45 |             raise ValueError(
 46 |                 "policy_type not recognized, choose deterministic or stochastic"
 47 |             )
 48 |         self.actor.to(device)
 49 |         # initialize networks
 50 |         self.init_nets([self.actor])
 51 | 
 52 |         self.optimizer = optim.Adam(
 53 |             self.actor.parameters(), lr=agent_config.lr, weight_decay=0.0
 54 |         )
 55 | 
 56 |         # create replay buffer
 57 |         self.batch_size = agent_config.batch_size
 58 |         self.replay_buffer = self.create_replay_buffer()
 59 | 
 60 |         # general stats
 61 |         self.collected_transitions = 0
 62 |         self.do_pretrain = False
 63 |         self.episodes = 0
 64 | 
 65 |     def get_agent_statedict(self):
 66 |         """Save agent"""
 67 |         act_statedict = self.actor.state_dict()
 68 |         return {"actor": act_statedict}
 69 | 
 70 |     def load_model(self, path):
 71 |         """load model"""
 72 |         try:
 73 |             statedict = torch.load(path)
 74 |             self.actor.load_state_dict(statedict["actor"])
 75 |             print("Model loaded")
 76 |         except:
 77 |             raise ValueError("Model not loaded")
 78 | 
 79 |     def load_replaybuffer(self, path):
 80 |         """load replay buffer"""
 81 |         try:
 82 |             loaded_data = TensorDictBase.load_memmap(path)
 83 |             self.replay_buffer.extend(loaded_data)
 84 |             if self.replay_buffer._batch_size != self.batch_size:
 85 |                 Warning(
 86 |                     "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size."
 87 |                 )
 88 |                 self.replay_buffer._batch_size = self.batch_size
 89 |             print("Replay Buffer loaded")
 90 |             print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n")
 91 |         except:
 92 |             raise ValueError("Replay Buffer not loaded")
 93 | 
 94 |     def eval(self):
 95 |         """Sets the agent to evaluation mode."""
 96 |         self.actor.eval()
 97 | 
 98 |     @torch.no_grad()
 99 |     def get_eval_action(self, td: TensorDictBase) -> TensorDictBase:
100 |         """Get eval action from actor network"""
101 |         with set_exploration_type(ExplorationType.MODE):
102 |             out_td = self.actor(td.to(self.device))
103 |         return out_td
104 | 
105 |     def create_replay_buffer(
106 |         self,
107 |         buffer_size=1000000,
108 |         buffer_scratch_dir="./tmp",
109 |         device="cpu",
110 |         prefetch=3,
111 |     ):
112 |         """Create replay buffer"""
113 | 
114 |         replay_buffer = TensorDictReplayBuffer(
115 |             pin_memory=False,
116 |             prefetch=prefetch,
117 |             storage=LazyMemmapStorage(
118 |                 buffer_size,
119 |                 scratch_dir=buffer_scratch_dir,
120 |             ),
121 |             batch_size=self.batch_size,
122 |         )
123 |         replay_buffer.append_transform(lambda x: x.to(device))
124 |         # TODO: check if we have image in observation space if so add this transform
125 |         # replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True))
126 | 
127 |         return replay_buffer
128 | 
129 |     @torch.no_grad()
130 |     def get_action(self, td: TensorDictBase) -> TensorDictBase:
131 |         """Get action from actor network"""
132 |         with set_exploration_type(ExplorationType.RANDOM):
133 |             out_td = self.actor(td.to(self.device))
134 |         return out_td
135 | 
136 |     def add_experience(self, transition: td.TensorDict):
137 |         """Add experience to replay buffer"""
138 |         """Add experience to replay buffer"""
139 |         self.replay_buffer.extend(transition)
140 |         self.collected_transitions += 1
141 | 
142 |     def train(self, batch_size=64, num_updates=1):
143 |         """Train the agent"""
144 |         log_data = {}
145 | 
146 |         for i in range(num_updates):
147 |             batch = self.replay_buffer.sample(batch_size).to(self.device)
148 |             orig_action = batch.get("action").clone()
149 | 
150 |             out_dict = self.actor(batch)
151 |             loss = torch.mean((out_dict.get("action") - orig_action) ** 2)
152 |             self.optimizer.zero_grad()
153 |             loss.backward()
154 |             self.optimizer.step()
155 |             log_data.update({"loss": loss})
156 |         return log_data
157 | 


--------------------------------------------------------------------------------
/experiments/helper/agents/cql.py:
--------------------------------------------------------------------------------
  1 | import tensordict as td
  2 | import torch
  3 | 
  4 | from experiments.helper.agents.base import BaseAgent
  5 | from experiments.helper.networks.networks import get_critic, get_stochastic_actor
  6 | from tensordict import TensorDictBase
  7 | from torch import optim
  8 | from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer
  9 | from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage
 10 | from torchrl.envs.utils import ExplorationType, set_exploration_type
 11 | from torchrl.objectives import SoftUpdate
 12 | 
 13 | from torchrl.objectives.cql import CQLLoss
 14 | 
 15 | 
 16 | class CQLAgent(BaseAgent):
 17 |     def __init__(self, state_spec, action_spec, agent_config, device="cpu"):
 18 |         super(CQLAgent, self).__init__(
 19 |             state_spec, action_spec, agent_config.name, device
 20 |         )
 21 | 
 22 |         with_lagrange = agent_config.with_lagrange
 23 | 
 24 |         self.actor = get_stochastic_actor(state_spec, action_spec, agent_config)
 25 |         self.critic = get_critic(state_spec, agent_config)
 26 | 
 27 |         self.actor.to(device)
 28 |         self.critic.to(device)
 29 | 
 30 |         # initialize networks
 31 |         self.init_nets([self.actor, self.critic])
 32 | 
 33 |         # define loss function
 34 |         self.loss_module = CQLLoss(
 35 |             actor_network=self.actor,
 36 |             qvalue_network=self.critic,
 37 |             loss_function=agent_config.loss_function,
 38 |             temperature=agent_config.temperature,
 39 |             min_q_weight=agent_config.min_q_weight,
 40 |             max_q_backup=agent_config.max_q_backup,
 41 |             deterministic_backup=agent_config.deterministic_backup,
 42 |             num_random=agent_config.num_random,
 43 |             with_lagrange=agent_config.with_lagrange,
 44 |             lagrange_thresh=agent_config.lagrange_thresh,
 45 |         )
 46 |         # Define Target Network Updater
 47 |         self.target_net_updater = SoftUpdate(
 48 |             self.loss_module, eps=agent_config.soft_update_eps
 49 |         )
 50 |         self.target_net_updater.init_()
 51 | 
 52 |         # Reset weights
 53 |         self.reset_params = agent_config.reset_params
 54 | 
 55 |         # Define Replay Buffer
 56 |         self.batch_size = agent_config.batch_size
 57 |         self.replay_buffer = self.create_replay_buffer(
 58 |             prb=agent_config.prb,
 59 |             buffer_size=agent_config.buffer_size,
 60 |             device=device,
 61 |         )
 62 | 
 63 |         # Define Optimizer
 64 |         critic_params = list(
 65 |             self.loss_module.qvalue_network_params.flatten_keys().values()
 66 |         )
 67 |         actor_params = list(
 68 |             self.loss_module.actor_network_params.flatten_keys().values()
 69 |         )
 70 |         self.optimizer_actor = optim.Adam(
 71 |             actor_params, lr=agent_config.lr, weight_decay=0.0
 72 |         )
 73 |         self.optimizer_critic = optim.Adam(
 74 |             critic_params, lr=agent_config.lr, weight_decay=0.0
 75 |         )
 76 |         self.optimizer_alpha = optim.Adam(
 77 |             [self.loss_module.log_alpha],
 78 |             lr=3.0e-4,
 79 |         )
 80 |         if with_lagrange:
 81 |             self.alpha_prime_optim = torch.optim.Adam(
 82 |                 [self.loss_module.log_alpha_prime],
 83 |                 lr=agent_config.lr,
 84 |             )
 85 |         else:
 86 |             self.alpha_prime_optim = None
 87 |         # general stats
 88 |         self.collected_transitions = 0
 89 |         self.total_updates = 0
 90 |         self.do_pretrain = agent_config.pretrain
 91 |         self.bc_steps = agent_config.bc_steps
 92 | 
 93 |     def get_agent_statedict(self):
 94 |         """Save agent"""
 95 |         act_statedict = self.actor.state_dict()
 96 |         critic_statedict = self.critic.state_dict()
 97 |         return {"actor": act_statedict, "critic": critic_statedict}
 98 | 
 99 |     def load_model(self, path):
100 |         """load model"""
101 |         try:
102 |             statedict = torch.load(path)
103 |             self.actor.load_state_dict(statedict["actor"])
104 |             self.critic.load_state_dict(statedict["critic"])
105 |             print("Model loaded")
106 |         except:
107 |             raise ValueError("Model not loaded")
108 | 
109 |     def load_replaybuffer(self, path):
110 |         """load replay buffer"""
111 |         try:
112 |             loaded_data = TensorDictBase.load_memmap(path)
113 |             self.replay_buffer.extend(loaded_data)
114 |             if self.replay_buffer._batch_size != self.batch_size:
115 |                 Warning(
116 |                     "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size."
117 |                 )
118 |                 self.replay_buffer._batch_size = self.batch_size
119 |             print("Replay Buffer loaded")
120 |             print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n")
121 |         except:
122 |             raise ValueError("Replay Buffer not loaded")
123 | 
124 |     def reset_networks(self):
125 |         """reset network parameters"""
126 |         print("Resetting Networks!")
127 |         self.loss_module.actor_network_params.apply(self.reset_parameter)
128 |         self.loss_module.target_actor_network_params.apply(self.reset_parameter)
129 |         self.loss_module.qvalue_network_params.apply(self.reset_parameter)
130 |         self.loss_module.target_qvalue_network_params.apply(self.reset_parameter)
131 | 
132 |     def eval(self):
133 |         """Sets the agent to evaluation mode."""
134 |         self.actor.eval()
135 | 
136 |     def td_preprocessing(self, td: TensorDictBase) -> TensorDictBase:
137 |         # TODO not ideal to have this here
138 |         td.pop("scale")
139 |         td.pop("loc")
140 |         td.pop("params")
141 |         if "vector_obs_embedding" in td.keys():
142 |             td.pop("vector_obs_embedding")
143 |         if "image_embedding" in td.keys():
144 |             td.pop("image_embedding")
145 | 
146 |     def create_replay_buffer(
147 |         self,
148 |         prb=False,
149 |         buffer_size=100000,
150 |         buffer_scratch_dir=None,
151 |         device="cpu",
152 |         prefetch=3,
153 |     ):
154 |         """Create replay buffer"""
155 |         # TODO: make this part of base off policy agent
156 |         if prb:
157 |             replay_buffer = TensorDictPrioritizedReplayBuffer(
158 |                 alpha=0.7,
159 |                 beta=0.5,
160 |                 pin_memory=False,
161 |                 prefetch=1,
162 |                 storage=LazyTensorStorage(
163 |                     buffer_size,
164 |                 ),
165 |             )
166 |         else:
167 |             replay_buffer = TensorDictReplayBuffer(
168 |                 pin_memory=False,
169 |                 prefetch=prefetch,
170 |                 storage=LazyMemmapStorage(
171 |                     buffer_size,
172 |                     scratch_dir=buffer_scratch_dir,
173 |                 ),
174 |                 batch_size=self.batch_size,
175 |             )
176 |         replay_buffer.append_transform(lambda x: x.to(device))
177 |         # TODO: check if we have image in observation space if so add this transform
178 |         # replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True))
179 |         return replay_buffer
180 | 
181 |     @torch.no_grad()
182 |     def get_action(self, td: TensorDictBase) -> TensorDictBase:
183 |         """Get action from actor network"""
184 |         with set_exploration_type(ExplorationType.RANDOM):
185 |             out_td = self.actor(td.to(self.device))
186 |         self.td_preprocessing(out_td)
187 |         return out_td
188 | 
189 |     @torch.no_grad()
190 |     def get_eval_action(self, td: TensorDictBase) -> TensorDictBase:
191 |         """Get eval action from actor network"""
192 |         with set_exploration_type(ExplorationType.MODE):
193 |             out_td = self.actor(td.to(self.device))
194 |         self.td_preprocessing(out_td)
195 |         return out_td
196 | 
197 |     def add_experience(self, transition: td.TensorDict):
198 |         """Add experience to replay buffer"""
199 |         self.replay_buffer.extend(transition)
200 |         self.collected_transitions += 1
201 | 
202 |     def train(self, batch_size=64, num_updates=1):
203 |         """Train the agent"""
204 |         self.actor.train()
205 |         for i in range(num_updates):
206 |             self.total_updates += 1
207 |             # Sample a batch from the replay buffer
208 |             batch = self.replay_buffer.sample(batch_size)
209 |             # Compute CQL Loss
210 |             loss = self.loss_module(batch)
211 | 
212 |             # Update alpha
213 |             alpha_loss = loss["loss_alpha"]
214 |             alpha_prime_loss = loss["loss_alpha_prime"]
215 |             self.optimizer_alpha.zero_grad()
216 |             alpha_loss.backward()
217 |             self.optimizer_alpha.step()
218 | 
219 |             # Update Actpr Network
220 |             # official cql implementation uses behavior cloning loss for first few updating steps as it helps for some tasks
221 |             if self.total_updates >= self.bc_steps:
222 |                 actor_loss = loss["loss_actor"]
223 |             else:
224 |                 actor_loss = loss["loss_actor_bc"]
225 |             self.optimizer_actor.zero_grad()
226 |             actor_loss.backward()
227 |             self.optimizer_actor.step()
228 | 
229 |             if self.alpha_prime_optim is not None:
230 |                 self.alpha_prime_optim.zero_grad()
231 |                 alpha_prime_loss.backward(retain_graph=True)
232 |                 self.alpha_prime_optim.step()
233 | 
234 |             # Update Critic Network
235 |             q_loss = loss["loss_qvalue"]
236 |             cql_loss = loss["loss_cql"]
237 | 
238 |             q_loss = q_loss + cql_loss
239 |             self.optimizer_critic.zero_grad()
240 |             q_loss.backward(retain_graph=False)
241 |             self.optimizer_critic.step()
242 | 
243 |             # Update Target Networks
244 |             self.target_net_updater.step()
245 |             # Update Prioritized Replay Buffer
246 |             if isinstance(self.replay_buffer, TensorDictPrioritizedReplayBuffer):
247 |                 self.replay_buffer.update_priorities(
248 |                     batch["indices"],
249 |                     loss["critic_loss"].detach().cpu().numpy(),
250 |                 )
251 |         self.actor.eval()
252 |         return loss
253 | 


--------------------------------------------------------------------------------
/experiments/helper/agents/iql.py:
--------------------------------------------------------------------------------
  1 | import tensordict as td
  2 | import torch
  3 | 
  4 | from experiments.helper.agents.base import BaseAgent
  5 | from experiments.helper.networks.networks import (
  6 |     get_critic,
  7 |     get_stochastic_actor,
  8 |     get_value_operator,
  9 | )
 10 | from tensordict import TensorDictBase
 11 | from torch import optim
 12 | from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer
 13 | from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage
 14 | from torchrl.envs.transforms import ToTensorImage
 15 | from torchrl.envs.utils import ExplorationType, set_exploration_type
 16 | from torchrl.objectives import SoftUpdate
 17 | 
 18 | from torchrl.objectives.iql import IQLLoss
 19 | 
 20 | 
 21 | class IQLAgent(BaseAgent):
 22 |     def __init__(self, state_spec, action_spec, agent_config, device="cpu"):
 23 |         super(IQLAgent, self).__init__(
 24 |             state_spec, action_spec, agent_config.name, device
 25 |         )
 26 | 
 27 |         self.actor = get_stochastic_actor(state_spec, action_spec, agent_config)
 28 |         self.critic = get_critic(state_spec, agent_config)
 29 | 
 30 |         self.value = get_value_operator(state_spec, agent_config)
 31 | 
 32 |         self.actor.to(device)
 33 |         self.critic.to(device)
 34 |         self.value.to(device)
 35 | 
 36 |         # initialize networks
 37 |         self.init_nets([self.actor, self.critic, self.value])
 38 | 
 39 |         # define loss function
 40 |         self.loss_module = IQLLoss(
 41 |             actor_network=self.actor,
 42 |             qvalue_network=self.critic,
 43 |             value_network=self.value,
 44 |             num_qvalue_nets=2,
 45 |             temperature=agent_config.temperature,
 46 |             expectile=agent_config.expectile,
 47 |             loss_function=agent_config.loss_function,
 48 |         )
 49 |         # Define Target Network Updater
 50 |         self.target_net_updater = SoftUpdate(
 51 |             self.loss_module, eps=agent_config.soft_update_eps
 52 |         )
 53 |         self.target_net_updater.init_()
 54 | 
 55 |         # Reset weights
 56 |         self.reset_params = agent_config.reset_params
 57 | 
 58 |         # Define Replay Buffer
 59 |         self.batch_size = agent_config.batch_size
 60 | 
 61 |         self.replay_buffer = self.create_replay_buffer(
 62 |             prb=agent_config.prb,
 63 |             buffer_size=agent_config.buffer_size,
 64 |             device=device,
 65 |         )
 66 | 
 67 |         # Define Optimizer
 68 |         critic_params = list(
 69 |             self.loss_module.qvalue_network_params.flatten_keys().values()
 70 |         )
 71 |         value_params = list(
 72 |             self.loss_module.value_network_params.flatten_keys().values()
 73 |         )
 74 |         actor_params = list(
 75 |             self.loss_module.actor_network_params.flatten_keys().values()
 76 |         )
 77 |         self.optimizer_actor = optim.Adam(
 78 |             actor_params, lr=agent_config.lr, weight_decay=0.0
 79 |         )
 80 |         self.optimizer_critic = optim.Adam(
 81 |             critic_params, lr=agent_config.lr, weight_decay=0.0
 82 |         )
 83 |         self.optimizer_value = optim.Adam(
 84 |             value_params, lr=agent_config.lr, weight_decay=0.0
 85 |         )
 86 | 
 87 |         # general stats
 88 |         self.collected_transitions = 0
 89 |         self.total_updates = 0
 90 |         self.do_pretrain = agent_config.pretrain
 91 | 
 92 |     def get_agent_statedict(self):
 93 |         """Save agent"""
 94 |         act_statedict = self.actor.state_dict()
 95 |         critic_statedict = self.critic.state_dict()
 96 |         value_statedict = self.value.state_dict()
 97 |         return {
 98 |             "actor": act_statedict,
 99 |             "critic": critic_statedict,
100 |             "value": value_statedict,
101 |         }
102 | 
103 |     def load_model(self, path):
104 |         """load model"""
105 | 
106 |         try:
107 |             statedict = torch.load(path)
108 |             self.actor.load_state_dict(statedict["actor"])
109 |             self.critic.load_state_dict(statedict["critic"])
110 |             self.value.load_state_dict(statedict["value"])
111 |             print("Model loaded")
112 |         except:
113 |             raise ValueError("Model not loaded")
114 | 
115 |     def load_replaybuffer(self, path):
116 |         """load replay buffer"""
117 |         try:
118 |             loaded_data = TensorDictBase.load_memmap(path)
119 |             self.replay_buffer.extend(loaded_data)
120 |             if self.replay_buffer._batch_size != self.batch_size:
121 |                 Warning(
122 |                     "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size."
123 |                 )
124 |                 self.replay_buffer._batch_size = self.batch_size
125 |             print("Replay Buffer loaded")
126 |             print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n")
127 |         except:
128 |             raise ValueError("Replay Buffer not loaded")
129 | 
130 |     def reset_networks(self):
131 |         """reset network parameters"""
132 |         print("Resetting Networks!")
133 |         self.loss_module.actor_network_params.apply(self.reset_parameter)
134 |         self.loss_module.target_actor_network_params.apply(self.reset_parameter)
135 |         self.loss_module.qvalue_network_params.apply(self.reset_parameter)
136 |         self.loss_module.target_qvalue_network_params.apply(self.reset_parameter)
137 |         self.loss_module.value_network_params.apply(self.reset_parameter)
138 | 
139 |     def eval(self):
140 |         """Sets the agent to evaluation mode."""
141 |         self.actor.eval()
142 | 
143 |     def td_preprocessing(self, td: TensorDictBase) -> TensorDictBase:
144 |         # TODO not ideal to have this here
145 |         td.pop("scale")
146 |         td.pop("loc")
147 |         td.pop("params")
148 |         if "vector_obs_embedding" in td.keys():
149 |             td.pop("vector_obs_embedding")
150 |         if "image_embedding" in td.keys():
151 |             td.pop("image_embedding")
152 | 
153 |     def create_replay_buffer(
154 |         self,
155 |         prb=False,
156 |         buffer_size=100000,
157 |         buffer_scratch_dir=None,
158 |         device="cpu",
159 |         prefetch=3,
160 |     ):
161 |         """Create replay buffer"""
162 |         # TODO: make this part of base off policy agent
163 |         if prb:
164 |             replay_buffer = TensorDictPrioritizedReplayBuffer(
165 |                 alpha=0.7,
166 |                 beta=0.5,
167 |                 pin_memory=False,
168 |                 prefetch=1,
169 |                 storage=LazyTensorStorage(
170 |                     buffer_size,
171 |                     device=device,
172 |                 ),
173 |             )
174 |         else:
175 |             replay_buffer = TensorDictReplayBuffer(
176 |                 pin_memory=False,
177 |                 prefetch=prefetch,
178 |                 storage=LazyMemmapStorage(
179 |                     buffer_size,
180 |                     scratch_dir=buffer_scratch_dir,
181 |                 ),
182 |                 batch_size=self.batch_size,
183 |             )
184 |         replay_buffer.append_transform(lambda x: x.to(device))
185 |         # TODO: check if we have image in observation space if so add this transform
186 |         # replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True))
187 | 
188 |         return replay_buffer
189 | 
190 |     @torch.no_grad()
191 |     def get_action(self, td: TensorDictBase) -> TensorDictBase:
192 |         """Get action from actor network"""
193 |         with set_exploration_type(ExplorationType.RANDOM):
194 |             out_td = self.actor(td.to(self.device))
195 |         self.td_preprocessing(out_td)
196 |         return out_td
197 | 
198 |     @torch.no_grad()
199 |     def get_eval_action(self, td: TensorDictBase) -> TensorDictBase:
200 |         """Get eval action from actor network"""
201 |         with set_exploration_type(ExplorationType.MODE):
202 |             out_td = self.actor(td.to(self.device))
203 |         self.td_preprocessing(out_td)
204 |         return out_td
205 | 
206 |     def add_experience(self, transition: td.TensorDict):
207 |         """Add experience to replay buffer"""
208 |         self.replay_buffer.extend(transition)
209 |         self.collected_transitions += 1
210 | 
211 |     def pretrain(self, wandb, batch_size=64, num_updates=1):
212 |         """Pretrain the agent with simple behavioral cloning"""
213 |         # TODO: implement pretrain for testing
214 |         # for i in range(num_updates):
215 |         #     batch = self.replay_buffer.sample(batch_size)
216 |         #     pred, _ = self.actor(batch["observations"].float())
217 |         #     loss = torch.mean((pred - batch["actions"]) ** 2)
218 |         #     self.optimizer.zero_grad()
219 |         #     loss.backward()
220 |         #     self.optimizer.step()
221 |         #     wandb.log({"pretrain/loss": loss.item()})
222 | 
223 |     def train(self, batch_size=64, num_updates=1):
224 |         """Train the agent"""
225 |         self.actor.train()
226 |         for i in range(num_updates):
227 |             self.total_updates += 1
228 |             if self.reset_params and self.total_updates % self.reset_params == 0:
229 |                 self.reset_networks()
230 |             # Sample a batch from the replay buffer
231 |             batch = self.replay_buffer.sample(batch_size)
232 |             # Compute IQL Loss
233 |             loss = self.loss_module(batch)
234 | 
235 |             # Update Actpr Network
236 |             self.optimizer_actor.zero_grad()
237 |             loss["loss_actor"].backward()
238 |             self.optimizer_actor.step()
239 |             # Update Critic Network
240 |             self.optimizer_critic.zero_grad()
241 |             loss["loss_qvalue"].backward()
242 |             torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
243 |             self.optimizer_critic.step()
244 |             # Update Value Network
245 |             self.optimizer_value.zero_grad()
246 |             loss["loss_value"].backward()
247 |             self.optimizer_value.step()
248 | 
249 |             # Update Target Networks
250 |             self.target_net_updater.step()
251 |             # Update Prioritized Replay Buffer
252 |             if isinstance(self.replay_buffer, TensorDictPrioritizedReplayBuffer):
253 |                 self.replay_buffer.update_priorities(
254 |                     batch["indices"],
255 |                     loss["critic_loss"].detach().cpu().numpy(),
256 |                 )
257 |         self.actor.eval()
258 |         return loss
259 | 


--------------------------------------------------------------------------------
/experiments/helper/agents/random.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from experiments.helper.agents.base import BaseAgent
 4 | from tensordict import TensorDictBase
 5 | from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer
 6 | from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage
 7 | 
 8 | 
 9 | class RandomAgent(BaseAgent):
10 |     def __init__(self, state_spec, action_spec, agent_config, device="cpu"):
11 |         super(RandomAgent, self).__init__(
12 |             state_spec, action_spec, agent_config.name, device
13 |         )
14 | 
15 |         self.actor = None
16 |         self.replay_buffer = self.create_replay_buffer(
17 |             batch_size=256,
18 |             prb=False,
19 |             buffer_size=1000000,
20 |             device=device,
21 |             buffer_scratch_dir="/tmp",
22 |         )
23 | 
24 |     def eval(self):
25 |         """Sets the agent to evaluation mode."""
26 | 
27 |     @torch.no_grad()
28 |     def get_action(self, tensordict: TensorDictBase):
29 |         """Sample random actions from a uniform distribution"""
30 |         tensordict.set("action", self.action_spec.rand())
31 |         return tensordict
32 | 
33 |     @torch.no_grad()
34 |     def get_eval_action(self, tensordict: TensorDictBase):
35 |         """Sample random actions from a uniform distribution"""
36 |         tensordict.set("action", self.action_spec.rand())
37 |         return tensordict
38 | 
39 |     def add_experience(self, transition: TensorDictBase):
40 |         """Add experience to replay buffer"""
41 |         self.replay_buffer.extend(transition)
42 | 
43 |     def train(self, batch_size=64, num_updates=1):
44 |         """Train the agent"""
45 |         return {}
46 | 
47 |     def create_replay_buffer(
48 |         self,
49 |         batch_size=256,
50 |         prb=False,
51 |         buffer_size=100000,
52 |         buffer_scratch_dir=None,
53 |         device="cpu",
54 |         prefetch=3,
55 |     ):
56 |         """Create replay buffer"""
57 |         # TODO: make this part of base off policy agent
58 |         if prb:
59 |             replay_buffer = TensorDictPrioritizedReplayBuffer(
60 |                 alpha=0.7,
61 |                 beta=0.5,
62 |                 pin_memory=False,
63 |                 prefetch=1,
64 |                 storage=LazyTensorStorage(
65 |                     buffer_size,
66 |                 ),
67 |             )
68 |         else:
69 |             replay_buffer = TensorDictReplayBuffer(
70 |                 pin_memory=False,
71 |                 prefetch=prefetch,
72 |                 storage=LazyMemmapStorage(
73 |                     buffer_size,
74 |                     scratch_dir=buffer_scratch_dir,
75 |                 ),
76 |                 batch_size=batch_size,
77 |             )
78 |         return replay_buffer
79 | 


--------------------------------------------------------------------------------
/experiments/helper/agents/sac.py:
--------------------------------------------------------------------------------
  1 | import tensordict as td
  2 | import torch
  3 | 
  4 | from experiments.helper.agents.base import BaseAgent
  5 | from experiments.helper.networks.networks import get_critic, get_stochastic_actor
  6 | from tensordict import TensorDictBase
  7 | from torch import optim
  8 | from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer
  9 | from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage
 10 | from torchrl.envs.utils import ExplorationType, set_exploration_type
 11 | from torchrl.objectives import SoftUpdate
 12 | 
 13 | from torchrl.objectives.sac import SACLoss
 14 | 
 15 | 
 16 | class SACAgent(BaseAgent):
 17 |     def __init__(self, state_spec, action_spec, agent_config, device="cpu"):
 18 |         super(SACAgent, self).__init__(
 19 |             state_spec, action_spec, agent_config.name, device
 20 |         )
 21 | 
 22 |         self.actor = get_stochastic_actor(state_spec, action_spec, agent_config)
 23 |         self.critic = get_critic(state_spec, agent_config)
 24 | 
 25 |         self.actor.to(device)
 26 |         self.critic.to(device)
 27 | 
 28 |         # initialize networks
 29 |         self.init_nets([self.actor, self.critic])
 30 | 
 31 |         # define loss function
 32 |         self.loss_module = SACLoss(
 33 |             actor_network=self.actor,
 34 |             qvalue_network=self.critic,
 35 |             delay_qvalue=True,
 36 |             value_network=None,  # None to use SAC version 2
 37 |             num_qvalue_nets=2,
 38 |             fixed_alpha=agent_config.fixed_alpha,
 39 |             alpha_init=agent_config.alpha_init,
 40 |             loss_function=agent_config.loss_function,
 41 |         )
 42 |         # Define Target Network Updater
 43 |         self.target_net_updater = SoftUpdate(
 44 |             self.loss_module, eps=agent_config.soft_update_eps
 45 |         )
 46 |         self.target_net_updater.init_()
 47 | 
 48 |         # Reset weights
 49 |         self.reset_params = agent_config.reset_params
 50 | 
 51 |         self.batch_size = agent_config.batch_size
 52 |         # Define Replay Buffer
 53 |         self.buffer_batch_size = agent_config.batch_size
 54 |         self.replay_buffer = self.create_replay_buffer(
 55 |             prb=agent_config.prb,
 56 |             buffer_size=agent_config.buffer_size,
 57 |             buffer_scratch_dir="/tmp",
 58 |             device=device,
 59 |         )
 60 |         # Define Optimizer
 61 |         critic_params = list(
 62 |             self.loss_module.qvalue_network_params.flatten_keys().values()
 63 |         )
 64 |         actor_params = list(
 65 |             self.loss_module.actor_network_params.flatten_keys().values()
 66 |         )
 67 |         self.optimizer_actor = optim.Adam(
 68 |             actor_params, lr=agent_config.lr, weight_decay=0.0
 69 |         )
 70 |         self.optimizer_critic = optim.Adam(
 71 |             critic_params, lr=agent_config.lr, weight_decay=0.0
 72 |         )
 73 |         self.optimizer_alpha = optim.Adam(
 74 |             [self.loss_module.log_alpha],
 75 |             lr=3.0e-4,
 76 |         )
 77 | 
 78 |         # general stats
 79 |         self.collected_transitions = 0
 80 |         self.total_updates = 0
 81 | 
 82 |     def get_agent_statedict(self):
 83 |         """Save agent"""
 84 |         act_statedict = self.actor.state_dict()
 85 |         critic_statedict = self.critic.state_dict()
 86 |         return {"actor": act_statedict, "critic": critic_statedict}
 87 | 
 88 |     def load_model(self, path):
 89 |         """load model"""
 90 |         try:
 91 |             statedict = torch.load(path)
 92 |             self.actor.load_state_dict(statedict["actor"])
 93 |             self.critic.load_state_dict(statedict["critic"])
 94 |             print("Model loaded")
 95 |         except:
 96 |             raise ValueError("Model not loaded")
 97 | 
 98 |     def load_replaybuffer(self, path):
 99 |         """load replay buffer"""
100 |         try:
101 |             loaded_data = TensorDictBase.load_memmap(path)
102 |             self.replay_buffer.extend(loaded_data)
103 |             if self.replay_buffer._batch_size != self.batch_size:
104 |                 Warning(
105 |                     "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size."
106 |                 )
107 |                 self.replay_buffer._batch_size = self.batch_size
108 |             print("Replay Buffer loaded")
109 |             print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n")
110 |         except:
111 |             raise ValueError("Replay Buffer not loaded")
112 | 
113 |     def reset_networks(self):
114 |         """reset network parameters"""
115 |         print("Resetting Networks!")
116 |         self.loss_module.actor_network_params.apply(self.reset_parameter)
117 |         self.loss_module.target_actor_network_params.apply(self.reset_parameter)
118 |         self.loss_module.qvalue_network_params.apply(self.reset_parameter)
119 |         self.loss_module.target_qvalue_network_params.apply(self.reset_parameter)
120 | 
121 |     def eval(self):
122 |         """Sets the agent to evaluation mode."""
123 |         self.actor.eval()
124 | 
125 |     def td_preprocessing(self, td: TensorDictBase) -> TensorDictBase:
126 |         # TODO not ideal to have this here
127 |         td.pop("scale")
128 |         td.pop("loc")
129 |         td.pop("params")
130 |         if "obs_embedding" in td.keys():
131 |             td.pop("obs_embedding")
132 |         if "pixel_embedding" in td.keys():
133 |             td.pop("pixel_embedding")
134 | 
135 |     def create_replay_buffer(
136 |         self,
137 |         prb=False,
138 |         buffer_size=100000,
139 |         buffer_scratch_dir=".",
140 |         device="cpu",
141 |         prefetch=3,
142 |     ):
143 |         """Create replay buffer"""
144 |         # TODO: make this part of base off policy agent
145 |         if prb:
146 |             replay_buffer = TensorDictPrioritizedReplayBuffer(
147 |                 alpha=0.7,
148 |                 beta=0.5,
149 |                 pin_memory=False,
150 |                 prefetch=1,
151 |                 storage=LazyTensorStorage(
152 |                     buffer_size,
153 |                 ),
154 |             )
155 |         else:
156 |             replay_buffer = TensorDictReplayBuffer(
157 |                 pin_memory=False,
158 |                 prefetch=prefetch,
159 |                 storage=LazyMemmapStorage(
160 |                     buffer_size,
161 |                     scratch_dir=buffer_scratch_dir,
162 |                 ),
163 |                 batch_size=self.batch_size,
164 |             )
165 |         replay_buffer.append_transform(lambda x: x.to(device))
166 |         # TODO: check if we have image in observation space if so add this transform
167 |         # replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True))
168 | 
169 |         return replay_buffer
170 | 
171 |     @torch.no_grad()
172 |     def get_action(self, td: TensorDictBase) -> TensorDictBase:
173 |         """Get action from actor network"""
174 |         with set_exploration_type(ExplorationType.RANDOM):
175 |             out_td = self.actor(td.to(self.device))
176 |         self.td_preprocessing(out_td)
177 |         return out_td
178 | 
179 |     @torch.no_grad()
180 |     def get_eval_action(self, td: TensorDictBase) -> TensorDictBase:
181 |         """Get eval action from actor network"""
182 |         with set_exploration_type(ExplorationType.MODE):
183 |             out_td = self.actor(td.to(self.device))
184 |         self.td_preprocessing(out_td)
185 |         return out_td
186 | 
187 |     def add_experience(self, transition: td.TensorDict):
188 |         """Add experience to replay buffer"""
189 |         self.replay_buffer.extend(transition)
190 |         self.collected_transitions += 1
191 | 
192 |     def train(self, batch_size=64, num_updates=1):
193 |         """Train the agent"""
194 |         self.actor.train()
195 |         for i in range(num_updates):
196 |             self.total_updates += 1
197 |             if self.reset_params and self.total_updates % self.reset_params == 0:
198 |                 self.reset_networks()
199 |             # Sample a batch from the replay buffer
200 |             batch = self.replay_buffer.sample(batch_size)
201 |             # Compute SAC Loss
202 |             loss = self.loss_module(batch)
203 | 
204 |             # Update Actpr Network
205 |             self.optimizer_actor.zero_grad()
206 |             loss["loss_actor"].backward()
207 |             self.optimizer_actor.step()
208 |             # Update Critic Network
209 |             self.optimizer_critic.zero_grad()
210 |             loss["loss_qvalue"].backward()
211 |             torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
212 |             self.optimizer_critic.step()
213 | 
214 |             # Update alpha
215 |             self.optimizer_alpha.zero_grad()
216 |             loss["loss_alpha"].backward()
217 |             self.optimizer_alpha.step()
218 | 
219 |             # Update Target Networks
220 |             self.target_net_updater.step()
221 |             # Update Prioritized Replay Buffer
222 |             if isinstance(self.replay_buffer, TensorDictPrioritizedReplayBuffer):
223 |                 self.replay_buffer.update_priorities(
224 |                     batch["indices"],
225 |                     loss["critic_loss"].detach().cpu().numpy(),
226 |                 )
227 |         self.actor.eval()
228 |         return loss
229 | 


--------------------------------------------------------------------------------
/experiments/helper/networks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BricksRL/bricksrl/bc250aeaa3b9ab9d718601fced38325f3621c8a3/experiments/helper/networks/__init__.py


--------------------------------------------------------------------------------
/experiments/helper/utils.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | import numpy as np
  4 | import tensordict as td
  5 | import torch
  6 | from bricksrl.environments import ALL_2WHEELER_ENVS, ALL_ROBOARM_ENVS, ALL_WALKER_ENVS
  7 | from moviepy.editor import concatenate_videoclips, ImageClip
  8 | from omegaconf import DictConfig
  9 | from tensordict import TensorDict, TensorDictBase
 10 | from torchrl.envs.utils import step_mdp
 11 | from tqdm import tqdm
 12 | 
 13 | 
 14 | def setup_check(robot: str, config: DictConfig):
 15 |     if robot == "2wheeler":
 16 |         assert (
 17 |             config.env.name in ALL_2WHEELER_ENVS
 18 |         ), f"You are trying to run a 2wheeler experiment but are using the env {config.env.name}, select one of {ALL_2WHEELER_ENVS}"
 19 |     elif robot == "walker":
 20 |         assert (
 21 |             config.env.name in ALL_WALKER_ENVS
 22 |         ), f"You are trying to run a walker experiment but are using the env {config.env.name}, select one of {ALL_WALKER_ENVS}"
 23 |     elif robot == "roboarm":
 24 |         assert (
 25 |             config.env.name in ALL_ROBOARM_ENVS
 26 |         ), f"You are trying to run a roboarm experiment but are using the env {config.env.name}, select one of {ALL_ROBOARM_ENVS}"
 27 | 
 28 | 
 29 | def data2numpy(data: list):
 30 |     """Convert a list of bytes to a numpy array."""
 31 |     return np.array(data)[None, :]
 32 | 
 33 | 
 34 | def handle_disconnect(_):
 35 |     print("Hub was disconnected.")
 36 | 
 37 | 
 38 | def tensordict2dict(td: TensorDictBase) -> dict:
 39 |     """Convert a TensorDict to a dictionary."""
 40 |     return {k: v.item() for k, v in td.items()}
 41 | 
 42 | 
 43 | def logout(agent):
 44 |     # TODO save model or training data
 45 |     x = input("Do you want to save the model? (y/n)")
 46 |     if x == "y":
 47 |         save_name = input("Enter the name of the file to save: ")
 48 |         torch.save(agent.get_agent_statedict(), save_name + ".pth")
 49 |     x = input("Do you want to save the replay buffer? (y/n)")
 50 |     if x == "y":
 51 |         save_name = input("Enter the name of the file to save: ")
 52 |         # agent.replay_buffer.dump(save_name)
 53 |         batched_data = agent.replay_buffer.storage._storage[
 54 |             : agent.replay_buffer.__len__()
 55 |         ]
 56 |         batched_data.save(save_name, copy_existing=True)
 57 | 
 58 | 
 59 | def login(agent):
 60 |     x = input("Do you want to load the model? (y/n)")
 61 |     if x == "y":
 62 |         save_name = input("Enter the name of the file to load: ")
 63 |         agent.load_model(save_name)
 64 |     else:
 65 |         print("Model not loaded!")
 66 |     x = input("Do you want to load the replay buffer? (y/n)")
 67 |     if x == "y":
 68 |         save_name = input("Enter the name of the file to load: ")
 69 |         agent.load_replaybuffer(save_name)
 70 |     else:
 71 |         print("Buffer not loaded!")
 72 | 
 73 | 
 74 | def prefill_buffer(env, agent, num_episodes=10, stop_on_done=False):
 75 |     """
 76 |     Prefills the agent's replay buffer with experiences by running the environment for a specified number of episodes.
 77 | 
 78 |     Args:
 79 |     - env: gym.Env object representing the environment
 80 |     - agent: Agent object with an add_experience method to add experiences to the replay buffer
 81 |     - num_episodes: int, number of episodes to run the environment for
 82 | 
 83 |     Returns: None
 84 |     """
 85 |     if agent.name in ["sac", "td3"]:
 86 |         inpt = input("Press Enter to start prefilling episode: ")
 87 |         for e in tqdm(range(num_episodes), desc="Prefilling buffer"):
 88 |             print("Prefill episode: ", e)
 89 |             td = env.reset()
 90 |             done = False
 91 |             truncated = False
 92 |             while not done and not truncated:
 93 |                 td = env.sample_random_action(td)
 94 |                 td = env.step(td)
 95 |                 agent.add_experience(td)
 96 |                 done = td.get(("next", "done"))
 97 | 
 98 |                 if done and stop_on_done:
 99 |                     inpt = input(
100 |                         "Please reset the robot to the starting position and press Enter to continue or q to quit:"
101 |                     )
102 |                     if inpt == "q":
103 |                         break
104 |                 td = step_mdp(td)
105 |         print("Prefill done! Buffer size: ", agent.replay_buffer.__len__())
106 | 
107 | 
108 | def convert_bgr_to_rgb(bgr_image: np.array) -> np.array:
109 |     return bgr_image[:, :, ::-1]  # Reverses the third dimension (color channels)
110 | 
111 | 
112 | def create_video_from_images(
113 |     images: List[np.array], video_name: str = "episode_1", fps: int = 20
114 | ):
115 |     # Convert each NumPy array image to an ImageClip
116 |     clips = [ImageClip(convert_bgr_to_rgb(np_img.squeeze(0))) for np_img in images]
117 | 
118 |     # Set the duration of each clip to match the desired FPS
119 |     # Note: This assumes all images should be displayed for an equal amount of time.
120 |     for clip in clips:
121 |         clip.duration = 1 / fps
122 | 
123 |     # Concatenate the ImageClips into a single video
124 |     final_clip = concatenate_videoclips(clips, method="compose")
125 | 
126 |     # Write the result to a video file
127 |     final_clip.write_videofile(video_name, fps=fps)
128 | 


--------------------------------------------------------------------------------
/experiments/roboarm/eval.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import time
  4 | 
  5 | import hydra
  6 | import numpy as np
  7 | import wandb
  8 | from omegaconf import DictConfig, OmegaConf
  9 | from torchrl.envs.utils import step_mdp
 10 | from tqdm import tqdm
 11 | 
 12 | # Add the project root to PYTHONPATH for config
 13 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
 14 | if project_root not in sys.path:
 15 |     sys.path.insert(0, project_root)
 16 | 
 17 | from bricksrl.environments import make_env, VIDEO_LOGGING_ENVS
 18 | from experiments.helper.agents import get_agent
 19 | from experiments.helper.utils import (
 20 |     create_video_from_images,
 21 |     login,
 22 |     logout,
 23 |     setup_check,
 24 | )
 25 | 
 26 | 
 27 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config")
 28 | def run(cfg: DictConfig) -> None:
 29 |     print(OmegaConf.to_yaml(cfg))
 30 | 
 31 |     # make environment.
 32 |     setup_check(robot="roboarm", config=cfg)
 33 |     env, action_space, state_space = make_env(cfg)
 34 | 
 35 |     # make agent
 36 |     agent, project_name = get_agent(action_space, state_space, cfg)
 37 |     login(agent)
 38 |     agent.eval()
 39 | 
 40 |     # initialize wandb
 41 |     wandb.init(project=project_name + "_eval")
 42 |     wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
 43 | 
 44 |     eval_episodes = cfg.episodes
 45 |     env_name = cfg.env.name
 46 |     quit = False
 47 |     _ = input("Press Enter to start evaluation...")
 48 |     try:
 49 |         for e in tqdm(range(eval_episodes), desc="Evaluation"):
 50 |             td = env.reset()
 51 |             done = td.get("done", False)
 52 |             truncated = td.get("truncated", False)
 53 |             ep_return = 0
 54 |             ep_steps = 0
 55 |             total_step_times = []
 56 |             if env_name in VIDEO_LOGGING_ENVS:
 57 |                 image_caputres = [td.get("original_pixels").numpy()]
 58 |             print("Start new evaluation...", flush=True)
 59 |             while not done and not truncated:
 60 |                 ep_steps += 1
 61 |                 step_start_time = time.time()
 62 |                 td = agent.get_eval_action(td)
 63 |                 td = env.step(td)
 64 |                 agent.add_experience(td)
 65 |                 if env_name in VIDEO_LOGGING_ENVS:
 66 |                     image_caputres.append(
 67 |                         td.get(("next", "original_pixels")).cpu().numpy()
 68 |                     )
 69 |                 agent.add_experience(td)
 70 |                 total_agent_step_time = time.time() - step_start_time
 71 |                 total_step_times.append(total_agent_step_time)
 72 |                 done = td.get(("next", "done"), False)
 73 |                 ep_return += td.get(("next", "reward"), 0)
 74 | 
 75 |                 if done:
 76 |                     break
 77 |                 td = step_mdp(td)
 78 | 
 79 |             if quit:
 80 |                 break
 81 | 
 82 |             # Metrics Logging
 83 |             log_dict = {
 84 |                 "epoch": e,
 85 |                 "reward": ep_return,
 86 |                 "steps": ep_steps,
 87 |                 "total_step_time": np.mean(total_step_times),
 88 |                 "done": done.float(),
 89 |             }
 90 |             if env_name == "roboarm-v0" or env_name == "roboarm_sim-v0":
 91 |                 final_error = td.get(("error")).item()
 92 |                 log_dict.update({"final_error": final_error})
 93 | 
 94 |             wandb.log(log_dict)
 95 |             if env_name in VIDEO_LOGGING_ENVS:
 96 |                 video_name = "episode_{}.mp4".format(e)
 97 |                 create_video_from_images(image_caputres, video_name, fps=5)
 98 |                 wandb.log({"video": wandb.Video(video_name, fps=5, format="mp4")})
 99 | 
100 |     except KeyboardInterrupt:
101 |         print("Evaluation interrupted by user.")
102 | 
103 |     logout(agent)
104 |     env.close()
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     run()
109 | 


--------------------------------------------------------------------------------
/experiments/roboarm/pretrain.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import hydra
 5 | import wandb
 6 | from omegaconf import DictConfig, OmegaConf
 7 | from tqdm import tqdm
 8 | 
 9 | # Add the project root to PYTHONPATH for config
10 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
11 | if project_root not in sys.path:
12 |     sys.path.insert(0, project_root)
13 | 
14 | from bricksrl.environments import make_env
15 | from experiments.helper.agents import get_agent
16 | from experiments.helper.utils import login, logout, setup_check, tensordict2dict
17 | 
18 | 
19 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config")
20 | def run(cfg: DictConfig) -> None:
21 |     print(OmegaConf.to_yaml(cfg))
22 | 
23 |     # make environment.
24 |     setup_check(robot="roboarm", config=cfg)
25 |     env, action_space, state_space = make_env(cfg, pretrain=True)
26 | 
27 |     # make agent
28 |     agent, project_name = get_agent(action_space, state_space, cfg)
29 |     login(agent)
30 | 
31 |     # initialize wandb
32 |     wandb.init(project=project_name)
33 |     wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
34 |     wandb.watch(agent.actor, log_freq=1) if agent.actor else None
35 | 
36 |     batch_size = cfg.agent.batch_size
37 |     num_updates = cfg.agent.num_updates
38 |     train_episodes = cfg.episodes
39 |     print("Start training...")
40 |     try:
41 |         for e in tqdm(range(train_episodes), desc="Training"):
42 | 
43 |             loss_info = agent.train(batch_size=batch_size, num_updates=num_updates)
44 | 
45 |             # Metrics Logging
46 |             log_dict = {
47 |                 "epoch": e,
48 |                 "buffer_size": agent.replay_buffer.__len__(),
49 |             }
50 |             log_dict.update(tensordict2dict(loss_info))
51 |             wandb.log(log_dict)
52 | 
53 |     except KeyboardInterrupt:
54 |         print("Training interrupted by user.")
55 | 
56 |     logout(agent)
57 |     env.close()
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     run()
62 | 


--------------------------------------------------------------------------------
/experiments/roboarm/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import time
  4 | 
  5 | import hydra
  6 | import numpy as np
  7 | import wandb
  8 | from omegaconf import DictConfig, OmegaConf
  9 | from torchrl.envs.utils import step_mdp
 10 | from tqdm import tqdm
 11 | 
 12 | # Add the project root to PYTHONPATH for config
 13 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
 14 | if project_root not in sys.path:
 15 |     sys.path.insert(0, project_root)
 16 | 
 17 | from bricksrl.environments import make_env, VIDEO_LOGGING_ENVS
 18 | from experiments.helper.agents import get_agent
 19 | from experiments.helper.utils import (
 20 |     create_video_from_images,
 21 |     login,
 22 |     logout,
 23 |     prefill_buffer,
 24 |     setup_check,
 25 |     tensordict2dict,
 26 | )
 27 | 
 28 | 
 29 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config")
 30 | def run(cfg: DictConfig) -> None:
 31 |     print(OmegaConf.to_yaml(cfg))
 32 | 
 33 |     # make environment.
 34 |     setup_check(robot="roboarm", config=cfg)
 35 |     env, action_space, state_space = make_env(cfg)
 36 | 
 37 |     # make agent
 38 |     agent, project_name = get_agent(action_space, state_space, cfg)
 39 |     login(agent)
 40 | 
 41 |     # initialize wandb
 42 |     wandb.init(project=project_name)
 43 |     wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
 44 |     wandb.watch(agent.actor, log_freq=1) if agent.actor else None
 45 | 
 46 |     # prefill buffer with random actions
 47 |     prefill_buffer(
 48 |         env=env,
 49 |         agent=agent,
 50 |         num_episodes=cfg.agent.prefill_episodes,
 51 |     )
 52 | 
 53 |     batch_size = cfg.agent.batch_size
 54 |     num_updates = cfg.agent.num_updates
 55 |     env_name = cfg.env.name
 56 |     train_episodes = cfg.episodes
 57 |     max_episode_steps = cfg.env.max_episode_steps
 58 | 
 59 |     print("Start training...")
 60 |     quit = False
 61 |     try:
 62 |         for e in tqdm(range(train_episodes), desc="Training"):
 63 |             td = env.reset()
 64 |             done = td.get("done", False)
 65 |             truncated = td.get("truncated", False)
 66 |             ep_return = 0
 67 |             ep_steps = 0
 68 |             total_step_times = []
 69 |             if env_name in VIDEO_LOGGING_ENVS:
 70 |                 image_caputres = [td.get("original_pixels").numpy()]
 71 |             print("Start new data collection...", flush=True)
 72 |             while not done and not truncated:
 73 |                 ep_steps += 1
 74 |                 step_start_time = time.time()
 75 |                 td = agent.get_action(td)
 76 |                 td = env.step(td)
 77 |                 if env_name in VIDEO_LOGGING_ENVS:
 78 |                     image_caputres.append(
 79 |                         td.get(("next", "original_pixels")).cpu().numpy()
 80 |                     )
 81 |                 agent.add_experience(td)
 82 |                 total_agent_step_time = time.time() - step_start_time
 83 |                 total_step_times.append(total_agent_step_time)
 84 |                 done = td.get(("next", "done"), False)
 85 |                 ep_return += td.get(("next", "reward"), 0)
 86 | 
 87 |                 td = step_mdp(td)
 88 |                 if done:
 89 |                     break
 90 | 
 91 |             loss_info = agent.train(
 92 |                 batch_size=batch_size, num_updates=num_updates * ep_steps
 93 |             )
 94 | 
 95 |             if quit:
 96 |                 break
 97 | 
 98 |             # Metrics Logging
 99 |             log_dict = {
100 |                 "epoch": e,
101 |                 "reward": ep_return,
102 |                 "steps": ep_steps,
103 |                 "total_step_time": np.mean(total_step_times),
104 |                 "buffer_size": agent.replay_buffer.__len__(),
105 |                 "done": done.float(),
106 |             }
107 |             if env_name == "roboarm-v0" or env_name == "roboarm_sim-v0":
108 |                 final_error = td.get(("error")).item()
109 |                 log_dict.update({"final_error": final_error})
110 |             log_dict.update(tensordict2dict(loss_info))
111 |             wandb.log(log_dict)
112 |             if env_name in VIDEO_LOGGING_ENVS and done and ep_steps < max_episode_steps:
113 |                 video_name = "episode_{}.mp4".format(e)
114 |                 create_video_from_images(image_caputres, video_name, fps=5)
115 |                 wandb.log({"video": wandb.Video(video_name, fps=5, format="mp4")})
116 | 
117 |     except KeyboardInterrupt:
118 |         print("Training interrupted by user.")
119 | 
120 |     logout(agent)
121 |     env.close()
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     run()
126 | 


--------------------------------------------------------------------------------
/experiments/walker/eval.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import time
  4 | 
  5 | import hydra
  6 | import numpy as np
  7 | import wandb
  8 | from omegaconf import DictConfig, OmegaConf
  9 | from torchrl.envs.utils import step_mdp
 10 | from tqdm import tqdm
 11 | 
 12 | # Add the project root to PYTHONPATH for config
 13 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
 14 | if project_root not in sys.path:
 15 |     sys.path.insert(0, project_root)
 16 | 
 17 | from bricksrl.environments import make_env, VIDEO_LOGGING_ENVS
 18 | from experiments.helper.agents import get_agent
 19 | from experiments.helper.utils import (
 20 |     create_video_from_images,
 21 |     login,
 22 |     logout,
 23 |     setup_check,
 24 | )
 25 | 
 26 | 
 27 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config")
 28 | def run(cfg: DictConfig) -> None:
 29 |     print(OmegaConf.to_yaml(cfg))
 30 | 
 31 |     # make environment.
 32 |     setup_check(robot="walker", config=cfg)
 33 |     env, action_space, state_space = make_env(cfg)
 34 | 
 35 |     # make agent
 36 |     agent, project_name = get_agent(action_space, state_space, cfg)
 37 |     login(agent)
 38 |     agent.eval()
 39 | 
 40 |     # initialize wandb
 41 |     wandb.init(project=project_name + "_eval")
 42 |     wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
 43 | 
 44 |     eval_episodes = cfg.episodes
 45 |     env_name = cfg.env.name
 46 |     quit = False
 47 |     _ = input("Press Enter to start evaluation...")
 48 |     try:
 49 |         for e in tqdm(range(eval_episodes), desc="Evaluation"):
 50 |             td = env.reset()
 51 |             done = td.get("done", False)
 52 |             truncated = td.get("truncated", False)
 53 |             ep_return = 0
 54 |             ep_steps = 0
 55 |             total_step_times = []
 56 |             actions = []
 57 |             if env_name in VIDEO_LOGGING_ENVS:
 58 |                 image_caputres = [td.get("original_image").numpy()]
 59 |                 # so we can reset the robot in the camera view
 60 |                 input("Press Enter to start episode...")
 61 |             print("Start new evaluation...", flush=True)
 62 |             while not done and not truncated:
 63 |                 ep_steps += 1
 64 |                 step_start_time = time.time()
 65 |                 td = agent.get_eval_action(td)
 66 |                 actions.append(td.get("action").cpu().numpy())
 67 |                 td = env.step(td)
 68 |                 agent.add_experience(td)
 69 |                 total_agent_step_time = time.time() - step_start_time
 70 |                 total_step_times.append(total_agent_step_time)
 71 |                 done = td.get(("next", "done"), False)
 72 |                 ep_return += td.get(("next", "reward"), 0)
 73 |                 if env_name in VIDEO_LOGGING_ENVS:
 74 |                     image_caputres.append(
 75 |                         td.get(("next", "original_image")).cpu().numpy()
 76 |                     )
 77 | 
 78 |                 if done:
 79 |                     break
 80 |                 td = step_mdp(td)
 81 | 
 82 |             if quit:
 83 |                 break
 84 | 
 85 |             # Metrics Logging
 86 |             log_dict = {
 87 |                 "epoch": e,
 88 |                 "reward": ep_return,
 89 |                 "steps": ep_steps,
 90 |                 "total_step_time": np.mean(total_step_times),
 91 |                 "buffer_size": agent.replay_buffer.__len__(),
 92 |                 "done": done.float(),
 93 |             }
 94 | 
 95 |             wandb.log(log_dict)
 96 |             if env_name in VIDEO_LOGGING_ENVS:
 97 |                 video_name = "episode_{}.mp4".format(e)
 98 |                 create_video_from_images(image_caputres, video_name, fps=5)
 99 |                 wandb.log({"video": wandb.Video(video_name, fps=5, format="mp4")})
100 | 
101 |     except KeyboardInterrupt:
102 |         print("Evaluation interrupted by user.")
103 | 
104 |     logout(agent)
105 |     env.close()
106 | 
107 | 
108 | if __name__ == "__main__":
109 |     run()
110 | 


--------------------------------------------------------------------------------
/experiments/walker/pretrain.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | import hydra
 5 | import wandb
 6 | from omegaconf import DictConfig, OmegaConf
 7 | from tqdm import tqdm
 8 | 
 9 | # Add the project root to PYTHONPATH for config
10 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
11 | if project_root not in sys.path:
12 |     sys.path.insert(0, project_root)
13 | 
14 | from bricksrl.environments import make_env
15 | from experiments.helper.agents import get_agent
16 | from experiments.helper.utils import login, logout, setup_check, tensordict2dict
17 | 
18 | 
19 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config")
20 | def run(cfg: DictConfig) -> None:
21 |     print(OmegaConf.to_yaml(cfg))
22 | 
23 |     # make environment.
24 |     setup_check(robot="walker", config=cfg)
25 |     env, action_space, state_space = make_env(cfg, pretrain=True)
26 | 
27 |     # make agent
28 |     agent, project_name = get_agent(action_space, state_space, cfg)
29 |     login(agent)
30 | 
31 |     # initialize wandb
32 |     wandb.init(project=project_name)
33 |     wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
34 |     wandb.watch(agent.actor, log_freq=1) if agent.actor else None
35 | 
36 |     batch_size = cfg.agent.batch_size
37 |     num_updates = cfg.agent.num_updates
38 |     train_episodes = cfg.episodes
39 |     print("Start training...")
40 |     try:
41 |         for e in tqdm(range(train_episodes), desc="Training"):
42 | 
43 |             loss_info = agent.train(batch_size=batch_size, num_updates=num_updates)
44 | 
45 |             # Metrics Logging
46 |             log_dict = {
47 |                 "epoch": e,
48 |                 "buffer_size": agent.replay_buffer.__len__(),
49 |             }
50 |             log_dict.update(tensordict2dict(loss_info))
51 |             wandb.log(log_dict)
52 | 
53 |     except KeyboardInterrupt:
54 |         print("Training interrupted by user.")
55 | 
56 |     logout(agent)
57 |     env.close()
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     run()
62 | 


--------------------------------------------------------------------------------
/experiments/walker/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import time
  4 | 
  5 | import hydra
  6 | import numpy as np
  7 | import wandb
  8 | from omegaconf import DictConfig, OmegaConf
  9 | from torchrl.envs.utils import step_mdp
 10 | from tqdm import tqdm
 11 | 
 12 | # Add the project root to PYTHONPATH for config
 13 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
 14 | if project_root not in sys.path:
 15 |     sys.path.insert(0, project_root)
 16 | 
 17 | from bricksrl.environments import make_env, VIDEO_LOGGING_ENVS
 18 | from experiments.helper.agents import get_agent
 19 | from experiments.helper.utils import (
 20 |     create_video_from_images,
 21 |     login,
 22 |     logout,
 23 |     prefill_buffer,
 24 |     setup_check,
 25 |     tensordict2dict,
 26 | )
 27 | 
 28 | 
 29 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config")
 30 | def run(cfg: DictConfig) -> None:
 31 |     print(OmegaConf.to_yaml(cfg))
 32 | 
 33 |     # make environment.
 34 |     setup_check(robot="walker", config=cfg)
 35 |     env, action_space, state_space = make_env(cfg)
 36 | 
 37 |     # make agent
 38 |     agent, project_name = get_agent(action_space, state_space, cfg)
 39 |     login(agent)
 40 | 
 41 |     # initialize wandb
 42 |     wandb.init(project=project_name)
 43 |     wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
 44 |     wandb.watch(agent.actor, log_freq=1) if agent.actor else None
 45 | 
 46 |     # prefill buffer with random actions
 47 |     prefill_buffer(env=env, agent=agent, num_episodes=cfg.agent.prefill_episodes)
 48 | 
 49 |     batch_size = cfg.agent.batch_size
 50 |     num_updates = cfg.agent.num_updates
 51 |     env_name = cfg.env.name
 52 |     train_episodes = cfg.episodes
 53 |     print("Start training...")
 54 |     quit = False
 55 |     try:
 56 |         for e in tqdm(range(train_episodes), desc="Training"):
 57 |             td = env.reset()
 58 |             done = td.get("done", False)
 59 |             truncated = td.get("truncated", False)
 60 |             ep_return = 0
 61 |             ep_steps = 0
 62 |             total_step_times = []
 63 |             agent_actions = []
 64 |             if env_name in VIDEO_LOGGING_ENVS:
 65 |                 image_caputres = [td.get("original_image").numpy()]
 66 |                 # so we can reset the robot in the camera view
 67 |                 input("Press Enter to start episode...")
 68 | 
 69 |             print("Start new data collection...", flush=True)
 70 |             while not done and not truncated:
 71 |                 ep_steps += 1
 72 |                 step_start_time = time.time()
 73 |                 td = agent.get_action(td)
 74 |                 td = env.step(td)
 75 |                 agent.add_experience(td)
 76 |                 done = td.get(("next", "done"), False)
 77 |                 ep_return += td.get(("next", "reward"), 0)
 78 |                 if env_name in VIDEO_LOGGING_ENVS:
 79 |                     image_caputres.append(
 80 |                         td.get(("next", "original_image")).cpu().numpy()
 81 |                     )
 82 |                 total_agent_step_time = time.time() - step_start_time
 83 |                 total_step_times.append(total_agent_step_time)
 84 |                 if done:
 85 |                     break
 86 |                 td = step_mdp(td)
 87 |             loss_info = agent.train(
 88 |                 batch_size=batch_size, num_updates=num_updates * ep_steps
 89 |             )
 90 |             action = td.get("action").cpu().numpy()
 91 |             agent_actions.append(action)
 92 | 
 93 |             if quit:
 94 |                 break
 95 | 
 96 |             # Metrics Logging
 97 |             log_dict = {
 98 |                 "epoch": e,
 99 |                 "reward": ep_return,
100 |                 "steps": ep_steps,
101 |                 "total_step_time": np.mean(total_step_times),
102 |                 "buffer_size": agent.replay_buffer.__len__(),
103 |                 "action": wandb.Histogram(action),
104 |                 "done": done,
105 |                 "action_mean": wandb.Histogram(np.mean(agent_actions, axis=0)),
106 |             }
107 |             log_dict.update(tensordict2dict(loss_info))
108 |             wandb.log(log_dict)
109 |             if env_name in VIDEO_LOGGING_ENVS:
110 |                 video_name = "episode_{}.mp4".format(e)
111 |                 create_video_from_images(image_caputres, video_name, fps=5)
112 |                 wandb.log({"video": wandb.Video(video_name, fps=5, format="mp4")})
113 | 
114 |     except KeyboardInterrupt:
115 |         print("Training interrupted by user.")
116 | 
117 |     logout(agent)
118 |     env.close()
119 | 
120 | 
121 | if __name__ == "__main__":
122 |     run()
123 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=64", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name="bricksrl",
 5 |     version="0.1.0",
 6 |     packages=find_packages(),
 7 |     install_requires=[
 8 |         "pybricksdev",
 9 |         "tensordict==0.5.0",
10 |         "torchrl==0.5.0",
11 |         "hydra-core==1.3.2",
12 |         "wandb==0.16.1",
13 |         "opencv-python==4.9.0.80",
14 |         "moviepy==1.0.3",
15 |         "tqdm==4.66.1",
16 |         "numpy==1.24.1",
17 |         "pynput",
18 |     ],
19 |     extras_require={
20 |         "dev": [
21 |             "pytest==8.0.2",
22 |             "ufmt",
23 |             "pre-commit",
24 |         ],
25 |     },
26 |     author="Sebastian Dittert",
27 |     description="BricksRL: A Platform for Democratizing Robotics and Reinforcement Learning Research and Education with LEGO",
28 |     url="https://github.com/BricksRL/bricksrl",
29 |     classifiers=[
30 |         "Programming Language :: Python :: 3",
31 |         "License :: OSI Approved :: MIT License",
32 |         "Operating System :: OS Independent",
33 |     ],
34 |     python_requires=">=3.8",
35 | )
36 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BricksRL/bricksrl/bc250aeaa3b9ab9d718601fced38325f3621c8a3/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_agents.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import torch
  3 | from bricksrl.environments.dummy.mixed_obs_dummy import MixedObsDummyEnv
  4 | from bricksrl.environments.dummy.vec_obs_dummy import VecGoalObsDummyEnv, VecObsDummyEnv
  5 | from experiments.helper.agents import get_agent
  6 | from hydra import compose, initialize
  7 | from torchrl.envs import Compose, ToTensorImage, TransformedEnv
  8 | from torchrl.envs.utils import step_mdp
  9 | 
 10 | 
 11 | def collection_round(env, agent, max_steps=1000):
 12 |     td = env.reset()
 13 |     for _ in range(max_steps):
 14 |         td = agent.get_action(td)
 15 |         td = env.step(td)
 16 |         agent.add_experience(td)
 17 |         td = step_mdp(td)
 18 | 
 19 | 
 20 | def get_env(env, img_shape=(64, 64, 3)):
 21 |     if env == "mixed":
 22 |         env = MixedObsDummyEnv(img_shape=img_shape)
 23 |         env = TransformedEnv(
 24 |             env, Compose(ToTensorImage(in_keys=["pixels"], from_int=True))
 25 |         )
 26 |     elif env == "vec":
 27 |         env = VecObsDummyEnv()
 28 |     elif env == "vec_goal":
 29 |         env = VecGoalObsDummyEnv()
 30 |     else:
 31 |         raise ValueError("Invalid environment")
 32 |     return env
 33 | 
 34 | 
 35 | @pytest.mark.parametrize(
 36 |     "env",
 37 |     ["mixed", "vec", "vec_goal"],
 38 | )
 39 | @pytest.mark.parametrize(
 40 |     "device",
 41 |     ["cpu", "cuda"],
 42 | )
 43 | def test_random_agent(env, device):
 44 |     with initialize(config_path="../conf"):
 45 |         cfg = compose(config_name="config")
 46 | 
 47 |     if torch.cuda.is_available() and device == "cuda":
 48 |         device = "cuda"
 49 |     else:
 50 |         device = "cpu"
 51 |     with initialize(config_path="../conf"):
 52 |         cfg = compose(
 53 |             config_name="config", overrides=["device=" + device, "agent=random"]
 54 |         )
 55 |     # Test data collection
 56 |     env = get_env(env)
 57 |     agent, _ = get_agent(env.action_spec, env.observation_spec, cfg)
 58 |     collection_round(env, agent, max_steps=10)
 59 | 
 60 | 
 61 | @pytest.mark.parametrize(
 62 |     "env",
 63 |     ["mixed", "vec", "vec_goal"],
 64 | )
 65 | @pytest.mark.parametrize(
 66 |     "device",
 67 |     ["cpu", "cuda"],
 68 | )
 69 | def test_sac_agent(env, device):
 70 |     if torch.cuda.is_available() and device == "cuda":
 71 |         device = "cuda"
 72 |     else:
 73 |         device = "cpu"
 74 |     with initialize(config_path="../conf"):
 75 |         cfg = compose(config_name="config", overrides=["agent=sac", "device=" + device])
 76 | 
 77 |     # Test data collection
 78 |     env = get_env(env)
 79 |     agent, _ = get_agent(env.action_spec, env.observation_spec, cfg)
 80 |     collection_round(env, agent, max_steps=10)
 81 |     # Test training
 82 |     agent.train(batch_size=1, num_updates=1)
 83 | 
 84 |     # Test evaluation
 85 |     td = env.reset()
 86 |     td1 = agent.get_action(td)
 87 |     td2 = agent.get_action(td)
 88 | 
 89 |     assert not torch.allclose(td1["action"], td2["action"])
 90 | 
 91 |     agent.eval()
 92 |     td = env.reset()
 93 |     eval_td1 = agent.get_eval_action(td)
 94 |     eval_td2 = agent.get_eval_action(td)
 95 | 
 96 |     assert torch.allclose(eval_td1["action"], eval_td2["action"])
 97 | 
 98 | 
 99 | @pytest.mark.parametrize(
100 |     "env",
101 |     ["mixed", "vec", "vec_goal"],
102 | )
103 | @pytest.mark.parametrize(
104 |     "device",
105 |     ["cpu", "cuda"],
106 | )
107 | def test_td3_agent(env, device):
108 |     if torch.cuda.is_available() and device == "cuda":
109 |         device = "cuda"
110 |     else:
111 |         device = "cpu"
112 |     with initialize(config_path="../conf"):
113 |         cfg = compose(config_name="config", overrides=["agent=td3", "device=" + device])
114 | 
115 |     # Test data collection
116 |     env = get_env(env)
117 |     agent, _ = get_agent(env.action_spec, env.observation_spec, cfg)
118 |     collection_round(env, agent, max_steps=10)
119 | 
120 |     # Test training
121 |     agent.train(batch_size=1, num_updates=1)
122 | 
123 |     # Test evaluation
124 |     td = env.reset()
125 |     td1 = agent.get_action(td)
126 |     td2 = agent.get_action(td)
127 | 
128 |     assert not torch.allclose(td1["action"], td2["action"])
129 | 
130 |     agent.eval()
131 |     td = env.reset()
132 |     eval_td1 = agent.get_eval_action(td)
133 |     eval_td2 = agent.get_eval_action(td)
134 | 
135 |     assert torch.allclose(eval_td1["action"], eval_td2["action"])
136 | 
137 | 
138 | @pytest.mark.parametrize(
139 |     "env",
140 |     ["mixed", "vec", "vec_goal"],
141 | )
142 | @pytest.mark.parametrize(
143 |     "device",
144 |     ["cpu", "cuda"],
145 | )
146 | def test_drq_agent(env, device):
147 |     if torch.cuda.is_available() and device == "cuda":
148 |         device = "cuda"
149 |     else:
150 |         device = "cpu"
151 |     with initialize(config_path="../conf"):
152 |         cfg = compose(
153 |             config_name="config", overrides=["agent=droq", "device=" + device]
154 |         )
155 | 
156 |     # Test data collection
157 |     env = get_env(env)
158 |     agent, _ = get_agent(env.action_spec, env.observation_spec, cfg)
159 |     collection_round(env, agent, max_steps=10)
160 |     # Test training
161 |     agent.train(batch_size=1, num_updates=1)
162 | 
163 |     # Test evaluation
164 |     td = env.reset()
165 |     td1 = agent.get_action(td)
166 |     td2 = agent.get_action(td)
167 | 
168 |     assert not torch.allclose(td1["action"], td2["action"])
169 | 
170 |     agent.eval()
171 |     td = env.reset()
172 |     eval_td1 = agent.get_eval_action(td)
173 |     eval_td2 = agent.get_eval_action(td)
174 | 
175 |     assert torch.allclose(eval_td1["action"], eval_td2["action"])
176 | 
177 | 
178 | @pytest.mark.parametrize(
179 |     "env",
180 |     ["mixed", "vec", "vec_goal"],
181 | )
182 | @pytest.mark.parametrize(
183 |     "device",
184 |     ["cpu", "cuda"],
185 | )
186 | def test_iql_agent(env, device):
187 |     if torch.cuda.is_available() and device == "cuda":
188 |         device = "cuda"
189 |     else:
190 |         device = "cpu"
191 |     with initialize(config_path="../conf"):
192 |         cfg = compose(config_name="config", overrides=["agent=iql", "device=" + device])
193 | 
194 |     # Test data collection
195 |     env = get_env(env)
196 |     agent, _ = get_agent(env.action_spec, env.observation_spec, cfg)
197 |     collection_round(env, agent, max_steps=10)
198 |     # Test training
199 |     agent.train(batch_size=1, num_updates=1)
200 | 
201 |     # Test evaluation
202 |     td = env.reset()
203 |     td1 = agent.get_action(td)
204 |     td2 = agent.get_action(td)
205 | 
206 |     assert not torch.allclose(td1["action"], td2["action"])
207 | 
208 |     agent.eval()
209 |     td = env.reset()
210 |     eval_td1 = agent.get_eval_action(td)
211 |     eval_td2 = agent.get_eval_action(td)
212 | 
213 |     assert torch.allclose(eval_td1["action"], eval_td2["action"])
214 | 
215 | 
216 | @pytest.mark.parametrize(
217 |     "env",
218 |     ["mixed", "vec", "vec_goal"],
219 | )
220 | @pytest.mark.parametrize(
221 |     "device",
222 |     ["cpu", "cuda"],
223 | )
224 | def test_cql_agent(env, device):
225 |     if torch.cuda.is_available() and device == "cuda":
226 |         device = "cuda"
227 |     else:
228 |         device = "cpu"
229 |     with initialize(config_path="../conf"):
230 |         cfg = compose(config_name="config", overrides=["agent=cql", "device=" + device])
231 | 
232 |     # Test data collection
233 |     env = get_env(env)
234 |     agent, _ = get_agent(env.action_spec, env.observation_spec, cfg)
235 |     collection_round(env, agent, max_steps=10)
236 |     # Test training
237 |     agent.train(batch_size=1, num_updates=1)
238 | 
239 |     # Test evaluation
240 |     td = env.reset()
241 |     td1 = agent.get_action(td)
242 |     td2 = agent.get_action(td)
243 | 
244 |     assert not torch.allclose(td1["action"], td2["action"])
245 | 
246 |     agent.eval()
247 |     td = env.reset()
248 |     eval_td1 = agent.get_eval_action(td)
249 |     eval_td2 = agent.get_eval_action(td)
250 | 
251 |     assert torch.allclose(eval_td1["action"], eval_td2["action"])
252 | 
253 | 
254 | @pytest.mark.parametrize(
255 |     "env",
256 |     ["mixed", "vec", "vec_goal"],
257 | )
258 | @pytest.mark.parametrize(
259 |     "device",
260 |     ["cpu", "cuda"],
261 | )
262 | def test_bc_agent(env, device):
263 |     if torch.cuda.is_available() and device == "cuda":
264 |         device = "cuda"
265 |     else:
266 |         device = "cpu"
267 |     with initialize(config_path="../conf"):
268 |         cfg = compose(config_name="config", overrides=["agent=bc", "device=" + device])
269 | 
270 |     # Test data collection
271 |     env = get_env(env)
272 |     agent, _ = get_agent(env.action_spec, env.observation_spec, cfg)
273 |     collection_round(env, agent, max_steps=10)
274 |     # Test training
275 |     agent.train(batch_size=1, num_updates=1)
276 | 
277 |     # Test evaluation
278 |     agent.eval()
279 |     td = env.reset()
280 |     eval_td1 = agent.get_eval_action(td)
281 |     eval_td2 = agent.get_eval_action(td)
282 | 
283 |     assert torch.allclose(eval_td1["action"], eval_td2["action"])
284 | 
285 | 
286 | @pytest.mark.parametrize(
287 |     "env",
288 |     ["mixed"],
289 | )
290 | @pytest.mark.parametrize(
291 |     "img_shape",
292 |     [(64, 64, 3), (128, 128, 3)],
293 | )
294 | @pytest.mark.parametrize(
295 |     "device",
296 |     ["cpu", "cuda"],
297 | )
298 | def test_mixd_obs_size_agent(env, device, img_shape):
299 |     if torch.cuda.is_available() and device == "cuda":
300 |         device = "cuda"
301 |     else:
302 |         device = "cpu"
303 |     with initialize(config_path="../conf"):
304 |         cfg = compose(config_name="config", overrides=["agent=td3", "device=" + device])
305 | 
306 |     # Test data collection
307 |     env = get_env(env, img_shape)
308 |     agent, _ = get_agent(env.action_spec, env.observation_spec, cfg)
309 |     collection_round(env, agent, max_steps=10)
310 | 
311 |     # Test training
312 |     agent.train(batch_size=1, num_updates=1)
313 | 
314 |     # Test evaluation
315 |     td = env.reset()
316 |     td1 = agent.get_action(td)
317 |     td2 = agent.get_action(td)
318 | 
319 |     assert not torch.allclose(td1["action"], td2["action"])
320 | 
321 |     agent.eval()
322 |     td = env.reset()
323 |     eval_td1 = agent.get_eval_action(td)
324 |     eval_td2 = agent.get_eval_action(td)
325 | 
326 |     assert torch.allclose(eval_td1["action"], eval_td2["action"])
327 | 


--------------------------------------------------------------------------------
/tests/test_env_sim.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from bricksrl.environments import make_env
 4 | from experiments.helper.agents import get_agent
 5 | from hydra import compose, initialize
 6 | 
 7 | from tests.test_agents import collection_round
 8 | 
 9 | 
10 | @pytest.mark.parametrize(
11 |     "agent",
12 |     ["sac", "td3", "random"],
13 | )
14 | @pytest.mark.parametrize("env", ["walker_sim-v0", "roboarm_sim-v0"])
15 | @pytest.mark.parametrize(
16 |     "device",
17 |     ["cpu", "cuda"],
18 | )
19 | def test_sac_agent(agent, env, device):
20 |     if torch.cuda.is_available() and device == "cuda":
21 |         device = "cuda"
22 |     else:
23 |         device = "cpu"
24 |     with initialize(config_path="../conf"):
25 |         cfg = compose(
26 |             config_name="config",
27 |             overrides=["agent=" + agent, "device=" + device, "env=" + env],
28 |         )
29 | 
30 |     # Create environment
31 |     env, action_space, state_space = make_env(cfg)
32 |     # Create agent
33 |     agent, _ = get_agent(action_space, state_space, cfg)
34 |     # Test data collection
35 |     collection_round(env, agent, max_steps=10)
36 | 


--------------------------------------------------------------------------------