├── .github └── workflows │ └── ci.yaml ├── .gitignore ├── LICENSE ├── README.md ├── bricksrl ├── Pybricks │ └── PybricksHubClass.py ├── __init__.py └── environments │ ├── __init__.py │ ├── base │ └── base_env.py │ ├── dummy │ ├── mixed_obs_dummy.py │ └── vec_obs_dummy.py │ ├── roboarm_mixed_v0 │ ├── RoboArmMixedEnv.py │ └── client.py │ ├── roboarm_v0 │ ├── RoboArmEnv.py │ ├── RoboArmSim.py │ └── client.py │ ├── runaway_v0 │ ├── RunAwayEnv.py │ └── client.py │ ├── spinning_v0 │ ├── SpinningEnv.py │ └── client.py │ └── walker_v0 │ ├── WalkerEnv.py │ ├── WalkerEnvSim.py │ └── client.py ├── conf ├── README.md ├── agent │ ├── bc.yaml │ ├── cql.yaml │ ├── droq.yaml │ ├── iql.yaml │ ├── random.yaml │ ├── sac.yaml │ └── td3.yaml ├── config.yaml └── env │ ├── roboarm-v0.yaml │ ├── roboarm_mixed-v0.yaml │ ├── roboarm_sim-v0.yaml │ ├── runaway-v0.yaml │ ├── spinning-v0.yaml │ ├── walker-v0.yaml │ └── walker_sim-v0.yaml ├── examples ├── README.md ├── custom_env.py ├── example_notebook.ipynb └── torchrl_sac │ ├── config.yaml │ ├── train.py │ └── utils.py ├── experiments ├── 2wheeler │ ├── eval.py │ ├── pretrain.py │ └── train.py ├── helper │ ├── __init__.py │ ├── agents │ │ ├── __init__.py │ │ ├── base.py │ │ ├── behavior_cloning.py │ │ ├── cql.py │ │ ├── iql.py │ │ ├── random.py │ │ ├── sac.py │ │ └── td3.py │ ├── networks │ │ ├── __init__.py │ │ └── networks.py │ └── utils.py ├── roboarm │ ├── eval.py │ ├── pretrain.py │ └── train.py └── walker │ ├── eval.py │ ├── pretrain.py │ └── train.py ├── pyproject.toml ├── setup.py └── tests ├── __init__.py ├── test_agents.py └── test_env_sim.py /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | # Runs on both push and pull_request events to the main branch. 5 | push: 6 | branches: 7 | - '**' # or master, depending on your default branch 8 | pull_request: 9 | branches: 10 | - main # or master 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | strategy: 18 | matrix: 19 | python-version: [3.8.18, 3.9] 20 | 21 | steps: 22 | - name: Checkout code 23 | uses: actions/checkout@v3 24 | 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | 30 | - name: Install dependencies 31 | run: | 32 | python -m pip install --upgrade pip 33 | pip install -e .[dev] 34 | 35 | - name: List files # checking if the files are in the right place 36 | run: | 37 | ls 38 | 39 | - name: Run tests with pytest 40 | run: | 41 | pytest 42 | 43 | - name: Check code formatting with ufmt 44 | run: ufmt check . -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | .DS_Store 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | 132 | # wandb 133 | wandb/ 134 | # hydra 135 | outputs/ 136 | # .pth files 137 | *.pth 138 | 139 | # vscode 140 | .vscode/ 141 | 142 | # dev tools 143 | pytest.ini 144 | .pre-commit-config.yaml 145 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 www.compscience.org 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BricksRL 2 | 3 | ![CI](https://github.com/BricksRL/bricksrl/actions/workflows/ci.yaml/badge.svg?branch=main) 4 | ![Python](https://img.shields.io/badge/python-3.8%20%7C%203.9-blue) 5 | [![arXiv](https://img.shields.io/badge/arXiv-2406.17490-b31b1b.svg)](https://arxiv.org/abs/2406.17490) 6 | [![Website](https://img.shields.io/badge/Website-Visit%20Now-blue)](https://bricksrl.github.io/ProjectPage/) 7 | [![Discord](https://img.shields.io/badge/Join_our_Discord-7289da?logo=discord&logoColor=ffffff&labelColor=7289da)](https://discord.gg/qdTsFaVfZm) 8 | 9 | 10 | BricksRL allows the training of custom LEGO robots using deep reinforcement learning. By integrating [Pybricks](https://pybricks.com/) and [TorchRL](https://pytorch.org/rl/stable/index.html), it facilitates efficient real-world training via Bluetooth communication between LEGO hubs and a local computing device. Check out our [paper](https://arxiv.org/abs/2406.17490)! 11 | 12 | For additional information and building instructions for the robots, view the project page [BricksRL](https://bricksrl.github.io/ProjectPage/). 13 | 14 | 15 | 16 | 17 | ## Prerequisites 18 |
19 | Click me 20 | 21 | ### Enable web Bluetooth on chrome 22 | 23 | 1. Go to "chrome://flags/" 24 | 2. enable "Experimental Web Platform features" 25 | 3. restart chrome 26 | 4. Use beta.pybricks.com to edit and upload the client scripts for each environment 27 | 28 | ### Environment Setup 29 | 30 | 1. **Create a Conda environment:** 31 | ```bash 32 | conda create --name bricksrl python=3.8 33 | ``` 34 | 2. **Activate the environment:** 35 | ```bash 36 | conda activate bricksrl 37 | ``` 38 | 3. **Install PyTorch:** 39 | ```bash 40 | pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 41 | 42 | ``` 43 | 4. **Install bricksrl and additional packages:** 44 | For regular users, install the package and all required dependencies by running: 45 | ```bash 46 | pip install -e . 47 | ``` 48 | 49 | This will install the bricksrl package along with the dependencies listed in setup.py. 50 | 51 | 5. **(Optional) Install development tools:** 52 | 53 | If you are a developer and need to install development tools (e.g., pytest, ufmt, pre-commit), use the following command to install them as extras: 54 | 55 | ```bash 56 | pip install -e .[dev] 57 | ``` 58 | This will install the development dependencies defined in the setup.py file along with the package. 59 | 60 | 61 |
62 | 63 | ## Usage 64 | ### Client 65 | Update your client script on the [Pybricks Hub](https://beta.pybricks.com/) whenever you want to run a new environment with your robot. 66 | 67 | 68 | ### Config 69 | Before running experiments, please review and modify the configuration settings according to your needs. Each environment and agent setup has its own specific configuration file under the configs/ directory. For more information checkout the [config README](conf/README.md). 70 | 71 | 72 | ### Robots 73 | 74 | Robots utilized for our experiments. Building instructions can be found [here](https://bricksrl.github.io/ProjectPage/). 75 | 76 | | ![2wheeler](https://drive.google.com/uc?export=view&id=1IxqQ1VZchPZMNXyZnTULuNy53-LMYT6W) | ![Walker](https://drive.google.com/uc?export=view&id=1ImR0f1UNjC4sUHXWWg_D06eukrh-doW9) | ![RoboArm](https://drive.google.com/uc?export=view&id=1IYCJrl5rZBvOb6xKwbSUZqYrVwKjCpJH) | 77 | |:--:|:--:|:--:| 78 | | **2Wheeler** | **Walker** | **RoboArm** | 79 | 80 | 81 | ## Run Experiments 82 | ### Train an Agent 83 | 84 | ```bash 85 | python experiments/walker/train.py 86 | ``` 87 | 88 | ### Evaluate an Agent 89 | ```bash 90 | python experiments/walker/eval.py 91 | ``` 92 | 93 | ## Results 94 |
95 | Click me 96 | 97 | Evaluation videos of the trained agents can be found [here](https://bricksrl.github.io/ProjectPage/). 98 | 99 | ### 2Wheeler Results: 100 | 101 | 2Wheeler Results 102 | 103 | ### Walker Results: 104 | 105 | Walker Results 106 | 107 | ### RoboArm Results: 108 | 109 | RoboArm Results 110 | 111 | RoboArm Mixed Results 112 | 113 |
114 | 115 | 116 | ### Offline RL 117 |
118 | Click me 119 | 120 | With the use of precollected [datasets](https://huggingface.co/datasets/compsciencelab/BricksRL-Datasets) we can pretrain agents with offline RL to perform a task without the need of real world interaction. Such pretrained policies can be evaluated directly or used for later training to fine tuning the pretrained policy on the real robot. 121 | 122 | #### Datasets 123 | The datasets can be downloaded from huggingface and contain expert and random transitions for the 2Wheeler (RunAway-v0 and Spinning-v0), Walker (Walker-v0) and RoboArm (RoboArm-v0) robots. 124 | 125 | ```bash 126 | git lfs install 127 | git clone git@hf.co:datasets/compsciencelab/BricksRL-Datasets 128 | ``` 129 | 130 | The datasets consist of TensorDicts containing expert and random transitions, which can be directly loaded into the replay buffer. When initiating (pre-)training, simply provide the path to the desired TensorDict when prompted to load the replay buffer. 131 | 132 | 133 | #### Pretrain an Agent 134 | 135 | The execution of an experiment for offline training is similar to the online training except that you run the **pretrain.py** script: 136 | 137 | ```bash 138 | python experiments/walker/pretrain.py 139 | ``` 140 | 141 | Trained policies can then be evaluated as before with: 142 | 143 | ```bash 144 | python experiments/walker/eval.py 145 | ``` 146 | 147 | Or run training for fine-tuning the policy on the real robot: 148 | 149 | ```bash 150 | python experiments/walker/train.py 151 | ``` 152 | 153 | 154 |
155 | 156 | ## Examples 157 | 158 | ### TorchRL and Custom Environment Examples 159 | 160 | Examples to use BricksRL environments with typical training scripts from [TorchRL's sota-implementations](https://github.com/pytorch/rl/tree/main/sota-implementations) can be found [here](examples/). 161 | 162 | 163 | We also provide a template to create your own [custom BricksRL enviornment](examples/custom_env.py) which subsequently can be used directly in the TorchRL examples. 164 | 165 | For more information see the examples [readme](examples/README.md). 166 | 167 | 168 | ### High-Level Examples 169 | In the [example notebook](examples/example_notebook.ipynb) we provide high-level training examples to train a **SAC agent** in the **RoboArmSim-v0** environment and a **TD3 agent** in the **WalkerSim-v0** enviornment. 170 | The examples are based on the experiments for our paper. Stand alone examples similar to the [TorchRL sota-implementations](https://github.com/pytorch/rl/tree/main/sota-implementations) can be found [here](examples/torchrl_sac). 171 | 172 | ## Citation 173 | If you use BricksRL in your work, please refer to this BibTeX entry to cite it: 174 | 175 | ``` 176 | @article{dittert2024bricksrl, 177 | title={BricksRL: A Platform for Democratizing Robotics and Reinforcement Learning Research and Education with LEGO}, 178 | author={Sebastian Dittert and Vincent Moens and Gianni De Fabritiis}, 179 | journal={arXiv preprint arXiv:2406.17490}, 180 | year={2024} 181 | } 182 | ``` -------------------------------------------------------------------------------- /bricksrl/Pybricks/PybricksHubClass.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import struct 3 | 4 | from bleak import BleakClient, BleakScanner 5 | 6 | 7 | class PybricksHub: 8 | """Class for connecting to a Pybricks Hub.""" 9 | 10 | PYBRICKS_COMMAND_EVENT_CHAR_UUID = "c5f50002-8280-46da-89f4-6d8051e4aeef" 11 | HUB_NAME = "Pybricks Hub" 12 | 13 | def __init__(self, out_format_str: str, state_dim: int): 14 | self.device = None 15 | self.client = None 16 | self.rx_queue = asyncio.Queue(maxsize=8) # LifoQueue 17 | self.loop = asyncio.get_event_loop() 18 | 19 | self.exception_out_data = struct.pack(out_format_str, *([0.0] * state_dim)) 20 | self.disconnected = False 21 | self.payload_buffer = None 22 | 23 | def connect(self) -> None: 24 | """Connect to the hub.""" 25 | print("Connecting to the hub...") 26 | self.loop.run_until_complete(self._connect()) 27 | 28 | async def _connect(self) -> None: 29 | """Connect to the hub.""" 30 | try: 31 | # Find the device and initialize client. 32 | self.device = await BleakScanner.find_device_by_name(self.HUB_NAME) 33 | self.client = BleakClient( 34 | self.device, disconnected_callback=self._handle_disconnect 35 | ) 36 | 37 | # Connect and get services 38 | print("Switch on the hub", flush=True) 39 | await self.client.connect() 40 | await self.client.start_notify( 41 | self.PYBRICKS_COMMAND_EVENT_CHAR_UUID, self._handle_rx 42 | ) 43 | 44 | # Tell user to start program on the hub. 45 | print("Start the program on the hub now with the button.", flush=True) 46 | await asyncio.sleep(5) 47 | 48 | except Exception as e: 49 | # Handle exceptions. 50 | print(e) 51 | await self.disconnect() 52 | 53 | def send(self, data: bytes) -> None: 54 | """Send data to the hub as bytes.""" 55 | self.loop.run_until_complete(self._send(data)) 56 | 57 | async def _send(self, data: bytes) -> None: 58 | try: 59 | # Send some data to the hub. 60 | await self.client.write_gatt_char( 61 | self.PYBRICKS_COMMAND_EVENT_CHAR_UUID, 62 | b"\x06" + data, # Prepend "write stdin" command 63 | response=False, 64 | ) 65 | except Exception as e: 66 | # Handle exceptions. 67 | print(e) 68 | await self.disconnect() 69 | 70 | def disconnect(self) -> None: 71 | """ 72 | Disconnect from the hub. 73 | This method disconnects the hub from the client. 74 | """ 75 | if self.client and not self.disconnected: 76 | asyncio.create_task(self._disconnect()) 77 | 78 | async def _disconnect(self) -> None: 79 | try: 80 | # Disconnect when we are done. 81 | if self.client: 82 | await self.client.disconnect() 83 | except Exception as e: 84 | # Handle exceptions. 85 | print(e) 86 | finally: 87 | self.disconnected = True 88 | 89 | self.client = None 90 | self.device = None 91 | self.rx_char = None 92 | 93 | def _handle_disconnect(self, _) -> None: 94 | print("Hub was disconnected.") 95 | self.disconnect() 96 | 97 | async def _handle_rx(self, _, data: bytes) -> None: 98 | # add received data to the queue 99 | if data[0] == 0x01: # "write stdout" event (0x01) 100 | payload = data[1:] 101 | # print("Received:", payload) 102 | if ( 103 | len(payload) != len(self.exception_out_data) 104 | and self.payload_buffer is None 105 | ): 106 | self.payload_buffer = payload 107 | elif ( 108 | len(payload) != len(self.exception_out_data) 109 | and self.payload_buffer is not None 110 | ): 111 | self.payload_buffer += payload 112 | if self.payload_buffer.__len__() == len(self.exception_out_data): 113 | await self.rx_queue.put(self.payload_buffer) 114 | self.payload_buffer = None 115 | else: 116 | await self.rx_queue.put(payload) 117 | 118 | async def _read_data(self) -> bytes: 119 | try: 120 | # get data from the queue 121 | return await self.rx_queue.get() 122 | except asyncio.QueueEmpty: 123 | print("Queue is empty, returning zeros") 124 | return self.exception_out_data 125 | 126 | def read(self) -> bytes: 127 | """Read data from the hub and return it as a bytearray.""" 128 | return self.loop.run_until_complete(self._read_data()) 129 | 130 | def close(self) -> None: 131 | if not self.loop.is_closed(): 132 | self.loop.run_until_complete(self._disconnect()) 133 | -------------------------------------------------------------------------------- /bricksrl/__init__.py: -------------------------------------------------------------------------------- 1 | from bricksrl.environments.base.base_env import BaseEnv 2 | from bricksrl.Pybricks.PybricksHubClass import PybricksHub 3 | -------------------------------------------------------------------------------- /bricksrl/environments/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torchrl.envs import ( 3 | CatFrames, 4 | Compose, 5 | ObservationNorm, 6 | ToTensorImage, 7 | TransformedEnv, 8 | ) 9 | 10 | from bricksrl.environments.roboarm_mixed_v0.RoboArmMixedEnv import RoboArmMixedEnv_v0 11 | from bricksrl.environments.roboarm_v0.RoboArmEnv import RoboArmEnv_v0 12 | from bricksrl.environments.roboarm_v0.RoboArmSim import RoboArmSimEnv_v0 13 | from bricksrl.environments.runaway_v0.RunAwayEnv import RunAwayEnv_v0 14 | from bricksrl.environments.spinning_v0.SpinningEnv import SpinningEnv_v0 15 | from bricksrl.environments.walker_v0.WalkerEnv import WalkerEnv_v0 16 | from bricksrl.environments.walker_v0.WalkerEnvSim import WalkerEnvSim_v0 17 | 18 | VIDEO_LOGGING_ENVS = ["roboarm_mixed-v0", "walker_mixed-v0"] 19 | ALL_2WHEELER_ENVS = ["spinning-v0", "runaway-v0"] 20 | ALL_WALKER_ENVS = [ 21 | "walker-v0", 22 | "walker_sim-v0", 23 | ] 24 | ALL_ROBOARM_ENVS = [ 25 | "roboarm-v0", 26 | "roboarm_mixed-v0", 27 | "roboarm_sim-v0", 28 | ] 29 | ALL_ENVS = ALL_2WHEELER_ENVS + ALL_WALKER_ENVS + ALL_ROBOARM_ENVS 30 | 31 | 32 | # TODO: maybe outsorce this to experiments/helper and not in bricksrl 33 | def make_env(config, pretrain=False): 34 | """ 35 | Creates a new environment based on the provided configuration. 36 | 37 | Args: 38 | config: A configuration object containing the environment name and maximum episode steps. 39 | pretrain: A boolean indicating whether the environment is for pretraining. 40 | 41 | Returns: 42 | A tuple containing the new environment, its action spec, and its state spec. 43 | """ 44 | env = make(name=config.env.name, env_conf=config.env, pretrain=pretrain) 45 | observation_keys = [key for key in env.observation_spec.keys()] 46 | 47 | transforms = [] 48 | if config.env.frame_stack > 1: 49 | transforms.append( 50 | CatFrames( 51 | N=config.env.frame_stack, 52 | in_keys=observation_keys, 53 | out_key=observation_keys, 54 | ) 55 | ) 56 | if config.env.action_filter < 1: 57 | raise NotImplementedError("ActionFilterWrapper not implemented yet") 58 | # TODO: add this to torchrl 59 | # env = ActionFilterWrapper( 60 | # env, current_action_influence=config.env.action_filter 61 | # ) 62 | normalize_keys = [key for key in observation_keys if key != "pixels"] 63 | obs_ranges = np.array(list(env.observation_ranges.values())) 64 | obs_mean = obs_ranges.mean(axis=-1) # mean of min and max 65 | obs_std = obs_ranges.std(axis=-1) # std of min and max 66 | transforms.append( 67 | ObservationNorm( 68 | in_keys=normalize_keys, loc=obs_mean, scale=obs_std, standard_normal=True 69 | ) 70 | ) 71 | if "pixels" in observation_keys: 72 | transforms.append(ToTensorImage(in_keys=["pixels"], from_int=True)) 73 | 74 | env = TransformedEnv(env, Compose(*transforms)) 75 | 76 | action_spec = env.action_spec 77 | state_spec = env.observation_spec 78 | 79 | return env, action_spec, state_spec 80 | 81 | 82 | def make(name="RunAway", env_conf=None, pretrain=False): 83 | if name == "runaway-v0": 84 | return RunAwayEnv_v0( 85 | max_episode_steps=env_conf.max_episode_steps, 86 | min_distance=env_conf.min_distance, 87 | verbose=env_conf.verbose, 88 | pretrain=pretrain, 89 | ) 90 | elif name == "spinning-v0": 91 | return SpinningEnv_v0( 92 | max_episode_steps=env_conf.max_episode_steps, 93 | sleep_time=env_conf.sleep_time, 94 | verbose=env_conf.verbose, 95 | pretrain=pretrain, 96 | ) 97 | elif name == "walker-v0": 98 | return WalkerEnv_v0( 99 | max_episode_steps=env_conf.max_episode_steps, 100 | verbose=env_conf.verbose, 101 | sleep_time=env_conf.sleep_time, 102 | pretrain=pretrain, 103 | ) 104 | elif name == "walker_sim-v0": 105 | return WalkerEnvSim_v0( 106 | max_episode_steps=env_conf.max_episode_steps, 107 | noise=env_conf.noise, 108 | low_action_angle=env_conf.low_action_angle, 109 | high_action_angle=env_conf.high_action_angle, 110 | verbose=env_conf.verbose, 111 | ) 112 | elif name == "roboarm-v0": 113 | return RoboArmEnv_v0( 114 | max_episode_steps=env_conf.max_episode_steps, 115 | verbose=env_conf.verbose, 116 | sleep_time=env_conf.sleep_time, 117 | reward_signal=env_conf.reward_signal, 118 | pretrain=pretrain, 119 | ) 120 | elif name == "roboarm_sim-v0": 121 | return RoboArmSimEnv_v0( 122 | max_episode_steps=env_conf.max_episode_steps, 123 | verbose=env_conf.verbose, 124 | noise=env_conf.noise, 125 | reward_signal=env_conf.reward_signal, 126 | ) 127 | elif name == "roboarm_mixed-v0": 128 | return RoboArmMixedEnv_v0( 129 | max_episode_steps=env_conf.max_episode_steps, 130 | sleep_time=env_conf.sleep_time, 131 | verbose=env_conf.verbose, 132 | reward_signal=env_conf.reward_signal, 133 | camera_id=env_conf.camera_id, 134 | goal_radius=env_conf.goal_radius, 135 | pretrain=pretrain, 136 | ) 137 | else: 138 | print("Environment not found") 139 | -------------------------------------------------------------------------------- /bricksrl/environments/base/base_env.py: -------------------------------------------------------------------------------- 1 | import struct 2 | import sys 3 | 4 | import numpy as np 5 | 6 | import torch 7 | from bricksrl.Pybricks.PybricksHubClass import PybricksHub 8 | from tensordict import TensorDict, TensorDictBase 9 | from torchrl.envs import EnvBase 10 | 11 | 12 | class BaseEnv(EnvBase): 13 | """ 14 | The base class for reinforcement learning environments used with the Lego robots. 15 | 16 | Args: 17 | action_dim (int): The dimensionality of the action space. 18 | state_dim (int): The dimensionality of the state space. 19 | use_hub (bool): Whether to use the Pybricks hub for communication, if False, only 20 | the observation spec and action specs are created and can be used. 21 | Can be helpful for testing and debugging as you dont connect to the hub. 22 | verbose (bool): Whether to print verbose output. 23 | """ 24 | 25 | def __init__( 26 | self, 27 | action_dim: int, 28 | state_dim: int, 29 | use_hub: bool = True, 30 | verbose: bool = False, 31 | ): 32 | self.verbose = verbose 33 | self.action_dim = action_dim 34 | self.state_dim = state_dim 35 | 36 | self.action_format_str = "!" + "f" * self.action_dim 37 | self.state_format_str = "!" + "f" * self.state_dim 38 | 39 | self.expected_bytesize = struct.calcsize(self.state_format_str) 40 | 41 | # buffer state in case of missing data 42 | self.buffered_state = np.zeros(self.state_dim, dtype=np.float32) 43 | 44 | if use_hub: 45 | self.hub = PybricksHub( 46 | state_dim=state_dim, out_format_str=self.state_format_str 47 | ) 48 | self.hub.connect() 49 | print("Connected to hub.") 50 | else: 51 | self.hub = None 52 | super().__init__(batch_size=torch.Size([1])) 53 | 54 | def send_to_hub(self, action: np.array) -> None: 55 | """ 56 | Sends the given action to the hub as bytes. 57 | 58 | Args: 59 | action (np.array): The action to send to the hub as a numpy array. 60 | 61 | Raises: 62 | AssertionError: If the shape of the action does not match the action dimension. 63 | """ 64 | assert ( 65 | action.shape[0] == self.action_dim 66 | ), "Action shape does not match action dimension." 67 | byte_action = struct.pack(self.action_format_str, *action) 68 | if self.verbose: 69 | print("Sending data size: ", len(byte_action)) 70 | print("Sending data: ", byte_action) 71 | self.hub.send(byte_action) 72 | 73 | def read_from_hub(self) -> np.array: 74 | """ 75 | Reads the current state of the environment from the hub and returns it as a numpy array. 76 | 77 | Returns: 78 | np.array: The current state of the environment as a numpy array. 79 | """ 80 | byte_state = self.hub.read() 81 | if self.verbose: 82 | print("Reading data size: ", sys.getsizeof(byte_state)) 83 | print("Reading data: ", byte_state) 84 | print("len: ", len(byte_state)) 85 | 86 | if len(byte_state) != self.expected_bytesize: 87 | print( 88 | "State has size {} but should have size {}.".format( 89 | len(byte_state), struct.calcsize(self.state_format_str) 90 | ) 91 | ) 92 | print("Returning previous state.") 93 | state = self.buffered_state 94 | print("State: ", state) 95 | else: 96 | state = np.array([struct.unpack(self.state_format_str, byte_state)]) 97 | self.buffered_state = state 98 | assert ( 99 | state.shape[1] == self.state_dim 100 | ), f"State has shape {state.shape[0]} and does not match state dimension: {self.state_dim}." 101 | return state 102 | 103 | def sample_random_action(self, tensordict: TensorDictBase) -> TensorDictBase: 104 | """ 105 | Sample a random action from the action space. 106 | 107 | Returns: 108 | TensorDictBase: A dictionary containing the sampled action. 109 | """ 110 | if tensordict is not None: 111 | tensordict.set("action", self.action_spec.rand()) 112 | return tensordict 113 | else: 114 | return TensorDict({"action": self.action_spec.rand()}, []) 115 | 116 | def close(self) -> None: 117 | if self.hub is not None: 118 | self.hub.close() 119 | 120 | def _step( 121 | self, 122 | ): 123 | raise NotImplementedError 124 | 125 | def _reset( 126 | self, 127 | ): 128 | raise NotImplementedError 129 | 130 | def _set_seed(self, seed: int): 131 | np.random.seed(seed) 132 | torch.manual_seed(seed) 133 | 134 | 135 | class BaseSimEnv(EnvBase): 136 | """ 137 | The base class for reinforcement learning environments used to simulate Lego robots. 138 | 139 | Args: 140 | action_dim (int): The dimensionality of the action space. 141 | state_dim (int): The dimensionality of the state space. 142 | verbose (bool): Whether to print verbose output. 143 | use_hub (bool): This argument is kept for compatibility but is not used in the simulation environment. 144 | """ 145 | 146 | def __init__( 147 | self, 148 | action_dim: int, 149 | state_dim: int, 150 | verbose: bool = False, 151 | use_hub: bool = False, 152 | ): 153 | self.verbose = verbose 154 | self.action_dim = action_dim 155 | self.state_dim = state_dim 156 | 157 | super().__init__(batch_size=torch.Size([1])) 158 | 159 | def sample_random_action(self, tensordict: TensorDictBase) -> TensorDictBase: 160 | """ 161 | Sample a random action from the action space. 162 | 163 | Returns: 164 | TensorDictBase: A dictionary containing the sampled action. 165 | """ 166 | if tensordict is not None: 167 | tensordict.set("action", self.action_spec.rand()) 168 | return tensordict 169 | else: 170 | return TensorDict({"action": self.action_spec.rand()}, []) 171 | 172 | def _step( 173 | self, 174 | ): 175 | raise NotImplementedError 176 | 177 | def _reset( 178 | self, 179 | ): 180 | raise NotImplementedError 181 | 182 | def _set_seed(self, seed: int): 183 | """ 184 | Sets the seed for the environment's random number generator. 185 | 186 | Args: 187 | seed (int): The seed to set. 188 | """ 189 | np.random.seed(seed) 190 | torch.manual_seed(seed) 191 | -------------------------------------------------------------------------------- /bricksrl/environments/dummy/mixed_obs_dummy.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import numpy as np 4 | 5 | import torch 6 | 7 | from tensordict import TensorDict, TensorDictBase 8 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec 9 | from torchrl.envs import EnvBase 10 | 11 | 12 | class MixedObsDummyEnv(EnvBase): 13 | """ 14 | MixedObsDummyEnv is a dummy environment for testing purposes. 15 | It does not connec to Pybricks 16 | 17 | """ 18 | 19 | action_dim = 4 20 | state_dim = 7 21 | observation_key = "observation" 22 | pixel_observation_key = "pixels" 23 | 24 | def __init__(self, max_episode_steps=10, img_shape=(64, 64, 3)): 25 | self.max_episode_steps = max_episode_steps 26 | self._batch_size = torch.Size([1]) 27 | self.action_spec = BoundedTensorSpec( 28 | low=-torch.ones((1, self.action_dim)), 29 | high=torch.ones((1, self.action_dim)), 30 | shape=(1, self.action_dim), 31 | ) 32 | 33 | observation_spec = BoundedTensorSpec( 34 | low=-torch.ones((1, self.state_dim)), 35 | high=torch.ones((1, self.state_dim)), 36 | ) 37 | 38 | pixel_observation_spec = BoundedTensorSpec( 39 | low=torch.zeros((1,) + img_shape, dtype=torch.uint8), 40 | high=torch.ones((1,) + img_shape, dtype=torch.uint8) * 255, 41 | ) 42 | 43 | self.observation_spec = CompositeSpec(shape=(1,)) 44 | self.observation_spec.set(self.observation_key, observation_spec) 45 | self.observation_spec.set(self.pixel_observation_key, pixel_observation_spec) 46 | super().__init__(batch_size=self._batch_size) 47 | 48 | def _set_seed(self, seed: int): 49 | return super()._set_seed(seed) 50 | 51 | def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: 52 | """ 53 | Reset the environment and return the initial state. 54 | 55 | Returns: 56 | TensorDictBase: The initial state of the environment. 57 | """ 58 | # TODO solve this fake action sending before to receive first state 59 | self.episode_step_iter = 0 60 | observation = self.observation_spec[self.observation_key].rand() 61 | pixel_observation = self.observation_spec[self.pixel_observation_key].rand() 62 | return TensorDict( 63 | { 64 | self.observation_key: observation.float(), 65 | self.pixel_observation_key: pixel_observation, 66 | }, 67 | batch_size=[1], 68 | ) 69 | 70 | def reward( 71 | self, 72 | action: np.ndarray, 73 | next_state: np.ndarray, 74 | ) -> Tuple[float, bool]: 75 | """ """ 76 | return 0.0, False 77 | 78 | def _step(self, tensordict: TensorDictBase) -> TensorDictBase: 79 | """ """ 80 | action = tensordict.get("action").cpu().numpy() 81 | observation = self.observation_spec[self.observation_key].rand() 82 | pixel_observation = self.observation_spec[self.pixel_observation_key].rand() 83 | 84 | reward, done = self.reward( 85 | action=action, 86 | next_state=observation, 87 | ) 88 | next_tensordict = TensorDict( 89 | { 90 | self.observation_key: observation.float(), 91 | self.pixel_observation_key: pixel_observation, 92 | "reward": torch.tensor([reward]).float(), 93 | "done": torch.tensor([done]).bool(), 94 | }, 95 | batch_size=[1], 96 | ) 97 | 98 | # increment episode step counter 99 | self.episode_step_iter += 1 100 | if self.episode_step_iter >= self.max_episode_steps: 101 | next_tensordict.set("done", torch.tensor([True])) 102 | return next_tensordict 103 | -------------------------------------------------------------------------------- /bricksrl/environments/dummy/vec_obs_dummy.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import numpy as np 4 | 5 | import torch 6 | 7 | from tensordict import TensorDict, TensorDictBase 8 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec 9 | from torchrl.envs import EnvBase 10 | 11 | 12 | class VecObsDummyEnv(EnvBase): 13 | """ 14 | VecObsDummyEnv is a dummy environment for testing purposes. 15 | It does not connec to Pybricks 16 | 17 | """ 18 | 19 | action_dim = 4 20 | state_dim = 7 21 | observation_key = "observation" 22 | 23 | def __init__(self, max_episode_steps=10): 24 | self.max_episode_steps = max_episode_steps 25 | self._batch_size = torch.Size([1]) 26 | self.action_spec = BoundedTensorSpec( 27 | low=-torch.ones((1, self.action_dim)), 28 | high=torch.ones((1, self.action_dim)), 29 | shape=(1, self.action_dim), 30 | ) 31 | 32 | observation_spec = BoundedTensorSpec( 33 | low=-torch.ones((1, self.state_dim)), 34 | high=torch.ones((1, self.state_dim)), 35 | ) 36 | 37 | self.observation_spec = CompositeSpec(shape=(1,)) 38 | self.observation_spec.set(self.observation_key, observation_spec) 39 | super().__init__(batch_size=self._batch_size) 40 | 41 | def _set_seed(self, seed: int): 42 | return super()._set_seed(seed) 43 | 44 | def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: 45 | """ 46 | Reset the environment and return the initial state. 47 | 48 | Returns: 49 | TensorDictBase: The initial state of the environment. 50 | """ 51 | # TODO solve this fake action sending before to receive first state 52 | self.episode_step_iter = 0 53 | observation = self.observation_spec[self.observation_key].rand() 54 | return TensorDict( 55 | { 56 | self.observation_key: observation.float(), 57 | }, 58 | batch_size=[1], 59 | ) 60 | 61 | def reward( 62 | self, 63 | action: np.ndarray, 64 | next_state: np.ndarray, 65 | ) -> Tuple[float, bool]: 66 | """ """ 67 | return 0.0, False 68 | 69 | def _step(self, tensordict: TensorDictBase) -> TensorDictBase: 70 | """ """ 71 | action = tensordict.get("action").cpu().numpy() 72 | next_observation = self.observation_spec[self.observation_key].rand() 73 | 74 | reward, done = self.reward( 75 | action=action, 76 | next_state=next_observation, 77 | ) 78 | next_tensordict = TensorDict( 79 | { 80 | self.observation_key: next_observation.float(), 81 | "reward": torch.tensor([reward]).float(), 82 | "done": torch.tensor([done]).bool(), 83 | }, 84 | batch_size=[1], 85 | ) 86 | 87 | # increment episode step counter 88 | self.episode_step_iter += 1 89 | if self.episode_step_iter >= self.max_episode_steps: 90 | next_tensordict.set("done", torch.tensor([True])) 91 | return next_tensordict 92 | 93 | 94 | class VecGoalObsDummyEnv(EnvBase): 95 | """ 96 | VecGoalObsDummyEnv is a dummy environment for testing purposes. 97 | It does not connec to Pybricks 98 | 99 | """ 100 | 101 | action_dim = 4 102 | state_dim = 7 103 | observation_key = "observation" 104 | goal_observation_key = "goal_observation" 105 | 106 | def __init__(self, max_episode_steps=10): 107 | self.max_episode_steps = max_episode_steps 108 | self._batch_size = torch.Size([1]) 109 | self.action_spec = BoundedTensorSpec( 110 | low=-torch.ones((1, self.action_dim)), 111 | high=torch.ones((1, self.action_dim)), 112 | shape=(1, self.action_dim), 113 | ) 114 | 115 | observation_spec = BoundedTensorSpec( 116 | low=-torch.ones((1, self.state_dim)), 117 | high=torch.ones((1, self.state_dim)), 118 | ) 119 | 120 | self.observation_spec = CompositeSpec(shape=(1,)) 121 | self.observation_spec.set(self.observation_key, observation_spec) 122 | self.observation_spec.set(self.goal_observation_key, observation_spec) 123 | super().__init__(batch_size=self._batch_size) 124 | 125 | def _set_seed(self, seed: int): 126 | return super()._set_seed(seed) 127 | 128 | def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: 129 | """ 130 | Reset the environment and return the initial state. 131 | 132 | Returns: 133 | TensorDictBase: The initial state of the environment. 134 | """ 135 | # TODO solve this fake action sending before to receive first state 136 | self.episode_step_iter = 0 137 | observation = self.observation_spec[self.observation_key].rand() 138 | goal_observation = self.observation_spec[self.goal_observation_key].rand() 139 | return TensorDict( 140 | { 141 | self.observation_key: observation.float(), 142 | self.goal_observation_key: goal_observation.float(), 143 | }, 144 | batch_size=[1], 145 | ) 146 | 147 | def reward( 148 | self, 149 | action: np.ndarray, 150 | next_state: np.ndarray, 151 | ) -> Tuple[float, bool]: 152 | """ """ 153 | return 0.0, False 154 | 155 | def _step(self, tensordict: TensorDictBase) -> TensorDictBase: 156 | """ """ 157 | action = tensordict.get("action").cpu().numpy() 158 | next_observation = self.observation_spec[self.observation_key].rand() 159 | goal = tensordict.get(self.goal_observation_key) 160 | 161 | reward, done = self.reward( 162 | action=action, 163 | next_state=next_observation, 164 | ) 165 | next_tensordict = TensorDict( 166 | { 167 | self.observation_key: next_observation.float(), 168 | self.goal_observation_key: goal.float(), 169 | "reward": torch.tensor([reward]).float(), 170 | "done": torch.tensor([done]).bool(), 171 | }, 172 | batch_size=[1], 173 | ) 174 | 175 | # increment episode step counter 176 | self.episode_step_iter += 1 177 | if self.episode_step_iter >= self.max_episode_steps: 178 | next_tensordict.set("done", torch.tensor([True])) 179 | return next_tensordict 180 | -------------------------------------------------------------------------------- /bricksrl/environments/roboarm_mixed_v0/client.py: -------------------------------------------------------------------------------- 1 | import ustruct 2 | from micropython import kbd_intr 3 | from pybricks.hubs import InventorHub 4 | from pybricks.parameters import Port 5 | from pybricks.pupdevices import Motor 6 | from pybricks.tools import wait 7 | from uselect import poll 8 | from usys import stdin, stdout 9 | 10 | kbd_intr(-1) 11 | hub = InventorHub() 12 | 13 | # Initialize and set the motors 14 | high_motor_range = (-150, 10) 15 | high_motor = Motor(Port.A) 16 | high_motor.run_target(speed=400, target_angle=-70) 17 | 18 | low_motor_range = (10, 75) 19 | low_motor = Motor(Port.D) 20 | low_motor.control.limits(500, 1000, 900) 21 | low_motor.run_target(speed=200, target_angle=40) 22 | 23 | rotation_motor_range = (-140, 40) 24 | rotation_motor = Motor(Port.B, gears=[20, 60]) 25 | motors = {"HM": high_motor, "LM": low_motor, "RM": rotation_motor} 26 | 27 | 28 | def get_current_motor_angles(): 29 | angles = {} 30 | for k, v in motors.items(): 31 | angle = normalize_angle(get_angle(v)) 32 | angles.update({k: angle}) 33 | return angles 34 | 35 | 36 | def run_angle(motor, angle, speed=300): 37 | motor.run_angle(speed=speed, rotation_angle=angle, wait=False) 38 | 39 | 40 | def get_angle(motor): 41 | return motor.angle() 42 | 43 | 44 | def normalize_angle(angle, low_angle=-180, high_angle=179, original_one_round=360): 45 | # Normalize angle to be within -179 to 179 degrees 46 | while angle <= low_angle: 47 | angle += original_one_round 48 | while angle > high_angle: 49 | angle -= original_one_round 50 | return angle 51 | 52 | 53 | def transform_range(value, old_min, old_max, new_min, new_max): 54 | """ 55 | Transform a value from one range to another. 56 | 57 | Parameters: 58 | value (float): The value to transform. 59 | old_min (float): The minimum value of the old range. 60 | old_max (float): The maximum value of the old range. 61 | new_min (float): The minimum value of the new range. 62 | new_max (float): The maximum value of the new range. 63 | 64 | Returns: 65 | float: The transformed value. 66 | """ 67 | # Compute the scale factor between the old and new ranges 68 | scale = (new_max - new_min) / (old_max - old_min) 69 | # Apply the transformation 70 | return new_min + (value - old_min) * scale 71 | 72 | 73 | keyboard = poll() 74 | keyboard.register(stdin) 75 | motor_speed = 250 76 | 77 | while True: 78 | 79 | while not keyboard.poll(0): 80 | wait(1) 81 | 82 | # Read action values for the motors 83 | data = stdin.buffer.read(12) 84 | rotation_action, low_action, high_action = ustruct.unpack("!fff", data) 85 | 86 | # Transform action range for motors 87 | high_action = transform_range(high_action, -1, 1, -60, 60) 88 | low_action = transform_range(low_action, -1, 1, -30, 30) 89 | rotation_action = transform_range(rotation_action, -1, 1, -90, 90) 90 | 91 | angles = get_current_motor_angles() 92 | 93 | # Adjust high action to ensure it stays within range after being applied 94 | if angles["HM"] + high_action > max(high_motor_range): 95 | high_action = max(high_motor_range) - angles["HM"] 96 | elif angles["HM"] + high_action < min(high_motor_range): 97 | high_action = min(high_motor_range) - angles["HM"] 98 | high_motor.run_angle(speed=motor_speed, rotation_angle=high_action, wait=False) 99 | 100 | # Adjust low action to ensure it stays within range after being applied 101 | if angles["LM"] + low_action > max(low_motor_range): 102 | low_action = max(low_motor_range) - angles["LM"] 103 | elif angles["LM"] + low_action < min(low_motor_range): 104 | low_action = min(low_motor_range) - angles["LM"] 105 | low_motor.run_angle(speed=motor_speed, rotation_angle=low_action, wait=False) 106 | 107 | # Adjust rotation action to ensure it stays within range after being applied 108 | if angles["RM"] + rotation_action > max(rotation_motor_range): 109 | rotation_action = max(rotation_motor_range) - angles["RM"] 110 | elif angles["RM"] + rotation_action < min(rotation_motor_range): 111 | rotation_action = min(rotation_motor_range) - angles["RM"] 112 | rotation_motor.control.limits(250, 200, 500) 113 | rotation_motor.run_angle( 114 | speed=motor_speed, rotation_angle=rotation_action, wait=False 115 | ) 116 | 117 | # Small delay to let motors arrive target angle 118 | wait(250) 119 | 120 | # Sometimes low angle jumps out of range and cant move back this corrects those cases 121 | if low_angle < 10: 122 | low_motor.run_target(speed=200, target_angle=10) 123 | 124 | # Read sensors to get current state of the robot 125 | high_angle = high_motor.angle() 126 | low_angle = low_motor.angle() 127 | rotation_angle = rotation_motor.angle() 128 | 129 | # Send current state back to environment 130 | out_msg = ustruct.pack( 131 | "!fff", 132 | high_angle, 133 | low_angle, 134 | rotation_angle, 135 | ) 136 | stdout.buffer.write(out_msg) 137 | -------------------------------------------------------------------------------- /bricksrl/environments/roboarm_v0/RoboArmEnv.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Tuple 3 | 4 | import numpy as np 5 | import torch 6 | 7 | from bricksrl.environments.base.base_env import BaseEnv 8 | from numpy import linalg 9 | from tensordict import TensorDict, TensorDictBase 10 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec 11 | 12 | 13 | class RoboArmEnv_v0(BaseEnv): 14 | """ """ 15 | 16 | action_dim = 4 # (Grab_motor_action, high_motor_action, low_motor_action, rotation_motor_action) 17 | 18 | state_dim = 4 # (GM, HM, LM, RM) 19 | 20 | observation_ranges = { 21 | "GM": (-148, -44), 22 | "HM": (-150, 10), 23 | "LM": (10, 70), 24 | "RM": (-180, 179), 25 | } 26 | 27 | observation_key = "observation" 28 | goal_observation_key = "goal_observation" 29 | 30 | def __init__( 31 | self, 32 | max_episode_steps: int = 50, 33 | sleep_time: float = 0.0, 34 | verbose: bool = False, 35 | pretrain: bool = False, 36 | reward_signal: str = "dense", 37 | ): 38 | self.sleep_time = sleep_time 39 | 40 | assert reward_signal in [ 41 | "dense", 42 | "sparse", 43 | ], "Reward signal must be dense or sparse." 44 | self.reward_signal = reward_signal 45 | self.max_episode_steps = max_episode_steps 46 | self._batch_size = torch.Size([1]) 47 | 48 | # Define action spec 49 | self.action_spec = BoundedTensorSpec( 50 | low=-1, 51 | high=1, 52 | shape=(1, self.action_dim), 53 | ) 54 | 55 | self.goal_thresholds = np.array( 56 | [50] 57 | ) # everythin below 20 is very good. 50 is good! 58 | # Observation 4 motors (GM, HM, LM, RM) + goal positions (GGM, GHM, GLM, GRM) 59 | # Define observation spec 60 | bounds = torch.tensor( 61 | [ 62 | self.observation_ranges["GM"], 63 | self.observation_ranges["HM"], 64 | self.observation_ranges["LM"], 65 | self.observation_ranges["RM"], 66 | ] 67 | ) 68 | 69 | low_bounds = bounds[:, 0].unsqueeze(0) 70 | high_bounds = bounds[:, 1].unsqueeze(0) 71 | 72 | observation_spec = BoundedTensorSpec( 73 | low=low_bounds, 74 | high=high_bounds, 75 | ) 76 | 77 | self.observation_spec = CompositeSpec(shape=(1,)) 78 | self.observation_spec.set(self.observation_key, observation_spec) 79 | self.observation_spec.set(self.goal_observation_key, observation_spec) 80 | super().__init__( 81 | action_dim=self.action_dim, 82 | state_dim=self.state_dim, 83 | verbose=verbose, 84 | use_hub=1 - pretrain, 85 | ) 86 | 87 | def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: 88 | """ 89 | Reset the environment and return the initial state. 90 | 91 | Returns: 92 | TensorDictBase: The initial state of the environment. 93 | """ 94 | # TODO solve this fake action sending before to receive first state 95 | self.episode_step_iter = 0 96 | if tensordict is not None: 97 | action = tensordict.get("action").cpu().numpy().squeeze() 98 | else: 99 | action = np.zeros(self.action_dim) 100 | self.send_to_hub(action) 101 | time.sleep(self.sleep_time) 102 | observation = self.read_from_hub() 103 | # sample random goal state 104 | self.goal_observation = ( 105 | self.observation_spec[self.goal_observation_key].rand().numpy() 106 | ) 107 | 108 | return TensorDict( 109 | { 110 | self.observation_key: torch.tensor(observation, dtype=torch.float32), 111 | self.goal_observation_key: torch.tensor( 112 | self.goal_observation, dtype=torch.float32 113 | ), 114 | "error": torch.tensor([0]).float(), 115 | }, 116 | batch_size=[1], 117 | ) 118 | 119 | @staticmethod 120 | def shortest_angular_distance_vectorized( 121 | theta_goal: np.array, theta_current: np.array 122 | ) -> float: 123 | """ 124 | Calculate the shortest angular distance between two arrays of angles. 125 | 126 | Parameters: 127 | - theta_goal: Array of goal angles in degrees. 128 | - theta_current: Array of current angles in degrees. 129 | 130 | Returns: 131 | - Array of the shortest angular distances in degrees. 132 | """ 133 | 134 | # Convert angles from degrees to radians 135 | theta_goal_rad = np.radians(theta_goal) 136 | theta_current_rad = np.radians(theta_current) 137 | 138 | # Calculate difference in radians using np.arctan2 for vectorized operation 139 | delta_theta_rad = np.arctan2( 140 | np.sin(theta_goal_rad - theta_current_rad), 141 | np.cos(theta_goal_rad - theta_current_rad), 142 | ) 143 | 144 | # Convert result back to degrees 145 | delta_theta_deg = np.degrees(delta_theta_rad) 146 | 147 | return delta_theta_deg 148 | 149 | def reward( 150 | self, 151 | achieved_state: np.array, 152 | ) -> Tuple[float, bool]: 153 | """Reward function of roboarm. 154 | 155 | Args: 156 | achieved_state (np.ndarray): The achieved state. 157 | goal_state (np.ndarray): The goal state. 158 | 159 | Returns: 160 | Tuple[float, bool]: The reward received and a boolean indicating whether the episode is done. 161 | """ 162 | 163 | done = False 164 | if self.reward_signal == "dense": 165 | angle_deltas = self.shortest_angular_distance_vectorized( 166 | self.goal_observation, achieved_state 167 | ) 168 | error = np.sum(np.abs(angle_deltas)) 169 | reward = -error / 100 170 | if error < np.mean(self.goal_thresholds): 171 | done = True 172 | elif self.reward_signal == "sparse": 173 | angle_deltas = self.shortest_angular_distance_vectorized( 174 | self.goal_observation, achieved_state 175 | ) 176 | error = np.sum(np.abs(angle_deltas)) 177 | if np.all(error <= self.goal_thresholds): 178 | reward = 1 179 | done = True 180 | 181 | else: 182 | reward = 0 183 | else: 184 | raise ValueError("Reward signal must be dense or sparse.") 185 | 186 | return reward, done, error 187 | 188 | def _step(self, tensordict: TensorDictBase) -> TensorDictBase: 189 | """ """ 190 | # Send action to hub to receive next state 191 | self.send_to_hub(tensordict.get("action").cpu().numpy().squeeze()) 192 | time.sleep( 193 | self.sleep_time 194 | ) # we need to wait some time for sensors to read and to 195 | 196 | # receive the next state 197 | next_observation = self.read_from_hub() 198 | 199 | # calc reward and done 200 | reward, done, error = self.reward( 201 | achieved_state=next_observation, 202 | ) 203 | 204 | next_tensordict = TensorDict( 205 | { 206 | self.observation_key: torch.tensor( 207 | next_observation, dtype=torch.float32 208 | ), 209 | self.goal_observation_key: torch.tensor( 210 | self.goal_observation, dtype=torch.float32 211 | ), 212 | "reward": torch.tensor([reward]).float(), 213 | "done": torch.tensor([done]).bool(), 214 | "error": torch.tensor([error]).float(), 215 | }, 216 | batch_size=[1], 217 | ) 218 | 219 | # increment episode step counter 220 | self.episode_step_iter += 1 221 | if self.episode_step_iter >= self.max_episode_steps: 222 | next_tensordict.set("done", torch.tensor([True])) 223 | return next_tensordict 224 | -------------------------------------------------------------------------------- /bricksrl/environments/roboarm_v0/client.py: -------------------------------------------------------------------------------- 1 | import ustruct 2 | from micropython import kbd_intr 3 | from pybricks.hubs import InventorHub 4 | from pybricks.parameters import Port 5 | from pybricks.pupdevices import Motor 6 | from pybricks.tools import wait 7 | from uselect import poll 8 | from usys import stdin, stdout 9 | 10 | kbd_intr(-1) 11 | 12 | hub = InventorHub() 13 | 14 | # Initialize and set the motors 15 | grab_motor_range = (-148, -45) 16 | grab_motor = Motor(Port.E) 17 | grab_motor.run_target(speed=400, target_angle=-95) # start roughly in the middle 18 | 19 | high_motor_range = (-150, 10) 20 | high_motor = Motor(Port.A) 21 | high_motor.run_target(speed=400, target_angle=-70) 22 | 23 | low_motor_range = (10, 70) 24 | low_motor = Motor(Port.D) 25 | low_motor.control.limits(500, 1000, 900) 26 | low_motor.run_target(speed=400, target_angle=40) 27 | 28 | rotation_motor = Motor(Port.B, gears=[20, 60]) 29 | 30 | motors = {"GM": grab_motor, "HM": high_motor, "LM": low_motor, "RM": rotation_motor} 31 | 32 | 33 | def get_current_motor_angles(): 34 | angles = {} 35 | for k, v in motors.items(): 36 | angle = normalize_angle(get_angle(v)) 37 | angles.update({k: angle}) 38 | return angles 39 | 40 | 41 | def normalize_angle(angle, low_angle=-180, high_angle=179, original_one_round=360): 42 | # Normalize angle to be within -179 to 179 degrees 43 | while angle <= low_angle: 44 | angle += original_one_round 45 | while angle > high_angle: 46 | angle -= original_one_round 47 | return angle 48 | 49 | 50 | def run_angle(motor, angle, speed=300): 51 | motor.run_angle(speed=speed, rotation_angle=angle, wait=False) 52 | 53 | 54 | def get_angle(motor): 55 | return motor.angle() 56 | 57 | 58 | def transform_range(value, old_min, old_max, new_min, new_max): 59 | """ 60 | Transform a value from one range to another. 61 | 62 | Parameters: 63 | value (float): The value to transform. 64 | old_min (float): The minimum value of the old range. 65 | old_max (float): The maximum value of the old range. 66 | new_min (float): The minimum value of the new range. 67 | new_max (float): The maximum value of the new range. 68 | 69 | Returns: 70 | float: The transformed value. 71 | """ 72 | # Compute the scale factor between the old and new ranges 73 | scale = (new_max - new_min) / (old_max - old_min) 74 | # Apply the transformation 75 | return new_min + (value - old_min) * scale 76 | 77 | 78 | keyboard = poll() 79 | keyboard.register(stdin) 80 | motor_speed = 250 81 | 82 | while True: 83 | 84 | while not keyboard.poll(0): 85 | wait(1) 86 | 87 | # Read action values for the motors 88 | data = stdin.buffer.read(16) # Reading 4 bytes (4 floats) 89 | rotation_action, low_action, high_action, grab_action = ustruct.unpack( 90 | "!ffff", data 91 | ) 92 | 93 | # Transform action range for motors 94 | grab_action = transform_range(grab_action, -1, 1, -25, 25) 95 | high_action = transform_range(high_action, -1, 1, -60, 60) 96 | low_action = transform_range(low_action, -1, 1, -30, 30) 97 | rotation_action = transform_range(rotation_action, -1, 1, -100, 100) 98 | 99 | angles = get_current_motor_angles() 100 | 101 | # Adjust grab action to ensure it stays within range after being applied 102 | if angles["GM"] + grab_action > max(grab_motor_range): 103 | grab_action = max(grab_motor_range) - angles["GM"] 104 | elif angles["GM"] + grab_action < min(grab_motor_range): 105 | grab_action = min(grab_motor_range) - angles["GM"] 106 | grab_motor.run_angle(speed=motor_speed, rotation_angle=grab_action, wait=False) 107 | 108 | # Adjust high action to ensure it stays within range after being applied 109 | if angles["HM"] + high_action > max(high_motor_range): 110 | high_action = max(high_motor_range) - angles["HM"] 111 | elif angles["HM"] + high_action < min(high_motor_range): 112 | high_action = min(high_motor_range) - angles["HM"] 113 | high_motor.run_angle(speed=motor_speed, rotation_angle=high_action, wait=False) 114 | 115 | # Adjust low action to ensure it stays within range after being applied 116 | if angles["LM"] + low_action > max(low_motor_range): 117 | low_action = max(low_motor_range) - angles["LM"] 118 | elif angles["LM"] + low_action < min(low_motor_range): 119 | low_action = min(low_motor_range) - angles["LM"] 120 | low_motor.run_angle(speed=motor_speed, rotation_angle=low_action, wait=False) 121 | rotation_motor.run_angle( 122 | speed=motor_speed, rotation_angle=rotation_action, wait=False 123 | ) 124 | 125 | # Small delay to let motors arrive target angle 126 | wait(250) 127 | 128 | # Sometimes low angle jumps out of range and cant move back this corrects those cases 129 | if low_angle < 10: 130 | low_motor.run_target(speed=200, target_angle=10) 131 | 132 | # Read sensors to get current state of the robot 133 | rotation_angle = rotation_motor.angle() 134 | high_angle = high_motor.angle() 135 | grab_angle = grab_motor.angle() 136 | low_angle = low_motor.angle() 137 | 138 | # Send current state back to environment 139 | out_msg = ustruct.pack( 140 | "!ffff", grab_angle, high_angle, low_angle, normalize_angle(rotation_angle) 141 | ) 142 | stdout.buffer.write(out_msg) 143 | -------------------------------------------------------------------------------- /bricksrl/environments/runaway_v0/RunAwayEnv.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Tuple 3 | 4 | import numpy as np 5 | import torch 6 | from bricksrl.environments.base.base_env import BaseEnv 7 | from tensordict import TensorDict, TensorDictBase 8 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec 9 | 10 | 11 | class RunAwayEnv_v0(BaseEnv): 12 | """ 13 | A reinforcement learning environment for training agents to get away from a wall. 14 | 15 | The goal of the agent is to increase the distance measured by an ultrasonic sensor and get away from the wall as fast as possible. 16 | The environment provides a state consisting of 4 sensor readings (left, right, pitch, roll) and the distance to the wall. 17 | The agent can take a continuous action in the range [-1, 1] to control the movement of the robot. 18 | The environment returns a reward based on the change in distance to the wall and terminates the episode if the robot gets too close to the wall or the maximum number of steps is reached. 19 | 20 | Args: 21 | max_episode_steps (int): The maximum number of steps per episode. Defaults to 10. 22 | min_distance (float): The minimum distance to the wall. Defaults to 40. 23 | sleep_time (float): The time to wait between sending actions and receiving the next state. Defaults to 0.2. 24 | verbose (bool): Whether to print verbose information during the environment's execution. Defaults to False. 25 | 26 | """ 27 | 28 | action_dim = 1 # control the wheel motors together 29 | # 5 sensors (left motor angle, right motor angle, pitch, roll, distance) 30 | state_dim = 5 31 | 32 | observation_ranges = { 33 | "left_motor_angles": [0, 360], 34 | "right_motor_angles": [0, 360], 35 | "roll_angle": [-90, 90], 36 | "pitch_angle": [-90, 90], 37 | "distance": [0, 2000], 38 | } 39 | 40 | observation_key = "observation" 41 | 42 | def __init__( 43 | self, 44 | max_episode_steps: int = 10, 45 | min_distance: float = 40, 46 | sleep_time: float = 0.2, 47 | verbose: bool = False, 48 | pretrain: bool = False, 49 | ): 50 | self.sleep_time = sleep_time 51 | self.min_distance = min_distance 52 | self.max_episode_steps = max_episode_steps 53 | self._batch_size = torch.Size([1]) 54 | 55 | # Define action spec 56 | self.action_spec = BoundedTensorSpec( 57 | low=-1, 58 | high=1, 59 | shape=(1, self.action_dim), 60 | ) 61 | 62 | # Define observation spec 63 | bounds = torch.tensor( 64 | [ 65 | self.observation_ranges["left_motor_angles"], 66 | self.observation_ranges["right_motor_angles"], 67 | self.observation_ranges["roll_angle"], 68 | self.observation_ranges["pitch_angle"], 69 | self.observation_ranges["distance"], 70 | ] 71 | ) 72 | 73 | low_bounds = bounds[:, 0].unsqueeze(0) 74 | high_bounds = bounds[:, 1].unsqueeze(0) 75 | 76 | observation_spec = BoundedTensorSpec( 77 | low=low_bounds, 78 | high=high_bounds, 79 | ) 80 | self.observation_spec = CompositeSpec( 81 | {self.observation_key: observation_spec}, shape=(1,) 82 | ) 83 | self.verbose = verbose 84 | super().__init__( 85 | action_dim=self.action_dim, 86 | state_dim=self.state_dim, 87 | verbose=verbose, 88 | use_hub=1 - pretrain, 89 | ) 90 | 91 | def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: 92 | """ 93 | Reset the environment and return the initial state. 94 | 95 | Returns: 96 | TensorDictBase: The initial state of the environment. 97 | """ 98 | # TODO solve this fake action sending before to receive first state 99 | self.episode_step_iter = 0 100 | if tensordict is not None: 101 | action = tensordict.get("action").cpu().numpy().squeeze(0) 102 | else: 103 | action = np.zeros(self.action_dim) 104 | self.send_to_hub(action) 105 | time.sleep(self.sleep_time) 106 | observation = self.read_from_hub() 107 | self.distance = observation[:, -1] 108 | return TensorDict( 109 | { 110 | self.observation_key: torch.tensor(observation, dtype=torch.float32), 111 | "distance": torch.tensor([self.distance]).float(), 112 | }, 113 | batch_size=[1], 114 | ) 115 | 116 | def reward(self, next_observation: np.array) -> Tuple[float, bool]: 117 | """Reward function of RunAwayEnv. 118 | 119 | Goal: Increase distance measured by ultrasonic sensor aka. 120 | get away from the wall as fast as possible. 121 | 122 | """ 123 | done = False 124 | 125 | current_distance = next_observation[:, -1] 126 | if current_distance <= self.min_distance: # too close to the wall break episode 127 | done = True 128 | reward = 0.0 129 | elif current_distance < self.distance: 130 | reward = -1.0 131 | elif current_distance > self.distance: 132 | reward = 1.0 133 | else: 134 | reward = 0.0 135 | if self.distance >= 2000: 136 | done = True 137 | self.distance = current_distance 138 | return reward, done 139 | 140 | def _step(self, tensordict: TensorDictBase) -> TensorDictBase: 141 | """ """ 142 | # Send action to hub to receive next state 143 | self.send_to_hub(tensordict.get("action").cpu().numpy().squeeze(0)) 144 | time.sleep(self.sleep_time) # wait some time for sensors to read and to 145 | 146 | # receive the next state 147 | next_observation = self.read_from_hub() 148 | 149 | # calc reward and done 150 | reward, done = self.reward( 151 | next_observation=next_observation, 152 | ) 153 | 154 | next_tensordict = TensorDict( 155 | { 156 | self.observation_key: torch.tensor( 157 | next_observation, dtype=torch.float32 158 | ), 159 | "reward": torch.tensor([reward]).float(), 160 | "done": torch.tensor([done]).bool(), 161 | "distance": torch.tensor([self.distance]).float(), 162 | }, 163 | batch_size=[1], 164 | ) 165 | 166 | # increment episode step counter 167 | self.episode_step_iter += 1 168 | if self.episode_step_iter >= self.max_episode_steps: 169 | next_tensordict.set("done", torch.tensor([True])) 170 | return next_tensordict 171 | -------------------------------------------------------------------------------- /bricksrl/environments/runaway_v0/client.py: -------------------------------------------------------------------------------- 1 | import ustruct 2 | from micropython import kbd_intr 3 | from pybricks.hubs import InventorHub 4 | from pybricks.parameters import Direction, Port 5 | from pybricks.pupdevices import Motor, UltrasonicSensor 6 | from pybricks.robotics import DriveBase 7 | from pybricks.tools import wait 8 | from uselect import poll 9 | from usys import stdin, stdout 10 | 11 | kbd_intr(-1) 12 | 13 | 14 | def normalize_angle(angle): 15 | # Normalize angle to be within 0 and 360 16 | while angle <= 0: 17 | angle += 360 18 | while angle > 360: 19 | angle -= 360 20 | return angle 21 | 22 | 23 | def transform_range(value, old_min, old_max, new_min, new_max): 24 | """ 25 | Transform a value from one range to another. 26 | 27 | Parameters: 28 | value (float): The value to transform. 29 | old_min (float): The minimum value of the old range. 30 | old_max (float): The maximum value of the old range. 31 | new_min (float): The minimum value of the new range. 32 | new_max (float): The maximum value of the new range. 33 | 34 | Returns: 35 | float: The transformed value. 36 | """ 37 | # Compute the scale factor between the old and new ranges 38 | scale = (new_max - new_min) / (old_max - old_min) 39 | # Apply the transformation 40 | return new_min + (value - old_min) * scale 41 | 42 | 43 | kbd_intr(-1) 44 | hub = InventorHub() 45 | 46 | # Initialize the drive base. 47 | left_motor = Motor(Port.E, Direction.COUNTERCLOCKWISE) 48 | right_motor = Motor(Port.A) 49 | drive_base = DriveBase(left_motor, right_motor, wheel_diameter=56, axle_track=130) 50 | # Initialize the distance sensor. 51 | sensor = UltrasonicSensor(Port.C) 52 | 53 | keyboard = poll() 54 | keyboard.register(stdin) 55 | 56 | while True: 57 | 58 | # Optional: Check available input. 59 | while not keyboard.poll(0): 60 | wait(1) 61 | 62 | # Read action values for the motors 63 | action_value = ustruct.unpack("!f", stdin.buffer.read(4))[0] 64 | action = transform_range(action_value, -1, 1, -100, 100) 65 | 66 | drive_base.straight(action, wait=True) 67 | 68 | # Read sensors to get current state of the robot 69 | (left, right) = (left_motor.angle(), right_motor.angle()) 70 | (pitch, roll) = hub.imu.tilt() 71 | dist = sensor.distance() 72 | 73 | # Send current state back to environment 74 | out_msg = ustruct.pack( 75 | "!fffff", normalize_angle(left), normalize_angle(right), pitch, roll, dist 76 | ) 77 | stdout.buffer.write(out_msg) 78 | -------------------------------------------------------------------------------- /bricksrl/environments/spinning_v0/SpinningEnv.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Tuple 3 | 4 | import numpy as np 5 | import torch 6 | from bricksrl.environments.base.base_env import BaseEnv 7 | from tensordict import TensorDict, TensorDictBase 8 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec 9 | 10 | 11 | class SpinningEnv_v0(BaseEnv): 12 | """ 13 | SpinningEnv_v0 is a custom gym environment for a spinning robot. 14 | The robot has to learn to spin in a circle around its own axis given a random goal direction (left or right, 0 or 1). 15 | 16 | Args: 17 | max_episode_steps (int): The maximum number of steps per episode. Defaults to 50. 18 | sleep_time (float): The time to wait between sending actions and receiving the next state. Defaults to 0.2. 19 | verbose (bool): Whether to print verbose information during the environment's execution. Defaults to False. 20 | 21 | """ 22 | 23 | action_dim = 2 # to control the wheel motors independently 24 | state_dim = 5 # 5 sensors (left, right, pitch, roll, rotation_velocity) + 1 direction (left or right) 25 | 26 | observation_ranges = { 27 | "left_motor_angle": [0, 360], 28 | "right_motor_angle": [0, 360], 29 | "pitch_angle": [-90, 90], 30 | "roll_angle": [-90, 90], 31 | "rotation_velocity": [-100, 100], 32 | "direction": [0, 1], 33 | } 34 | 35 | observation_key = "observation" 36 | 37 | def __init__( 38 | self, 39 | max_episode_steps: int = 50, 40 | sleep_time: float = 0.2, 41 | verbose: bool = False, 42 | pretrain: bool = False, 43 | ): 44 | self.sleep_time = sleep_time 45 | self._batch_size = torch.Size([1]) 46 | self.max_episode_steps = max_episode_steps 47 | 48 | # Define action spec 49 | self.action_spec = BoundedTensorSpec( 50 | low=-1, 51 | high=1, 52 | shape=(1, self.action_dim), 53 | ) 54 | 55 | # Define observation spec 56 | bounds = torch.tensor( 57 | [ 58 | self.observation_ranges["left_motor_angle"], 59 | self.observation_ranges["right_motor_angle"], 60 | self.observation_ranges["pitch_angle"], 61 | self.observation_ranges["roll_angle"], 62 | self.observation_ranges["rotation_velocity"], 63 | self.observation_ranges["direction"], 64 | ] 65 | ) 66 | low_bounds = bounds[:, 0].unsqueeze(0) 67 | high_bounds = bounds[:, 1].unsqueeze(0) 68 | 69 | observation_spec = BoundedTensorSpec( 70 | low=low_bounds, 71 | high=high_bounds, 72 | ) 73 | self.observation_spec = CompositeSpec( 74 | {self.observation_key: observation_spec}, shape=(1,) 75 | ) 76 | 77 | super().__init__( 78 | action_dim=self.action_dim, 79 | state_dim=self.state_dim, 80 | verbose=verbose, 81 | use_hub=1 - pretrain, 82 | ) 83 | 84 | def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: 85 | """ 86 | Reset the environment and return the initial state. 87 | 88 | Returns: 89 | TensorDictBase: The initial state of the environment. 90 | """ 91 | # TODO solve this fake action sending before to receive first state 92 | self.episode_step_iter = 0 93 | if tensordict is not None: 94 | action = tensordict.get("action").cpu().numpy().squeeze() 95 | else: 96 | action = np.zeros(self.action_dim) 97 | self.send_to_hub(action) 98 | time.sleep(self.sleep_time) 99 | 100 | state = self.read_from_hub() 101 | self.direction = np.random.randint(0, 2) # (0,1) left or right 102 | full_original_state = np.concatenate( 103 | (state, np.array([[self.direction]])), axis=1, dtype=np.float32 104 | ) 105 | 106 | return TensorDict( 107 | { 108 | self.observation_key: torch.tensor(full_original_state), 109 | }, 110 | batch_size=[1], 111 | ) 112 | 113 | def reward(self, next_observation: np.array) -> Tuple[float, bool]: 114 | """Reward function of Spinning environment. 115 | If the self.direction is 0, the robot is spinning left, otherwise right. 116 | We want to maximise in those cases the angular velocity (last element of the state vector). 117 | If the robot is spinning in the wrong direction, we want to minimize the angular velocity. 118 | """ 119 | done = False 120 | velocity = next_observation[:, -2] 121 | 122 | if self.direction == 0: 123 | reward = velocity 124 | else: 125 | reward = -velocity 126 | 127 | return reward, done 128 | 129 | def _step(self, tensordict: TensorDictBase) -> TensorDictBase: 130 | """ """ 131 | # Send action to hub to receive next state 132 | self.send_to_hub(tensordict.get("action").cpu().numpy().squeeze()) 133 | time.sleep(self.sleep_time) # wait some time for sensors to read and to 134 | # receive the next state 135 | next_observation = self.read_from_hub() 136 | full_original_next_observation = np.concatenate( 137 | (next_observation, np.array([[self.direction]])), axis=1, dtype=np.float32 138 | ) 139 | # calc reward and done 140 | reward, done = self.reward(full_original_next_observation) 141 | 142 | next_tensordict = TensorDict( 143 | { 144 | self.observation_key: torch.tensor(full_original_next_observation), 145 | "reward": torch.tensor([reward]).float(), 146 | "done": torch.tensor([done]).bool(), 147 | }, 148 | batch_size=[1], 149 | ) 150 | # increment episode step counter 151 | self.episode_step_iter += 1 152 | if self.episode_step_iter >= self.max_episode_steps: 153 | next_tensordict.set("done", torch.tensor([True]).bool()) 154 | 155 | return next_tensordict 156 | -------------------------------------------------------------------------------- /bricksrl/environments/spinning_v0/client.py: -------------------------------------------------------------------------------- 1 | import ustruct 2 | from micropython import kbd_intr 3 | from pybricks.hubs import InventorHub 4 | from pybricks.parameters import Axis, Direction, Port 5 | from pybricks.pupdevices import Motor 6 | from pybricks.tools import wait 7 | from uselect import poll 8 | from usys import stdin, stdout 9 | 10 | kbd_intr(-1) 11 | hub = InventorHub() 12 | 13 | # Initialize and set the motors 14 | left_motor = Motor(Port.E, Direction.COUNTERCLOCKWISE) 15 | right_motor = Motor(Port.A) 16 | 17 | keyboard = poll() 18 | keyboard.register(stdin) 19 | 20 | 21 | def normalize_angle(angle): 22 | # Normalize angle to be within 0 and 360 23 | while angle <= 0: 24 | angle += 360 25 | while angle > 360: 26 | angle -= 360 27 | return angle 28 | 29 | 30 | def transform_range(value, old_min, old_max, new_min, new_max): 31 | """ 32 | Transform a value from one range to another. 33 | 34 | Parameters: 35 | value (float): The value to transform. 36 | old_min (float): The minimum value of the old range. 37 | old_max (float): The maximum value of the old range. 38 | new_min (float): The minimum value of the new range. 39 | new_max (float): The maximum value of the new range. 40 | 41 | Returns: 42 | float: The transformed value. 43 | """ 44 | # Compute the scale factor between the old and new ranges 45 | scale = (new_max - new_min) / (old_max - old_min) 46 | # Apply the transformation 47 | return new_min + (value - old_min) * scale 48 | 49 | 50 | while True: 51 | 52 | while not keyboard.poll(0): 53 | wait(1) 54 | 55 | # Read action values for both motors 56 | data = stdin.buffer.read(8) # Reading 8 bytes (two floats) 57 | left_action_value, right_action_value = ustruct.unpack("!ff", data) 58 | 59 | # Apply action to each motor 60 | left_motor.run_angle( 61 | speed=400, 62 | rotation_angle=transform_range(left_action_value, -1, 1, -100, 100), 63 | wait=False, 64 | ) 65 | right_motor.run_angle( 66 | speed=400, 67 | rotation_angle=transform_range(right_action_value, -1, 1, -100, 100), 68 | wait=False, 69 | ) 70 | 71 | wait(100) # Small delay 72 | 73 | # Read sensors to get current state of the robot 74 | (left, right) = (left_motor.angle(), right_motor.angle()) 75 | (pitch, roll) = hub.imu.tilt() 76 | z_angl_vel = hub.imu.angular_velocity(Axis.Z) 77 | 78 | # Send current state back to environment 79 | out_msg = ustruct.pack( 80 | "!fffff", normalize_angle(left), normalize_angle(right), pitch, roll, z_angl_vel 81 | ) 82 | stdout.buffer.write(out_msg) 83 | -------------------------------------------------------------------------------- /bricksrl/environments/walker_v0/WalkerEnv.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Tuple 3 | 4 | import numpy as np 5 | 6 | import torch 7 | 8 | from bricksrl.environments.base.base_env import BaseEnv 9 | from tensordict import TensorDict, TensorDictBase 10 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec 11 | 12 | 13 | class WalkerEnv_v0(BaseEnv): 14 | """ 15 | A reinforcement learning environment for the robodog to learn to walk. 16 | 17 | Specific to the walker_v0 environment is, that the reward function is hard coded to learn a gait routine. 18 | In contrast to the walker_v1 environment, the reward function is not based on the acceleration of the robot. 19 | 20 | Args: 21 | max_episode_steps (int): The maximum number of steps per episode. Defaults to 10. 22 | sleep_time (float): The time to wait between sending actions and receiving the next state. Defaults to 0.0. 23 | verbose (bool): Whether to print additional information. Defaults to False. 24 | 25 | """ 26 | 27 | action_dim = 4 # (lf_value, lb_value, rf_value, rb_value) 28 | # angles are in range [-179, 179] 29 | state_dim = 7 # (lf_angle, rf_angle, lb_angle, rb_angle, pitch, roll, acc_x) 30 | 31 | observation_ranges = { 32 | "lf_angle": [-179, 179], 33 | "rf_angle": [-179, 179], 34 | "lb_angle": [-179, 179], 35 | "rb_angle": [-179, 179], 36 | "pitch": [-50, 50], 37 | "roll": [-50, 50], 38 | "acc_x": [-3000, 3000], 39 | } 40 | 41 | observation_key = "observation" 42 | 43 | def __init__( 44 | self, 45 | max_episode_steps: int = 50, 46 | sleep_time: float = 0.0, 47 | verbose: bool = False, 48 | pretrain: bool = False, 49 | ): 50 | self.sleep_time = sleep_time 51 | self._batch_size = torch.Size([1]) 52 | self.max_episode_steps = max_episode_steps 53 | 54 | # Define action spec 55 | self.action_spec = BoundedTensorSpec( 56 | low=-1, 57 | high=1, 58 | shape=(1, self.action_dim), 59 | ) 60 | 61 | # Define observation spec 62 | bounds = torch.tensor( 63 | [ 64 | self.observation_ranges["lf_angle"], 65 | self.observation_ranges["rf_angle"], 66 | self.observation_ranges["lb_angle"], 67 | self.observation_ranges["rb_angle"], 68 | self.observation_ranges["pitch"], 69 | self.observation_ranges["roll"], 70 | self.observation_ranges["acc_x"], 71 | ] 72 | ) 73 | # Reshape bounds to (1, 7) 74 | low_bounds = bounds[:, 0].unsqueeze(0) 75 | high_bounds = bounds[:, 1].unsqueeze(0) 76 | 77 | observation_spec = BoundedTensorSpec( 78 | low=low_bounds, 79 | high=high_bounds, 80 | shape=(1, self.state_dim), 81 | ) 82 | 83 | self.observation_spec = CompositeSpec( 84 | {self.observation_key: observation_spec}, shape=(1,) 85 | ) 86 | super().__init__( 87 | action_dim=self.action_dim, 88 | state_dim=self.state_dim, 89 | verbose=verbose, 90 | use_hub=1 - pretrain, 91 | ) 92 | 93 | def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: 94 | """ 95 | Reset the environment and return the initial state. 96 | 97 | Returns: 98 | TensorDictBase: The initial state of the environment. 99 | """ 100 | # TODO solve this fake action sending before to receive first state 101 | self.episode_step_iter = 0 102 | if tensordict is not None: 103 | action = tensordict.get("action").cpu().numpy().squeeze() 104 | else: 105 | action = np.zeros(self.action_dim) 106 | self.send_to_hub(action) 107 | time.sleep(self.sleep_time) 108 | observation = self.read_from_hub() 109 | 110 | return TensorDict( 111 | { 112 | self.observation_key: torch.tensor(observation, dtype=torch.float32), 113 | }, 114 | batch_size=[1], 115 | ) 116 | 117 | def reward( 118 | self, 119 | action: np.ndarray, 120 | next_state: np.ndarray, 121 | ) -> Tuple[float, bool]: 122 | """Reward function of walker. 123 | 124 | Args: 125 | action (np.ndarray): The action taken. 126 | next_state (np.ndarray): The next state. 127 | 128 | Returns: 129 | Tuple[float, bool]: The reward received and a boolean indicating whether the episode is done. 130 | """ 131 | 132 | done = False 133 | # pitch and roll need to stay in range [-75, 75] outside done = True 134 | pitch, roll = next_state[:, -3], next_state[:, -2] 135 | if np.abs(pitch) > 100 or np.abs(roll) > 100: 136 | done = True 137 | reward = 0 138 | return reward, done 139 | 140 | ( 141 | lf_angle, 142 | rf_angle, 143 | lb_angle, 144 | rb_angle, 145 | pitch, 146 | roll, 147 | acc_x, 148 | ) = next_state.squeeze() 149 | 150 | # we want actions to be negative and high 151 | # action is in range [-1, 1] over 4 dims -> sum is in range [-4, 4] -> divide by 4 to get in range [-1, 1] 152 | action_reward = -np.sum(action) / 4 / 10 153 | # Take this off we dont want them to be similar otherwise we cant adapt for noise in the system 154 | # actions should ideally be similar something like [-0.75, -0.75, -0.75, -0.75] 155 | # action_std_reward = -np.std(action) 156 | 157 | # we want lf_angle and rb_angle to be synchronized and rf_angle and lb_angle to be synchronized 158 | # divide by 180 to get in range [-1, 0] 159 | lf_rb_diff_reward = -angular_difference(lf_angle, rb_angle) / 180 160 | rf_lb_diff_reward = -angular_difference(rf_angle, lb_angle) / 180 161 | 162 | # we want lf_rb and rf_lb to be 180° apart 163 | # divide by 180 to get in range [-1, 0] 164 | lf_rf_180_reward = -(180 - angular_difference(lf_angle, rf_angle)) / 180 165 | lb_rb_180_reward = -(180 - angular_difference(lb_angle, rb_angle)) / 180 166 | 167 | if self.verbose: 168 | print("action_reward", action_reward) 169 | # print("action_std_reward", action_std_reward) 170 | print("lf_rb_diff_reward", lf_rb_diff_reward) 171 | print("rf_lb_diff_reward", rf_lb_diff_reward) 172 | print("lf_rf_180_reward", lf_rf_180_reward) 173 | 174 | reward = ( 175 | action_reward 176 | # + action_std_reward 177 | + lf_rb_diff_reward 178 | + rf_lb_diff_reward 179 | + lf_rf_180_reward 180 | + lb_rb_180_reward 181 | ) 182 | 183 | return reward.item(), done 184 | 185 | def _step(self, tensordict: TensorDictBase) -> TensorDictBase: 186 | """ """ 187 | # Send action to hub to receive next state 188 | action = tensordict.get("action").cpu().numpy().squeeze() 189 | self.send_to_hub(action) 190 | time.sleep(self.sleep_time) # wait some time for sensors to read and to 191 | # receive the next state 192 | next_observation = self.read_from_hub() 193 | 194 | # calc reward and done 195 | reward, done = self.reward( 196 | action=action, 197 | next_state=next_observation, 198 | ) 199 | next_tensordict = TensorDict( 200 | { 201 | self.observation_key: torch.tensor( 202 | next_observation, dtype=torch.float32 203 | ), 204 | "reward": torch.tensor([reward]).float(), 205 | "done": torch.tensor([done]).bool(), 206 | }, 207 | batch_size=[1], 208 | ) 209 | 210 | # increment episode step counter 211 | self.episode_step_iter += 1 212 | if self.episode_step_iter >= self.max_episode_steps: 213 | next_tensordict.set("done", torch.tensor([True])) 214 | return next_tensordict 215 | 216 | 217 | def angular_difference(angle1, angle2): 218 | # Calculate the difference in angles, wrapped between -180 and 180 219 | difference = (angle2 - angle1 + 180) % 360 - 180 220 | return abs(difference) # Return the absolute value of the difference 221 | -------------------------------------------------------------------------------- /bricksrl/environments/walker_v0/WalkerEnvSim.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import numpy as np 4 | 5 | import torch 6 | 7 | from bricksrl.environments.base.base_env import BaseSimEnv 8 | from tensordict import TensorDict, TensorDictBase 9 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec 10 | 11 | 12 | class WalkerEnvSim_v0(BaseSimEnv): 13 | """ """ 14 | 15 | action_dim = 4 # (lf_value, lb_value, rf_value, rb_value) 16 | # angles are in range [-179, 179] 17 | state_dim = 7 # (lf_angle, rf_angle, lb_angle, rb_angle, pitch, roll, acc_x) 18 | 19 | observation_ranges = { 20 | "lf_angle": [-179, 179], 21 | "rf_angle": [-179, 179], 22 | "lb_angle": [-179, 179], 23 | "rb_angle": [-179, 179], 24 | "pitch": [-75, 75], 25 | "roll": [-75, 75], 26 | "acc_x": [-3000, 3000], 27 | } 28 | 29 | observation_key = "observation" 30 | 31 | def __init__( 32 | self, 33 | max_episode_steps: int = 50, 34 | noise: float = 0.1, 35 | low_action_angle: int = -100, 36 | high_action_angle: int = 0, 37 | verbose: bool = False, 38 | ): 39 | self._batch_size = torch.Size([1]) 40 | self.max_episode_steps = max_episode_steps 41 | self.noise = noise 42 | self.low_action_angle = low_action_angle 43 | self.high_action_angle = high_action_angle 44 | self.current_leg_angles = None 45 | 46 | # Define action spec 47 | self.action_spec = BoundedTensorSpec( 48 | low=-1, 49 | high=1, 50 | shape=(1, self.action_dim), 51 | ) 52 | 53 | # Define observation spec 54 | bounds = torch.tensor( 55 | [ 56 | self.observation_ranges["lf_angle"], 57 | self.observation_ranges["rf_angle"], 58 | self.observation_ranges["lb_angle"], 59 | self.observation_ranges["rb_angle"], 60 | self.observation_ranges["pitch"], 61 | self.observation_ranges["roll"], 62 | self.observation_ranges["acc_x"], 63 | ] 64 | ) 65 | # Reshape bounds to (1, 7) 66 | low_bounds = bounds[:, 0].unsqueeze(0) 67 | high_bounds = bounds[:, 1].unsqueeze(0) 68 | observation_spec = BoundedTensorSpec( 69 | low=low_bounds, 70 | high=high_bounds, 71 | ) 72 | 73 | self.observation_spec = CompositeSpec( 74 | {self.observation_key: observation_spec}, shape=(1,) 75 | ) 76 | super().__init__( 77 | action_dim=self.action_dim, 78 | state_dim=self.state_dim, 79 | verbose=verbose, 80 | use_hub=False, 81 | ) 82 | 83 | def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: 84 | """ 85 | Reset the environment and return the initial state. 86 | 87 | Returns: 88 | TensorDictBase: The initial state of the environment. 89 | """ 90 | # TODO solve this fake action sending before to receive first state 91 | self.episode_step_iter = 0 92 | 93 | observation = self.observation_spec[self.observation_key].rand() 94 | self.current_leg_angles = observation[0, :4] 95 | return TensorDict( 96 | { 97 | self.observation_key: observation, 98 | }, 99 | batch_size=[1], 100 | ) 101 | 102 | def reward( 103 | self, 104 | action: np.ndarray, 105 | next_state: np.ndarray, 106 | ) -> Tuple[float, bool]: 107 | """Reward function of walker. 108 | 109 | Args: 110 | action (np.ndarray): The action taken. 111 | next_state (np.ndarray): The next state. 112 | 113 | Returns: 114 | Tuple[float, bool]: The reward received and a boolean indicating whether the episode is done. 115 | """ 116 | 117 | done = False 118 | # pitch and roll need to stay in range [-75, 75] outside done = True 119 | pitch, roll = next_state[:, -3], next_state[:, -2] 120 | if np.abs(pitch) > 100 or np.abs(roll) > 100: 121 | done = True 122 | reward = 0 123 | return reward, done 124 | 125 | ( 126 | lf_angle, 127 | rf_angle, 128 | lb_angle, 129 | rb_angle, 130 | pitch, 131 | roll, 132 | acc_x, 133 | ) = next_state.squeeze() 134 | 135 | # we want actions to be negative and high 136 | # action is in range [-1, 1] over 4 dims -> sum is in range [-4, 4] -> divide by 4 to get in range [-1, 1] 137 | action_reward = -np.sum(action) / 4 / 10 138 | # Take this off we dont want them to be similar otherwise we cant adapt for noise in the system 139 | # actions should ideally be similar something like [-0.75, -0.75, -0.75, -0.75] 140 | # action_std_reward = -np.std(action) 141 | 142 | # we want lf_angle and rb_angle to be synchronized and rf_angle and lb_angle to be synchronized 143 | # divide by 180 to get in range [-1, 0] 144 | lf_rb_diff_reward = -angular_difference(lf_angle, rb_angle) / 180 145 | rf_lb_diff_reward = -angular_difference(rf_angle, lb_angle) / 180 146 | 147 | # we want lf_rb and rf_lb to be 180° apart 148 | # divide by 180 to get in range [-1, 0] 149 | lf_rf_180_reward = -(180 - angular_difference(lf_angle, rf_angle)) / 180 150 | lb_rb_180_reward = -(180 - angular_difference(lb_angle, rb_angle)) / 180 151 | 152 | if self.verbose: 153 | print("action_reward", action_reward) 154 | # print("action_std_reward", action_std_reward) 155 | print("lf_rb_diff_reward", lf_rb_diff_reward) 156 | print("rf_lb_diff_reward", rf_lb_diff_reward) 157 | print("lf_rf_180_reward", lf_rf_180_reward) 158 | 159 | reward = ( 160 | action_reward 161 | # + action_std_reward 162 | + lf_rb_diff_reward 163 | + rf_lb_diff_reward 164 | + lf_rf_180_reward 165 | + lb_rb_180_reward 166 | ) 167 | 168 | return reward.item(), done 169 | 170 | @staticmethod 171 | def transform_range(value, old_min, old_max, new_min, new_max): 172 | """ 173 | Transform a value from one range to another. 174 | 175 | Parameters: 176 | value (float): The value to transform. 177 | old_min (float): The minimum value of the old range. 178 | old_max (float): The maximum value of the old range. 179 | new_min (float): The minimum value of the new range. 180 | new_max (float): The maximum value of the new range. 181 | 182 | Returns: 183 | float: The transformed value. 184 | """ 185 | # Compute the scale factor between the old and new ranges 186 | scale = (new_max - new_min) / (old_max - old_min) 187 | # Apply the transformation 188 | return new_min + (value - old_min) * scale 189 | 190 | @staticmethod 191 | def normalize_angle(angle, low_angle=-180, high_angle=179, original_one_round=360): 192 | # Normalize angle to be within -179 to 179 degrees 193 | while angle <= low_angle: 194 | angle += original_one_round 195 | while angle > high_angle: 196 | angle -= original_one_round 197 | return angle 198 | 199 | def apply_action(self, action: np.ndarray) -> np.ndarray: 200 | 201 | noise = np.random.normal(0, self.noise, size=4) 202 | action += noise 203 | 204 | lf_value, lb_value, rf_value, rb_value = action 205 | # transform action range for motors 206 | lf_action = self.transform_range( 207 | lf_value, -1, 1, self.low_action_angle, self.high_action_angle 208 | ) 209 | lb_action = self.transform_range( 210 | lb_value, -1, 1, self.low_action_angle, self.high_action_angle 211 | ) 212 | rf_actopm = self.transform_range( 213 | rf_value, -1, 1, self.low_action_angle, self.high_action_angle 214 | ) 215 | rb_action = self.transform_range( 216 | rb_value, -1, 1, self.low_action_angle, self.high_action_angle 217 | ) 218 | 219 | ( 220 | lf_angle, 221 | rf_angle, 222 | lb_angle, 223 | rb_angle, 224 | ) = self.current_leg_angles.squeeze() 225 | 226 | new_lf_angle = self.normalize_angle(lf_angle + lf_action) 227 | new_lb_angle = self.normalize_angle(lb_angle + lb_action) 228 | new_rf_angle = self.normalize_angle(rf_angle + rf_actopm) 229 | new_rb_angle = self.normalize_angle(rb_angle + rb_action) 230 | 231 | self.current_leg_angles = np.array( 232 | [ 233 | [ 234 | new_lf_angle, 235 | new_rf_angle, 236 | new_lb_angle, 237 | new_rb_angle, 238 | ] 239 | ], 240 | dtype=np.float32, 241 | ) 242 | return self.current_leg_angles 243 | 244 | def _step(self, tensordict: TensorDictBase) -> TensorDictBase: 245 | """ """ 246 | # Send action to hub to receive next state 247 | action = tensordict.get("action").cpu().numpy().squeeze() 248 | 249 | # receive the next state 250 | next_observation = self.apply_action(action) 251 | 252 | # add zeros for pitch, roll and acc_x 253 | next_observation = np.concatenate( 254 | (next_observation, np.zeros((1, 3))), axis=1, dtype=np.float32 255 | ) 256 | 257 | # calc reward and done 258 | reward, done = self.reward( 259 | action=action, 260 | next_state=next_observation, 261 | ) 262 | next_tensordict = TensorDict( 263 | { 264 | self.observation_key: next_observation, 265 | "reward": torch.tensor([reward]).float(), 266 | "done": torch.tensor([done]).bool(), 267 | }, 268 | batch_size=[1], 269 | ) 270 | 271 | # increment episode step counter 272 | self.episode_step_iter += 1 273 | if self.episode_step_iter >= self.max_episode_steps: 274 | next_tensordict.set("done", torch.tensor([True])) 275 | return next_tensordict 276 | 277 | 278 | def angular_difference(angle1, angle2): 279 | # Calculate the difference in angles, wrapped between -180 and 180 280 | difference = (angle2 - angle1 + 180) % 360 - 180 281 | return abs(difference) # Return the absolute value of the difference 282 | -------------------------------------------------------------------------------- /bricksrl/environments/walker_v0/client.py: -------------------------------------------------------------------------------- 1 | # NOTE: Run this program with the latest 2 | # firmware provided via https://beta.pybricks.com/ 3 | 4 | import umath 5 | import ustruct 6 | from micropython import kbd_intr 7 | from pybricks.hubs import InventorHub 8 | from pybricks.parameters import Axis, Direction, Port 9 | from pybricks.pupdevices import Motor, UltrasonicSensor 10 | from pybricks.tools import wait 11 | from uselect import poll 12 | 13 | # Standard MicroPython modules 14 | from usys import stdin, stdout 15 | 16 | kbd_intr(-1) 17 | hub = InventorHub() 18 | 19 | # Initialize and set the motors 20 | lf_motor = Motor(Port.D, Direction.COUNTERCLOCKWISE) 21 | lb_motor = Motor(Port.B, Direction.COUNTERCLOCKWISE) 22 | rf_motor = Motor(Port.C) 23 | rb_motor = Motor(Port.A) 24 | 25 | # Init additional sensor 26 | eyes = UltrasonicSensor(Port.E) 27 | 28 | # Setup poll 29 | keyboard = poll() 30 | keyboard.register(stdin) 31 | 32 | 33 | def normalize_angle(angle): 34 | # Normalize angle to be within -179 to 179 degrees 35 | while angle <= -180: 36 | angle += 360 37 | while angle > 179: 38 | angle -= 360 39 | return angle 40 | 41 | 42 | def transform_range(value, old_min, old_max, new_min, new_max): 43 | """ 44 | Transform a value from one range to another. 45 | 46 | Parameters: 47 | value (float): The value to transform. 48 | old_min (float): The minimum value of the old range. 49 | old_max (float): The maximum value of the old range. 50 | new_min (float): The minimum value of the new range. 51 | new_max (float): The maximum value of the new range. 52 | 53 | Returns: 54 | float: The transformed value. 55 | """ 56 | # Compute the scale factor between the old and new ranges 57 | scale = (new_max - new_min) / (old_max - old_min) 58 | # Apply the transformation 59 | return new_min + (value - old_min) * scale 60 | 61 | 62 | # Setting default values and ranges 63 | low_angle = -100 # 270 64 | high_angle = 0 65 | speed = 600 66 | 67 | while True: 68 | 69 | while not keyboard.poll(0): 70 | wait(1) 71 | 72 | # Read action values for the motors 73 | data = stdin.buffer.read(16) # Reading 16 bytes (4 floats) 74 | lf_value, lb_value, rf_value, rb_value = ustruct.unpack("!ffff", data) 75 | 76 | # Apply actions. Motor angle range is [-180, 179] action outputs are [-1, 1] we transform the actions first. 77 | lb_motor.run_angle( 78 | speed=speed, 79 | rotation_angle=transform_range(lb_value, -1, 1, low_angle, high_angle), 80 | wait=False, 81 | ) 82 | lf_motor.run_angle( 83 | speed=speed, 84 | rotation_angle=transform_range(lf_value, -1, 1, low_angle, high_angle), 85 | wait=False, 86 | ) 87 | rb_motor.run_angle( 88 | speed=speed, 89 | rotation_angle=transform_range(rb_value, -1, 1, low_angle, high_angle), 90 | wait=False, 91 | ) 92 | rf_motor.run_angle( 93 | speed=speed, 94 | rotation_angle=transform_range(rf_value, -1, 1, low_angle, high_angle), 95 | wait=False, 96 | ) 97 | 98 | # Small delay to let motors arrive target angle 99 | wait(250) # 250 100 | 101 | # Read sensors to get current state of the robot 102 | a_x = hub.imu.acceleration(Axis.X) 103 | (lf_angle, rf_angle) = (lf_motor.angle(), rf_motor.angle()) 104 | (lb_angle, rb_angle) = (lb_motor.angle(), rb_motor.angle()) 105 | (pitch, roll) = hub.imu.tilt() 106 | dist = eyes.distance() 107 | 108 | if umath.fabs(pitch) > 90 or umath.fabs(roll) > 120 or dist <= 40: 109 | hub.display.text(text="Help", on=500, off=50) 110 | 111 | # Send current state back to environment 112 | out_msg = ustruct.pack( 113 | "!fffffff", 114 | normalize_angle(lf_angle), 115 | normalize_angle(rf_angle), 116 | normalize_angle(lb_angle), 117 | normalize_angle(rb_angle), 118 | pitch, 119 | roll, 120 | a_x, 121 | ) 122 | stdout.buffer.write(out_msg) 123 | -------------------------------------------------------------------------------- /conf/README.md: -------------------------------------------------------------------------------- 1 | # Configuration Details for BricksRL Experiments 2 | 3 | ## Overview 4 | This directory contains all the necessary configuration files to tailor your experiments using BricksRL. Configurations are managed using [Hydra](https://hydra.cc/), a powerful tool for configuring complex applications that allows for easy modification of parameters directly from the command line. 5 | 6 | ## Configuration Files 7 | - **config.yaml**: The base configuration for all experiments including what agent and environment to run. 8 | - **env/**: Contains environment-specific configurations. 9 | - **runaway-v0.yaml**: Settings for the *RunAway-v0* environment for the 2wheeler robot. 10 | - **spinning-v0.yaml**: Settings for the *Spinning-v0* environment for the 2wheeler robot. 11 | - **walker-v0.yaml**: Settings for the *Walker-v0* environment for the walker robot. 12 | - **walker_sim-v0.yaml**: Settings for the *WalkerSim-v0* environment for the walker robot. 13 | - **roboarm-v0.yaml**: Settings for the *RoboArm-v0* environment for the roboarm robot. 14 | - **roboarm_sim-v0.yaml**: Settings for the *RoboArmSim-v0* environment for the roboarm robot. 15 | - **roboarm_mixed-v0.yaml**: Settings for the *RoboArmMixed-v0* environment for the roboarm robot. 16 | - **agent/**: Contains agent-specific configurations. 17 | - **sac.yaml**: Configuration for the SAC agent. 18 | - **td3.yaml**: Configuration for the TD3 agent. 19 | - **droq.yaml**: Configuration for the DroQ agent. 20 | 21 | ## Using Hydra for Configuration Overrides 22 | Hydra allows you to override any configuration parameter directly from the terminal when you run your experiments. This makes it easy to test different configurations without altering your configuration files. 23 | 24 | ### Example Usage 25 | To run an experiment with the walker environment using the SAC agent and specify the number of episodes directly from the command line, you can use the following command: 26 | 27 | ```bash 28 | python experiments/walker/train.py episodes=200 agent=sac 29 | ``` 30 | This command temporarily overrides the episodes and agent parameters for this specific run without needing to change the configuration files. 31 | 32 | You can further override agent or environment specific parameter like: 33 | 34 | ```bash 35 | python experiments/walker/train.py agent=sac agent.batch_size=32 36 | ``` 37 | 38 | or 39 | 40 | ```bash 41 | python experiments/walker/train.py env.max_episode_steps=200 env.frame_stack=4 42 | ``` -------------------------------------------------------------------------------- /conf/agent/bc.yaml: -------------------------------------------------------------------------------- 1 | name: bc 2 | lr: 3e-4 3 | batch_size: 256 4 | num_updates: 1 5 | prefill_episodes: 0 6 | 7 | 8 | policy_type: deterministic # stochastic or deterministic 9 | num_cells: 256 10 | dropout: 0.01 11 | normalization: LayerNorm 12 | -------------------------------------------------------------------------------- /conf/agent/cql.yaml: -------------------------------------------------------------------------------- 1 | name: cql 2 | lr: 3e-4 3 | batch_size: 256 4 | num_updates: 1 5 | prefill_episodes: 10 6 | 7 | bc_steps: 1000 8 | 9 | # CQL specific 10 | num_cells: 256 11 | gamma: 0.99 12 | soft_update_eps: 0.995 13 | loss_function: l2 14 | temperature: 1.0 15 | min_q_weight: 1.0 16 | max_q_backup: False 17 | deterministic_backup: False 18 | num_random: 10 19 | with_lagrange: True 20 | lagrange_thresh: 5.0 # tau 21 | 22 | normalization: None 23 | dropout: 0.0 24 | 25 | prb: 0 26 | buffer_size: 1000000 27 | pretrain: False 28 | reset_params: False -------------------------------------------------------------------------------- /conf/agent/droq.yaml: -------------------------------------------------------------------------------- 1 | name: sac 2 | lr: 3e-4 3 | batch_size: 256 4 | num_updates: 20 5 | prefill_episodes: 10 6 | 7 | num_cells: 256 8 | gamma: 0.99 9 | soft_update_eps: 0.995 10 | alpha_init: 1 11 | fixed_alpha: False 12 | loss_function: l2 13 | 14 | normalization: LayerNorm 15 | dropout: 0.01 16 | 17 | prb: 0 18 | buffer_size: 1000000 19 | reset_params: False 20 | -------------------------------------------------------------------------------- /conf/agent/iql.yaml: -------------------------------------------------------------------------------- 1 | name: iql 2 | lr: 3e-4 3 | batch_size: 256 4 | num_updates: 1 5 | prefill_episodes: 0 6 | 7 | num_cells: 256 8 | gamma: 0.99 9 | soft_update_eps: 0.995 10 | loss_function: l2 11 | temperature: 1.0 12 | expectile: 0.5 13 | 14 | normalization: None 15 | dropout: 0.0 16 | 17 | prb: 0 18 | buffer_size: 1000000 19 | pretrain: False 20 | reset_params: False -------------------------------------------------------------------------------- /conf/agent/random.yaml: -------------------------------------------------------------------------------- 1 | name: random 2 | # not used for random agent 3 | batch_size: 256 4 | num_updates: 2500 5 | prefill_episodes: 0 -------------------------------------------------------------------------------- /conf/agent/sac.yaml: -------------------------------------------------------------------------------- 1 | name: sac 2 | lr: 3e-4 3 | batch_size: 256 4 | num_updates: 1 5 | prefill_episodes: 10 6 | 7 | num_cells: 256 8 | gamma: 0.99 9 | soft_update_eps: 0.995 10 | alpha_init: 1 11 | fixed_alpha: False 12 | loss_function: l2 13 | 14 | normalization: None 15 | dropout: 0.0 16 | 17 | prb: 0 18 | buffer_size: 1000000 19 | reset_params: False -------------------------------------------------------------------------------- /conf/agent/td3.yaml: -------------------------------------------------------------------------------- 1 | name: td3 2 | lr: 3e-4 3 | batch_size: 256 4 | num_updates: 1 5 | prefill_episodes: 10 6 | 7 | num_cells: 256 8 | gamma: 0.99 9 | soft_update_eps: 0.995 10 | loss_function: smooth_l1 11 | exploration_noise: 0.1 # 0.01 12 | 13 | normalization: None 14 | dropout: 0.0 15 | 16 | prb: 0 17 | buffer_size: 1000000 18 | reset_params: False 19 | use_bc: False 20 | alpha: 1.0 -------------------------------------------------------------------------------- /conf/config.yaml: -------------------------------------------------------------------------------- 1 | # Base Config to run all examples 2 | 3 | run_name: "" 4 | verbose: 0 5 | 6 | device: "cuda" 7 | episodes: 250 8 | 9 | defaults: 10 | - _self_ 11 | # random, sac, td3, droq 12 | - agent: sac 13 | - env: walker_sim-v0 -------------------------------------------------------------------------------- /conf/env/roboarm-v0.yaml: -------------------------------------------------------------------------------- 1 | name: "roboarm-v0" 2 | max_episode_steps: 100 3 | # env specific params 4 | verbose: 0 5 | # env wrapper 6 | frame_stack: 1 7 | action_filter: 1 8 | sleep_time: 0.0 9 | reward_signal: dense 10 | -------------------------------------------------------------------------------- /conf/env/roboarm_mixed-v0.yaml: -------------------------------------------------------------------------------- 1 | name: "roboarm_mixed-v0" 2 | max_episode_steps: 30 3 | # env specific params 4 | verbose: 0 5 | # env wrapper 6 | frame_stack: 1 7 | action_filter: 1 8 | sleep_time: 0.0 9 | reward_signal: dense 10 | camera_id: 2 11 | goal_radius: 25 12 | -------------------------------------------------------------------------------- /conf/env/roboarm_sim-v0.yaml: -------------------------------------------------------------------------------- 1 | name: "roboarm_sim-v0" 2 | max_episode_steps: 100 3 | # env specific params 4 | verbose: 0 5 | # env wrapper 6 | frame_stack: 1 7 | action_filter: 1 8 | noise: 0.05 9 | reward_signal: dense 10 | -------------------------------------------------------------------------------- /conf/env/runaway-v0.yaml: -------------------------------------------------------------------------------- 1 | name: "runaway-v0" 2 | max_episode_steps: 20 3 | # env specific params 4 | min_distance: 40. 5 | verbose: 0 6 | # env wrapper 7 | frame_stack: 1 8 | action_filter: 1 9 | 10 | -------------------------------------------------------------------------------- /conf/env/spinning-v0.yaml: -------------------------------------------------------------------------------- 1 | name: "spinning-v0" 2 | max_episode_steps: 50 3 | verbose: 0 4 | # env wrapper 5 | frame_stack: 1 6 | action_filter: 1 7 | sleep_time: 0.0 8 | 9 | -------------------------------------------------------------------------------- /conf/env/walker-v0.yaml: -------------------------------------------------------------------------------- 1 | name: "walker-v0" 2 | max_episode_steps: 100 3 | # env specific params 4 | verbose: 0 5 | # env wrapper 6 | frame_stack: 1 7 | action_filter: 1 8 | sleep_time: 0.0 9 | 10 | -------------------------------------------------------------------------------- /conf/env/walker_sim-v0.yaml: -------------------------------------------------------------------------------- 1 | name: "walker_sim-v0" 2 | max_episode_steps: 100 3 | # env specific params 4 | noise: 0.1 5 | low_action_angle: -100 6 | high_action_angle: 0 7 | verbose: 0 8 | # env wrapper 9 | frame_stack: 1 10 | action_filter: 1 11 | 12 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | ## TorchRL SOTA Example 4 | 5 | In the [torchrl_sac](./torchrl_sac/) folder you will find a training script to train LEGO robots with Bricksrl similar to the state-of-the-art implementations in [TorchRL](https://github.com/pytorch/rl/tree/main/sota-implementations). This allows you to basically plug-and-play with any TorchRL sota-implementation or do custom adaptations. 6 | 7 | [Example results](https://wandb.ai/sebastian-dittert/bricksrl_torchrl_sac_example?nw=nwusersebastiandittert) 8 | 9 | ### TorchRL sota-example diff 10 | 11 | Only change made to the TorchRL sota-implementations is the make_env function: 12 | 13 | ``` 14 | # Environment import from BricksRL 15 | from bricksrl.environments.walker_v0.WalkerEnvSim import WalkerEnvSim_v0 16 | 17 | # ==================================================================== 18 | # Make BricksRL Environment 19 | # ----------------- 20 | 21 | 22 | def env_maker(cfg, device="cpu", from_pixels=False): 23 | # We use the WalkerEnvSim_v0 environment from BricksRL as an example 24 | # as it is easy to test as it does not require a robot at hand or to connect to the hub. 25 | # Users can replace this with any other environment from BricksRL or custom environments. 26 | env = WalkerEnvSim_v0(max_episode_steps=cfg.env.max_episode_steps) 27 | observation_keys = [key for key in env.observation_spec.keys()] 28 | 29 | transforms = [] 30 | if cfg.env.frame_stack > 1: 31 | transforms.append( 32 | CatFrames( 33 | N=cfg.env.frame_stack, 34 | in_keys=observation_keys, 35 | out_key=observation_keys, 36 | ) 37 | ) 38 | normalize_keys = [key for key in observation_keys if key != "pixels"] 39 | obs_ranges = np.array(list(env.observation_ranges.values())) 40 | obs_mean = obs_ranges.mean(axis=-1) 41 | obs_std = obs_ranges.std(axis=-1) 42 | transforms.append( 43 | ObservationNorm( 44 | in_keys=normalize_keys, loc=obs_mean, scale=obs_std, standard_normal=True 45 | ) 46 | ) 47 | transforms.append(DeviceCastTransform(device)) 48 | return TransformedEnv(env, Compose(*transforms)) 49 | 50 | ``` 51 | 52 | 53 | 54 | 55 | ## Custom Environment 56 | A template to create your own custom environments can be found [here](./custom_env.py). With an environment created like this you can update the [TorchRL example](./torchrl_sac) to train your own TorchRL agent on your custom environment. 57 | 58 | > **Attention!** For each custom environment, you need a custom client script that must be loaded on the HUB! 59 | 60 | ## High Level Examples 61 | In the [example notebook](./example_notebook.ipynb) we provide high-level training examples to train a **SAC agent** in the **RoboArmSim-v0** environment and a **TD3 agent** in the **WalkerSim-v0** enviornment. 62 | The examples are based on the experiments for our paper. Stand alone examples similar to the [TorchRL sota-implementations](https://github.com/pytorch/rl/tree/main/sota-implementations) can be found [here](./torchrl_sac). -------------------------------------------------------------------------------- /examples/custom_env.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import numpy as np 4 | 5 | import torch 6 | 7 | from bricksrl.environments.base.base_env import BaseEnv 8 | from tensordict import TensorDict, TensorDictBase 9 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec 10 | 11 | 12 | class CustomEnv(BaseEnv): 13 | """ 14 | Environment template for creating your own custom environment for BricksRL. 15 | 16 | Args: 17 | max_episode_steps (int): The maximum number of steps per episode. Defaults to 10. 18 | verbose (bool): Whether to print additional information. Defaults to False. 19 | 20 | """ 21 | 22 | def __init__( 23 | self, 24 | max_episode_steps: int = 50, 25 | verbose: bool = False, 26 | ): 27 | self._batch_size = torch.Size([1]) 28 | self.max_episode_steps = max_episode_steps 29 | 30 | # Define action spec 31 | self.action_spec = BoundedTensorSpec( 32 | low=-1, 33 | high=1, 34 | shape=(1, self.action_dim), 35 | ) 36 | 37 | # Define observation spec 38 | 39 | observation_spec = BoundedTensorSpec( 40 | low=-1, 41 | high=1, 42 | shape=(1, self.state_dim), 43 | ) 44 | 45 | self.observation_spec = CompositeSpec( 46 | {self.observation_key: observation_spec}, shape=(1,) 47 | ) 48 | super().__init__( 49 | action_dim=self.action_dim, 50 | state_dim=self.state_dim, 51 | verbose=verbose, 52 | ) 53 | 54 | def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: 55 | """ 56 | Reset the environment and return the initial state. 57 | 58 | Returns: 59 | TensorDictBase: The initial state of the environment. 60 | """ 61 | # TODO solve this fake action sending before to receive first state 62 | self.episode_step_iter = 0 63 | if tensordict is not None: 64 | action = tensordict.get("action").cpu().numpy().squeeze() 65 | else: 66 | action = np.zeros(self.action_dim) 67 | self.send_to_hub(action) 68 | # Get current observation 69 | observation = self.read_from_hub() 70 | 71 | return TensorDict( 72 | { 73 | self.observation_key: torch.tensor(observation, dtype=torch.float32), 74 | }, 75 | batch_size=[1], 76 | ) 77 | 78 | def reward( 79 | self, 80 | action: np.ndarray, 81 | next_state: np.ndarray, 82 | ) -> Tuple[float, bool]: 83 | """Your custom reward function""" 84 | return 1.0, False 85 | 86 | def _step(self, tensordict: TensorDictBase) -> TensorDictBase: 87 | """Custom step function""" 88 | # Send action to hub to receive next state 89 | action = tensordict.get("action").cpu().numpy().squeeze() 90 | self.send_to_hub(action) 91 | # receive the next state 92 | next_observation = self.read_from_hub() 93 | 94 | # calc reward and done 95 | reward, done = self.reward( 96 | action=action, 97 | next_state=next_observation, 98 | ) 99 | next_tensordict = TensorDict( 100 | { 101 | self.observation_key: torch.tensor( 102 | next_observation, dtype=torch.float32 103 | ), 104 | "reward": torch.tensor([reward]).float(), 105 | "done": torch.tensor([done]).bool(), 106 | }, 107 | batch_size=[1], 108 | ) 109 | 110 | # increment episode step counter 111 | self.episode_step_iter += 1 112 | if self.episode_step_iter >= self.max_episode_steps: 113 | next_tensordict.set("done", torch.tensor([True])) 114 | return next_tensordict 115 | -------------------------------------------------------------------------------- /examples/torchrl_sac/config.yaml: -------------------------------------------------------------------------------- 1 | # environment and task 2 | env: 3 | max_episode_steps: 100 4 | seed: 41 5 | frame_stack: 1 6 | 7 | # collector 8 | collector: 9 | total_frames: 10_000 10 | init_random_frames: 1000 11 | frames_per_batch: 1000 12 | init_env_steps: 1000 13 | device: cpu 14 | env_per_collector: 1 15 | reset_at_each_iter: False 16 | 17 | # replay buffer 18 | replay_buffer: 19 | size: 1000000 20 | prb: 0 # use prioritized experience replay 21 | scratch_dir: null 22 | 23 | # optim 24 | optim: 25 | utd_ratio: 1.0 26 | gamma: 0.99 27 | loss_function: l2 28 | lr: 3.0e-4 29 | weight_decay: 0.0 30 | batch_size: 256 31 | target_update_polyak: 0.995 32 | alpha_init: 1.0 33 | adam_eps: 1.0e-8 34 | 35 | # network 36 | network: 37 | hidden_sizes: [256, 256] 38 | activation: relu 39 | default_policy_scale: 1.0 40 | scale_lb: 0.1 41 | device: 42 | 43 | # logging 44 | logger: 45 | backend: wandb 46 | project_name: bricksrl_torchrl_sac_example 47 | group_name: null 48 | exp_name: Walkersim-v0_SAC 49 | mode: online 50 | eval_iter: 1000 51 | video: False -------------------------------------------------------------------------------- /examples/torchrl_sac/train.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | """SAC Example. 6 | 7 | This is a simple self-contained example of a SAC training script. 8 | 9 | It supports state environments like MuJoCo. 10 | 11 | The helper functions are coded in the utils.py associated with this script. 12 | """ 13 | import time 14 | 15 | import hydra 16 | 17 | import numpy as np 18 | import torch 19 | import torch.cuda 20 | import tqdm 21 | from tensordict import TensorDict 22 | from torchrl._utils import logger as torchrl_logger 23 | from torchrl.envs.utils import ExplorationType, set_exploration_type 24 | 25 | from torchrl.record.loggers import generate_exp_name, get_logger 26 | from utils import ( 27 | dump_video, 28 | log_metrics, 29 | make_collector, 30 | make_environment, 31 | make_loss_module, 32 | make_replay_buffer, 33 | make_sac_agent, 34 | make_sac_optimizer, 35 | ) 36 | 37 | 38 | @hydra.main(version_base="1.1", config_path="", config_name="config") 39 | def main(cfg: "DictConfig"): # noqa: F821 40 | device = cfg.network.device 41 | if device in ("", None): 42 | if torch.cuda.is_available(): 43 | device = torch.device("cuda:0") 44 | else: 45 | device = torch.device("cpu") 46 | device = torch.device(device) 47 | 48 | # Create logger 49 | exp_name = generate_exp_name("SAC", cfg.logger.exp_name) 50 | logger = None 51 | if cfg.logger.backend: 52 | logger = get_logger( 53 | logger_type=cfg.logger.backend, 54 | logger_name="sac_logging", 55 | experiment_name=exp_name, 56 | wandb_kwargs={ 57 | "mode": cfg.logger.mode, 58 | "config": dict(cfg), 59 | "project": cfg.logger.project_name, 60 | "group": cfg.logger.group_name, 61 | }, 62 | ) 63 | 64 | torch.manual_seed(cfg.env.seed) 65 | np.random.seed(cfg.env.seed) 66 | 67 | # Create environments 68 | train_env, eval_env = make_environment(cfg, logger=logger) 69 | 70 | # Create agent 71 | model, exploration_policy = make_sac_agent(cfg, train_env, eval_env, device) 72 | 73 | # Create SAC loss 74 | loss_module, target_net_updater = make_loss_module(cfg, model) 75 | 76 | # Create off-policy collector 77 | collector = make_collector(cfg, train_env, exploration_policy) 78 | 79 | # Create replay buffer 80 | replay_buffer = make_replay_buffer( 81 | batch_size=cfg.optim.batch_size, 82 | prb=cfg.replay_buffer.prb, 83 | buffer_size=cfg.replay_buffer.size, 84 | scratch_dir=cfg.replay_buffer.scratch_dir, 85 | device="cpu", 86 | ) 87 | 88 | # Create optimizers 89 | ( 90 | optimizer_actor, 91 | optimizer_critic, 92 | optimizer_alpha, 93 | ) = make_sac_optimizer(cfg, loss_module) 94 | 95 | # Main loop 96 | start_time = time.time() 97 | collected_frames = 0 98 | pbar = tqdm.tqdm(total=cfg.collector.total_frames) 99 | 100 | init_random_frames = cfg.collector.init_random_frames 101 | num_updates = int( 102 | cfg.collector.env_per_collector 103 | * cfg.collector.frames_per_batch 104 | * cfg.optim.utd_ratio 105 | ) 106 | prb = cfg.replay_buffer.prb 107 | eval_iter = cfg.logger.eval_iter 108 | frames_per_batch = cfg.collector.frames_per_batch 109 | eval_rollout_steps = cfg.env.max_episode_steps 110 | 111 | sampling_start = time.time() 112 | for i, tensordict in enumerate(collector): 113 | sampling_time = time.time() - sampling_start 114 | 115 | # Update weights of the inference policy 116 | collector.update_policy_weights_() 117 | 118 | pbar.update(tensordict.numel()) 119 | 120 | tensordict = tensordict.reshape(-1) 121 | current_frames = tensordict.numel() 122 | # Add to replay buffer 123 | replay_buffer.extend(tensordict.cpu()) 124 | collected_frames += current_frames 125 | 126 | # Optimization steps 127 | training_start = time.time() 128 | if collected_frames >= init_random_frames: 129 | losses = TensorDict({}, batch_size=[num_updates]) 130 | for i in range(num_updates): 131 | # Sample from replay buffer 132 | sampled_tensordict = replay_buffer.sample() 133 | if sampled_tensordict.device != device: 134 | sampled_tensordict = sampled_tensordict.to( 135 | device, non_blocking=True 136 | ) 137 | else: 138 | sampled_tensordict = sampled_tensordict.clone() 139 | 140 | # Compute loss 141 | loss_td = loss_module(sampled_tensordict) 142 | 143 | actor_loss = loss_td["loss_actor"] 144 | q_loss = loss_td["loss_qvalue"] 145 | alpha_loss = loss_td["loss_alpha"] 146 | 147 | # Update actor 148 | optimizer_actor.zero_grad() 149 | actor_loss.backward() 150 | optimizer_actor.step() 151 | 152 | # Update critic 153 | optimizer_critic.zero_grad() 154 | q_loss.backward() 155 | optimizer_critic.step() 156 | 157 | # Update alpha 158 | optimizer_alpha.zero_grad() 159 | alpha_loss.backward() 160 | optimizer_alpha.step() 161 | 162 | losses[i] = loss_td.select( 163 | "loss_actor", "loss_qvalue", "loss_alpha" 164 | ).detach() 165 | 166 | # Update qnet_target params 167 | target_net_updater.step() 168 | 169 | # Update priority 170 | if prb: 171 | replay_buffer.update_priority(sampled_tensordict) 172 | 173 | training_time = time.time() - training_start 174 | episode_end = ( 175 | tensordict["next", "done"] 176 | if tensordict["next", "done"].any() 177 | else tensordict["next", "truncated"] 178 | ) 179 | episode_rewards = tensordict["next", "episode_reward"][episode_end] 180 | 181 | # Logging 182 | metrics_to_log = {} 183 | if len(episode_rewards) > 0: 184 | episode_length = tensordict["next", "step_count"][episode_end] 185 | metrics_to_log["train/reward"] = episode_rewards.mean().item() 186 | metrics_to_log["train/episode_length"] = episode_length.sum().item() / len( 187 | episode_length 188 | ) 189 | if collected_frames >= init_random_frames: 190 | metrics_to_log["train/q_loss"] = losses.get("loss_qvalue").mean().item() 191 | metrics_to_log["train/actor_loss"] = losses.get("loss_actor").mean().item() 192 | metrics_to_log["train/alpha_loss"] = losses.get("loss_alpha").mean().item() 193 | metrics_to_log["train/alpha"] = loss_td["alpha"].item() 194 | metrics_to_log["train/entropy"] = loss_td["entropy"].item() 195 | metrics_to_log["train/sampling_time"] = sampling_time 196 | metrics_to_log["train/training_time"] = training_time 197 | 198 | # Evaluation 199 | if abs(collected_frames % eval_iter) < frames_per_batch: 200 | with set_exploration_type(ExplorationType.DETERMINISTIC), torch.no_grad(): 201 | eval_start = time.time() 202 | eval_rollout = eval_env.rollout( 203 | eval_rollout_steps, 204 | model[0], 205 | auto_cast_to_device=True, 206 | break_when_any_done=True, 207 | ) 208 | eval_env.apply(dump_video) 209 | eval_time = time.time() - eval_start 210 | eval_reward = eval_rollout["next", "reward"].sum(-2).mean().item() 211 | metrics_to_log["eval/reward"] = eval_reward 212 | metrics_to_log["eval/time"] = eval_time 213 | if logger is not None: 214 | log_metrics(logger, metrics_to_log, collected_frames) 215 | sampling_start = time.time() 216 | 217 | collector.shutdown() 218 | if not eval_env.is_closed: 219 | eval_env.close() 220 | if not train_env.is_closed: 221 | train_env.close() 222 | end_time = time.time() 223 | execution_time = end_time - start_time 224 | torchrl_logger.info(f"Training took {execution_time:.2f} seconds to finish") 225 | 226 | 227 | if __name__ == "__main__": 228 | main() 229 | -------------------------------------------------------------------------------- /experiments/2wheeler/eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | 5 | import hydra 6 | import numpy as np 7 | import wandb 8 | from omegaconf import DictConfig, OmegaConf 9 | from torchrl.envs.utils import step_mdp 10 | from tqdm import tqdm 11 | 12 | # Add the project root to PYTHONPATH for config 13 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) 14 | if project_root not in sys.path: 15 | sys.path.insert(0, project_root) 16 | 17 | from bricksrl.environments import make_env 18 | from experiments.helper.agents import get_agent 19 | from experiments.helper.utils import login, logout, setup_check 20 | 21 | 22 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config") 23 | def run(cfg: DictConfig) -> None: 24 | print(OmegaConf.to_yaml(cfg)) 25 | 26 | # make environment. 27 | setup_check(robot="2wheeler", config=cfg) 28 | env, action_space, state_space = make_env(cfg) 29 | 30 | # make agent 31 | agent, project_name = get_agent(action_space, state_space, cfg) 32 | login(agent) 33 | agent.eval() 34 | 35 | # initialize wandb 36 | wandb.init(project=project_name + "_eval") 37 | wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True) 38 | 39 | eval_episodes = cfg.episodes 40 | quit = False 41 | _ = input("Press Enter to start evaluation...") 42 | try: 43 | for e in tqdm(range(eval_episodes), desc="Evaluation"): 44 | td = env.reset() 45 | done = td.get("done", False) 46 | truncated = td.get("truncated", False) 47 | ep_return = 0 48 | ep_steps = 0 49 | total_step_times = [] 50 | actions = [] 51 | print("Start new evaluation...", flush=True) 52 | while not done and not truncated: 53 | ep_steps += 1 54 | step_start_time = time.time() 55 | td = agent.get_eval_action(td) 56 | actions.append(td.get("action").cpu().numpy()) 57 | td = env.step(td) 58 | agent.add_experience(td) 59 | total_agent_step_time = time.time() - step_start_time 60 | total_step_times.append(total_agent_step_time) 61 | done = td.get(("next", "done"), False) 62 | ep_return += td.get(("next", "reward"), 0) 63 | if done: 64 | if cfg.env.name == "runaway-v0": 65 | inpt = input( 66 | "Please reset the robot to the starting position and press Enter to continue or q to quit:" 67 | ) 68 | if inpt == "q": 69 | quit = True 70 | break 71 | td = step_mdp(td) 72 | 73 | if quit: 74 | break 75 | 76 | # Metrics Logging 77 | log_dict = { 78 | "epoch": e, 79 | "reward": ep_return, 80 | "steps": ep_steps, 81 | "total_step_time": np.mean(total_step_times), 82 | "buffer_size": agent.replay_buffer.__len__(), 83 | "done": done.float(), 84 | "mean_action": np.mean(actions), 85 | } 86 | if cfg.env.name == "runaway-v0": 87 | log_dict.update({"distance": td.get("distance")}) 88 | 89 | wandb.log(log_dict) 90 | 91 | except KeyboardInterrupt: 92 | print("Evaluation interrupted by user.") 93 | logout(agent) 94 | env.close() 95 | 96 | 97 | if __name__ == "__main__": 98 | run() 99 | -------------------------------------------------------------------------------- /experiments/2wheeler/pretrain.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import hydra 5 | import wandb 6 | from omegaconf import DictConfig, OmegaConf 7 | from tqdm import tqdm 8 | 9 | # Add the project root to PYTHONPATH for config 10 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) 11 | if project_root not in sys.path: 12 | sys.path.insert(0, project_root) 13 | 14 | from bricksrl.environments import make_env 15 | from experiments.helper.agents import get_agent 16 | from experiments.helper.utils import login, logout, setup_check, tensordict2dict 17 | 18 | 19 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config") 20 | def run(cfg: DictConfig) -> None: 21 | print(OmegaConf.to_yaml(cfg)) 22 | 23 | # make environment. 24 | setup_check(robot="2wheeler", config=cfg) 25 | env, action_space, state_space = make_env(cfg, pretrain=True) 26 | 27 | # make agent 28 | agent, project_name = get_agent(action_space, state_space, cfg) 29 | login(agent) 30 | 31 | # initialize wandb 32 | wandb.init(project=project_name) 33 | wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True) 34 | wandb.watch(agent.actor, log_freq=1) if agent.actor else None 35 | 36 | batch_size = cfg.agent.batch_size 37 | num_updates = cfg.agent.num_updates 38 | train_episodes = cfg.episodes 39 | print("Start training...") 40 | try: 41 | for e in tqdm(range(train_episodes), desc="Training"): 42 | 43 | loss_info = agent.train(batch_size=batch_size, num_updates=num_updates) 44 | 45 | # Metrics Logging 46 | log_dict = { 47 | "epoch": e, 48 | "buffer_size": agent.replay_buffer.__len__(), 49 | } 50 | log_dict.update(tensordict2dict(loss_info)) 51 | wandb.log(log_dict) 52 | 53 | except KeyboardInterrupt: 54 | print("Training interrupted by user.") 55 | 56 | logout(agent) 57 | env.close() 58 | 59 | 60 | if __name__ == "__main__": 61 | run() 62 | -------------------------------------------------------------------------------- /experiments/2wheeler/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | 5 | import hydra 6 | import numpy as np 7 | import wandb 8 | from omegaconf import DictConfig, OmegaConf 9 | from torchrl.envs.utils import step_mdp 10 | from tqdm import tqdm 11 | 12 | # Add the project root to PYTHONPATH for config 13 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) 14 | if project_root not in sys.path: 15 | sys.path.insert(0, project_root) 16 | 17 | from bricksrl.environments import make_env 18 | from experiments.helper.agents import get_agent 19 | from experiments.helper.utils import ( 20 | login, 21 | logout, 22 | prefill_buffer, 23 | setup_check, 24 | tensordict2dict, 25 | ) 26 | 27 | 28 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config") 29 | def run(cfg: DictConfig) -> None: 30 | print(OmegaConf.to_yaml(cfg)) 31 | 32 | # make environment. 33 | setup_check(robot="2wheeler", config=cfg) 34 | env, action_space, state_space = make_env(cfg) 35 | 36 | # make agent 37 | agent, project_name = get_agent(action_space, state_space, cfg) 38 | login(agent) 39 | 40 | # initialize wandb 41 | wandb.init(project=project_name) 42 | wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True) 43 | wandb.watch(agent.actor, log_freq=1) if agent.actor else None 44 | 45 | # prefill buffer with random actions 46 | prefill_buffer( 47 | env=env, 48 | agent=agent, 49 | num_episodes=cfg.agent.prefill_episodes, 50 | stop_on_done=True if cfg.env.name == "runaway-v0" else False, 51 | ) 52 | 53 | batch_size = cfg.agent.batch_size 54 | num_updates = cfg.agent.num_updates 55 | train_episodes = cfg.episodes 56 | print("Start training...") 57 | quit = False 58 | try: 59 | for e in tqdm(range(train_episodes), desc="Training"): 60 | td = env.reset() 61 | done = td.get("done", False) 62 | truncated = td.get("truncated", False) 63 | ep_return = 0 64 | ep_steps = 0 65 | total_step_times = [] 66 | actions = [] 67 | print("Start new data collection...", flush=True) 68 | while not done and not truncated: 69 | ep_steps += 1 70 | step_start_time = time.time() 71 | td = agent.get_action(td) 72 | actions.append(td.get("action").cpu().numpy()) 73 | td = env.step(td) 74 | agent.add_experience(td) 75 | total_agent_step_time = time.time() - step_start_time 76 | total_step_times.append(total_agent_step_time) 77 | done = td.get(("next", "done"), False) 78 | ep_return += td.get(("next", "reward"), 0) 79 | if done: 80 | if cfg.env.name == "runaway-v0": 81 | inpt = input( 82 | "Please reset the robot to the starting position and press Enter to continue or q to quit:" 83 | ) 84 | if inpt == "q": 85 | quit = True 86 | break 87 | td = step_mdp(td) 88 | loss_info = agent.train( 89 | batch_size=batch_size, num_updates=num_updates * ep_steps 90 | ) 91 | if quit: 92 | break 93 | 94 | # Metrics Logging 95 | log_dict = { 96 | "epoch": e, 97 | "reward": ep_return, 98 | "steps": ep_steps, 99 | "total_step_time": np.mean(total_step_times), 100 | "buffer_size": agent.replay_buffer.__len__(), 101 | "done": done.float(), 102 | "mean_action": np.mean(actions), 103 | } 104 | if cfg.env.name == "runaway-v0": 105 | log_dict.update({"distance": td.get("distance")}) 106 | 107 | log_dict.update(tensordict2dict(loss_info)) 108 | wandb.log(log_dict) 109 | 110 | except KeyboardInterrupt: 111 | print("Training interrupted by user.") 112 | 113 | logout(agent) 114 | env.close() 115 | 116 | 117 | if __name__ == "__main__": 118 | run() 119 | -------------------------------------------------------------------------------- /experiments/helper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BricksRL/bricksrl/bc250aeaa3b9ab9d718601fced38325f3621c8a3/experiments/helper/__init__.py -------------------------------------------------------------------------------- /experiments/helper/agents/__init__.py: -------------------------------------------------------------------------------- 1 | from experiments.helper.agents.behavior_cloning import BehavioralCloningAgent 2 | from experiments.helper.agents.cql import CQLAgent 3 | from experiments.helper.agents.iql import IQLAgent 4 | from experiments.helper.agents.random import RandomAgent 5 | from experiments.helper.agents.sac import SACAgent 6 | from experiments.helper.agents.td3 import TD3Agent 7 | 8 | all_agents = ["td3", "sac", "iql", "cql", "bc", "random"] 9 | 10 | 11 | def get_agent(action_spec, state_spec, cfg): 12 | if cfg.agent.name == "td3": 13 | agent = TD3Agent( 14 | action_spec=action_spec, 15 | state_spec=state_spec, 16 | agent_config=cfg.agent, 17 | device=cfg.device, 18 | ) 19 | elif cfg.agent.name == "sac": 20 | agent = SACAgent( 21 | action_spec=action_spec, 22 | state_spec=state_spec, 23 | agent_config=cfg.agent, 24 | device=cfg.device, 25 | ) 26 | elif cfg.agent.name == "bc": 27 | agent = BehavioralCloningAgent( 28 | action_spec=action_spec, 29 | state_spec=state_spec, 30 | agent_config=cfg.agent, 31 | device=cfg.device, 32 | ) 33 | elif cfg.agent.name == "random": 34 | agent = RandomAgent( 35 | action_spec=action_spec, 36 | state_spec=state_spec, 37 | agent_config=cfg.agent, 38 | device=cfg.device, 39 | ) 40 | elif cfg.agent.name == "iql": 41 | agent = IQLAgent( 42 | action_spec=action_spec, 43 | state_spec=state_spec, 44 | agent_config=cfg.agent, 45 | device=cfg.device, 46 | ) 47 | elif cfg.agent.name == "cql": 48 | agent = CQLAgent( 49 | action_spec=action_spec, 50 | state_spec=state_spec, 51 | agent_config=cfg.agent, 52 | device=cfg.device, 53 | ) 54 | else: 55 | raise NotImplementedError( 56 | f"Agent {cfg.agent.name} not implemented, please choose from {all_agents}" 57 | ) 58 | 59 | project_name = f"lego-{cfg.agent.name}-{cfg.env.name}" 60 | print("--- Agent initialized ---", flush=True) 61 | 62 | return agent, project_name 63 | -------------------------------------------------------------------------------- /experiments/helper/agents/base.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import Tuple 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.init as init 7 | from tensordict import TensorDictBase 8 | from torchrl.data.tensor_specs import TensorSpec 9 | from torchrl.envs.utils import set_exploration_mode 10 | 11 | 12 | class BaseAgent: 13 | """Implements a base agent used to interact with the lego robots. 14 | 15 | Args: 16 | state_spec (TensorSpec): The state specification of the environment. 17 | action_spec (TensorSpec): The action specification of the environment. 18 | agent_name (str): The name of the agent. 19 | device (str): The device to use for computation. 20 | 21 | Attributes: 22 | name (str): The name of the agent. 23 | observation_spec (TensorSpec): The state specification of the environment. 24 | action_spec (TensorSpec): The action specification of the environment. 25 | device (str): The device to use for computation. 26 | observation_keys (List[str]): The keys used to access the observation in the tensor dictionary. 27 | """ 28 | 29 | def __init__( 30 | self, 31 | state_spec: TensorSpec, 32 | action_spec: TensorSpec, 33 | agent_name: str, 34 | device: str = "cpu", 35 | ): 36 | self.name = agent_name 37 | self.observation_spec = state_spec 38 | self.action_spec = action_spec 39 | self.device = device 40 | self.observation_keys = [key for key in self.observation_spec.keys()] 41 | 42 | def init_nets(self, model: nn.Module): 43 | """Initializes the networks with random data. 44 | 45 | Args: 46 | model (list): A list of PyTorch models to initialize. 47 | """ 48 | with torch.no_grad(), set_exploration_mode("random"): 49 | td = self.observation_spec.rand() 50 | td = td.to(self.device) 51 | for net in model: 52 | net(td) 53 | del td 54 | 55 | def eval(self): 56 | """Sets the agent to evaluation mode.""" 57 | raise NotImplementedError 58 | 59 | @staticmethod 60 | def reset_parameter(param): 61 | if param.data.ndimension() == 2: # Weights 62 | init.kaiming_uniform_(param.data, a=math.sqrt(5)) 63 | else: # Biases and others 64 | # Adjust based on your specific needs 65 | init.uniform_(param.data, -1, 1) 66 | 67 | def get_action(self, tensordict: TensorDictBase) -> TensorDictBase: 68 | """Returns a sampled action given a tensordict to collect data. 69 | 70 | Args: 71 | tensordict (TensorDictBase): Tensordict containing the current state of the environment. 72 | 73 | Returns: 74 | TensorDictBase: TensorDict containing the sampled action to take in the environment. 75 | """ 76 | raise NotImplementedError 77 | 78 | def get_eval_action(self, tensordict: TensorDictBase) -> TensorDictBase: 79 | """Returns an action given a tensordict to evaluate the agent. 80 | 81 | Args: 82 | tensordict (TensorDictBase): Tensordict containing the current state of the environment. 83 | 84 | Returns: 85 | TensorDictBase: TensorDict containing the eval action to take in the environment. 86 | """ 87 | raise NotImplementedError 88 | 89 | def train( 90 | self, 91 | ): 92 | """Trains the agent. 93 | 94 | Raises: 95 | NotImplementedError: This method must be implemented by a subclass. 96 | """ 97 | raise NotImplementedError 98 | -------------------------------------------------------------------------------- /experiments/helper/agents/behavior_cloning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensordict as td 3 | import torch 4 | 5 | from experiments.helper.agents.base import BaseAgent 6 | from experiments.helper.networks.networks import ( 7 | get_deterministic_actor, 8 | get_stochastic_actor, 9 | ) 10 | from tensordict import TensorDictBase 11 | from torch import nn, optim 12 | from torchrl.data import BoundedTensorSpec, TensorDictReplayBuffer 13 | 14 | from torchrl.data.replay_buffers.storages import LazyMemmapStorage 15 | from torchrl.envs import RenameTransform, ToTensorImage 16 | from torchrl.envs.utils import ExplorationType, set_exploration_type 17 | 18 | 19 | def initialize(net, std=0.02): 20 | for p, n in net.named_parameters(): 21 | if "weight" in p: 22 | # nn.init.xavier_uniform_(n) 23 | nn.init.normal_(n, mean=0, std=std) 24 | elif "bias" in p: 25 | nn.init.zeros_(n) 26 | 27 | 28 | class BehavioralCloningAgent(BaseAgent): 29 | def __init__(self, state_spec, action_spec, agent_config, device="cpu"): 30 | super(BehavioralCloningAgent, self).__init__( 31 | state_spec, action_spec, agent_config.name, device 32 | ) 33 | 34 | if agent_config.policy_type == "deterministic": 35 | self.actor = get_deterministic_actor(state_spec, action_spec, agent_config) 36 | elif agent_config.policy_type == "stochastic": 37 | raise NotImplementedError( 38 | "Stochastic actor training is not implemented yet" 39 | ) 40 | # TODO: Implement stochastic actor training 41 | # self.actor = get_stochastic_actor( 42 | # state_spec, action_spec, agent_config 43 | # ) 44 | else: 45 | raise ValueError( 46 | "policy_type not recognized, choose deterministic or stochastic" 47 | ) 48 | self.actor.to(device) 49 | # initialize networks 50 | self.init_nets([self.actor]) 51 | 52 | self.optimizer = optim.Adam( 53 | self.actor.parameters(), lr=agent_config.lr, weight_decay=0.0 54 | ) 55 | 56 | # create replay buffer 57 | self.batch_size = agent_config.batch_size 58 | self.replay_buffer = self.create_replay_buffer() 59 | 60 | # general stats 61 | self.collected_transitions = 0 62 | self.do_pretrain = False 63 | self.episodes = 0 64 | 65 | def get_agent_statedict(self): 66 | """Save agent""" 67 | act_statedict = self.actor.state_dict() 68 | return {"actor": act_statedict} 69 | 70 | def load_model(self, path): 71 | """load model""" 72 | try: 73 | statedict = torch.load(path) 74 | self.actor.load_state_dict(statedict["actor"]) 75 | print("Model loaded") 76 | except: 77 | raise ValueError("Model not loaded") 78 | 79 | def load_replaybuffer(self, path): 80 | """load replay buffer""" 81 | try: 82 | loaded_data = TensorDictBase.load_memmap(path) 83 | self.replay_buffer.extend(loaded_data) 84 | if self.replay_buffer._batch_size != self.batch_size: 85 | Warning( 86 | "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size." 87 | ) 88 | self.replay_buffer._batch_size = self.batch_size 89 | print("Replay Buffer loaded") 90 | print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n") 91 | except: 92 | raise ValueError("Replay Buffer not loaded") 93 | 94 | def eval(self): 95 | """Sets the agent to evaluation mode.""" 96 | self.actor.eval() 97 | 98 | @torch.no_grad() 99 | def get_eval_action(self, td: TensorDictBase) -> TensorDictBase: 100 | """Get eval action from actor network""" 101 | with set_exploration_type(ExplorationType.MODE): 102 | out_td = self.actor(td.to(self.device)) 103 | return out_td 104 | 105 | def create_replay_buffer( 106 | self, 107 | buffer_size=1000000, 108 | buffer_scratch_dir="./tmp", 109 | device="cpu", 110 | prefetch=3, 111 | ): 112 | """Create replay buffer""" 113 | 114 | replay_buffer = TensorDictReplayBuffer( 115 | pin_memory=False, 116 | prefetch=prefetch, 117 | storage=LazyMemmapStorage( 118 | buffer_size, 119 | scratch_dir=buffer_scratch_dir, 120 | ), 121 | batch_size=self.batch_size, 122 | ) 123 | replay_buffer.append_transform(lambda x: x.to(device)) 124 | # TODO: check if we have image in observation space if so add this transform 125 | # replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True)) 126 | 127 | return replay_buffer 128 | 129 | @torch.no_grad() 130 | def get_action(self, td: TensorDictBase) -> TensorDictBase: 131 | """Get action from actor network""" 132 | with set_exploration_type(ExplorationType.RANDOM): 133 | out_td = self.actor(td.to(self.device)) 134 | return out_td 135 | 136 | def add_experience(self, transition: td.TensorDict): 137 | """Add experience to replay buffer""" 138 | """Add experience to replay buffer""" 139 | self.replay_buffer.extend(transition) 140 | self.collected_transitions += 1 141 | 142 | def train(self, batch_size=64, num_updates=1): 143 | """Train the agent""" 144 | log_data = {} 145 | 146 | for i in range(num_updates): 147 | batch = self.replay_buffer.sample(batch_size).to(self.device) 148 | orig_action = batch.get("action").clone() 149 | 150 | out_dict = self.actor(batch) 151 | loss = torch.mean((out_dict.get("action") - orig_action) ** 2) 152 | self.optimizer.zero_grad() 153 | loss.backward() 154 | self.optimizer.step() 155 | log_data.update({"loss": loss}) 156 | return log_data 157 | -------------------------------------------------------------------------------- /experiments/helper/agents/cql.py: -------------------------------------------------------------------------------- 1 | import tensordict as td 2 | import torch 3 | 4 | from experiments.helper.agents.base import BaseAgent 5 | from experiments.helper.networks.networks import get_critic, get_stochastic_actor 6 | from tensordict import TensorDictBase 7 | from torch import optim 8 | from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer 9 | from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage 10 | from torchrl.envs.utils import ExplorationType, set_exploration_type 11 | from torchrl.objectives import SoftUpdate 12 | 13 | from torchrl.objectives.cql import CQLLoss 14 | 15 | 16 | class CQLAgent(BaseAgent): 17 | def __init__(self, state_spec, action_spec, agent_config, device="cpu"): 18 | super(CQLAgent, self).__init__( 19 | state_spec, action_spec, agent_config.name, device 20 | ) 21 | 22 | with_lagrange = agent_config.with_lagrange 23 | 24 | self.actor = get_stochastic_actor(state_spec, action_spec, agent_config) 25 | self.critic = get_critic(state_spec, agent_config) 26 | 27 | self.actor.to(device) 28 | self.critic.to(device) 29 | 30 | # initialize networks 31 | self.init_nets([self.actor, self.critic]) 32 | 33 | # define loss function 34 | self.loss_module = CQLLoss( 35 | actor_network=self.actor, 36 | qvalue_network=self.critic, 37 | loss_function=agent_config.loss_function, 38 | temperature=agent_config.temperature, 39 | min_q_weight=agent_config.min_q_weight, 40 | max_q_backup=agent_config.max_q_backup, 41 | deterministic_backup=agent_config.deterministic_backup, 42 | num_random=agent_config.num_random, 43 | with_lagrange=agent_config.with_lagrange, 44 | lagrange_thresh=agent_config.lagrange_thresh, 45 | ) 46 | # Define Target Network Updater 47 | self.target_net_updater = SoftUpdate( 48 | self.loss_module, eps=agent_config.soft_update_eps 49 | ) 50 | self.target_net_updater.init_() 51 | 52 | # Reset weights 53 | self.reset_params = agent_config.reset_params 54 | 55 | # Define Replay Buffer 56 | self.batch_size = agent_config.batch_size 57 | self.replay_buffer = self.create_replay_buffer( 58 | prb=agent_config.prb, 59 | buffer_size=agent_config.buffer_size, 60 | device=device, 61 | ) 62 | 63 | # Define Optimizer 64 | critic_params = list( 65 | self.loss_module.qvalue_network_params.flatten_keys().values() 66 | ) 67 | actor_params = list( 68 | self.loss_module.actor_network_params.flatten_keys().values() 69 | ) 70 | self.optimizer_actor = optim.Adam( 71 | actor_params, lr=agent_config.lr, weight_decay=0.0 72 | ) 73 | self.optimizer_critic = optim.Adam( 74 | critic_params, lr=agent_config.lr, weight_decay=0.0 75 | ) 76 | self.optimizer_alpha = optim.Adam( 77 | [self.loss_module.log_alpha], 78 | lr=3.0e-4, 79 | ) 80 | if with_lagrange: 81 | self.alpha_prime_optim = torch.optim.Adam( 82 | [self.loss_module.log_alpha_prime], 83 | lr=agent_config.lr, 84 | ) 85 | else: 86 | self.alpha_prime_optim = None 87 | # general stats 88 | self.collected_transitions = 0 89 | self.total_updates = 0 90 | self.do_pretrain = agent_config.pretrain 91 | self.bc_steps = agent_config.bc_steps 92 | 93 | def get_agent_statedict(self): 94 | """Save agent""" 95 | act_statedict = self.actor.state_dict() 96 | critic_statedict = self.critic.state_dict() 97 | return {"actor": act_statedict, "critic": critic_statedict} 98 | 99 | def load_model(self, path): 100 | """load model""" 101 | try: 102 | statedict = torch.load(path) 103 | self.actor.load_state_dict(statedict["actor"]) 104 | self.critic.load_state_dict(statedict["critic"]) 105 | print("Model loaded") 106 | except: 107 | raise ValueError("Model not loaded") 108 | 109 | def load_replaybuffer(self, path): 110 | """load replay buffer""" 111 | try: 112 | loaded_data = TensorDictBase.load_memmap(path) 113 | self.replay_buffer.extend(loaded_data) 114 | if self.replay_buffer._batch_size != self.batch_size: 115 | Warning( 116 | "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size." 117 | ) 118 | self.replay_buffer._batch_size = self.batch_size 119 | print("Replay Buffer loaded") 120 | print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n") 121 | except: 122 | raise ValueError("Replay Buffer not loaded") 123 | 124 | def reset_networks(self): 125 | """reset network parameters""" 126 | print("Resetting Networks!") 127 | self.loss_module.actor_network_params.apply(self.reset_parameter) 128 | self.loss_module.target_actor_network_params.apply(self.reset_parameter) 129 | self.loss_module.qvalue_network_params.apply(self.reset_parameter) 130 | self.loss_module.target_qvalue_network_params.apply(self.reset_parameter) 131 | 132 | def eval(self): 133 | """Sets the agent to evaluation mode.""" 134 | self.actor.eval() 135 | 136 | def td_preprocessing(self, td: TensorDictBase) -> TensorDictBase: 137 | # TODO not ideal to have this here 138 | td.pop("scale") 139 | td.pop("loc") 140 | td.pop("params") 141 | if "vector_obs_embedding" in td.keys(): 142 | td.pop("vector_obs_embedding") 143 | if "image_embedding" in td.keys(): 144 | td.pop("image_embedding") 145 | 146 | def create_replay_buffer( 147 | self, 148 | prb=False, 149 | buffer_size=100000, 150 | buffer_scratch_dir=None, 151 | device="cpu", 152 | prefetch=3, 153 | ): 154 | """Create replay buffer""" 155 | # TODO: make this part of base off policy agent 156 | if prb: 157 | replay_buffer = TensorDictPrioritizedReplayBuffer( 158 | alpha=0.7, 159 | beta=0.5, 160 | pin_memory=False, 161 | prefetch=1, 162 | storage=LazyTensorStorage( 163 | buffer_size, 164 | ), 165 | ) 166 | else: 167 | replay_buffer = TensorDictReplayBuffer( 168 | pin_memory=False, 169 | prefetch=prefetch, 170 | storage=LazyMemmapStorage( 171 | buffer_size, 172 | scratch_dir=buffer_scratch_dir, 173 | ), 174 | batch_size=self.batch_size, 175 | ) 176 | replay_buffer.append_transform(lambda x: x.to(device)) 177 | # TODO: check if we have image in observation space if so add this transform 178 | # replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True)) 179 | return replay_buffer 180 | 181 | @torch.no_grad() 182 | def get_action(self, td: TensorDictBase) -> TensorDictBase: 183 | """Get action from actor network""" 184 | with set_exploration_type(ExplorationType.RANDOM): 185 | out_td = self.actor(td.to(self.device)) 186 | self.td_preprocessing(out_td) 187 | return out_td 188 | 189 | @torch.no_grad() 190 | def get_eval_action(self, td: TensorDictBase) -> TensorDictBase: 191 | """Get eval action from actor network""" 192 | with set_exploration_type(ExplorationType.MODE): 193 | out_td = self.actor(td.to(self.device)) 194 | self.td_preprocessing(out_td) 195 | return out_td 196 | 197 | def add_experience(self, transition: td.TensorDict): 198 | """Add experience to replay buffer""" 199 | self.replay_buffer.extend(transition) 200 | self.collected_transitions += 1 201 | 202 | def train(self, batch_size=64, num_updates=1): 203 | """Train the agent""" 204 | self.actor.train() 205 | for i in range(num_updates): 206 | self.total_updates += 1 207 | # Sample a batch from the replay buffer 208 | batch = self.replay_buffer.sample(batch_size) 209 | # Compute CQL Loss 210 | loss = self.loss_module(batch) 211 | 212 | # Update alpha 213 | alpha_loss = loss["loss_alpha"] 214 | alpha_prime_loss = loss["loss_alpha_prime"] 215 | self.optimizer_alpha.zero_grad() 216 | alpha_loss.backward() 217 | self.optimizer_alpha.step() 218 | 219 | # Update Actpr Network 220 | # official cql implementation uses behavior cloning loss for first few updating steps as it helps for some tasks 221 | if self.total_updates >= self.bc_steps: 222 | actor_loss = loss["loss_actor"] 223 | else: 224 | actor_loss = loss["loss_actor_bc"] 225 | self.optimizer_actor.zero_grad() 226 | actor_loss.backward() 227 | self.optimizer_actor.step() 228 | 229 | if self.alpha_prime_optim is not None: 230 | self.alpha_prime_optim.zero_grad() 231 | alpha_prime_loss.backward(retain_graph=True) 232 | self.alpha_prime_optim.step() 233 | 234 | # Update Critic Network 235 | q_loss = loss["loss_qvalue"] 236 | cql_loss = loss["loss_cql"] 237 | 238 | q_loss = q_loss + cql_loss 239 | self.optimizer_critic.zero_grad() 240 | q_loss.backward(retain_graph=False) 241 | self.optimizer_critic.step() 242 | 243 | # Update Target Networks 244 | self.target_net_updater.step() 245 | # Update Prioritized Replay Buffer 246 | if isinstance(self.replay_buffer, TensorDictPrioritizedReplayBuffer): 247 | self.replay_buffer.update_priorities( 248 | batch["indices"], 249 | loss["critic_loss"].detach().cpu().numpy(), 250 | ) 251 | self.actor.eval() 252 | return loss 253 | -------------------------------------------------------------------------------- /experiments/helper/agents/iql.py: -------------------------------------------------------------------------------- 1 | import tensordict as td 2 | import torch 3 | 4 | from experiments.helper.agents.base import BaseAgent 5 | from experiments.helper.networks.networks import ( 6 | get_critic, 7 | get_stochastic_actor, 8 | get_value_operator, 9 | ) 10 | from tensordict import TensorDictBase 11 | from torch import optim 12 | from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer 13 | from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage 14 | from torchrl.envs.transforms import ToTensorImage 15 | from torchrl.envs.utils import ExplorationType, set_exploration_type 16 | from torchrl.objectives import SoftUpdate 17 | 18 | from torchrl.objectives.iql import IQLLoss 19 | 20 | 21 | class IQLAgent(BaseAgent): 22 | def __init__(self, state_spec, action_spec, agent_config, device="cpu"): 23 | super(IQLAgent, self).__init__( 24 | state_spec, action_spec, agent_config.name, device 25 | ) 26 | 27 | self.actor = get_stochastic_actor(state_spec, action_spec, agent_config) 28 | self.critic = get_critic(state_spec, agent_config) 29 | 30 | self.value = get_value_operator(state_spec, agent_config) 31 | 32 | self.actor.to(device) 33 | self.critic.to(device) 34 | self.value.to(device) 35 | 36 | # initialize networks 37 | self.init_nets([self.actor, self.critic, self.value]) 38 | 39 | # define loss function 40 | self.loss_module = IQLLoss( 41 | actor_network=self.actor, 42 | qvalue_network=self.critic, 43 | value_network=self.value, 44 | num_qvalue_nets=2, 45 | temperature=agent_config.temperature, 46 | expectile=agent_config.expectile, 47 | loss_function=agent_config.loss_function, 48 | ) 49 | # Define Target Network Updater 50 | self.target_net_updater = SoftUpdate( 51 | self.loss_module, eps=agent_config.soft_update_eps 52 | ) 53 | self.target_net_updater.init_() 54 | 55 | # Reset weights 56 | self.reset_params = agent_config.reset_params 57 | 58 | # Define Replay Buffer 59 | self.batch_size = agent_config.batch_size 60 | 61 | self.replay_buffer = self.create_replay_buffer( 62 | prb=agent_config.prb, 63 | buffer_size=agent_config.buffer_size, 64 | device=device, 65 | ) 66 | 67 | # Define Optimizer 68 | critic_params = list( 69 | self.loss_module.qvalue_network_params.flatten_keys().values() 70 | ) 71 | value_params = list( 72 | self.loss_module.value_network_params.flatten_keys().values() 73 | ) 74 | actor_params = list( 75 | self.loss_module.actor_network_params.flatten_keys().values() 76 | ) 77 | self.optimizer_actor = optim.Adam( 78 | actor_params, lr=agent_config.lr, weight_decay=0.0 79 | ) 80 | self.optimizer_critic = optim.Adam( 81 | critic_params, lr=agent_config.lr, weight_decay=0.0 82 | ) 83 | self.optimizer_value = optim.Adam( 84 | value_params, lr=agent_config.lr, weight_decay=0.0 85 | ) 86 | 87 | # general stats 88 | self.collected_transitions = 0 89 | self.total_updates = 0 90 | self.do_pretrain = agent_config.pretrain 91 | 92 | def get_agent_statedict(self): 93 | """Save agent""" 94 | act_statedict = self.actor.state_dict() 95 | critic_statedict = self.critic.state_dict() 96 | value_statedict = self.value.state_dict() 97 | return { 98 | "actor": act_statedict, 99 | "critic": critic_statedict, 100 | "value": value_statedict, 101 | } 102 | 103 | def load_model(self, path): 104 | """load model""" 105 | 106 | try: 107 | statedict = torch.load(path) 108 | self.actor.load_state_dict(statedict["actor"]) 109 | self.critic.load_state_dict(statedict["critic"]) 110 | self.value.load_state_dict(statedict["value"]) 111 | print("Model loaded") 112 | except: 113 | raise ValueError("Model not loaded") 114 | 115 | def load_replaybuffer(self, path): 116 | """load replay buffer""" 117 | try: 118 | loaded_data = TensorDictBase.load_memmap(path) 119 | self.replay_buffer.extend(loaded_data) 120 | if self.replay_buffer._batch_size != self.batch_size: 121 | Warning( 122 | "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size." 123 | ) 124 | self.replay_buffer._batch_size = self.batch_size 125 | print("Replay Buffer loaded") 126 | print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n") 127 | except: 128 | raise ValueError("Replay Buffer not loaded") 129 | 130 | def reset_networks(self): 131 | """reset network parameters""" 132 | print("Resetting Networks!") 133 | self.loss_module.actor_network_params.apply(self.reset_parameter) 134 | self.loss_module.target_actor_network_params.apply(self.reset_parameter) 135 | self.loss_module.qvalue_network_params.apply(self.reset_parameter) 136 | self.loss_module.target_qvalue_network_params.apply(self.reset_parameter) 137 | self.loss_module.value_network_params.apply(self.reset_parameter) 138 | 139 | def eval(self): 140 | """Sets the agent to evaluation mode.""" 141 | self.actor.eval() 142 | 143 | def td_preprocessing(self, td: TensorDictBase) -> TensorDictBase: 144 | # TODO not ideal to have this here 145 | td.pop("scale") 146 | td.pop("loc") 147 | td.pop("params") 148 | if "vector_obs_embedding" in td.keys(): 149 | td.pop("vector_obs_embedding") 150 | if "image_embedding" in td.keys(): 151 | td.pop("image_embedding") 152 | 153 | def create_replay_buffer( 154 | self, 155 | prb=False, 156 | buffer_size=100000, 157 | buffer_scratch_dir=None, 158 | device="cpu", 159 | prefetch=3, 160 | ): 161 | """Create replay buffer""" 162 | # TODO: make this part of base off policy agent 163 | if prb: 164 | replay_buffer = TensorDictPrioritizedReplayBuffer( 165 | alpha=0.7, 166 | beta=0.5, 167 | pin_memory=False, 168 | prefetch=1, 169 | storage=LazyTensorStorage( 170 | buffer_size, 171 | device=device, 172 | ), 173 | ) 174 | else: 175 | replay_buffer = TensorDictReplayBuffer( 176 | pin_memory=False, 177 | prefetch=prefetch, 178 | storage=LazyMemmapStorage( 179 | buffer_size, 180 | scratch_dir=buffer_scratch_dir, 181 | ), 182 | batch_size=self.batch_size, 183 | ) 184 | replay_buffer.append_transform(lambda x: x.to(device)) 185 | # TODO: check if we have image in observation space if so add this transform 186 | # replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True)) 187 | 188 | return replay_buffer 189 | 190 | @torch.no_grad() 191 | def get_action(self, td: TensorDictBase) -> TensorDictBase: 192 | """Get action from actor network""" 193 | with set_exploration_type(ExplorationType.RANDOM): 194 | out_td = self.actor(td.to(self.device)) 195 | self.td_preprocessing(out_td) 196 | return out_td 197 | 198 | @torch.no_grad() 199 | def get_eval_action(self, td: TensorDictBase) -> TensorDictBase: 200 | """Get eval action from actor network""" 201 | with set_exploration_type(ExplorationType.MODE): 202 | out_td = self.actor(td.to(self.device)) 203 | self.td_preprocessing(out_td) 204 | return out_td 205 | 206 | def add_experience(self, transition: td.TensorDict): 207 | """Add experience to replay buffer""" 208 | self.replay_buffer.extend(transition) 209 | self.collected_transitions += 1 210 | 211 | def pretrain(self, wandb, batch_size=64, num_updates=1): 212 | """Pretrain the agent with simple behavioral cloning""" 213 | # TODO: implement pretrain for testing 214 | # for i in range(num_updates): 215 | # batch = self.replay_buffer.sample(batch_size) 216 | # pred, _ = self.actor(batch["observations"].float()) 217 | # loss = torch.mean((pred - batch["actions"]) ** 2) 218 | # self.optimizer.zero_grad() 219 | # loss.backward() 220 | # self.optimizer.step() 221 | # wandb.log({"pretrain/loss": loss.item()}) 222 | 223 | def train(self, batch_size=64, num_updates=1): 224 | """Train the agent""" 225 | self.actor.train() 226 | for i in range(num_updates): 227 | self.total_updates += 1 228 | if self.reset_params and self.total_updates % self.reset_params == 0: 229 | self.reset_networks() 230 | # Sample a batch from the replay buffer 231 | batch = self.replay_buffer.sample(batch_size) 232 | # Compute IQL Loss 233 | loss = self.loss_module(batch) 234 | 235 | # Update Actpr Network 236 | self.optimizer_actor.zero_grad() 237 | loss["loss_actor"].backward() 238 | self.optimizer_actor.step() 239 | # Update Critic Network 240 | self.optimizer_critic.zero_grad() 241 | loss["loss_qvalue"].backward() 242 | torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5) 243 | self.optimizer_critic.step() 244 | # Update Value Network 245 | self.optimizer_value.zero_grad() 246 | loss["loss_value"].backward() 247 | self.optimizer_value.step() 248 | 249 | # Update Target Networks 250 | self.target_net_updater.step() 251 | # Update Prioritized Replay Buffer 252 | if isinstance(self.replay_buffer, TensorDictPrioritizedReplayBuffer): 253 | self.replay_buffer.update_priorities( 254 | batch["indices"], 255 | loss["critic_loss"].detach().cpu().numpy(), 256 | ) 257 | self.actor.eval() 258 | return loss 259 | -------------------------------------------------------------------------------- /experiments/helper/agents/random.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from experiments.helper.agents.base import BaseAgent 4 | from tensordict import TensorDictBase 5 | from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer 6 | from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage 7 | 8 | 9 | class RandomAgent(BaseAgent): 10 | def __init__(self, state_spec, action_spec, agent_config, device="cpu"): 11 | super(RandomAgent, self).__init__( 12 | state_spec, action_spec, agent_config.name, device 13 | ) 14 | 15 | self.actor = None 16 | self.replay_buffer = self.create_replay_buffer( 17 | batch_size=256, 18 | prb=False, 19 | buffer_size=1000000, 20 | device=device, 21 | buffer_scratch_dir="/tmp", 22 | ) 23 | 24 | def eval(self): 25 | """Sets the agent to evaluation mode.""" 26 | 27 | @torch.no_grad() 28 | def get_action(self, tensordict: TensorDictBase): 29 | """Sample random actions from a uniform distribution""" 30 | tensordict.set("action", self.action_spec.rand()) 31 | return tensordict 32 | 33 | @torch.no_grad() 34 | def get_eval_action(self, tensordict: TensorDictBase): 35 | """Sample random actions from a uniform distribution""" 36 | tensordict.set("action", self.action_spec.rand()) 37 | return tensordict 38 | 39 | def add_experience(self, transition: TensorDictBase): 40 | """Add experience to replay buffer""" 41 | self.replay_buffer.extend(transition) 42 | 43 | def train(self, batch_size=64, num_updates=1): 44 | """Train the agent""" 45 | return {} 46 | 47 | def create_replay_buffer( 48 | self, 49 | batch_size=256, 50 | prb=False, 51 | buffer_size=100000, 52 | buffer_scratch_dir=None, 53 | device="cpu", 54 | prefetch=3, 55 | ): 56 | """Create replay buffer""" 57 | # TODO: make this part of base off policy agent 58 | if prb: 59 | replay_buffer = TensorDictPrioritizedReplayBuffer( 60 | alpha=0.7, 61 | beta=0.5, 62 | pin_memory=False, 63 | prefetch=1, 64 | storage=LazyTensorStorage( 65 | buffer_size, 66 | ), 67 | ) 68 | else: 69 | replay_buffer = TensorDictReplayBuffer( 70 | pin_memory=False, 71 | prefetch=prefetch, 72 | storage=LazyMemmapStorage( 73 | buffer_size, 74 | scratch_dir=buffer_scratch_dir, 75 | ), 76 | batch_size=batch_size, 77 | ) 78 | return replay_buffer 79 | -------------------------------------------------------------------------------- /experiments/helper/agents/sac.py: -------------------------------------------------------------------------------- 1 | import tensordict as td 2 | import torch 3 | 4 | from experiments.helper.agents.base import BaseAgent 5 | from experiments.helper.networks.networks import get_critic, get_stochastic_actor 6 | from tensordict import TensorDictBase 7 | from torch import optim 8 | from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer 9 | from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage 10 | from torchrl.envs.utils import ExplorationType, set_exploration_type 11 | from torchrl.objectives import SoftUpdate 12 | 13 | from torchrl.objectives.sac import SACLoss 14 | 15 | 16 | class SACAgent(BaseAgent): 17 | def __init__(self, state_spec, action_spec, agent_config, device="cpu"): 18 | super(SACAgent, self).__init__( 19 | state_spec, action_spec, agent_config.name, device 20 | ) 21 | 22 | self.actor = get_stochastic_actor(state_spec, action_spec, agent_config) 23 | self.critic = get_critic(state_spec, agent_config) 24 | 25 | self.actor.to(device) 26 | self.critic.to(device) 27 | 28 | # initialize networks 29 | self.init_nets([self.actor, self.critic]) 30 | 31 | # define loss function 32 | self.loss_module = SACLoss( 33 | actor_network=self.actor, 34 | qvalue_network=self.critic, 35 | delay_qvalue=True, 36 | value_network=None, # None to use SAC version 2 37 | num_qvalue_nets=2, 38 | fixed_alpha=agent_config.fixed_alpha, 39 | alpha_init=agent_config.alpha_init, 40 | loss_function=agent_config.loss_function, 41 | ) 42 | # Define Target Network Updater 43 | self.target_net_updater = SoftUpdate( 44 | self.loss_module, eps=agent_config.soft_update_eps 45 | ) 46 | self.target_net_updater.init_() 47 | 48 | # Reset weights 49 | self.reset_params = agent_config.reset_params 50 | 51 | self.batch_size = agent_config.batch_size 52 | # Define Replay Buffer 53 | self.buffer_batch_size = agent_config.batch_size 54 | self.replay_buffer = self.create_replay_buffer( 55 | prb=agent_config.prb, 56 | buffer_size=agent_config.buffer_size, 57 | buffer_scratch_dir="/tmp", 58 | device=device, 59 | ) 60 | # Define Optimizer 61 | critic_params = list( 62 | self.loss_module.qvalue_network_params.flatten_keys().values() 63 | ) 64 | actor_params = list( 65 | self.loss_module.actor_network_params.flatten_keys().values() 66 | ) 67 | self.optimizer_actor = optim.Adam( 68 | actor_params, lr=agent_config.lr, weight_decay=0.0 69 | ) 70 | self.optimizer_critic = optim.Adam( 71 | critic_params, lr=agent_config.lr, weight_decay=0.0 72 | ) 73 | self.optimizer_alpha = optim.Adam( 74 | [self.loss_module.log_alpha], 75 | lr=3.0e-4, 76 | ) 77 | 78 | # general stats 79 | self.collected_transitions = 0 80 | self.total_updates = 0 81 | 82 | def get_agent_statedict(self): 83 | """Save agent""" 84 | act_statedict = self.actor.state_dict() 85 | critic_statedict = self.critic.state_dict() 86 | return {"actor": act_statedict, "critic": critic_statedict} 87 | 88 | def load_model(self, path): 89 | """load model""" 90 | try: 91 | statedict = torch.load(path) 92 | self.actor.load_state_dict(statedict["actor"]) 93 | self.critic.load_state_dict(statedict["critic"]) 94 | print("Model loaded") 95 | except: 96 | raise ValueError("Model not loaded") 97 | 98 | def load_replaybuffer(self, path): 99 | """load replay buffer""" 100 | try: 101 | loaded_data = TensorDictBase.load_memmap(path) 102 | self.replay_buffer.extend(loaded_data) 103 | if self.replay_buffer._batch_size != self.batch_size: 104 | Warning( 105 | "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size." 106 | ) 107 | self.replay_buffer._batch_size = self.batch_size 108 | print("Replay Buffer loaded") 109 | print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n") 110 | except: 111 | raise ValueError("Replay Buffer not loaded") 112 | 113 | def reset_networks(self): 114 | """reset network parameters""" 115 | print("Resetting Networks!") 116 | self.loss_module.actor_network_params.apply(self.reset_parameter) 117 | self.loss_module.target_actor_network_params.apply(self.reset_parameter) 118 | self.loss_module.qvalue_network_params.apply(self.reset_parameter) 119 | self.loss_module.target_qvalue_network_params.apply(self.reset_parameter) 120 | 121 | def eval(self): 122 | """Sets the agent to evaluation mode.""" 123 | self.actor.eval() 124 | 125 | def td_preprocessing(self, td: TensorDictBase) -> TensorDictBase: 126 | # TODO not ideal to have this here 127 | td.pop("scale") 128 | td.pop("loc") 129 | td.pop("params") 130 | if "obs_embedding" in td.keys(): 131 | td.pop("obs_embedding") 132 | if "pixel_embedding" in td.keys(): 133 | td.pop("pixel_embedding") 134 | 135 | def create_replay_buffer( 136 | self, 137 | prb=False, 138 | buffer_size=100000, 139 | buffer_scratch_dir=".", 140 | device="cpu", 141 | prefetch=3, 142 | ): 143 | """Create replay buffer""" 144 | # TODO: make this part of base off policy agent 145 | if prb: 146 | replay_buffer = TensorDictPrioritizedReplayBuffer( 147 | alpha=0.7, 148 | beta=0.5, 149 | pin_memory=False, 150 | prefetch=1, 151 | storage=LazyTensorStorage( 152 | buffer_size, 153 | ), 154 | ) 155 | else: 156 | replay_buffer = TensorDictReplayBuffer( 157 | pin_memory=False, 158 | prefetch=prefetch, 159 | storage=LazyMemmapStorage( 160 | buffer_size, 161 | scratch_dir=buffer_scratch_dir, 162 | ), 163 | batch_size=self.batch_size, 164 | ) 165 | replay_buffer.append_transform(lambda x: x.to(device)) 166 | # TODO: check if we have image in observation space if so add this transform 167 | # replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True)) 168 | 169 | return replay_buffer 170 | 171 | @torch.no_grad() 172 | def get_action(self, td: TensorDictBase) -> TensorDictBase: 173 | """Get action from actor network""" 174 | with set_exploration_type(ExplorationType.RANDOM): 175 | out_td = self.actor(td.to(self.device)) 176 | self.td_preprocessing(out_td) 177 | return out_td 178 | 179 | @torch.no_grad() 180 | def get_eval_action(self, td: TensorDictBase) -> TensorDictBase: 181 | """Get eval action from actor network""" 182 | with set_exploration_type(ExplorationType.MODE): 183 | out_td = self.actor(td.to(self.device)) 184 | self.td_preprocessing(out_td) 185 | return out_td 186 | 187 | def add_experience(self, transition: td.TensorDict): 188 | """Add experience to replay buffer""" 189 | self.replay_buffer.extend(transition) 190 | self.collected_transitions += 1 191 | 192 | def train(self, batch_size=64, num_updates=1): 193 | """Train the agent""" 194 | self.actor.train() 195 | for i in range(num_updates): 196 | self.total_updates += 1 197 | if self.reset_params and self.total_updates % self.reset_params == 0: 198 | self.reset_networks() 199 | # Sample a batch from the replay buffer 200 | batch = self.replay_buffer.sample(batch_size) 201 | # Compute SAC Loss 202 | loss = self.loss_module(batch) 203 | 204 | # Update Actpr Network 205 | self.optimizer_actor.zero_grad() 206 | loss["loss_actor"].backward() 207 | self.optimizer_actor.step() 208 | # Update Critic Network 209 | self.optimizer_critic.zero_grad() 210 | loss["loss_qvalue"].backward() 211 | torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5) 212 | self.optimizer_critic.step() 213 | 214 | # Update alpha 215 | self.optimizer_alpha.zero_grad() 216 | loss["loss_alpha"].backward() 217 | self.optimizer_alpha.step() 218 | 219 | # Update Target Networks 220 | self.target_net_updater.step() 221 | # Update Prioritized Replay Buffer 222 | if isinstance(self.replay_buffer, TensorDictPrioritizedReplayBuffer): 223 | self.replay_buffer.update_priorities( 224 | batch["indices"], 225 | loss["critic_loss"].detach().cpu().numpy(), 226 | ) 227 | self.actor.eval() 228 | return loss 229 | -------------------------------------------------------------------------------- /experiments/helper/networks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BricksRL/bricksrl/bc250aeaa3b9ab9d718601fced38325f3621c8a3/experiments/helper/networks/__init__.py -------------------------------------------------------------------------------- /experiments/helper/utils.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import numpy as np 4 | import tensordict as td 5 | import torch 6 | from bricksrl.environments import ALL_2WHEELER_ENVS, ALL_ROBOARM_ENVS, ALL_WALKER_ENVS 7 | from moviepy.editor import concatenate_videoclips, ImageClip 8 | from omegaconf import DictConfig 9 | from tensordict import TensorDict, TensorDictBase 10 | from torchrl.envs.utils import step_mdp 11 | from tqdm import tqdm 12 | 13 | 14 | def setup_check(robot: str, config: DictConfig): 15 | if robot == "2wheeler": 16 | assert ( 17 | config.env.name in ALL_2WHEELER_ENVS 18 | ), f"You are trying to run a 2wheeler experiment but are using the env {config.env.name}, select one of {ALL_2WHEELER_ENVS}" 19 | elif robot == "walker": 20 | assert ( 21 | config.env.name in ALL_WALKER_ENVS 22 | ), f"You are trying to run a walker experiment but are using the env {config.env.name}, select one of {ALL_WALKER_ENVS}" 23 | elif robot == "roboarm": 24 | assert ( 25 | config.env.name in ALL_ROBOARM_ENVS 26 | ), f"You are trying to run a roboarm experiment but are using the env {config.env.name}, select one of {ALL_ROBOARM_ENVS}" 27 | 28 | 29 | def data2numpy(data: list): 30 | """Convert a list of bytes to a numpy array.""" 31 | return np.array(data)[None, :] 32 | 33 | 34 | def handle_disconnect(_): 35 | print("Hub was disconnected.") 36 | 37 | 38 | def tensordict2dict(td: TensorDictBase) -> dict: 39 | """Convert a TensorDict to a dictionary.""" 40 | return {k: v.item() for k, v in td.items()} 41 | 42 | 43 | def logout(agent): 44 | # TODO save model or training data 45 | x = input("Do you want to save the model? (y/n)") 46 | if x == "y": 47 | save_name = input("Enter the name of the file to save: ") 48 | torch.save(agent.get_agent_statedict(), save_name + ".pth") 49 | x = input("Do you want to save the replay buffer? (y/n)") 50 | if x == "y": 51 | save_name = input("Enter the name of the file to save: ") 52 | # agent.replay_buffer.dump(save_name) 53 | batched_data = agent.replay_buffer.storage._storage[ 54 | : agent.replay_buffer.__len__() 55 | ] 56 | batched_data.save(save_name, copy_existing=True) 57 | 58 | 59 | def login(agent): 60 | x = input("Do you want to load the model? (y/n)") 61 | if x == "y": 62 | save_name = input("Enter the name of the file to load: ") 63 | agent.load_model(save_name) 64 | else: 65 | print("Model not loaded!") 66 | x = input("Do you want to load the replay buffer? (y/n)") 67 | if x == "y": 68 | save_name = input("Enter the name of the file to load: ") 69 | agent.load_replaybuffer(save_name) 70 | else: 71 | print("Buffer not loaded!") 72 | 73 | 74 | def prefill_buffer(env, agent, num_episodes=10, stop_on_done=False): 75 | """ 76 | Prefills the agent's replay buffer with experiences by running the environment for a specified number of episodes. 77 | 78 | Args: 79 | - env: gym.Env object representing the environment 80 | - agent: Agent object with an add_experience method to add experiences to the replay buffer 81 | - num_episodes: int, number of episodes to run the environment for 82 | 83 | Returns: None 84 | """ 85 | if agent.name in ["sac", "td3"]: 86 | inpt = input("Press Enter to start prefilling episode: ") 87 | for e in tqdm(range(num_episodes), desc="Prefilling buffer"): 88 | print("Prefill episode: ", e) 89 | td = env.reset() 90 | done = False 91 | truncated = False 92 | while not done and not truncated: 93 | td = env.sample_random_action(td) 94 | td = env.step(td) 95 | agent.add_experience(td) 96 | done = td.get(("next", "done")) 97 | 98 | if done and stop_on_done: 99 | inpt = input( 100 | "Please reset the robot to the starting position and press Enter to continue or q to quit:" 101 | ) 102 | if inpt == "q": 103 | break 104 | td = step_mdp(td) 105 | print("Prefill done! Buffer size: ", agent.replay_buffer.__len__()) 106 | 107 | 108 | def convert_bgr_to_rgb(bgr_image: np.array) -> np.array: 109 | return bgr_image[:, :, ::-1] # Reverses the third dimension (color channels) 110 | 111 | 112 | def create_video_from_images( 113 | images: List[np.array], video_name: str = "episode_1", fps: int = 20 114 | ): 115 | # Convert each NumPy array image to an ImageClip 116 | clips = [ImageClip(convert_bgr_to_rgb(np_img.squeeze(0))) for np_img in images] 117 | 118 | # Set the duration of each clip to match the desired FPS 119 | # Note: This assumes all images should be displayed for an equal amount of time. 120 | for clip in clips: 121 | clip.duration = 1 / fps 122 | 123 | # Concatenate the ImageClips into a single video 124 | final_clip = concatenate_videoclips(clips, method="compose") 125 | 126 | # Write the result to a video file 127 | final_clip.write_videofile(video_name, fps=fps) 128 | -------------------------------------------------------------------------------- /experiments/roboarm/eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | 5 | import hydra 6 | import numpy as np 7 | import wandb 8 | from omegaconf import DictConfig, OmegaConf 9 | from torchrl.envs.utils import step_mdp 10 | from tqdm import tqdm 11 | 12 | # Add the project root to PYTHONPATH for config 13 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) 14 | if project_root not in sys.path: 15 | sys.path.insert(0, project_root) 16 | 17 | from bricksrl.environments import make_env, VIDEO_LOGGING_ENVS 18 | from experiments.helper.agents import get_agent 19 | from experiments.helper.utils import ( 20 | create_video_from_images, 21 | login, 22 | logout, 23 | setup_check, 24 | ) 25 | 26 | 27 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config") 28 | def run(cfg: DictConfig) -> None: 29 | print(OmegaConf.to_yaml(cfg)) 30 | 31 | # make environment. 32 | setup_check(robot="roboarm", config=cfg) 33 | env, action_space, state_space = make_env(cfg) 34 | 35 | # make agent 36 | agent, project_name = get_agent(action_space, state_space, cfg) 37 | login(agent) 38 | agent.eval() 39 | 40 | # initialize wandb 41 | wandb.init(project=project_name + "_eval") 42 | wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True) 43 | 44 | eval_episodes = cfg.episodes 45 | env_name = cfg.env.name 46 | quit = False 47 | _ = input("Press Enter to start evaluation...") 48 | try: 49 | for e in tqdm(range(eval_episodes), desc="Evaluation"): 50 | td = env.reset() 51 | done = td.get("done", False) 52 | truncated = td.get("truncated", False) 53 | ep_return = 0 54 | ep_steps = 0 55 | total_step_times = [] 56 | if env_name in VIDEO_LOGGING_ENVS: 57 | image_caputres = [td.get("original_pixels").numpy()] 58 | print("Start new evaluation...", flush=True) 59 | while not done and not truncated: 60 | ep_steps += 1 61 | step_start_time = time.time() 62 | td = agent.get_eval_action(td) 63 | td = env.step(td) 64 | agent.add_experience(td) 65 | if env_name in VIDEO_LOGGING_ENVS: 66 | image_caputres.append( 67 | td.get(("next", "original_pixels")).cpu().numpy() 68 | ) 69 | agent.add_experience(td) 70 | total_agent_step_time = time.time() - step_start_time 71 | total_step_times.append(total_agent_step_time) 72 | done = td.get(("next", "done"), False) 73 | ep_return += td.get(("next", "reward"), 0) 74 | 75 | if done: 76 | break 77 | td = step_mdp(td) 78 | 79 | if quit: 80 | break 81 | 82 | # Metrics Logging 83 | log_dict = { 84 | "epoch": e, 85 | "reward": ep_return, 86 | "steps": ep_steps, 87 | "total_step_time": np.mean(total_step_times), 88 | "done": done.float(), 89 | } 90 | if env_name == "roboarm-v0" or env_name == "roboarm_sim-v0": 91 | final_error = td.get(("error")).item() 92 | log_dict.update({"final_error": final_error}) 93 | 94 | wandb.log(log_dict) 95 | if env_name in VIDEO_LOGGING_ENVS: 96 | video_name = "episode_{}.mp4".format(e) 97 | create_video_from_images(image_caputres, video_name, fps=5) 98 | wandb.log({"video": wandb.Video(video_name, fps=5, format="mp4")}) 99 | 100 | except KeyboardInterrupt: 101 | print("Evaluation interrupted by user.") 102 | 103 | logout(agent) 104 | env.close() 105 | 106 | 107 | if __name__ == "__main__": 108 | run() 109 | -------------------------------------------------------------------------------- /experiments/roboarm/pretrain.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import hydra 5 | import wandb 6 | from omegaconf import DictConfig, OmegaConf 7 | from tqdm import tqdm 8 | 9 | # Add the project root to PYTHONPATH for config 10 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) 11 | if project_root not in sys.path: 12 | sys.path.insert(0, project_root) 13 | 14 | from bricksrl.environments import make_env 15 | from experiments.helper.agents import get_agent 16 | from experiments.helper.utils import login, logout, setup_check, tensordict2dict 17 | 18 | 19 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config") 20 | def run(cfg: DictConfig) -> None: 21 | print(OmegaConf.to_yaml(cfg)) 22 | 23 | # make environment. 24 | setup_check(robot="roboarm", config=cfg) 25 | env, action_space, state_space = make_env(cfg, pretrain=True) 26 | 27 | # make agent 28 | agent, project_name = get_agent(action_space, state_space, cfg) 29 | login(agent) 30 | 31 | # initialize wandb 32 | wandb.init(project=project_name) 33 | wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True) 34 | wandb.watch(agent.actor, log_freq=1) if agent.actor else None 35 | 36 | batch_size = cfg.agent.batch_size 37 | num_updates = cfg.agent.num_updates 38 | train_episodes = cfg.episodes 39 | print("Start training...") 40 | try: 41 | for e in tqdm(range(train_episodes), desc="Training"): 42 | 43 | loss_info = agent.train(batch_size=batch_size, num_updates=num_updates) 44 | 45 | # Metrics Logging 46 | log_dict = { 47 | "epoch": e, 48 | "buffer_size": agent.replay_buffer.__len__(), 49 | } 50 | log_dict.update(tensordict2dict(loss_info)) 51 | wandb.log(log_dict) 52 | 53 | except KeyboardInterrupt: 54 | print("Training interrupted by user.") 55 | 56 | logout(agent) 57 | env.close() 58 | 59 | 60 | if __name__ == "__main__": 61 | run() 62 | -------------------------------------------------------------------------------- /experiments/roboarm/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | 5 | import hydra 6 | import numpy as np 7 | import wandb 8 | from omegaconf import DictConfig, OmegaConf 9 | from torchrl.envs.utils import step_mdp 10 | from tqdm import tqdm 11 | 12 | # Add the project root to PYTHONPATH for config 13 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) 14 | if project_root not in sys.path: 15 | sys.path.insert(0, project_root) 16 | 17 | from bricksrl.environments import make_env, VIDEO_LOGGING_ENVS 18 | from experiments.helper.agents import get_agent 19 | from experiments.helper.utils import ( 20 | create_video_from_images, 21 | login, 22 | logout, 23 | prefill_buffer, 24 | setup_check, 25 | tensordict2dict, 26 | ) 27 | 28 | 29 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config") 30 | def run(cfg: DictConfig) -> None: 31 | print(OmegaConf.to_yaml(cfg)) 32 | 33 | # make environment. 34 | setup_check(robot="roboarm", config=cfg) 35 | env, action_space, state_space = make_env(cfg) 36 | 37 | # make agent 38 | agent, project_name = get_agent(action_space, state_space, cfg) 39 | login(agent) 40 | 41 | # initialize wandb 42 | wandb.init(project=project_name) 43 | wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True) 44 | wandb.watch(agent.actor, log_freq=1) if agent.actor else None 45 | 46 | # prefill buffer with random actions 47 | prefill_buffer( 48 | env=env, 49 | agent=agent, 50 | num_episodes=cfg.agent.prefill_episodes, 51 | ) 52 | 53 | batch_size = cfg.agent.batch_size 54 | num_updates = cfg.agent.num_updates 55 | env_name = cfg.env.name 56 | train_episodes = cfg.episodes 57 | max_episode_steps = cfg.env.max_episode_steps 58 | 59 | print("Start training...") 60 | quit = False 61 | try: 62 | for e in tqdm(range(train_episodes), desc="Training"): 63 | td = env.reset() 64 | done = td.get("done", False) 65 | truncated = td.get("truncated", False) 66 | ep_return = 0 67 | ep_steps = 0 68 | total_step_times = [] 69 | if env_name in VIDEO_LOGGING_ENVS: 70 | image_caputres = [td.get("original_pixels").numpy()] 71 | print("Start new data collection...", flush=True) 72 | while not done and not truncated: 73 | ep_steps += 1 74 | step_start_time = time.time() 75 | td = agent.get_action(td) 76 | td = env.step(td) 77 | if env_name in VIDEO_LOGGING_ENVS: 78 | image_caputres.append( 79 | td.get(("next", "original_pixels")).cpu().numpy() 80 | ) 81 | agent.add_experience(td) 82 | total_agent_step_time = time.time() - step_start_time 83 | total_step_times.append(total_agent_step_time) 84 | done = td.get(("next", "done"), False) 85 | ep_return += td.get(("next", "reward"), 0) 86 | 87 | td = step_mdp(td) 88 | if done: 89 | break 90 | 91 | loss_info = agent.train( 92 | batch_size=batch_size, num_updates=num_updates * ep_steps 93 | ) 94 | 95 | if quit: 96 | break 97 | 98 | # Metrics Logging 99 | log_dict = { 100 | "epoch": e, 101 | "reward": ep_return, 102 | "steps": ep_steps, 103 | "total_step_time": np.mean(total_step_times), 104 | "buffer_size": agent.replay_buffer.__len__(), 105 | "done": done.float(), 106 | } 107 | if env_name == "roboarm-v0" or env_name == "roboarm_sim-v0": 108 | final_error = td.get(("error")).item() 109 | log_dict.update({"final_error": final_error}) 110 | log_dict.update(tensordict2dict(loss_info)) 111 | wandb.log(log_dict) 112 | if env_name in VIDEO_LOGGING_ENVS and done and ep_steps < max_episode_steps: 113 | video_name = "episode_{}.mp4".format(e) 114 | create_video_from_images(image_caputres, video_name, fps=5) 115 | wandb.log({"video": wandb.Video(video_name, fps=5, format="mp4")}) 116 | 117 | except KeyboardInterrupt: 118 | print("Training interrupted by user.") 119 | 120 | logout(agent) 121 | env.close() 122 | 123 | 124 | if __name__ == "__main__": 125 | run() 126 | -------------------------------------------------------------------------------- /experiments/walker/eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | 5 | import hydra 6 | import numpy as np 7 | import wandb 8 | from omegaconf import DictConfig, OmegaConf 9 | from torchrl.envs.utils import step_mdp 10 | from tqdm import tqdm 11 | 12 | # Add the project root to PYTHONPATH for config 13 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) 14 | if project_root not in sys.path: 15 | sys.path.insert(0, project_root) 16 | 17 | from bricksrl.environments import make_env, VIDEO_LOGGING_ENVS 18 | from experiments.helper.agents import get_agent 19 | from experiments.helper.utils import ( 20 | create_video_from_images, 21 | login, 22 | logout, 23 | setup_check, 24 | ) 25 | 26 | 27 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config") 28 | def run(cfg: DictConfig) -> None: 29 | print(OmegaConf.to_yaml(cfg)) 30 | 31 | # make environment. 32 | setup_check(robot="walker", config=cfg) 33 | env, action_space, state_space = make_env(cfg) 34 | 35 | # make agent 36 | agent, project_name = get_agent(action_space, state_space, cfg) 37 | login(agent) 38 | agent.eval() 39 | 40 | # initialize wandb 41 | wandb.init(project=project_name + "_eval") 42 | wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True) 43 | 44 | eval_episodes = cfg.episodes 45 | env_name = cfg.env.name 46 | quit = False 47 | _ = input("Press Enter to start evaluation...") 48 | try: 49 | for e in tqdm(range(eval_episodes), desc="Evaluation"): 50 | td = env.reset() 51 | done = td.get("done", False) 52 | truncated = td.get("truncated", False) 53 | ep_return = 0 54 | ep_steps = 0 55 | total_step_times = [] 56 | actions = [] 57 | if env_name in VIDEO_LOGGING_ENVS: 58 | image_caputres = [td.get("original_image").numpy()] 59 | # so we can reset the robot in the camera view 60 | input("Press Enter to start episode...") 61 | print("Start new evaluation...", flush=True) 62 | while not done and not truncated: 63 | ep_steps += 1 64 | step_start_time = time.time() 65 | td = agent.get_eval_action(td) 66 | actions.append(td.get("action").cpu().numpy()) 67 | td = env.step(td) 68 | agent.add_experience(td) 69 | total_agent_step_time = time.time() - step_start_time 70 | total_step_times.append(total_agent_step_time) 71 | done = td.get(("next", "done"), False) 72 | ep_return += td.get(("next", "reward"), 0) 73 | if env_name in VIDEO_LOGGING_ENVS: 74 | image_caputres.append( 75 | td.get(("next", "original_image")).cpu().numpy() 76 | ) 77 | 78 | if done: 79 | break 80 | td = step_mdp(td) 81 | 82 | if quit: 83 | break 84 | 85 | # Metrics Logging 86 | log_dict = { 87 | "epoch": e, 88 | "reward": ep_return, 89 | "steps": ep_steps, 90 | "total_step_time": np.mean(total_step_times), 91 | "buffer_size": agent.replay_buffer.__len__(), 92 | "done": done.float(), 93 | } 94 | 95 | wandb.log(log_dict) 96 | if env_name in VIDEO_LOGGING_ENVS: 97 | video_name = "episode_{}.mp4".format(e) 98 | create_video_from_images(image_caputres, video_name, fps=5) 99 | wandb.log({"video": wandb.Video(video_name, fps=5, format="mp4")}) 100 | 101 | except KeyboardInterrupt: 102 | print("Evaluation interrupted by user.") 103 | 104 | logout(agent) 105 | env.close() 106 | 107 | 108 | if __name__ == "__main__": 109 | run() 110 | -------------------------------------------------------------------------------- /experiments/walker/pretrain.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import hydra 5 | import wandb 6 | from omegaconf import DictConfig, OmegaConf 7 | from tqdm import tqdm 8 | 9 | # Add the project root to PYTHONPATH for config 10 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) 11 | if project_root not in sys.path: 12 | sys.path.insert(0, project_root) 13 | 14 | from bricksrl.environments import make_env 15 | from experiments.helper.agents import get_agent 16 | from experiments.helper.utils import login, logout, setup_check, tensordict2dict 17 | 18 | 19 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config") 20 | def run(cfg: DictConfig) -> None: 21 | print(OmegaConf.to_yaml(cfg)) 22 | 23 | # make environment. 24 | setup_check(robot="walker", config=cfg) 25 | env, action_space, state_space = make_env(cfg, pretrain=True) 26 | 27 | # make agent 28 | agent, project_name = get_agent(action_space, state_space, cfg) 29 | login(agent) 30 | 31 | # initialize wandb 32 | wandb.init(project=project_name) 33 | wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True) 34 | wandb.watch(agent.actor, log_freq=1) if agent.actor else None 35 | 36 | batch_size = cfg.agent.batch_size 37 | num_updates = cfg.agent.num_updates 38 | train_episodes = cfg.episodes 39 | print("Start training...") 40 | try: 41 | for e in tqdm(range(train_episodes), desc="Training"): 42 | 43 | loss_info = agent.train(batch_size=batch_size, num_updates=num_updates) 44 | 45 | # Metrics Logging 46 | log_dict = { 47 | "epoch": e, 48 | "buffer_size": agent.replay_buffer.__len__(), 49 | } 50 | log_dict.update(tensordict2dict(loss_info)) 51 | wandb.log(log_dict) 52 | 53 | except KeyboardInterrupt: 54 | print("Training interrupted by user.") 55 | 56 | logout(agent) 57 | env.close() 58 | 59 | 60 | if __name__ == "__main__": 61 | run() 62 | -------------------------------------------------------------------------------- /experiments/walker/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | 5 | import hydra 6 | import numpy as np 7 | import wandb 8 | from omegaconf import DictConfig, OmegaConf 9 | from torchrl.envs.utils import step_mdp 10 | from tqdm import tqdm 11 | 12 | # Add the project root to PYTHONPATH for config 13 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) 14 | if project_root not in sys.path: 15 | sys.path.insert(0, project_root) 16 | 17 | from bricksrl.environments import make_env, VIDEO_LOGGING_ENVS 18 | from experiments.helper.agents import get_agent 19 | from experiments.helper.utils import ( 20 | create_video_from_images, 21 | login, 22 | logout, 23 | prefill_buffer, 24 | setup_check, 25 | tensordict2dict, 26 | ) 27 | 28 | 29 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config") 30 | def run(cfg: DictConfig) -> None: 31 | print(OmegaConf.to_yaml(cfg)) 32 | 33 | # make environment. 34 | setup_check(robot="walker", config=cfg) 35 | env, action_space, state_space = make_env(cfg) 36 | 37 | # make agent 38 | agent, project_name = get_agent(action_space, state_space, cfg) 39 | login(agent) 40 | 41 | # initialize wandb 42 | wandb.init(project=project_name) 43 | wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True) 44 | wandb.watch(agent.actor, log_freq=1) if agent.actor else None 45 | 46 | # prefill buffer with random actions 47 | prefill_buffer(env=env, agent=agent, num_episodes=cfg.agent.prefill_episodes) 48 | 49 | batch_size = cfg.agent.batch_size 50 | num_updates = cfg.agent.num_updates 51 | env_name = cfg.env.name 52 | train_episodes = cfg.episodes 53 | print("Start training...") 54 | quit = False 55 | try: 56 | for e in tqdm(range(train_episodes), desc="Training"): 57 | td = env.reset() 58 | done = td.get("done", False) 59 | truncated = td.get("truncated", False) 60 | ep_return = 0 61 | ep_steps = 0 62 | total_step_times = [] 63 | agent_actions = [] 64 | if env_name in VIDEO_LOGGING_ENVS: 65 | image_caputres = [td.get("original_image").numpy()] 66 | # so we can reset the robot in the camera view 67 | input("Press Enter to start episode...") 68 | 69 | print("Start new data collection...", flush=True) 70 | while not done and not truncated: 71 | ep_steps += 1 72 | step_start_time = time.time() 73 | td = agent.get_action(td) 74 | td = env.step(td) 75 | agent.add_experience(td) 76 | done = td.get(("next", "done"), False) 77 | ep_return += td.get(("next", "reward"), 0) 78 | if env_name in VIDEO_LOGGING_ENVS: 79 | image_caputres.append( 80 | td.get(("next", "original_image")).cpu().numpy() 81 | ) 82 | total_agent_step_time = time.time() - step_start_time 83 | total_step_times.append(total_agent_step_time) 84 | if done: 85 | break 86 | td = step_mdp(td) 87 | loss_info = agent.train( 88 | batch_size=batch_size, num_updates=num_updates * ep_steps 89 | ) 90 | action = td.get("action").cpu().numpy() 91 | agent_actions.append(action) 92 | 93 | if quit: 94 | break 95 | 96 | # Metrics Logging 97 | log_dict = { 98 | "epoch": e, 99 | "reward": ep_return, 100 | "steps": ep_steps, 101 | "total_step_time": np.mean(total_step_times), 102 | "buffer_size": agent.replay_buffer.__len__(), 103 | "action": wandb.Histogram(action), 104 | "done": done, 105 | "action_mean": wandb.Histogram(np.mean(agent_actions, axis=0)), 106 | } 107 | log_dict.update(tensordict2dict(loss_info)) 108 | wandb.log(log_dict) 109 | if env_name in VIDEO_LOGGING_ENVS: 110 | video_name = "episode_{}.mp4".format(e) 111 | create_video_from_images(image_caputres, video_name, fps=5) 112 | wandb.log({"video": wandb.Video(video_name, fps=5, format="mp4")}) 113 | 114 | except KeyboardInterrupt: 115 | print("Training interrupted by user.") 116 | 117 | logout(agent) 118 | env.close() 119 | 120 | 121 | if __name__ == "__main__": 122 | run() 123 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=64", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="bricksrl", 5 | version="0.1.0", 6 | packages=find_packages(), 7 | install_requires=[ 8 | "pybricksdev", 9 | "tensordict==0.5.0", 10 | "torchrl==0.5.0", 11 | "hydra-core==1.3.2", 12 | "wandb==0.16.1", 13 | "opencv-python==4.9.0.80", 14 | "moviepy==1.0.3", 15 | "tqdm==4.66.1", 16 | "numpy==1.24.1", 17 | "pynput", 18 | ], 19 | extras_require={ 20 | "dev": [ 21 | "pytest==8.0.2", 22 | "ufmt", 23 | "pre-commit", 24 | ], 25 | }, 26 | author="Sebastian Dittert", 27 | description="BricksRL: A Platform for Democratizing Robotics and Reinforcement Learning Research and Education with LEGO", 28 | url="https://github.com/BricksRL/bricksrl", 29 | classifiers=[ 30 | "Programming Language :: Python :: 3", 31 | "License :: OSI Approved :: MIT License", 32 | "Operating System :: OS Independent", 33 | ], 34 | python_requires=">=3.8", 35 | ) 36 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BricksRL/bricksrl/bc250aeaa3b9ab9d718601fced38325f3621c8a3/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_agents.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | from bricksrl.environments.dummy.mixed_obs_dummy import MixedObsDummyEnv 4 | from bricksrl.environments.dummy.vec_obs_dummy import VecGoalObsDummyEnv, VecObsDummyEnv 5 | from experiments.helper.agents import get_agent 6 | from hydra import compose, initialize 7 | from torchrl.envs import Compose, ToTensorImage, TransformedEnv 8 | from torchrl.envs.utils import step_mdp 9 | 10 | 11 | def collection_round(env, agent, max_steps=1000): 12 | td = env.reset() 13 | for _ in range(max_steps): 14 | td = agent.get_action(td) 15 | td = env.step(td) 16 | agent.add_experience(td) 17 | td = step_mdp(td) 18 | 19 | 20 | def get_env(env, img_shape=(64, 64, 3)): 21 | if env == "mixed": 22 | env = MixedObsDummyEnv(img_shape=img_shape) 23 | env = TransformedEnv( 24 | env, Compose(ToTensorImage(in_keys=["pixels"], from_int=True)) 25 | ) 26 | elif env == "vec": 27 | env = VecObsDummyEnv() 28 | elif env == "vec_goal": 29 | env = VecGoalObsDummyEnv() 30 | else: 31 | raise ValueError("Invalid environment") 32 | return env 33 | 34 | 35 | @pytest.mark.parametrize( 36 | "env", 37 | ["mixed", "vec", "vec_goal"], 38 | ) 39 | @pytest.mark.parametrize( 40 | "device", 41 | ["cpu", "cuda"], 42 | ) 43 | def test_random_agent(env, device): 44 | with initialize(config_path="../conf"): 45 | cfg = compose(config_name="config") 46 | 47 | if torch.cuda.is_available() and device == "cuda": 48 | device = "cuda" 49 | else: 50 | device = "cpu" 51 | with initialize(config_path="../conf"): 52 | cfg = compose( 53 | config_name="config", overrides=["device=" + device, "agent=random"] 54 | ) 55 | # Test data collection 56 | env = get_env(env) 57 | agent, _ = get_agent(env.action_spec, env.observation_spec, cfg) 58 | collection_round(env, agent, max_steps=10) 59 | 60 | 61 | @pytest.mark.parametrize( 62 | "env", 63 | ["mixed", "vec", "vec_goal"], 64 | ) 65 | @pytest.mark.parametrize( 66 | "device", 67 | ["cpu", "cuda"], 68 | ) 69 | def test_sac_agent(env, device): 70 | if torch.cuda.is_available() and device == "cuda": 71 | device = "cuda" 72 | else: 73 | device = "cpu" 74 | with initialize(config_path="../conf"): 75 | cfg = compose(config_name="config", overrides=["agent=sac", "device=" + device]) 76 | 77 | # Test data collection 78 | env = get_env(env) 79 | agent, _ = get_agent(env.action_spec, env.observation_spec, cfg) 80 | collection_round(env, agent, max_steps=10) 81 | # Test training 82 | agent.train(batch_size=1, num_updates=1) 83 | 84 | # Test evaluation 85 | td = env.reset() 86 | td1 = agent.get_action(td) 87 | td2 = agent.get_action(td) 88 | 89 | assert not torch.allclose(td1["action"], td2["action"]) 90 | 91 | agent.eval() 92 | td = env.reset() 93 | eval_td1 = agent.get_eval_action(td) 94 | eval_td2 = agent.get_eval_action(td) 95 | 96 | assert torch.allclose(eval_td1["action"], eval_td2["action"]) 97 | 98 | 99 | @pytest.mark.parametrize( 100 | "env", 101 | ["mixed", "vec", "vec_goal"], 102 | ) 103 | @pytest.mark.parametrize( 104 | "device", 105 | ["cpu", "cuda"], 106 | ) 107 | def test_td3_agent(env, device): 108 | if torch.cuda.is_available() and device == "cuda": 109 | device = "cuda" 110 | else: 111 | device = "cpu" 112 | with initialize(config_path="../conf"): 113 | cfg = compose(config_name="config", overrides=["agent=td3", "device=" + device]) 114 | 115 | # Test data collection 116 | env = get_env(env) 117 | agent, _ = get_agent(env.action_spec, env.observation_spec, cfg) 118 | collection_round(env, agent, max_steps=10) 119 | 120 | # Test training 121 | agent.train(batch_size=1, num_updates=1) 122 | 123 | # Test evaluation 124 | td = env.reset() 125 | td1 = agent.get_action(td) 126 | td2 = agent.get_action(td) 127 | 128 | assert not torch.allclose(td1["action"], td2["action"]) 129 | 130 | agent.eval() 131 | td = env.reset() 132 | eval_td1 = agent.get_eval_action(td) 133 | eval_td2 = agent.get_eval_action(td) 134 | 135 | assert torch.allclose(eval_td1["action"], eval_td2["action"]) 136 | 137 | 138 | @pytest.mark.parametrize( 139 | "env", 140 | ["mixed", "vec", "vec_goal"], 141 | ) 142 | @pytest.mark.parametrize( 143 | "device", 144 | ["cpu", "cuda"], 145 | ) 146 | def test_drq_agent(env, device): 147 | if torch.cuda.is_available() and device == "cuda": 148 | device = "cuda" 149 | else: 150 | device = "cpu" 151 | with initialize(config_path="../conf"): 152 | cfg = compose( 153 | config_name="config", overrides=["agent=droq", "device=" + device] 154 | ) 155 | 156 | # Test data collection 157 | env = get_env(env) 158 | agent, _ = get_agent(env.action_spec, env.observation_spec, cfg) 159 | collection_round(env, agent, max_steps=10) 160 | # Test training 161 | agent.train(batch_size=1, num_updates=1) 162 | 163 | # Test evaluation 164 | td = env.reset() 165 | td1 = agent.get_action(td) 166 | td2 = agent.get_action(td) 167 | 168 | assert not torch.allclose(td1["action"], td2["action"]) 169 | 170 | agent.eval() 171 | td = env.reset() 172 | eval_td1 = agent.get_eval_action(td) 173 | eval_td2 = agent.get_eval_action(td) 174 | 175 | assert torch.allclose(eval_td1["action"], eval_td2["action"]) 176 | 177 | 178 | @pytest.mark.parametrize( 179 | "env", 180 | ["mixed", "vec", "vec_goal"], 181 | ) 182 | @pytest.mark.parametrize( 183 | "device", 184 | ["cpu", "cuda"], 185 | ) 186 | def test_iql_agent(env, device): 187 | if torch.cuda.is_available() and device == "cuda": 188 | device = "cuda" 189 | else: 190 | device = "cpu" 191 | with initialize(config_path="../conf"): 192 | cfg = compose(config_name="config", overrides=["agent=iql", "device=" + device]) 193 | 194 | # Test data collection 195 | env = get_env(env) 196 | agent, _ = get_agent(env.action_spec, env.observation_spec, cfg) 197 | collection_round(env, agent, max_steps=10) 198 | # Test training 199 | agent.train(batch_size=1, num_updates=1) 200 | 201 | # Test evaluation 202 | td = env.reset() 203 | td1 = agent.get_action(td) 204 | td2 = agent.get_action(td) 205 | 206 | assert not torch.allclose(td1["action"], td2["action"]) 207 | 208 | agent.eval() 209 | td = env.reset() 210 | eval_td1 = agent.get_eval_action(td) 211 | eval_td2 = agent.get_eval_action(td) 212 | 213 | assert torch.allclose(eval_td1["action"], eval_td2["action"]) 214 | 215 | 216 | @pytest.mark.parametrize( 217 | "env", 218 | ["mixed", "vec", "vec_goal"], 219 | ) 220 | @pytest.mark.parametrize( 221 | "device", 222 | ["cpu", "cuda"], 223 | ) 224 | def test_cql_agent(env, device): 225 | if torch.cuda.is_available() and device == "cuda": 226 | device = "cuda" 227 | else: 228 | device = "cpu" 229 | with initialize(config_path="../conf"): 230 | cfg = compose(config_name="config", overrides=["agent=cql", "device=" + device]) 231 | 232 | # Test data collection 233 | env = get_env(env) 234 | agent, _ = get_agent(env.action_spec, env.observation_spec, cfg) 235 | collection_round(env, agent, max_steps=10) 236 | # Test training 237 | agent.train(batch_size=1, num_updates=1) 238 | 239 | # Test evaluation 240 | td = env.reset() 241 | td1 = agent.get_action(td) 242 | td2 = agent.get_action(td) 243 | 244 | assert not torch.allclose(td1["action"], td2["action"]) 245 | 246 | agent.eval() 247 | td = env.reset() 248 | eval_td1 = agent.get_eval_action(td) 249 | eval_td2 = agent.get_eval_action(td) 250 | 251 | assert torch.allclose(eval_td1["action"], eval_td2["action"]) 252 | 253 | 254 | @pytest.mark.parametrize( 255 | "env", 256 | ["mixed", "vec", "vec_goal"], 257 | ) 258 | @pytest.mark.parametrize( 259 | "device", 260 | ["cpu", "cuda"], 261 | ) 262 | def test_bc_agent(env, device): 263 | if torch.cuda.is_available() and device == "cuda": 264 | device = "cuda" 265 | else: 266 | device = "cpu" 267 | with initialize(config_path="../conf"): 268 | cfg = compose(config_name="config", overrides=["agent=bc", "device=" + device]) 269 | 270 | # Test data collection 271 | env = get_env(env) 272 | agent, _ = get_agent(env.action_spec, env.observation_spec, cfg) 273 | collection_round(env, agent, max_steps=10) 274 | # Test training 275 | agent.train(batch_size=1, num_updates=1) 276 | 277 | # Test evaluation 278 | agent.eval() 279 | td = env.reset() 280 | eval_td1 = agent.get_eval_action(td) 281 | eval_td2 = agent.get_eval_action(td) 282 | 283 | assert torch.allclose(eval_td1["action"], eval_td2["action"]) 284 | 285 | 286 | @pytest.mark.parametrize( 287 | "env", 288 | ["mixed"], 289 | ) 290 | @pytest.mark.parametrize( 291 | "img_shape", 292 | [(64, 64, 3), (128, 128, 3)], 293 | ) 294 | @pytest.mark.parametrize( 295 | "device", 296 | ["cpu", "cuda"], 297 | ) 298 | def test_mixd_obs_size_agent(env, device, img_shape): 299 | if torch.cuda.is_available() and device == "cuda": 300 | device = "cuda" 301 | else: 302 | device = "cpu" 303 | with initialize(config_path="../conf"): 304 | cfg = compose(config_name="config", overrides=["agent=td3", "device=" + device]) 305 | 306 | # Test data collection 307 | env = get_env(env, img_shape) 308 | agent, _ = get_agent(env.action_spec, env.observation_spec, cfg) 309 | collection_round(env, agent, max_steps=10) 310 | 311 | # Test training 312 | agent.train(batch_size=1, num_updates=1) 313 | 314 | # Test evaluation 315 | td = env.reset() 316 | td1 = agent.get_action(td) 317 | td2 = agent.get_action(td) 318 | 319 | assert not torch.allclose(td1["action"], td2["action"]) 320 | 321 | agent.eval() 322 | td = env.reset() 323 | eval_td1 = agent.get_eval_action(td) 324 | eval_td2 = agent.get_eval_action(td) 325 | 326 | assert torch.allclose(eval_td1["action"], eval_td2["action"]) 327 | -------------------------------------------------------------------------------- /tests/test_env_sim.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | from bricksrl.environments import make_env 4 | from experiments.helper.agents import get_agent 5 | from hydra import compose, initialize 6 | 7 | from tests.test_agents import collection_round 8 | 9 | 10 | @pytest.mark.parametrize( 11 | "agent", 12 | ["sac", "td3", "random"], 13 | ) 14 | @pytest.mark.parametrize("env", ["walker_sim-v0", "roboarm_sim-v0"]) 15 | @pytest.mark.parametrize( 16 | "device", 17 | ["cpu", "cuda"], 18 | ) 19 | def test_sac_agent(agent, env, device): 20 | if torch.cuda.is_available() and device == "cuda": 21 | device = "cuda" 22 | else: 23 | device = "cpu" 24 | with initialize(config_path="../conf"): 25 | cfg = compose( 26 | config_name="config", 27 | overrides=["agent=" + agent, "device=" + device, "env=" + env], 28 | ) 29 | 30 | # Create environment 31 | env, action_space, state_space = make_env(cfg) 32 | # Create agent 33 | agent, _ = get_agent(action_space, state_space, cfg) 34 | # Test data collection 35 | collection_round(env, agent, max_steps=10) 36 | --------------------------------------------------------------------------------